fs/btrfs/disk-io.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2007 Oracle.  All rights reserved.
   4  */
   5
   6 #include <linux/fs.h>
   7 #include <linux/blkdev.h>
   8 #include <linux/radix-tree.h>
   9 #include <linux/writeback.h>
  10 #include <linux/workqueue.h>
  11 #include <linux/kthread.h>
  12 #include <linux/slab.h>
  13 #include <linux/migrate.h>
  14 #include <linux/ratelimit.h>
  15 #include <linux/uuid.h>
  16 #include <linux/semaphore.h>
  17 #include <linux/error-injection.h>
  18 #include <linux/crc32c.h>
  19 #include <linux/sched/mm.h>
  20 #include <asm/unaligned.h>
  21 #include <crypto/hash.h>
  22 #include "ctree.h"
  23 #include "disk-io.h"
  24 #include "transaction.h"
  25 #include "btrfs_inode.h"
  26 #include "bio.h"
  27 #include "print-tree.h"
  28 #include "locking.h"
  29 #include "tree-log.h"
  30 #include "free-space-cache.h"
  31 #include "free-space-tree.h"
  32 #include "check-integrity.h"
  33 #include "rcu-string.h"
  34 #include "dev-replace.h"
  35 #include "raid56.h"
  36 #include "sysfs.h"
  37 #include "qgroup.h"
  38 #include "compression.h"
  39 #include "tree-checker.h"
  40 #include "ref-verify.h"
  41 #include "block-group.h"
  42 #include "discard.h"
  43 #include "space-info.h"
  44 #include "zoned.h"
  45 #include "subpage.h"
  46 #include "fs.h"
  47 #include "accessors.h"
  48 #include "extent-tree.h"
  49 #include "root-tree.h"
  50 #include "defrag.h"
  51 #include "uuid-tree.h"
  52 #include "relocation.h"
  53 #include "scrub.h"
  54 #include "super.h"
  55
  56 #define BTRFS_SUPER_FLAG_SUPP   (BTRFS_HEADER_FLAG_WRITTEN |\
  57                                  BTRFS_HEADER_FLAG_RELOC |\
  58                                  BTRFS_SUPER_FLAG_ERROR |\
  59                                  BTRFS_SUPER_FLAG_SEEDING |\
  60                                  BTRFS_SUPER_FLAG_METADUMP |\
  61                                  BTRFS_SUPER_FLAG_METADUMP_V2)
  62
  63 static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info);
  64 static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info);
  65
  66 static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
  67 {
  68         if (fs_info->csum_shash)
  69                 crypto_free_shash(fs_info->csum_shash);
  70 }
  71
  72 /*
  73  * Compute the csum of a btree block and store the result to provided buffer.
  74  */
  75 static void csum_tree_block(struct extent_buffer *buf, u8 *result)
  76 {
  77         struct btrfs_fs_info *fs_info = buf->fs_info;
  78         const int num_pages = num_extent_pages(buf);
  79         const int first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
  80         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
  81         char *kaddr;
  82         int i;
  83
  84         shash->tfm = fs_info->csum_shash;
  85         crypto_shash_init(shash);
  86         kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start);
  87         crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
  88                             first_page_part - BTRFS_CSUM_SIZE);
  89
  90         for (i = 1; i < num_pages && INLINE_EXTENT_BUFFER_PAGES > 1; i++) {
  91                 kaddr = page_address(buf->pages[i]);
  92                 crypto_shash_update(shash, kaddr, PAGE_SIZE);
  93         }
  94         memset(result, 0, BTRFS_CSUM_SIZE);
  95         crypto_shash_final(shash, result);
  96 }
  97
  98 /*
  99  * we can't consider a given block up to date unless the transid of the
 100  * block matches the transid in the parent node's pointer.  This is how we
 101  * detect blocks that either didn't get written at all or got written
 102  * in the wrong place.
 103  */
 104 int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, int atomic)
 105 {
 106         if (!extent_buffer_uptodate(eb))
 107                 return 0;
 108
 109         if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
 110                 return 1;
 111
 112         if (atomic)
 113                 return -EAGAIN;
 114
 115         if (!extent_buffer_uptodate(eb) ||
 116             btrfs_header_generation(eb) != parent_transid) {
 117                 btrfs_err_rl(eb->fs_info,
 118 "parent transid verify failed on logical %llu mirror %u wanted %llu found %llu",
 119                         eb->start, eb->read_mirror,
 120                         parent_transid, btrfs_header_generation(eb));
 121                 clear_extent_buffer_uptodate(eb);
 122                 return 0;
 123         }
 124         return 1;
 125 }
 126
 127 static bool btrfs_supported_super_csum(u16 csum_type)
 128 {
 129         switch (csum_type) {
 130         case BTRFS_CSUM_TYPE_CRC32:
 131         case BTRFS_CSUM_TYPE_XXHASH:
 132         case BTRFS_CSUM_TYPE_SHA256:
 133         case BTRFS_CSUM_TYPE_BLAKE2:
 134                 return true;
 135         default:
 136                 return false;
 137         }
 138 }
 139
 140 /*
 141  * Return 0 if the superblock checksum type matches the checksum value of that
 142  * algorithm. Pass the raw disk superblock data.
 143  */
 144 int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
 145                            const struct btrfs_super_block *disk_sb)
 146 {
 147         char result[BTRFS_CSUM_SIZE];
 148         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
 149
 150         shash->tfm = fs_info->csum_shash;
 151
 152         /*
 153          * The super_block structure does not span the whole
 154          * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is
 155          * filled with zeros and is included in the checksum.
 156          */
 157         crypto_shash_digest(shash, (const u8 *)disk_sb + BTRFS_CSUM_SIZE,
 158                             BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result);
 159
 160         if (memcmp(disk_sb->csum, result, fs_info->csum_size))
 161                 return 1;
 162
 163         return 0;
 164 }
 165
 166 static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
 167                                       int mirror_num)
 168 {
 169         struct btrfs_fs_info *fs_info = eb->fs_info;
 170         int i, num_pages = num_extent_pages(eb);
 171         int ret = 0;
 172
 173         if (sb_rdonly(fs_info->sb))
 174                 return -EROFS;
 175
 176         for (i = 0; i < num_pages; i++) {
 177                 struct page *p = eb->pages[i];
 178                 u64 start = max_t(u64, eb->start, page_offset(p));
 179                 u64 end = min_t(u64, eb->start + eb->len, page_offset(p) + PAGE_SIZE);
 180                 u32 len = end - start;
 181
 182                 ret = btrfs_repair_io_failure(fs_info, 0, start, len,
 183                                 start, p, offset_in_page(start), mirror_num);
 184                 if (ret)
 185                         break;
 186         }
 187
 188         return ret;
 189 }
 190
 191 /*
 192  * helper to read a given tree block, doing retries as required when
 193  * the checksums don't match and we have alternate mirrors to try.
 194  *
 195  * @check:              expected tree parentness check, see the comments of the
 196  *                      structure for details.
 197  */
 198 int btrfs_read_extent_buffer(struct extent_buffer *eb,
 199                              struct btrfs_tree_parent_check *check)
 200 {
 201         struct btrfs_fs_info *fs_info = eb->fs_info;
 202         int failed = 0;
 203         int ret;
 204         int num_copies = 0;
 205         int mirror_num = 0;
 206         int failed_mirror = 0;
 207
 208         ASSERT(check);
 209
 210         while (1) {
 211                 clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
 212                 ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num, check);
 213                 if (!ret)
 214                         break;
 215
 216                 num_copies = btrfs_num_copies(fs_info,
 217                                               eb->start, eb->len);
 218                 if (num_copies == 1)
 219                         break;
 220
 221                 if (!failed_mirror) {
 222                         failed = 1;
 223                         failed_mirror = eb->read_mirror;
 224                 }
 225
 226                 mirror_num++;
 227                 if (mirror_num == failed_mirror)
 228                         mirror_num++;
 229
 230                 if (mirror_num > num_copies)
 231                         break;
 232         }
 233
 234         if (failed && !ret && failed_mirror)
 235                 btrfs_repair_eb_io_failure(eb, failed_mirror);
 236
 237         return ret;
 238 }
 239
 240 /*
 241  * Checksum a dirty tree block before IO.
 242  */
 243 blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
 244 {
 245         struct extent_buffer *eb = bbio->private;
 246         struct btrfs_fs_info *fs_info = eb->fs_info;
 247         u64 found_start = btrfs_header_bytenr(eb);
 248         u8 result[BTRFS_CSUM_SIZE];
 249         int ret;
 250
 251         /* Btree blocks are always contiguous on disk. */
 252         if (WARN_ON_ONCE(bbio->file_offset != eb->start))
 253                 return BLK_STS_IOERR;
 254         if (WARN_ON_ONCE(bbio->bio.bi_iter.bi_size != eb->len))
 255                 return BLK_STS_IOERR;
 256
 257         if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) {
 258                 WARN_ON_ONCE(found_start != 0);
 259                 return BLK_STS_OK;
 260         }
 261
 262         if (WARN_ON_ONCE(found_start != eb->start))
 263                 return BLK_STS_IOERR;
 264         if (WARN_ON(!btrfs_page_test_uptodate(fs_info, eb->pages[0], eb->start,
 265                                               eb->len)))
 266                 return BLK_STS_IOERR;
 267
 268         ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
 269                                     offsetof(struct btrfs_header, fsid),
 270                                     BTRFS_FSID_SIZE) == 0);
 271         csum_tree_block(eb, result);
 272
 273         if (btrfs_header_level(eb))
 274                 ret = btrfs_check_node(eb);
 275         else
 276                 ret = btrfs_check_leaf(eb);
 277
 278         if (ret < 0)
 279                 goto error;
 280
 281         /*
 282          * Also check the generation, the eb reached here must be newer than
 283          * last committed. Or something seriously wrong happened.
 284          */
 285         if (unlikely(btrfs_header_generation(eb) <= fs_info->last_trans_committed)) {
 286                 ret = -EUCLEAN;
 287                 btrfs_err(fs_info,
 288                         "block=%llu bad generation, have %llu expect > %llu",
 289                           eb->start, btrfs_header_generation(eb),
 290                           fs_info->last_trans_committed);
 291                 goto error;
 292         }
 293         write_extent_buffer(eb, result, 0, fs_info->csum_size);
 294         return BLK_STS_OK;
 295
 296 error:
 297         btrfs_print_tree(eb, 0);
 298         btrfs_err(fs_info, "block=%llu write time tree block corruption detected",
 299                   eb->start);
 300         /*
 301          * Be noisy if this is an extent buffer from a log tree. We don't abort
 302          * a transaction in case there's a bad log tree extent buffer, we just
 303          * fallback to a transaction commit. Still we want to know when there is
 304          * a bad log tree extent buffer, as that may signal a bug somewhere.
 305          */
 306         WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG) ||
 307                 btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID);
 308         return errno_to_blk_status(ret);
 309 }
 310
 311 static bool check_tree_block_fsid(struct extent_buffer *eb)
 312 {
 313         struct btrfs_fs_info *fs_info = eb->fs_info;
 314         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
 315         u8 fsid[BTRFS_FSID_SIZE];
 316         u8 *metadata_uuid;
 317
 318         read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid),
 319                            BTRFS_FSID_SIZE);
 320         /*
 321          * Checking the incompat flag is only valid for the current fs. For
 322          * seed devices it's forbidden to have their uuid changed so reading
 323          * ->fsid in this case is fine
 324          */
 325         if (btrfs_fs_incompat(fs_info, METADATA_UUID))
 326                 metadata_uuid = fs_devices->metadata_uuid;
 327         else
 328                 metadata_uuid = fs_devices->fsid;
 329
 330         if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE))
 331                 return false;
 332
 333         list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
 334                 if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE))
 335                         return false;
 336
 337         return true;
 338 }
 339
 340 /* Do basic extent buffer checks at read time */
 341 int btrfs_validate_extent_buffer(struct extent_buffer *eb,
 342                                  struct btrfs_tree_parent_check *check)
 343 {
 344         struct btrfs_fs_info *fs_info = eb->fs_info;
 345         u64 found_start;
 346         const u32 csum_size = fs_info->csum_size;
 347         u8 found_level;
 348         u8 result[BTRFS_CSUM_SIZE];
 349         const u8 *header_csum;
 350         int ret = 0;
 351
 352         ASSERT(check);
 353
 354         found_start = btrfs_header_bytenr(eb);
 355         if (found_start != eb->start) {
 356                 btrfs_err_rl(fs_info,
 357                         "bad tree block start, mirror %u want %llu have %llu",
 358                              eb->read_mirror, eb->start, found_start);
 359                 ret = -EIO;
 360                 goto out;
 361         }
 362         if (check_tree_block_fsid(eb)) {
 363                 btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u",
 364                              eb->start, eb->read_mirror);
 365                 ret = -EIO;
 366                 goto out;
 367         }
 368         found_level = btrfs_header_level(eb);
 369         if (found_level >= BTRFS_MAX_LEVEL) {
 370                 btrfs_err(fs_info,
 371                         "bad tree block level, mirror %u level %d on logical %llu",
 372                         eb->read_mirror, btrfs_header_level(eb), eb->start);
 373                 ret = -EIO;
 374                 goto out;
 375         }
 376
 377         csum_tree_block(eb, result);
 378         header_csum = page_address(eb->pages[0]) +
 379                 get_eb_offset_in_page(eb, offsetof(struct btrfs_header, csum));
 380
 381         if (memcmp(result, header_csum, csum_size) != 0) {
 382                 btrfs_warn_rl(fs_info,
 383 "checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d",
 384                               eb->start, eb->read_mirror,
 385                               CSUM_FMT_VALUE(csum_size, header_csum),
 386                               CSUM_FMT_VALUE(csum_size, result),
 387                               btrfs_header_level(eb));
 388                 ret = -EUCLEAN;
 389                 goto out;
 390         }
 391
 392         if (found_level != check->level) {
 393                 btrfs_err(fs_info,
 394                 "level verify failed on logical %llu mirror %u wanted %u found %u",
 395                           eb->start, eb->read_mirror, check->level, found_level);
 396                 ret = -EIO;
 397                 goto out;
 398         }
 399         if (unlikely(check->transid &&
 400                      btrfs_header_generation(eb) != check->transid)) {
 401                 btrfs_err_rl(eb->fs_info,
 402 "parent transid verify failed on logical %llu mirror %u wanted %llu found %llu",
 403                                 eb->start, eb->read_mirror, check->transid,
 404                                 btrfs_header_generation(eb));
 405                 ret = -EIO;
 406                 goto out;
 407         }
 408         if (check->has_first_key) {
 409                 struct btrfs_key *expect_key = &check->first_key;
 410                 struct btrfs_key found_key;
 411
 412                 if (found_level)
 413                         btrfs_node_key_to_cpu(eb, &found_key, 0);
 414                 else
 415                         btrfs_item_key_to_cpu(eb, &found_key, 0);
 416                 if (unlikely(btrfs_comp_cpu_keys(expect_key, &found_key))) {
 417                         btrfs_err(fs_info,
 418 "tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
 419                                   eb->start, check->transid,
 420                                   expect_key->objectid,
 421                                   expect_key->type, expect_key->offset,
 422                                   found_key.objectid, found_key.type,
 423                                   found_key.offset);
 424                         ret = -EUCLEAN;
 425                         goto out;
 426                 }
 427         }
 428         if (check->owner_root) {
 429                 ret = btrfs_check_eb_owner(eb, check->owner_root);
 430                 if (ret < 0)
 431                         goto out;
 432         }
 433
 434         /*
 435          * If this is a leaf block and it is corrupt, set the corrupt bit so
 436          * that we don't try and read the other copies of this block, just
 437          * return -EIO.
 438          */
 439         if (found_level == 0 && btrfs_check_leaf(eb)) {
 440                 set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
 441                 ret = -EIO;
 442         }
 443
 444         if (found_level > 0 && btrfs_check_node(eb))
 445                 ret = -EIO;
 446
 447         if (ret)
 448                 btrfs_err(fs_info,
 449                 "read time tree block corruption detected on logical %llu mirror %u",
 450                           eb->start, eb->read_mirror);
 451 out:
 452         return ret;
 453 }
 454
 455 #ifdef CONFIG_MIGRATION
 456 static int btree_migrate_folio(struct address_space *mapping,
 457                 struct folio *dst, struct folio *src, enum migrate_mode mode)
 458 {
 459         /*
 460          * we can't safely write a btree page from here,
 461          * we haven't done the locking hook
 462          */
 463         if (folio_test_dirty(src))
 464                 return -EAGAIN;
 465         /*
 466          * Buffers may be managed in a filesystem specific way.
 467          * We must have no buffers or drop them.
 468          */
 469         if (folio_get_private(src) &&
 470             !filemap_release_folio(src, GFP_KERNEL))
 471                 return -EAGAIN;
 472         return migrate_folio(mapping, dst, src, mode);
 473 }
 474 #else
 475 #define btree_migrate_folio NULL
 476 #endif
 477
 478 static int btree_writepages(struct address_space *mapping,
 479                             struct writeback_control *wbc)
 480 {
 481         struct btrfs_fs_info *fs_info;
 482         int ret;
 483
 484         if (wbc->sync_mode == WB_SYNC_NONE) {
 485
 486                 if (wbc->for_kupdate)
 487                         return 0;
 488
 489                 fs_info = BTRFS_I(mapping->host)->root->fs_info;
 490                 /* this is a bit racy, but that's ok */
 491                 ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
 492                                              BTRFS_DIRTY_METADATA_THRESH,
 493                                              fs_info->dirty_metadata_batch);
 494                 if (ret < 0)
 495                         return 0;
 496         }
 497         return btree_write_cache_pages(mapping, wbc);
 498 }
 499
 500 static bool btree_release_folio(struct folio *folio, gfp_t gfp_flags)
 501 {
 502         if (folio_test_writeback(folio) || folio_test_dirty(folio))
 503                 return false;
 504
 505         return try_release_extent_buffer(&folio->page);
 506 }
 507
 508 static void btree_invalidate_folio(struct folio *folio, size_t offset,
 509                                  size_t length)
 510 {
 511         struct extent_io_tree *tree;
 512         tree = &BTRFS_I(folio->mapping->host)->io_tree;
 513         extent_invalidate_folio(tree, folio, offset);
 514         btree_release_folio(folio, GFP_NOFS);
 515         if (folio_get_private(folio)) {
 516                 btrfs_warn(BTRFS_I(folio->mapping->host)->root->fs_info,
 517                            "folio private not zero on folio %llu",
 518                            (unsigned long long)folio_pos(folio));
 519                 folio_detach_private(folio);
 520         }
 521 }
 522
 523 #ifdef DEBUG
 524 static bool btree_dirty_folio(struct address_space *mapping,
 525                 struct folio *folio)
 526 {
 527         struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb);
 528         struct btrfs_subpage *subpage;
 529         struct extent_buffer *eb;
 530         int cur_bit = 0;
 531         u64 page_start = folio_pos(folio);
 532
 533         if (fs_info->sectorsize == PAGE_SIZE) {
 534                 eb = folio_get_private(folio);
 535                 BUG_ON(!eb);
 536                 BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
 537                 BUG_ON(!atomic_read(&eb->refs));
 538                 btrfs_assert_tree_write_locked(eb);
 539                 return filemap_dirty_folio(mapping, folio);
 540         }
 541         subpage = folio_get_private(folio);
 542
 543         ASSERT(subpage->dirty_bitmap);
 544         while (cur_bit < BTRFS_SUBPAGE_BITMAP_SIZE) {
 545                 unsigned long flags;
 546                 u64 cur;
 547                 u16 tmp = (1 << cur_bit);
 548
 549                 spin_lock_irqsave(&subpage->lock, flags);
 550                 if (!(tmp & subpage->dirty_bitmap)) {
 551                         spin_unlock_irqrestore(&subpage->lock, flags);
 552                         cur_bit++;
 553                         continue;
 554                 }
 555                 spin_unlock_irqrestore(&subpage->lock, flags);
 556                 cur = page_start + cur_bit * fs_info->sectorsize;
 557
 558                 eb = find_extent_buffer(fs_info, cur);
 559                 ASSERT(eb);
 560                 ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
 561                 ASSERT(atomic_read(&eb->refs));
 562                 btrfs_assert_tree_write_locked(eb);
 563                 free_extent_buffer(eb);
 564
 565                 cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits);
 566         }
 567         return filemap_dirty_folio(mapping, folio);
 568 }
 569 #else
 570 #define btree_dirty_folio filemap_dirty_folio
 571 #endif
 572
 573 static const struct address_space_operations btree_aops = {
 574         .writepages     = btree_writepages,
 575         .release_folio  = btree_release_folio,
 576         .invalidate_folio = btree_invalidate_folio,
 577         .migrate_folio  = btree_migrate_folio,
 578         .dirty_folio    = btree_dirty_folio,
 579 };
 580
 581 struct extent_buffer *btrfs_find_create_tree_block(
 582                                                 struct btrfs_fs_info *fs_info,
 583                                                 u64 bytenr, u64 owner_root,
 584                                                 int level)
 585 {
 586         if (btrfs_is_testing(fs_info))
 587                 return alloc_test_extent_buffer(fs_info, bytenr);
 588         return alloc_extent_buffer(fs_info, bytenr, owner_root, level);
 589 }
 590
 591 /*
 592  * Read tree block at logical address @bytenr and do variant basic but critical
 593  * verification.
 594  *
 595  * @check:              expected tree parentness check, see comments of the
 596  *                      structure for details.
 597  */
 598 struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
 599                                       struct btrfs_tree_parent_check *check)
 600 {
 601         struct extent_buffer *buf = NULL;
 602         int ret;
 603
 604         ASSERT(check);
 605
 606         buf = btrfs_find_create_tree_block(fs_info, bytenr, check->owner_root,
 607                                            check->level);
 608         if (IS_ERR(buf))
 609                 return buf;
 610
 611         ret = btrfs_read_extent_buffer(buf, check);
 612         if (ret) {
 613                 free_extent_buffer_stale(buf);
 614                 return ERR_PTR(ret);
 615         }
 616         if (btrfs_check_eb_owner(buf, check->owner_root)) {
 617                 free_extent_buffer_stale(buf);
 618                 return ERR_PTR(-EUCLEAN);
 619         }
 620         return buf;
 621
 622 }
 623
 624 static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
 625                          u64 objectid)
 626 {
 627         bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
 628
 629         memset(&root->root_key, 0, sizeof(root->root_key));
 630         memset(&root->root_item, 0, sizeof(root->root_item));
 631         memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
 632         root->fs_info = fs_info;
 633         root->root_key.objectid = objectid;
 634         root->node = NULL;
 635         root->commit_root = NULL;
 636         root->state = 0;
 637         RB_CLEAR_NODE(&root->rb_node);
 638
 639         root->last_trans = 0;
 640         root->free_objectid = 0;
 641         root->nr_delalloc_inodes = 0;
 642         root->nr_ordered_extents = 0;
 643         root->inode_tree = RB_ROOT;
 644         INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
 645
 646         btrfs_init_root_block_rsv(root);
 647
 648         INIT_LIST_HEAD(&root->dirty_list);
 649         INIT_LIST_HEAD(&root->root_list);
 650         INIT_LIST_HEAD(&root->delalloc_inodes);
 651         INIT_LIST_HEAD(&root->delalloc_root);
 652         INIT_LIST_HEAD(&root->ordered_extents);
 653         INIT_LIST_HEAD(&root->ordered_root);
 654         INIT_LIST_HEAD(&root->reloc_dirty_list);
 655         INIT_LIST_HEAD(&root->logged_list[0]);
 656         INIT_LIST_HEAD(&root->logged_list[1]);
 657         spin_lock_init(&root->inode_lock);
 658         spin_lock_init(&root->delalloc_lock);
 659         spin_lock_init(&root->ordered_extent_lock);
 660         spin_lock_init(&root->accounting_lock);
 661         spin_lock_init(&root->log_extents_lock[0]);
 662         spin_lock_init(&root->log_extents_lock[1]);
 663         spin_lock_init(&root->qgroup_meta_rsv_lock);
 664         mutex_init(&root->objectid_mutex);
 665         mutex_init(&root->log_mutex);
 666         mutex_init(&root->ordered_extent_mutex);
 667         mutex_init(&root->delalloc_mutex);
 668         init_waitqueue_head(&root->qgroup_flush_wait);
 669         init_waitqueue_head(&root->log_writer_wait);
 670         init_waitqueue_head(&root->log_commit_wait[0]);
 671         init_waitqueue_head(&root->log_commit_wait[1]);
 672         INIT_LIST_HEAD(&root->log_ctxs[0]);
 673         INIT_LIST_HEAD(&root->log_ctxs[1]);
 674         atomic_set(&root->log_commit[0], 0);
 675         atomic_set(&root->log_commit[1], 0);
 676         atomic_set(&root->log_writers, 0);
 677         atomic_set(&root->log_batch, 0);
 678         refcount_set(&root->refs, 1);
 679         atomic_set(&root->snapshot_force_cow, 0);
 680         atomic_set(&root->nr_swapfiles, 0);
 681         root->log_transid = 0;
 682         root->log_transid_committed = -1;
 683         root->last_log_commit = 0;
 684         root->anon_dev = 0;
 685         if (!dummy) {
 686                 extent_io_tree_init(fs_info, &root->dirty_log_pages,
 687                                     IO_TREE_ROOT_DIRTY_LOG_PAGES);
 688                 extent_io_tree_init(fs_info, &root->log_csum_range,
 689                                     IO_TREE_LOG_CSUM_RANGE);
 690         }
 691
 692         spin_lock_init(&root->root_item_lock);
 693         btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
 694 #ifdef CONFIG_BTRFS_DEBUG
 695         INIT_LIST_HEAD(&root->leak_list);
 696         spin_lock(&fs_info->fs_roots_radix_lock);
 697         list_add_tail(&root->leak_list, &fs_info->allocated_roots);
 698         spin_unlock(&fs_info->fs_roots_radix_lock);
 699 #endif
 700 }
 701
 702 static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
 703                                            u64 objectid, gfp_t flags)
 704 {
 705         struct btrfs_root *root = kzalloc(sizeof(*root), flags);
 706         if (root)
 707                 __setup_root(root, fs_info, objectid);
 708         return root;
 709 }
 710
 711 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 712 /* Should only be used by the testing infrastructure */
 713 struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info)
 714 {
 715         struct btrfs_root *root;
 716
 717         if (!fs_info)
 718                 return ERR_PTR(-EINVAL);
 719
 720         root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, GFP_KERNEL);
 721         if (!root)
 722                 return ERR_PTR(-ENOMEM);
 723
 724         /* We don't use the stripesize in selftest, set it as sectorsize */
 725         root->alloc_bytenr = 0;
 726
 727         return root;
 728 }
 729 #endif
 730
 731 static int global_root_cmp(struct rb_node *a_node, const struct rb_node *b_node)
 732 {
 733         const struct btrfs_root *a = rb_entry(a_node, struct btrfs_root, rb_node);
 734         const struct btrfs_root *b = rb_entry(b_node, struct btrfs_root, rb_node);
 735
 736         return btrfs_comp_cpu_keys(&a->root_key, &b->root_key);
 737 }
 738
 739 static int global_root_key_cmp(const void *k, const struct rb_node *node)
 740 {
 741         const struct btrfs_key *key = k;
 742         const struct btrfs_root *root = rb_entry(node, struct btrfs_root, rb_node);
 743
 744         return btrfs_comp_cpu_keys(key, &root->root_key);
 745 }
 746
 747 int btrfs_global_root_insert(struct btrfs_root *root)
 748 {
 749         struct btrfs_fs_info *fs_info = root->fs_info;
 750         struct rb_node *tmp;
 751         int ret = 0;
 752
 753         write_lock(&fs_info->global_root_lock);
 754         tmp = rb_find_add(&root->rb_node, &fs_info->global_root_tree, global_root_cmp);
 755         write_unlock(&fs_info->global_root_lock);
 756
 757         if (tmp) {
 758                 ret = -EEXIST;
 759                 btrfs_warn(fs_info, "global root %llu %llu already exists",
 760                                 root->root_key.objectid, root->root_key.offset);
 761         }
 762         return ret;
 763 }
 764
 765 void btrfs_global_root_delete(struct btrfs_root *root)
 766 {
 767         struct btrfs_fs_info *fs_info = root->fs_info;
 768
 769         write_lock(&fs_info->global_root_lock);
 770         rb_erase(&root->rb_node, &fs_info->global_root_tree);
 771         write_unlock(&fs_info->global_root_lock);
 772 }
 773
 774 struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info,
 775                                      struct btrfs_key *key)
 776 {
 777         struct rb_node *node;
 778         struct btrfs_root *root = NULL;
 779
 780         read_lock(&fs_info->global_root_lock);
 781         node = rb_find(key, &fs_info->global_root_tree, global_root_key_cmp);
 782         if (node)
 783                 root = container_of(node, struct btrfs_root, rb_node);
 784         read_unlock(&fs_info->global_root_lock);
 785
 786         return root;
 787 }
 788
 789 static u64 btrfs_global_root_id(struct btrfs_fs_info *fs_info, u64 bytenr)
 790 {
 791         struct btrfs_block_group *block_group;
 792         u64 ret;
 793
 794         if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
 795                 return 0;
 796
 797         if (bytenr)
 798                 block_group = btrfs_lookup_block_group(fs_info, bytenr);
 799         else
 800                 block_group = btrfs_lookup_first_block_group(fs_info, bytenr);
 801         ASSERT(block_group);
 802         if (!block_group)
 803                 return 0;
 804         ret = block_group->global_root_id;
 805         btrfs_put_block_group(block_group);
 806
 807         return ret;
 808 }
 809
 810 struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr)
 811 {
 812         struct btrfs_key key = {
 813                 .objectid = BTRFS_CSUM_TREE_OBJECTID,
 814                 .type = BTRFS_ROOT_ITEM_KEY,
 815                 .offset = btrfs_global_root_id(fs_info, bytenr),
 816         };
 817
 818         return btrfs_global_root(fs_info, &key);
 819 }
 820
 821 struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr)
 822 {
 823         struct btrfs_key key = {
 824                 .objectid = BTRFS_EXTENT_TREE_OBJECTID,
 825                 .type = BTRFS_ROOT_ITEM_KEY,
 826                 .offset = btrfs_global_root_id(fs_info, bytenr),
 827         };
 828
 829         return btrfs_global_root(fs_info, &key);
 830 }
 831
 832 struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
 833 {
 834         if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE))
 835                 return fs_info->block_group_root;
 836         return btrfs_extent_root(fs_info, 0);
 837 }
 838
 839 struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
 840                                      u64 objectid)
 841 {
 842         struct btrfs_fs_info *fs_info = trans->fs_info;
 843         struct extent_buffer *leaf;
 844         struct btrfs_root *tree_root = fs_info->tree_root;
 845         struct btrfs_root *root;
 846         struct btrfs_key key;
 847         unsigned int nofs_flag;
 848         int ret = 0;
 849
 850         /*
 851          * We're holding a transaction handle, so use a NOFS memory allocation
 852          * context to avoid deadlock if reclaim happens.
 853          */
 854         nofs_flag = memalloc_nofs_save();
 855         root = btrfs_alloc_root(fs_info, objectid, GFP_KERNEL);
 856         memalloc_nofs_restore(nofs_flag);
 857         if (!root)
 858                 return ERR_PTR(-ENOMEM);
 859
 860         root->root_key.objectid = objectid;
 861         root->root_key.type = BTRFS_ROOT_ITEM_KEY;
 862         root->root_key.offset = 0;
 863
 864         leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
 865                                       BTRFS_NESTING_NORMAL);
 866         if (IS_ERR(leaf)) {
 867                 ret = PTR_ERR(leaf);
 868                 leaf = NULL;
 869                 goto fail;
 870         }
 871
 872         root->node = leaf;
 873         btrfs_mark_buffer_dirty(leaf);
 874
 875         root->commit_root = btrfs_root_node(root);
 876         set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
 877
 878         btrfs_set_root_flags(&root->root_item, 0);
 879         btrfs_set_root_limit(&root->root_item, 0);
 880         btrfs_set_root_bytenr(&root->root_item, leaf->start);
 881         btrfs_set_root_generation(&root->root_item, trans->transid);
 882         btrfs_set_root_level(&root->root_item, 0);
 883         btrfs_set_root_refs(&root->root_item, 1);
 884         btrfs_set_root_used(&root->root_item, leaf->len);
 885         btrfs_set_root_last_snapshot(&root->root_item, 0);
 886         btrfs_set_root_dirid(&root->root_item, 0);
 887         if (is_fstree(objectid))
 888                 generate_random_guid(root->root_item.uuid);
 889         else
 890                 export_guid(root->root_item.uuid, &guid_null);
 891         btrfs_set_root_drop_level(&root->root_item, 0);
 892
 893         btrfs_tree_unlock(leaf);
 894
 895         key.objectid = objectid;
 896         key.type = BTRFS_ROOT_ITEM_KEY;
 897         key.offset = 0;
 898         ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item);
 899         if (ret)
 900                 goto fail;
 901
 902         return root;
 903
 904 fail:
 905         btrfs_put_root(root);
 906
 907         return ERR_PTR(ret);
 908 }
 909
 910 static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
 911                                          struct btrfs_fs_info *fs_info)
 912 {
 913         struct btrfs_root *root;
 914
 915         root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS);
 916         if (!root)
 917                 return ERR_PTR(-ENOMEM);
 918
 919         root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
 920         root->root_key.type = BTRFS_ROOT_ITEM_KEY;
 921         root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
 922
 923         return root;
 924 }
 925
 926 int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
 927                               struct btrfs_root *root)
 928 {
 929         struct extent_buffer *leaf;
 930
 931         /*
 932          * DON'T set SHAREABLE bit for log trees.
 933          *
 934          * Log trees are not exposed to user space thus can't be snapshotted,
 935          * and they go away before a real commit is actually done.
 936          *
 937          * They do store pointers to file data extents, and those reference
 938          * counts still get updated (along with back refs to the log tree).
 939          */
 940
 941         leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
 942                         NULL, 0, 0, 0, BTRFS_NESTING_NORMAL);
 943         if (IS_ERR(leaf))
 944                 return PTR_ERR(leaf);
 945
 946         root->node = leaf;
 947
 948         btrfs_mark_buffer_dirty(root->node);
 949         btrfs_tree_unlock(root->node);
 950
 951         return 0;
 952 }
 953
 954 int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
 955                              struct btrfs_fs_info *fs_info)
 956 {
 957         struct btrfs_root *log_root;
 958
 959         log_root = alloc_log_tree(trans, fs_info);
 960         if (IS_ERR(log_root))
 961                 return PTR_ERR(log_root);
 962
 963         if (!btrfs_is_zoned(fs_info)) {
 964                 int ret = btrfs_alloc_log_tree_node(trans, log_root);
 965
 966                 if (ret) {
 967                         btrfs_put_root(log_root);
 968                         return ret;
 969                 }
 970         }
 971
 972         WARN_ON(fs_info->log_root_tree);
 973         fs_info->log_root_tree = log_root;
 974         return 0;
 975 }
 976
 977 int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 978                        struct btrfs_root *root)
 979 {
 980         struct btrfs_fs_info *fs_info = root->fs_info;
 981         struct btrfs_root *log_root;
 982         struct btrfs_inode_item *inode_item;
 983         int ret;
 984
 985         log_root = alloc_log_tree(trans, fs_info);
 986         if (IS_ERR(log_root))
 987                 return PTR_ERR(log_root);
 988
 989         ret = btrfs_alloc_log_tree_node(trans, log_root);
 990         if (ret) {
 991                 btrfs_put_root(log_root);
 992                 return ret;
 993         }
 994
 995         log_root->last_trans = trans->transid;
 996         log_root->root_key.offset = root->root_key.objectid;
 997
 998         inode_item = &log_root->root_item.inode;
 999         btrfs_set_stack_inode_generation(inode_item, 1);
1000         btrfs_set_stack_inode_size(inode_item, 3);
1001         btrfs_set_stack_inode_nlink(inode_item, 1);
1002         btrfs_set_stack_inode_nbytes(inode_item,
1003                                      fs_info->nodesize);
1004         btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
1005
1006         btrfs_set_root_node(&log_root->root_item, log_root->node);
1007
1008         WARN_ON(root->log_root);
1009         root->log_root = log_root;
1010         root->log_transid = 0;
1011         root->log_transid_committed = -1;
1012         root->last_log_commit = 0;
1013         return 0;
1014 }
1015
1016 static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
1017                                               struct btrfs_path *path,
1018                                               struct btrfs_key *key)
1019 {
1020         struct btrfs_root *root;
1021         struct btrfs_tree_parent_check check = { 0 };
1022         struct btrfs_fs_info *fs_info = tree_root->fs_info;
1023         u64 generation;
1024         int ret;
1025         int level;
1026
1027         root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS);
1028         if (!root)
1029                 return ERR_PTR(-ENOMEM);
1030
1031         ret = btrfs_find_root(tree_root, key, path,
1032                               &root->root_item, &root->root_key);
1033         if (ret) {
1034                 if (ret > 0)
1035                         ret = -ENOENT;
1036                 goto fail;
1037         }
1038
1039         generation = btrfs_root_generation(&root->root_item);
1040         level = btrfs_root_level(&root->root_item);
1041         check.level = level;
1042         check.transid = generation;
1043         check.owner_root = key->objectid;
1044         root->node = read_tree_block(fs_info, btrfs_root_bytenr(&root->root_item),
1045                                      &check);
1046         if (IS_ERR(root->node)) {
1047                 ret = PTR_ERR(root->node);
1048                 root->node = NULL;
1049                 goto fail;
1050         }
1051         if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
1052                 ret = -EIO;
1053                 goto fail;
1054         }
1055
1056         /*
1057          * For real fs, and not log/reloc trees, root owner must
1058          * match its root node owner
1059          */
1060         if (!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state) &&
1061             root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
1062             root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1063             root->root_key.objectid != btrfs_header_owner(root->node)) {
1064                 btrfs_crit(fs_info,
1065 "root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu",
1066                            root->root_key.objectid, root->node->start,
1067                            btrfs_header_owner(root->node),
1068                            root->root_key.objectid);
1069                 ret = -EUCLEAN;
1070                 goto fail;
1071         }
1072         root->commit_root = btrfs_root_node(root);
1073         return root;
1074 fail:
1075         btrfs_put_root(root);
1076         return ERR_PTR(ret);
1077 }
1078
1079 struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
1080                                         struct btrfs_key *key)
1081 {
1082         struct btrfs_root *root;
1083         struct btrfs_path *path;
1084
1085         path = btrfs_alloc_path();
1086         if (!path)
1087                 return ERR_PTR(-ENOMEM);
1088         root = read_tree_root_path(tree_root, path, key);
1089         btrfs_free_path(path);
1090
1091         return root;
1092 }
1093
1094 /*
1095  * Initialize subvolume root in-memory structure
1096  *
1097  * @anon_dev:   anonymous device to attach to the root, if zero, allocate new
1098  */
1099 static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
1100 {
1101         int ret;
1102
1103         btrfs_drew_lock_init(&root->snapshot_lock);
1104
1105         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
1106             !btrfs_is_data_reloc_root(root)) {
1107                 set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
1108                 btrfs_check_and_init_root_item(&root->root_item);
1109         }
1110
1111         /*
1112          * Don't assign anonymous block device to roots that are not exposed to
1113          * userspace, the id pool is limited to 1M
1114          */
1115         if (is_fstree(root->root_key.objectid) &&
1116             btrfs_root_refs(&root->root_item) > 0) {
1117                 if (!anon_dev) {
1118                         ret = get_anon_bdev(&root->anon_dev);
1119                         if (ret)
1120                                 goto fail;
1121                 } else {
1122                         root->anon_dev = anon_dev;
1123                 }
1124         }
1125
1126         mutex_lock(&root->objectid_mutex);
1127         ret = btrfs_init_root_free_objectid(root);
1128         if (ret) {
1129                 mutex_unlock(&root->objectid_mutex);
1130                 goto fail;
1131         }
1132
1133         ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
1134
1135         mutex_unlock(&root->objectid_mutex);
1136
1137         return 0;
1138 fail:
1139         /* The caller is responsible to call btrfs_free_fs_root */
1140         return ret;
1141 }
1142
1143 static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
1144                                                u64 root_id)
1145 {
1146         struct btrfs_root *root;
1147
1148         spin_lock(&fs_info->fs_roots_radix_lock);
1149         root = radix_tree_lookup(&fs_info->fs_roots_radix,
1150                                  (unsigned long)root_id);
1151         root = btrfs_grab_root(root);
1152         spin_unlock(&fs_info->fs_roots_radix_lock);
1153         return root;
1154 }
1155
1156 static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
1157                                                 u64 objectid)
1158 {
1159         struct btrfs_key key = {
1160                 .objectid = objectid,
1161                 .type = BTRFS_ROOT_ITEM_KEY,
1162                 .offset = 0,
1163         };
1164
1165         switch (objectid) {
1166         case BTRFS_ROOT_TREE_OBJECTID:
1167                 return btrfs_grab_root(fs_info->tree_root);
1168         case BTRFS_EXTENT_TREE_OBJECTID:
1169                 return btrfs_grab_root(btrfs_global_root(fs_info, &key));
1170         case BTRFS_CHUNK_TREE_OBJECTID:
1171                 return btrfs_grab_root(fs_info->chunk_root);
1172         case BTRFS_DEV_TREE_OBJECTID:
1173                 return btrfs_grab_root(fs_info->dev_root);
1174         case BTRFS_CSUM_TREE_OBJECTID:
1175                 return btrfs_grab_root(btrfs_global_root(fs_info, &key));
1176         case BTRFS_QUOTA_TREE_OBJECTID:
1177                 return btrfs_grab_root(fs_info->quota_root);
1178         case BTRFS_UUID_TREE_OBJECTID:
1179                 return btrfs_grab_root(fs_info->uuid_root);
1180         case BTRFS_BLOCK_GROUP_TREE_OBJECTID:
1181                 return btrfs_grab_root(fs_info->block_group_root);
1182         case BTRFS_FREE_SPACE_TREE_OBJECTID:
1183                 return btrfs_grab_root(btrfs_global_root(fs_info, &key));
1184         default:
1185                 return NULL;
1186         }
1187 }
1188
1189 int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
1190                          struct btrfs_root *root)
1191 {
1192         int ret;
1193
1194         ret = radix_tree_preload(GFP_NOFS);
1195         if (ret)
1196                 return ret;
1197
1198         spin_lock(&fs_info->fs_roots_radix_lock);
1199         ret = radix_tree_insert(&fs_info->fs_roots_radix,
1200                                 (unsigned long)root->root_key.objectid,
1201                                 root);
1202         if (ret == 0) {
1203                 btrfs_grab_root(root);
1204                 set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
1205         }
1206         spin_unlock(&fs_info->fs_roots_radix_lock);
1207         radix_tree_preload_end();
1208
1209         return ret;
1210 }
1211
1212 void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info)
1213 {
1214 #ifdef CONFIG_BTRFS_DEBUG
1215         struct btrfs_root *root;
1216
1217         while (!list_empty(&fs_info->allocated_roots)) {
1218                 char buf[BTRFS_ROOT_NAME_BUF_LEN];
1219
1220                 root = list_first_entry(&fs_info->allocated_roots,
1221                                         struct btrfs_root, leak_list);
1222                 btrfs_err(fs_info, "leaked root %s refcount %d",
1223                           btrfs_root_name(&root->root_key, buf),
1224                           refcount_read(&root->refs));
1225                 while (refcount_read(&root->refs) > 1)
1226                         btrfs_put_root(root);
1227                 btrfs_put_root(root);
1228         }
1229 #endif
1230 }
1231
1232 static void free_global_roots(struct btrfs_fs_info *fs_info)
1233 {
1234         struct btrfs_root *root;
1235         struct rb_node *node;
1236
1237         while ((node = rb_first_postorder(&fs_info->global_root_tree)) != NULL) {
1238                 root = rb_entry(node, struct btrfs_root, rb_node);
1239                 rb_erase(&root->rb_node, &fs_info->global_root_tree);
1240                 btrfs_put_root(root);
1241         }
1242 }
1243
1244 void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
1245 {
1246         percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
1247         percpu_counter_destroy(&fs_info->delalloc_bytes);
1248         percpu_counter_destroy(&fs_info->ordered_bytes);
1249         percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
1250         btrfs_free_csum_hash(fs_info);
1251         btrfs_free_stripe_hash_table(fs_info);
1252         btrfs_free_ref_cache(fs_info);
1253         kfree(fs_info->balance_ctl);
1254         kfree(fs_info->delayed_root);
1255         free_global_roots(fs_info);
1256         btrfs_put_root(fs_info->tree_root);
1257         btrfs_put_root(fs_info->chunk_root);
1258         btrfs_put_root(fs_info->dev_root);
1259         btrfs_put_root(fs_info->quota_root);
1260         btrfs_put_root(fs_info->uuid_root);
1261         btrfs_put_root(fs_info->fs_root);
1262         btrfs_put_root(fs_info->data_reloc_root);
1263         btrfs_put_root(fs_info->block_group_root);
1264         btrfs_check_leaked_roots(fs_info);
1265         btrfs_extent_buffer_leak_debug_check(fs_info);
1266         kfree(fs_info->super_copy);
1267         kfree(fs_info->super_for_commit);
1268         kfree(fs_info->subpage_info);
1269         kvfree(fs_info);
1270 }
1271
1272
1273 /*
1274  * Get an in-memory reference of a root structure.
1275  *
1276  * For essential trees like root/extent tree, we grab it from fs_info directly.
1277  * For subvolume trees, we check the cached filesystem roots first. If not
1278  * found, then read it from disk and add it to cached fs roots.
1279  *
1280  * Caller should release the root by calling btrfs_put_root() after the usage.
1281  *
1282  * NOTE: Reloc and log trees can't be read by this function as they share the
1283  *       same root objectid.
1284  *
1285  * @objectid:   root id
1286  * @anon_dev:   preallocated anonymous block device number for new roots,
1287  *              pass 0 for new allocation.
1288  * @check_ref:  whether to check root item references, If true, return -ENOENT
1289  *              for orphan roots
1290  */
1291 static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
1292                                              u64 objectid, dev_t anon_dev,
1293                                              bool check_ref)
1294 {
1295         struct btrfs_root *root;
1296         struct btrfs_path *path;
1297         struct btrfs_key key;
1298         int ret;
1299
1300         root = btrfs_get_global_root(fs_info, objectid);
1301         if (root)
1302                 return root;
1303 again:
1304         root = btrfs_lookup_fs_root(fs_info, objectid);
1305         if (root) {
1306                 /* Shouldn't get preallocated anon_dev for cached roots */
1307                 ASSERT(!anon_dev);
1308                 if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
1309                         btrfs_put_root(root);
1310                         return ERR_PTR(-ENOENT);
1311                 }
1312                 return root;
1313         }
1314
1315         key.objectid = objectid;
1316         key.type = BTRFS_ROOT_ITEM_KEY;
1317         key.offset = (u64)-1;
1318         root = btrfs_read_tree_root(fs_info->tree_root, &key);
1319         if (IS_ERR(root))
1320                 return root;
1321
1322         if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
1323                 ret = -ENOENT;
1324                 goto fail;
1325         }
1326
1327         ret = btrfs_init_fs_root(root, anon_dev);
1328         if (ret)
1329                 goto fail;
1330
1331         path = btrfs_alloc_path();
1332         if (!path) {
1333                 ret = -ENOMEM;
1334                 goto fail;
1335         }
1336         key.objectid = BTRFS_ORPHAN_OBJECTID;
1337         key.type = BTRFS_ORPHAN_ITEM_KEY;
1338         key.offset = objectid;
1339
1340         ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
1341         btrfs_free_path(path);
1342         if (ret < 0)
1343                 goto fail;
1344         if (ret == 0)
1345                 set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
1346
1347         ret = btrfs_insert_fs_root(fs_info, root);
1348         if (ret) {
1349                 if (ret == -EEXIST) {
1350                         btrfs_put_root(root);
1351                         goto again;
1352                 }
1353                 goto fail;
1354         }
1355         return root;
1356 fail:
1357         /*
1358          * If our caller provided us an anonymous device, then it's his
1359          * responsibility to free it in case we fail. So we have to set our
1360          * root's anon_dev to 0 to avoid a double free, once by btrfs_put_root()
1361          * and once again by our caller.
1362          */
1363         if (anon_dev)
1364                 root->anon_dev = 0;
1365         btrfs_put_root(root);
1366         return ERR_PTR(ret);
1367 }
1368
1369 /*
1370  * Get in-memory reference of a root structure
1371  *
1372  * @objectid:   tree objectid
1373  * @check_ref:  if set, verify that the tree exists and the item has at least
1374  *              one reference
1375  */
1376 struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
1377                                      u64 objectid, bool check_ref)
1378 {
1379         return btrfs_get_root_ref(fs_info, objectid, 0, check_ref);
1380 }
1381
1382 /*
1383  * Get in-memory reference of a root structure, created as new, optionally pass
1384  * the anonymous block device id
1385  *
1386  * @objectid:   tree objectid
1387  * @anon_dev:   if zero, allocate a new anonymous block device or use the
1388  *              parameter value
1389  */
1390 struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
1391                                          u64 objectid, dev_t anon_dev)
1392 {
1393         return btrfs_get_root_ref(fs_info, objectid, anon_dev, true);
1394 }
1395
1396 /*
1397  * btrfs_get_fs_root_commit_root - return a root for the given objectid
1398  * @fs_info:    the fs_info
1399  * @objectid:   the objectid we need to lookup
1400  *
1401  * This is exclusively used for backref walking, and exists specifically because
1402  * of how qgroups does lookups.  Qgroups will do a backref lookup at delayed ref
1403  * creation time, which means we may have to read the tree_root in order to look
1404  * up a fs root that is not in memory.  If the root is not in memory we will
1405  * read the tree root commit root and look up the fs root from there.  This is a
1406  * temporary root, it will not be inserted into the radix tree as it doesn't
1407  * have the most uptodate information, it'll simply be discarded once the
1408  * backref code is finished using the root.
1409  */
1410 struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
1411                                                  struct btrfs_path *path,
1412                                                  u64 objectid)
1413 {
1414         struct btrfs_root *root;
1415         struct btrfs_key key;
1416
1417         ASSERT(path->search_commit_root && path->skip_locking);
1418
1419         /*
1420          * This can return -ENOENT if we ask for a root that doesn't exist, but
1421          * since this is called via the backref walking code we won't be looking
1422          * up a root that doesn't exist, unless there's corruption.  So if root
1423          * != NULL just return it.
1424          */
1425         root = btrfs_get_global_root(fs_info, objectid);
1426         if (root)
1427                 return root;
1428
1429         root = btrfs_lookup_fs_root(fs_info, objectid);
1430         if (root)
1431                 return root;
1432
1433         key.objectid = objectid;
1434         key.type = BTRFS_ROOT_ITEM_KEY;
1435         key.offset = (u64)-1;
1436         root = read_tree_root_path(fs_info->tree_root, path, &key);
1437         btrfs_release_path(path);
1438
1439         return root;
1440 }
1441
1442 static int cleaner_kthread(void *arg)
1443 {
1444         struct btrfs_fs_info *fs_info = arg;
1445         int again;
1446
1447         while (1) {
1448                 again = 0;
1449
1450                 set_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
1451
1452                 /* Make the cleaner go to sleep early. */
1453                 if (btrfs_need_cleaner_sleep(fs_info))
1454                         goto sleep;
1455
1456                 /*
1457                  * Do not do anything if we might cause open_ctree() to block
1458                  * before we have finished mounting the filesystem.
1459                  */
1460                 if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1461                         goto sleep;
1462
1463                 if (!mutex_trylock(&fs_info->cleaner_mutex))
1464                         goto sleep;
1465
1466                 /*
1467                  * Avoid the problem that we change the status of the fs
1468                  * during the above check and trylock.
1469                  */
1470                 if (btrfs_need_cleaner_sleep(fs_info)) {
1471                         mutex_unlock(&fs_info->cleaner_mutex);
1472                         goto sleep;
1473                 }
1474
1475                 if (test_and_clear_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags))
1476                         btrfs_sysfs_feature_update(fs_info);
1477
1478                 btrfs_run_delayed_iputs(fs_info);
1479
1480                 again = btrfs_clean_one_deleted_snapshot(fs_info);
1481                 mutex_unlock(&fs_info->cleaner_mutex);
1482
1483                 /*
1484                  * The defragger has dealt with the R/O remount and umount,
1485                  * needn't do anything special here.
1486                  */
1487                 btrfs_run_defrag_inodes(fs_info);
1488
1489                 /*
1490                  * Acquires fs_info->reclaim_bgs_lock to avoid racing
1491                  * with relocation (btrfs_relocate_chunk) and relocation
1492                  * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group)
1493                  * after acquiring fs_info->reclaim_bgs_lock. So we
1494                  * can't hold, nor need to, fs_info->cleaner_mutex when deleting
1495                  * unused block groups.
1496                  */
1497                 btrfs_delete_unused_bgs(fs_info);
1498
1499                 /*
1500                  * Reclaim block groups in the reclaim_bgs list after we deleted
1501                  * all unused block_groups. This possibly gives us some more free
1502                  * space.
1503                  */
1504                 btrfs_reclaim_bgs(fs_info);
1505 sleep:
1506                 clear_and_wake_up_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
1507                 if (kthread_should_park())
1508                         kthread_parkme();
1509                 if (kthread_should_stop())
1510                         return 0;
1511                 if (!again) {
1512                         set_current_state(TASK_INTERRUPTIBLE);
1513                         schedule();
1514                         __set_current_state(TASK_RUNNING);
1515                 }
1516         }
1517 }
1518
1519 static int transaction_kthread(void *arg)
1520 {
1521         struct btrfs_root *root = arg;
1522         struct btrfs_fs_info *fs_info = root->fs_info;
1523         struct btrfs_trans_handle *trans;
1524         struct btrfs_transaction *cur;
1525         u64 transid;
1526         time64_t delta;
1527         unsigned long delay;
1528         bool cannot_commit;
1529
1530         do {
1531                 cannot_commit = false;
1532                 delay = msecs_to_jiffies(fs_info->commit_interval * 1000);
1533                 mutex_lock(&fs_info->transaction_kthread_mutex);
1534
1535                 spin_lock(&fs_info->trans_lock);
1536                 cur = fs_info->running_transaction;
1537                 if (!cur) {
1538                         spin_unlock(&fs_info->trans_lock);
1539                         goto sleep;
1540                 }
1541
1542                 delta = ktime_get_seconds() - cur->start_time;
1543                 if (!test_and_clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags) &&
1544                     cur->state < TRANS_STATE_COMMIT_START &&
1545                     delta < fs_info->commit_interval) {
1546                         spin_unlock(&fs_info->trans_lock);
1547                         delay -= msecs_to_jiffies((delta - 1) * 1000);
1548                         delay = min(delay,
1549                                     msecs_to_jiffies(fs_info->commit_interval * 1000));
1550                         goto sleep;
1551                 }
1552                 transid = cur->transid;
1553                 spin_unlock(&fs_info->trans_lock);
1554
1555                 /* If the file system is aborted, this will always fail. */
1556                 trans = btrfs_attach_transaction(root);
1557                 if (IS_ERR(trans)) {
1558                         if (PTR_ERR(trans) != -ENOENT)
1559                                 cannot_commit = true;
1560                         goto sleep;
1561                 }
1562                 if (transid == trans->transid) {
1563                         btrfs_commit_transaction(trans);
1564                 } else {
1565                         btrfs_end_transaction(trans);
1566                 }
1567 sleep:
1568                 wake_up_process(fs_info->cleaner_kthread);
1569                 mutex_unlock(&fs_info->transaction_kthread_mutex);
1570
1571                 if (BTRFS_FS_ERROR(fs_info))
1572                         btrfs_cleanup_transaction(fs_info);
1573                 if (!kthread_should_stop() &&
1574                                 (!btrfs_transaction_blocked(fs_info) ||
1575                                  cannot_commit))
1576                         schedule_timeout_interruptible(delay);
1577         } while (!kthread_should_stop());
1578         return 0;
1579 }
1580
1581 /*
1582  * This will find the highest generation in the array of root backups.  The
1583  * index of the highest array is returned, or -EINVAL if we can't find
1584  * anything.
1585  *
1586  * We check to make sure the array is valid by comparing the
1587  * generation of the latest  root in the array with the generation
1588  * in the super block.  If they don't match we pitch it.
1589  */
1590 static int find_newest_super_backup(struct btrfs_fs_info *info)
1591 {
1592         const u64 newest_gen = btrfs_super_generation(info->super_copy);
1593         u64 cur;
1594         struct btrfs_root_backup *root_backup;
1595         int i;
1596
1597         for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
1598                 root_backup = info->super_copy->super_roots + i;
1599                 cur = btrfs_backup_tree_root_gen(root_backup);
1600                 if (cur == newest_gen)
1601                         return i;
1602         }
1603
1604         return -EINVAL;
1605 }
1606
1607 /*
1608  * copy all the root pointers into the super backup array.
1609  * this will bump the backup pointer by one when it is
1610  * done
1611  */
1612 static void backup_super_roots(struct btrfs_fs_info *info)
1613 {
1614         const int next_backup = info->backup_root_index;
1615         struct btrfs_root_backup *root_backup;
1616
1617         root_backup = info->super_for_commit->super_roots + next_backup;
1618
1619         /*
1620          * make sure all of our padding and empty slots get zero filled
1621          * regardless of which ones we use today
1622          */
1623         memset(root_backup, 0, sizeof(*root_backup));
1624
1625         info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
1626
1627         btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
1628         btrfs_set_backup_tree_root_gen(root_backup,
1629                                btrfs_header_generation(info->tree_root->node));
1630
1631         btrfs_set_backup_tree_root_level(root_backup,
1632                                btrfs_header_level(info->tree_root->node));
1633
1634         btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
1635         btrfs_set_backup_chunk_root_gen(root_backup,
1636                                btrfs_header_generation(info->chunk_root->node));
1637         btrfs_set_backup_chunk_root_level(root_backup,
1638                                btrfs_header_level(info->chunk_root->node));
1639
1640         if (!btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE)) {
1641                 struct btrfs_root *extent_root = btrfs_extent_root(info, 0);
1642                 struct btrfs_root *csum_root = btrfs_csum_root(info, 0);
1643
1644                 btrfs_set_backup_extent_root(root_backup,
1645                                              extent_root->node->start);
1646                 btrfs_set_backup_extent_root_gen(root_backup,
1647                                 btrfs_header_generation(extent_root->node));
1648                 btrfs_set_backup_extent_root_level(root_backup,
1649                                         btrfs_header_level(extent_root->node));
1650
1651                 btrfs_set_backup_csum_root(root_backup, csum_root->node->start);
1652                 btrfs_set_backup_csum_root_gen(root_backup,
1653                                                btrfs_header_generation(csum_root->node));
1654                 btrfs_set_backup_csum_root_level(root_backup,
1655                                                  btrfs_header_level(csum_root->node));
1656         }
1657
1658         /*
1659          * we might commit during log recovery, which happens before we set
1660          * the fs_root.  Make sure it is valid before we fill it in.
1661          */
1662         if (info->fs_root && info->fs_root->node) {
1663                 btrfs_set_backup_fs_root(root_backup,
1664                                          info->fs_root->node->start);
1665                 btrfs_set_backup_fs_root_gen(root_backup,
1666                                btrfs_header_generation(info->fs_root->node));
1667                 btrfs_set_backup_fs_root_level(root_backup,
1668                                btrfs_header_level(info->fs_root->node));
1669         }
1670
1671         btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
1672         btrfs_set_backup_dev_root_gen(root_backup,
1673                                btrfs_header_generation(info->dev_root->node));
1674         btrfs_set_backup_dev_root_level(root_backup,
1675                                        btrfs_header_level(info->dev_root->node));
1676
1677         btrfs_set_backup_total_bytes(root_backup,
1678                              btrfs_super_total_bytes(info->super_copy));
1679         btrfs_set_backup_bytes_used(root_backup,
1680                              btrfs_super_bytes_used(info->super_copy));
1681         btrfs_set_backup_num_devices(root_backup,
1682                              btrfs_super_num_devices(info->super_copy));
1683
1684         /*
1685          * if we don't copy this out to the super_copy, it won't get remembered
1686          * for the next commit
1687          */
1688         memcpy(&info->super_copy->super_roots,
1689                &info->super_for_commit->super_roots,
1690                sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
1691 }
1692
1693 /*
1694  * read_backup_root - Reads a backup root based on the passed priority. Prio 0
1695  * is the newest, prio 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots
1696  *
1697  * fs_info - filesystem whose backup roots need to be read
1698  * priority - priority of backup root required
1699  *
1700  * Returns backup root index on success and -EINVAL otherwise.
1701  */
1702 static int read_backup_root(struct btrfs_fs_info *fs_info, u8 priority)
1703 {
1704         int backup_index = find_newest_super_backup(fs_info);
1705         struct btrfs_super_block *super = fs_info->super_copy;
1706         struct btrfs_root_backup *root_backup;
1707
1708         if (priority < BTRFS_NUM_BACKUP_ROOTS && backup_index >= 0) {
1709                 if (priority == 0)
1710                         return backup_index;
1711
1712                 backup_index = backup_index + BTRFS_NUM_BACKUP_ROOTS - priority;
1713                 backup_index %= BTRFS_NUM_BACKUP_ROOTS;
1714         } else {
1715                 return -EINVAL;
1716         }
1717
1718         root_backup = super->super_roots + backup_index;
1719
1720         btrfs_set_super_generation(super,
1721                                    btrfs_backup_tree_root_gen(root_backup));
1722         btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
1723         btrfs_set_super_root_level(super,
1724                                    btrfs_backup_tree_root_level(root_backup));
1725         btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
1726
1727         /*
1728          * Fixme: the total bytes and num_devices need to match or we should
1729          * need a fsck
1730          */
1731         btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
1732         btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
1733
1734         return backup_index;
1735 }
1736
1737 /* helper to cleanup workers */
1738 static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
1739 {
1740         btrfs_destroy_workqueue(fs_info->fixup_workers);
1741         btrfs_destroy_workqueue(fs_info->delalloc_workers);
1742         btrfs_destroy_workqueue(fs_info->workers);
1743         if (fs_info->endio_workers)
1744                 destroy_workqueue(fs_info->endio_workers);
1745         if (fs_info->rmw_workers)
1746                 destroy_workqueue(fs_info->rmw_workers);
1747         if (fs_info->compressed_write_workers)
1748                 destroy_workqueue(fs_info->compressed_write_workers);
1749         btrfs_destroy_workqueue(fs_info->endio_write_workers);
1750         btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
1751         btrfs_destroy_workqueue(fs_info->delayed_workers);
1752         btrfs_destroy_workqueue(fs_info->caching_workers);
1753         btrfs_destroy_workqueue(fs_info->flush_workers);
1754         btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
1755         if (fs_info->discard_ctl.discard_workers)
1756                 destroy_workqueue(fs_info->discard_ctl.discard_workers);
1757         /*
1758          * Now that all other work queues are destroyed, we can safely destroy
1759          * the queues used for metadata I/O, since tasks from those other work
1760          * queues can do metadata I/O operations.
1761          */
1762         if (fs_info->endio_meta_workers)
1763                 destroy_workqueue(fs_info->endio_meta_workers);
1764 }
1765
1766 static void free_root_extent_buffers(struct btrfs_root *root)
1767 {
1768         if (root) {
1769                 free_extent_buffer(root->node);
1770                 free_extent_buffer(root->commit_root);
1771                 root->node = NULL;
1772                 root->commit_root = NULL;
1773         }
1774 }
1775
1776 static void free_global_root_pointers(struct btrfs_fs_info *fs_info)
1777 {
1778         struct btrfs_root *root, *tmp;
1779
1780         rbtree_postorder_for_each_entry_safe(root, tmp,
1781                                              &fs_info->global_root_tree,
1782                                              rb_node)
1783                 free_root_extent_buffers(root);
1784 }
1785
1786 /* helper to cleanup tree roots */
1787 static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
1788 {
1789         free_root_extent_buffers(info->tree_root);
1790
1791         free_global_root_pointers(info);
1792         free_root_extent_buffers(info->dev_root);
1793         free_root_extent_buffers(info->quota_root);
1794         free_root_extent_buffers(info->uuid_root);
1795         free_root_extent_buffers(info->fs_root);
1796         free_root_extent_buffers(info->data_reloc_root);
1797         free_root_extent_buffers(info->block_group_root);
1798         if (free_chunk_root)
1799                 free_root_extent_buffers(info->chunk_root);
1800 }
1801
1802 void btrfs_put_root(struct btrfs_root *root)
1803 {
1804         if (!root)
1805                 return;
1806
1807         if (refcount_dec_and_test(&root->refs)) {
1808                 WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
1809                 WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state));
1810                 if (root->anon_dev)
1811                         free_anon_bdev(root->anon_dev);
1812                 free_root_extent_buffers(root);
1813 #ifdef CONFIG_BTRFS_DEBUG
1814                 spin_lock(&root->fs_info->fs_roots_radix_lock);
1815                 list_del_init(&root->leak_list);
1816                 spin_unlock(&root->fs_info->fs_roots_radix_lock);
1817 #endif
1818                 kfree(root);
1819         }
1820 }
1821
1822 void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
1823 {
1824         int ret;
1825         struct btrfs_root *gang[8];
1826         int i;
1827
1828         while (!list_empty(&fs_info->dead_roots)) {
1829                 gang[0] = list_entry(fs_info->dead_roots.next,
1830                                      struct btrfs_root, root_list);
1831                 list_del(&gang[0]->root_list);
1832
1833                 if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state))
1834                         btrfs_drop_and_free_fs_root(fs_info, gang[0]);
1835                 btrfs_put_root(gang[0]);
1836         }
1837
1838         while (1) {
1839                 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
1840                                              (void **)gang, 0,
1841                                              ARRAY_SIZE(gang));
1842                 if (!ret)
1843                         break;
1844                 for (i = 0; i < ret; i++)
1845                         btrfs_drop_and_free_fs_root(fs_info, gang[i]);
1846         }
1847 }
1848
1849 static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
1850 {
1851         mutex_init(&fs_info->scrub_lock);
1852         atomic_set(&fs_info->scrubs_running, 0);
1853         atomic_set(&fs_info->scrub_pause_req, 0);
1854         atomic_set(&fs_info->scrubs_paused, 0);
1855         atomic_set(&fs_info->scrub_cancel_req, 0);
1856         init_waitqueue_head(&fs_info->scrub_pause_wait);
1857         refcount_set(&fs_info->scrub_workers_refcnt, 0);
1858 }
1859
1860 static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
1861 {
1862         spin_lock_init(&fs_info->balance_lock);
1863         mutex_init(&fs_info->balance_mutex);
1864         atomic_set(&fs_info->balance_pause_req, 0);
1865         atomic_set(&fs_info->balance_cancel_req, 0);
1866         fs_info->balance_ctl = NULL;
1867         init_waitqueue_head(&fs_info->balance_wait_q);
1868         atomic_set(&fs_info->reloc_cancel_req, 0);
1869 }
1870
1871 static int btrfs_init_btree_inode(struct super_block *sb)
1872 {
1873         struct btrfs_fs_info *fs_info = btrfs_sb(sb);
1874         unsigned long hash = btrfs_inode_hash(BTRFS_BTREE_INODE_OBJECTID,
1875                                               fs_info->tree_root);
1876         struct inode *inode;
1877
1878         inode = new_inode(sb);
1879         if (!inode)
1880                 return -ENOMEM;
1881
1882         inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
1883         set_nlink(inode, 1);
1884         /*
1885          * we set the i_size on the btree inode to the max possible int.
1886          * the real end of the address space is determined by all of
1887          * the devices in the system
1888          */
1889         inode->i_size = OFFSET_MAX;
1890         inode->i_mapping->a_ops = &btree_aops;
1891         mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
1892
1893         RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
1894         extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
1895                             IO_TREE_BTREE_INODE_IO);
1896         extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
1897
1898         BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
1899         BTRFS_I(inode)->location.objectid = BTRFS_BTREE_INODE_OBJECTID;
1900         BTRFS_I(inode)->location.type = 0;
1901         BTRFS_I(inode)->location.offset = 0;
1902         set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
1903         __insert_inode_hash(inode, hash);
1904         fs_info->btree_inode = inode;
1905
1906         return 0;
1907 }
1908
1909 static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
1910 {
1911         mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
1912         init_rwsem(&fs_info->dev_replace.rwsem);
1913         init_waitqueue_head(&fs_info->dev_replace.replace_wait);
1914 }
1915
1916 static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
1917 {
1918         spin_lock_init(&fs_info->qgroup_lock);
1919         mutex_init(&fs_info->qgroup_ioctl_lock);
1920         fs_info->qgroup_tree = RB_ROOT;
1921         INIT_LIST_HEAD(&fs_info->dirty_qgroups);
1922         fs_info->qgroup_seq = 1;
1923         fs_info->qgroup_ulist = NULL;
1924         fs_info->qgroup_rescan_running = false;
1925         fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
1926         mutex_init(&fs_info->qgroup_rescan_lock);
1927 }
1928
1929 static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
1930 {
1931         u32 max_active = fs_info->thread_pool_size;
1932         unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
1933         unsigned int ordered_flags = WQ_MEM_RECLAIM | WQ_FREEZABLE;
1934
1935         fs_info->workers =
1936                 btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16);
1937
1938         fs_info->delalloc_workers =
1939                 btrfs_alloc_workqueue(fs_info, "delalloc",
1940                                       flags, max_active, 2);
1941
1942         fs_info->flush_workers =
1943                 btrfs_alloc_workqueue(fs_info, "flush_delalloc",
1944                                       flags, max_active, 0);
1945
1946         fs_info->caching_workers =
1947                 btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0);
1948
1949         fs_info->fixup_workers =
1950                 btrfs_alloc_ordered_workqueue(fs_info, "fixup", ordered_flags);
1951
1952         fs_info->endio_workers =
1953                 alloc_workqueue("btrfs-endio", flags, max_active);
1954         fs_info->endio_meta_workers =
1955                 alloc_workqueue("btrfs-endio-meta", flags, max_active);
1956         fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
1957         fs_info->endio_write_workers =
1958                 btrfs_alloc_workqueue(fs_info, "endio-write", flags,
1959                                       max_active, 2);
1960         fs_info->compressed_write_workers =
1961                 alloc_workqueue("btrfs-compressed-write", flags, max_active);
1962         fs_info->endio_freespace_worker =
1963                 btrfs_alloc_workqueue(fs_info, "freespace-write", flags,
1964                                       max_active, 0);
1965         fs_info->delayed_workers =
1966                 btrfs_alloc_workqueue(fs_info, "delayed-meta", flags,
1967                                       max_active, 0);
1968         fs_info->qgroup_rescan_workers =
1969                 btrfs_alloc_ordered_workqueue(fs_info, "qgroup-rescan",
1970                                               ordered_flags);
1971         fs_info->discard_ctl.discard_workers =
1972                 alloc_ordered_workqueue("btrfs_discard", WQ_FREEZABLE);
1973
1974         if (!(fs_info->workers &&
1975               fs_info->delalloc_workers && fs_info->flush_workers &&
1976               fs_info->endio_workers && fs_info->endio_meta_workers &&
1977               fs_info->compressed_write_workers &&
1978               fs_info->endio_write_workers &&
1979               fs_info->endio_freespace_worker && fs_info->rmw_workers &&
1980               fs_info->caching_workers && fs_info->fixup_workers &&
1981               fs_info->delayed_workers && fs_info->qgroup_rescan_workers &&
1982               fs_info->discard_ctl.discard_workers)) {
1983                 return -ENOMEM;
1984         }
1985
1986         return 0;
1987 }
1988
1989 static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
1990 {
1991         struct crypto_shash *csum_shash;
1992         const char *csum_driver = btrfs_super_csum_driver(csum_type);
1993
1994         csum_shash = crypto_alloc_shash(csum_driver, 0, 0);
1995
1996         if (IS_ERR(csum_shash)) {
1997                 btrfs_err(fs_info, "error allocating %s hash for checksum",
1998                           csum_driver);
1999                 return PTR_ERR(csum_shash);
2000         }
2001
2002         fs_info->csum_shash = csum_shash;
2003
2004         /*
2005          * Check if the checksum implementation is a fast accelerated one.
2006          * As-is this is a bit of a hack and should be replaced once the csum
2007          * implementations provide that information themselves.
2008          */
2009         switch (csum_type) {
2010         case BTRFS_CSUM_TYPE_CRC32:
2011                 if (!strstr(crypto_shash_driver_name(csum_shash), "generic"))
2012                         set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
2013                 break;
2014         case BTRFS_CSUM_TYPE_XXHASH:
2015                 set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
2016                 break;
2017         default:
2018                 break;
2019         }
2020
2021         btrfs_info(fs_info, "using %s (%s) checksum algorithm",
2022                         btrfs_super_csum_name(csum_type),
2023                         crypto_shash_driver_name(csum_shash));
2024         return 0;
2025 }
2026
2027 static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
2028                             struct btrfs_fs_devices *fs_devices)
2029 {
2030         int ret;
2031         struct btrfs_tree_parent_check check = { 0 };
2032         struct btrfs_root *log_tree_root;
2033         struct btrfs_super_block *disk_super = fs_info->super_copy;
2034         u64 bytenr = btrfs_super_log_root(disk_super);
2035         int level = btrfs_super_log_root_level(disk_super);
2036
2037         if (fs_devices->rw_devices == 0) {
2038                 btrfs_warn(fs_info, "log replay required on RO media");
2039                 return -EIO;
2040         }
2041
2042         log_tree_root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID,
2043                                          GFP_KERNEL);
2044         if (!log_tree_root)
2045                 return -ENOMEM;
2046
2047         check.level = level;
2048         check.transid = fs_info->generation + 1;
2049         check.owner_root = BTRFS_TREE_LOG_OBJECTID;
2050         log_tree_root->node = read_tree_block(fs_info, bytenr, &check);
2051         if (IS_ERR(log_tree_root->node)) {
2052                 btrfs_warn(fs_info, "failed to read log tree");
2053                 ret = PTR_ERR(log_tree_root->node);
2054                 log_tree_root->node = NULL;
2055                 btrfs_put_root(log_tree_root);
2056                 return ret;
2057         }
2058         if (!extent_buffer_uptodate(log_tree_root->node)) {
2059                 btrfs_err(fs_info, "failed to read log tree");
2060                 btrfs_put_root(log_tree_root);
2061                 return -EIO;
2062         }
2063
2064         /* returns with log_tree_root freed on success */
2065         ret = btrfs_recover_log_trees(log_tree_root);
2066         if (ret) {
2067                 btrfs_handle_fs_error(fs_info, ret,
2068                                       "Failed to recover log tree");
2069                 btrfs_put_root(log_tree_root);
2070                 return ret;
2071         }
2072
2073         if (sb_rdonly(fs_info->sb)) {
2074                 ret = btrfs_commit_super(fs_info);
2075                 if (ret)
2076                         return ret;
2077         }
2078
2079         return 0;
2080 }
2081
2082 static int load_global_roots_objectid(struct btrfs_root *tree_root,
2083                                       struct btrfs_path *path, u64 objectid,
2084                                       const char *name)
2085 {
2086         struct btrfs_fs_info *fs_info = tree_root->fs_info;
2087         struct btrfs_root *root;
2088         u64 max_global_id = 0;
2089         int ret;
2090         struct btrfs_key key = {
2091                 .objectid = objectid,
2092                 .type = BTRFS_ROOT_ITEM_KEY,
2093                 .offset = 0,
2094         };
2095         bool found = false;
2096
2097         /* If we have IGNOREDATACSUMS skip loading these roots. */
2098         if (objectid == BTRFS_CSUM_TREE_OBJECTID &&
2099             btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
2100                 set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
2101                 return 0;
2102         }
2103
2104         while (1) {
2105                 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
2106                 if (ret < 0)
2107                         break;
2108
2109                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2110                         ret = btrfs_next_leaf(tree_root, path);
2111                         if (ret) {
2112                                 if (ret > 0)
2113                                         ret = 0;
2114                                 break;
2115                         }
2116                 }
2117                 ret = 0;
2118
2119                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2120                 if (key.objectid != objectid)
2121                         break;
2122                 btrfs_release_path(path);
2123
2124                 /*
2125                  * Just worry about this for extent tree, it'll be the same for
2126                  * everybody.
2127                  */
2128                 if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
2129                         max_global_id = max(max_global_id, key.offset);
2130
2131                 found = true;
2132                 root = read_tree_root_path(tree_root, path, &key);
2133                 if (IS_ERR(root)) {
2134                         if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
2135                                 ret = PTR_ERR(root);
2136                         break;
2137                 }
2138                 set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2139                 ret = btrfs_global_root_insert(root);
2140                 if (ret) {
2141                         btrfs_put_root(root);
2142                         break;
2143                 }
2144                 key.offset++;
2145         }
2146         btrfs_release_path(path);
2147
2148         if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
2149                 fs_info->nr_global_roots = max_global_id + 1;
2150
2151         if (!found || ret) {
2152                 if (objectid == BTRFS_CSUM_TREE_OBJECTID)
2153                         set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
2154
2155                 if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
2156                         ret = ret ? ret : -ENOENT;
2157                 else
2158                         ret = 0;
2159                 btrfs_err(fs_info, "failed to load root %s", name);
2160         }
2161         return ret;
2162 }
2163
2164 static int load_global_roots(struct btrfs_root *tree_root)
2165 {
2166         struct btrfs_path *path;
2167         int ret = 0;
2168
2169         path = btrfs_alloc_path();
2170         if (!path)
2171                 return -ENOMEM;
2172
2173         ret = load_global_roots_objectid(tree_root, path,
2174                                          BTRFS_EXTENT_TREE_OBJECTID, "extent");
2175         if (ret)
2176                 goto out;
2177         ret = load_global_roots_objectid(tree_root, path,
2178                                          BTRFS_CSUM_TREE_OBJECTID, "csum");
2179         if (ret)
2180                 goto out;
2181         if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE))
2182                 goto out;
2183         ret = load_global_roots_objectid(tree_root, path,
2184                                          BTRFS_FREE_SPACE_TREE_OBJECTID,
2185                                          "free space");
2186 out:
2187         btrfs_free_path(path);
2188         return ret;
2189 }
2190
2191 static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
2192 {
2193         struct btrfs_root *tree_root = fs_info->tree_root;
2194         struct btrfs_root *root;
2195         struct btrfs_key location;
2196         int ret;
2197
2198         BUG_ON(!fs_info->tree_root);
2199
2200         ret = load_global_roots(tree_root);
2201         if (ret)
2202                 return ret;
2203
2204         location.type = BTRFS_ROOT_ITEM_KEY;
2205         location.offset = 0;
2206
2207         if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) {
2208                 location.objectid = BTRFS_BLOCK_GROUP_TREE_OBJECTID;
2209                 root = btrfs_read_tree_root(tree_root, &location);
2210                 if (IS_ERR(root)) {
2211                         if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2212                                 ret = PTR_ERR(root);
2213                                 goto out;
2214                         }
2215                 } else {
2216                         set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2217                         fs_info->block_group_root = root;
2218                 }
2219         }
2220
2221         location.objectid = BTRFS_DEV_TREE_OBJECTID;
2222         root = btrfs_read_tree_root(tree_root, &location);
2223         if (IS_ERR(root)) {
2224                 if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2225                         ret = PTR_ERR(root);
2226                         goto out;
2227                 }
2228         } else {
2229                 set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2230                 fs_info->dev_root = root;
2231         }
2232         /* Initialize fs_info for all devices in any case */
2233         ret = btrfs_init_devices_late(fs_info);
2234         if (ret)
2235                 goto out;
2236
2237         /*
2238          * This tree can share blocks with some other fs tree during relocation
2239          * and we need a proper setup by btrfs_get_fs_root
2240          */
2241         root = btrfs_get_fs_root(tree_root->fs_info,
2242                                  BTRFS_DATA_RELOC_TREE_OBJECTID, true);
2243         if (IS_ERR(root)) {
2244                 if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2245                         ret = PTR_ERR(root);
2246                         goto out;
2247                 }
2248         } else {
2249                 set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2250                 fs_info->data_reloc_root = root;
2251         }
2252
2253         location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
2254         root = btrfs_read_tree_root(tree_root, &location);
2255         if (!IS_ERR(root)) {
2256                 set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2257                 set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
2258                 fs_info->quota_root = root;
2259         }
2260
2261         location.objectid = BTRFS_UUID_TREE_OBJECTID;
2262         root = btrfs_read_tree_root(tree_root, &location);
2263         if (IS_ERR(root)) {
2264                 if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2265                         ret = PTR_ERR(root);
2266                         if (ret != -ENOENT)
2267                                 goto out;
2268                 }
2269         } else {
2270                 set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2271                 fs_info->uuid_root = root;
2272         }
2273
2274         return 0;
2275 out:
2276         btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
2277                    location.objectid, ret);
2278         return ret;
2279 }
2280
2281 /*
2282  * Real super block validation
2283  * NOTE: super csum type and incompat features will not be checked here.
2284  *
2285  * @sb:         super block to check
2286  * @mirror_num: the super block number to check its bytenr:
2287  *              0       the primary (1st) sb
2288  *              1, 2    2nd and 3rd backup copy
2289  *             -1       skip bytenr check
2290  */
2291 int btrfs_validate_super(struct btrfs_fs_info *fs_info,
2292                          struct btrfs_super_block *sb, int mirror_num)
2293 {
2294         u64 nodesize = btrfs_super_nodesize(sb);
2295         u64 sectorsize = btrfs_super_sectorsize(sb);
2296         int ret = 0;
2297
2298         if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
2299                 btrfs_err(fs_info, "no valid FS found");
2300                 ret = -EINVAL;
2301         }
2302         if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) {
2303                 btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu",
2304                                 btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
2305                 ret = -EINVAL;
2306         }
2307         if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
2308                 btrfs_err(fs_info, "tree_root level too big: %d >= %d",
2309                                 btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
2310                 ret = -EINVAL;
2311         }
2312         if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
2313                 btrfs_err(fs_info, "chunk_root level too big: %d >= %d",
2314                                 btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
2315                 ret = -EINVAL;
2316         }
2317         if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
2318                 btrfs_err(fs_info, "log_root level too big: %d >= %d",
2319                                 btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
2320                 ret = -EINVAL;
2321         }
2322
2323         /*
2324          * Check sectorsize and nodesize first, other check will need it.
2325          * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
2326          */
2327         if (!is_power_of_2(sectorsize) || sectorsize < 4096 ||
2328             sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
2329                 btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
2330                 ret = -EINVAL;
2331         }
2332
2333         /*
2334          * We only support at most two sectorsizes: 4K and PAGE_SIZE.
2335          *
2336          * We can support 16K sectorsize with 64K page size without problem,
2337          * but such sectorsize/pagesize combination doesn't make much sense.
2338          * 4K will be our future standard, PAGE_SIZE is supported from the very
2339          * beginning.
2340          */
2341         if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && sectorsize != PAGE_SIZE)) {
2342                 btrfs_err(fs_info,
2343                         "sectorsize %llu not yet supported for page size %lu",
2344                         sectorsize, PAGE_SIZE);
2345                 ret = -EINVAL;
2346         }
2347
2348         if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
2349             nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
2350                 btrfs_err(fs_info, "invalid nodesize %llu", nodesize);
2351                 ret = -EINVAL;
2352         }
2353         if (nodesize != le32_to_cpu(sb->__unused_leafsize)) {
2354                 btrfs_err(fs_info, "invalid leafsize %u, should be %llu",
2355                           le32_to_cpu(sb->__unused_leafsize), nodesize);
2356                 ret = -EINVAL;
2357         }
2358
2359         /* Root alignment check */
2360         if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) {
2361                 btrfs_warn(fs_info, "tree_root block unaligned: %llu",
2362                            btrfs_super_root(sb));
2363                 ret = -EINVAL;
2364         }
2365         if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) {
2366                 btrfs_warn(fs_info, "chunk_root block unaligned: %llu",
2367                            btrfs_super_chunk_root(sb));
2368                 ret = -EINVAL;
2369         }
2370         if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) {
2371                 btrfs_warn(fs_info, "log_root block unaligned: %llu",
2372                            btrfs_super_log_root(sb));
2373                 ret = -EINVAL;
2374         }
2375
2376         if (memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid,
2377                    BTRFS_FSID_SIZE)) {
2378                 btrfs_err(fs_info,
2379                 "superblock fsid doesn't match fsid of fs_devices: %pU != %pU",
2380                         fs_info->super_copy->fsid, fs_info->fs_devices->fsid);
2381                 ret = -EINVAL;
2382         }
2383
2384         if (btrfs_fs_incompat(fs_info, METADATA_UUID) &&
2385             memcmp(fs_info->fs_devices->metadata_uuid,
2386                    fs_info->super_copy->metadata_uuid, BTRFS_FSID_SIZE)) {
2387                 btrfs_err(fs_info,
2388 "superblock metadata_uuid doesn't match metadata uuid of fs_devices: %pU != %pU",
2389                         fs_info->super_copy->metadata_uuid,
2390                         fs_info->fs_devices->metadata_uuid);
2391                 ret = -EINVAL;
2392         }
2393
2394         if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
2395                    BTRFS_FSID_SIZE) != 0) {
2396                 btrfs_err(fs_info,
2397                         "dev_item UUID does not match metadata fsid: %pU != %pU",
2398                         fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid);
2399                 ret = -EINVAL;
2400         }
2401
2402         /*
2403          * Artificial requirement for block-group-tree to force newer features
2404          * (free-space-tree, no-holes) so the test matrix is smaller.
2405          */
2406         if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) &&
2407             (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) ||
2408              !btrfs_fs_incompat(fs_info, NO_HOLES))) {
2409                 btrfs_err(fs_info,
2410                 "block-group-tree feature requires fres-space-tree and no-holes");
2411                 ret = -EINVAL;
2412         }
2413
2414         /*
2415          * Hint to catch really bogus numbers, bitflips or so, more exact checks are
2416          * done later
2417          */
2418         if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) {
2419                 btrfs_err(fs_info, "bytes_used is too small %llu",
2420                           btrfs_super_bytes_used(sb));
2421                 ret = -EINVAL;
2422         }
2423         if (!is_power_of_2(btrfs_super_stripesize(sb))) {
2424                 btrfs_err(fs_info, "invalid stripesize %u",
2425                           btrfs_super_stripesize(sb));
2426                 ret = -EINVAL;
2427         }
2428         if (btrfs_super_num_devices(sb) > (1UL << 31))
2429                 btrfs_warn(fs_info, "suspicious number of devices: %llu",
2430                            btrfs_super_num_devices(sb));
2431         if (btrfs_super_num_devices(sb) == 0) {
2432                 btrfs_err(fs_info, "number of devices is 0");
2433                 ret = -EINVAL;
2434         }
2435
2436         if (mirror_num >= 0 &&
2437             btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num)) {
2438                 btrfs_err(fs_info, "super offset mismatch %llu != %u",
2439                           btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
2440                 ret = -EINVAL;
2441         }
2442
2443         /*
2444          * Obvious sys_chunk_array corruptions, it must hold at least one key
2445          * and one chunk
2446          */
2447         if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
2448                 btrfs_err(fs_info, "system chunk array too big %u > %u",
2449                           btrfs_super_sys_array_size(sb),
2450                           BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
2451                 ret = -EINVAL;
2452         }
2453         if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
2454                         + sizeof(struct btrfs_chunk)) {
2455                 btrfs_err(fs_info, "system chunk array too small %u < %zu",
2456                           btrfs_super_sys_array_size(sb),
2457                           sizeof(struct btrfs_disk_key)
2458                           + sizeof(struct btrfs_chunk));
2459                 ret = -EINVAL;
2460         }
2461
2462         /*
2463          * The generation is a global counter, we'll trust it more than the others
2464          * but it's still possible that it's the one that's wrong.
2465          */
2466         if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
2467                 btrfs_warn(fs_info,
2468                         "suspicious: generation < chunk_root_generation: %llu < %llu",
2469                         btrfs_super_generation(sb),
2470                         btrfs_super_chunk_root_generation(sb));
2471         if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
2472             && btrfs_super_cache_generation(sb) != (u64)-1)
2473                 btrfs_warn(fs_info,
2474                         "suspicious: generation < cache_generation: %llu < %llu",
2475                         btrfs_super_generation(sb),
2476                         btrfs_super_cache_generation(sb));
2477
2478         return ret;
2479 }
2480
2481 /*
2482  * Validation of super block at mount time.
2483  * Some checks already done early at mount time, like csum type and incompat
2484  * flags will be skipped.
2485  */
2486 static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info)
2487 {
2488         return btrfs_validate_super(fs_info, fs_info->super_copy, 0);
2489 }
2490
2491 /*
2492  * Validation of super block at write time.
2493  * Some checks like bytenr check will be skipped as their values will be
2494  * overwritten soon.
2495  * Extra checks like csum type and incompat flags will be done here.
2496  */
2497 static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
2498                                       struct btrfs_super_block *sb)
2499 {
2500         int ret;
2501
2502         ret = btrfs_validate_super(fs_info, sb, -1);
2503         if (ret < 0)
2504                 goto out;
2505         if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) {
2506                 ret = -EUCLEAN;
2507                 btrfs_err(fs_info, "invalid csum type, has %u want %u",
2508                           btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32);
2509                 goto out;
2510         }
2511         if (btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
2512                 ret = -EUCLEAN;
2513                 btrfs_err(fs_info,
2514                 "invalid incompat flags, has 0x%llx valid mask 0x%llx",
2515                           btrfs_super_incompat_flags(sb),
2516                           (unsigned long long)BTRFS_FEATURE_INCOMPAT_SUPP);
2517                 goto out;
2518         }
2519 out:
2520         if (ret < 0)
2521                 btrfs_err(fs_info,
2522                 "super block corruption detected before writing it to disk");
2523         return ret;
2524 }
2525
2526 static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int level)
2527 {
2528         struct btrfs_tree_parent_check check = {
2529                 .level = level,
2530                 .transid = gen,
2531                 .owner_root = root->root_key.objectid
2532         };
2533         int ret = 0;
2534
2535         root->node = read_tree_block(root->fs_info, bytenr, &check);
2536         if (IS_ERR(root->node)) {
2537                 ret = PTR_ERR(root->node);
2538                 root->node = NULL;
2539                 return ret;
2540         }
2541         if (!extent_buffer_uptodate(root->node)) {
2542                 free_extent_buffer(root->node);
2543                 root->node = NULL;
2544                 return -EIO;
2545         }
2546
2547         btrfs_set_root_node(&root->root_item, root->node);
2548         root->commit_root = btrfs_root_node(root);
2549         btrfs_set_root_refs(&root->root_item, 1);
2550         return ret;
2551 }
2552
2553 static int load_important_roots(struct btrfs_fs_info *fs_info)
2554 {
2555         struct btrfs_super_block *sb = fs_info->super_copy;
2556         u64 gen, bytenr;
2557         int level, ret;
2558
2559         bytenr = btrfs_super_root(sb);
2560         gen = btrfs_super_generation(sb);
2561         level = btrfs_super_root_level(sb);
2562         ret = load_super_root(fs_info->tree_root, bytenr, gen, level);
2563         if (ret) {
2564                 btrfs_warn(fs_info, "couldn't read tree root");
2565                 return ret;
2566         }
2567         return 0;
2568 }
2569
2570 static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
2571 {
2572         int backup_index = find_newest_super_backup(fs_info);
2573         struct btrfs_super_block *sb = fs_info->super_copy;
2574         struct btrfs_root *tree_root = fs_info->tree_root;
2575         bool handle_error = false;
2576         int ret = 0;
2577         int i;
2578
2579         for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
2580                 if (handle_error) {
2581                         if (!IS_ERR(tree_root->node))
2582                                 free_extent_buffer(tree_root->node);
2583                         tree_root->node = NULL;
2584
2585                         if (!btrfs_test_opt(fs_info, USEBACKUPROOT))
2586                                 break;
2587
2588                         free_root_pointers(fs_info, 0);
2589
2590                         /*
2591                          * Don't use the log in recovery mode, it won't be
2592                          * valid
2593                          */
2594                         btrfs_set_super_log_root(sb, 0);
2595
2596                         /* We can't trust the free space cache either */
2597                         btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
2598
2599                         btrfs_warn(fs_info, "try to load backup roots slot %d", i);
2600                         ret = read_backup_root(fs_info, i);
2601                         backup_index = ret;
2602                         if (ret < 0)
2603                                 return ret;
2604                 }
2605
2606                 ret = load_important_roots(fs_info);
2607                 if (ret) {
2608                         handle_error = true;
2609                         continue;
2610                 }
2611
2612                 /*
2613                  * No need to hold btrfs_root::objectid_mutex since the fs
2614                  * hasn't been fully initialised and we are the only user
2615                  */
2616                 ret = btrfs_init_root_free_objectid(tree_root);
2617                 if (ret < 0) {
2618                         handle_error = true;
2619                         continue;
2620                 }
2621
2622                 ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
2623
2624                 ret = btrfs_read_roots(fs_info);
2625                 if (ret < 0) {
2626                         handle_error = true;
2627                         continue;
2628                 }
2629
2630                 /* All successful */
2631                 fs_info->generation = btrfs_header_generation(tree_root->node);
2632                 fs_info->last_trans_committed = fs_info->generation;
2633                 fs_info->last_reloc_trans = 0;
2634
2635                 /* Always begin writing backup roots after the one being used */
2636                 if (backup_index < 0) {
2637                         fs_info->backup_root_index = 0;
2638                 } else {
2639                         fs_info->backup_root_index = backup_index + 1;
2640                         fs_info->backup_root_index %= BTRFS_NUM_BACKUP_ROOTS;
2641                 }
2642                 break;
2643         }
2644
2645         return ret;
2646 }
2647
2648 void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
2649 {
2650         INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
2651         INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
2652         INIT_LIST_HEAD(&fs_info->trans_list);
2653         INIT_LIST_HEAD(&fs_info->dead_roots);
2654         INIT_LIST_HEAD(&fs_info->delayed_iputs);
2655         INIT_LIST_HEAD(&fs_info->delalloc_roots);
2656         INIT_LIST_HEAD(&fs_info->caching_block_groups);
2657         spin_lock_init(&fs_info->delalloc_root_lock);
2658         spin_lock_init(&fs_info->trans_lock);
2659         spin_lock_init(&fs_info->fs_roots_radix_lock);
2660         spin_lock_init(&fs_info->delayed_iput_lock);
2661         spin_lock_init(&fs_info->defrag_inodes_lock);
2662         spin_lock_init(&fs_info->super_lock);
2663         spin_lock_init(&fs_info->buffer_lock);
2664         spin_lock_init(&fs_info->unused_bgs_lock);
2665         spin_lock_init(&fs_info->treelog_bg_lock);
2666         spin_lock_init(&fs_info->zone_active_bgs_lock);
2667         spin_lock_init(&fs_info->relocation_bg_lock);
2668         rwlock_init(&fs_info->tree_mod_log_lock);
2669         rwlock_init(&fs_info->global_root_lock);
2670         mutex_init(&fs_info->unused_bg_unpin_mutex);
2671         mutex_init(&fs_info->reclaim_bgs_lock);
2672         mutex_init(&fs_info->reloc_mutex);
2673         mutex_init(&fs_info->delalloc_root_mutex);
2674         mutex_init(&fs_info->zoned_meta_io_lock);
2675         mutex_init(&fs_info->zoned_data_reloc_io_lock);
2676         seqlock_init(&fs_info->profiles_lock);
2677
2678         btrfs_lockdep_init_map(fs_info, btrfs_trans_num_writers);
2679         btrfs_lockdep_init_map(fs_info, btrfs_trans_num_extwriters);
2680         btrfs_lockdep_init_map(fs_info, btrfs_trans_pending_ordered);
2681         btrfs_lockdep_init_map(fs_info, btrfs_ordered_extent);
2682         btrfs_state_lockdep_init_map(fs_info, btrfs_trans_commit_start,
2683                                      BTRFS_LOCKDEP_TRANS_COMMIT_START);
2684         btrfs_state_lockdep_init_map(fs_info, btrfs_trans_unblocked,
2685                                      BTRFS_LOCKDEP_TRANS_UNBLOCKED);
2686         btrfs_state_lockdep_init_map(fs_info, btrfs_trans_super_committed,
2687                                      BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
2688         btrfs_state_lockdep_init_map(fs_info, btrfs_trans_completed,
2689                                      BTRFS_LOCKDEP_TRANS_COMPLETED);
2690
2691         INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
2692         INIT_LIST_HEAD(&fs_info->space_info);
2693         INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
2694         INIT_LIST_HEAD(&fs_info->unused_bgs);
2695         INIT_LIST_HEAD(&fs_info->reclaim_bgs);
2696         INIT_LIST_HEAD(&fs_info->zone_active_bgs);
2697 #ifdef CONFIG_BTRFS_DEBUG
2698         INIT_LIST_HEAD(&fs_info->allocated_roots);
2699         INIT_LIST_HEAD(&fs_info->allocated_ebs);
2700         spin_lock_init(&fs_info->eb_leak_lock);
2701 #endif
2702         extent_map_tree_init(&fs_info->mapping_tree);
2703         btrfs_init_block_rsv(&fs_info->global_block_rsv,
2704                              BTRFS_BLOCK_RSV_GLOBAL);
2705         btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
2706         btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
2707         btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
2708         btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
2709                              BTRFS_BLOCK_RSV_DELOPS);
2710         btrfs_init_block_rsv(&fs_info->delayed_refs_rsv,
2711                              BTRFS_BLOCK_RSV_DELREFS);
2712
2713         atomic_set(&fs_info->async_delalloc_pages, 0);
2714         atomic_set(&fs_info->defrag_running, 0);
2715         atomic_set(&fs_info->nr_delayed_iputs, 0);
2716         atomic64_set(&fs_info->tree_mod_seq, 0);
2717         fs_info->global_root_tree = RB_ROOT;
2718         fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
2719         fs_info->metadata_ratio = 0;
2720         fs_info->defrag_inodes = RB_ROOT;
2721         atomic64_set(&fs_info->free_chunk_space, 0);
2722         fs_info->tree_mod_log = RB_ROOT;
2723         fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
2724         btrfs_init_ref_verify(fs_info);
2725
2726         fs_info->thread_pool_size = min_t(unsigned long,
2727                                           num_online_cpus() + 2, 8);
2728
2729         INIT_LIST_HEAD(&fs_info->ordered_roots);
2730         spin_lock_init(&fs_info->ordered_root_lock);
2731
2732         btrfs_init_scrub(fs_info);
2733 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
2734         fs_info->check_integrity_print_mask = 0;
2735 #endif
2736         btrfs_init_balance(fs_info);
2737         btrfs_init_async_reclaim_work(fs_info);
2738
2739         rwlock_init(&fs_info->block_group_cache_lock);
2740         fs_info->block_group_cache_tree = RB_ROOT_CACHED;
2741
2742         extent_io_tree_init(fs_info, &fs_info->excluded_extents,
2743                             IO_TREE_FS_EXCLUDED_EXTENTS);
2744
2745         mutex_init(&fs_info->ordered_operations_mutex);
2746         mutex_init(&fs_info->tree_log_mutex);
2747         mutex_init(&fs_info->chunk_mutex);
2748         mutex_init(&fs_info->transaction_kthread_mutex);
2749         mutex_init(&fs_info->cleaner_mutex);
2750         mutex_init(&fs_info->ro_block_group_mutex);
2751         init_rwsem(&fs_info->commit_root_sem);
2752         init_rwsem(&fs_info->cleanup_work_sem);
2753         init_rwsem(&fs_info->subvol_sem);
2754         sema_init(&fs_info->uuid_tree_rescan_sem, 1);
2755
2756         btrfs_init_dev_replace_locks(fs_info);
2757         btrfs_init_qgroup(fs_info);
2758         btrfs_discard_init(fs_info);
2759
2760         btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
2761         btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
2762
2763         init_waitqueue_head(&fs_info->transaction_throttle);
2764         init_waitqueue_head(&fs_info->transaction_wait);
2765         init_waitqueue_head(&fs_info->transaction_blocked_wait);
2766         init_waitqueue_head(&fs_info->async_submit_wait);
2767         init_waitqueue_head(&fs_info->delayed_iputs_wait);
2768
2769         /* Usable values until the real ones are cached from the superblock */
2770         fs_info->nodesize = 4096;
2771         fs_info->sectorsize = 4096;
2772         fs_info->sectorsize_bits = ilog2(4096);
2773         fs_info->stripesize = 4096;
2774
2775         fs_info->max_extent_size = BTRFS_MAX_EXTENT_SIZE;
2776
2777         spin_lock_init(&fs_info->swapfile_pins_lock);
2778         fs_info->swapfile_pins = RB_ROOT;
2779
2780         fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH;
2781         INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work);
2782 }
2783
2784 static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb)
2785 {
2786         int ret;
2787
2788         fs_info->sb = sb;
2789         sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE;
2790         sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE);
2791
2792         ret = percpu_counter_init(&fs_info->ordered_bytes, 0, GFP_KERNEL);
2793         if (ret)
2794                 return ret;
2795
2796         ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
2797         if (ret)
2798                 return ret;
2799
2800         fs_info->dirty_metadata_batch = PAGE_SIZE *
2801                                         (1 + ilog2(nr_cpu_ids));
2802
2803         ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
2804         if (ret)
2805                 return ret;
2806
2807         ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0,
2808                         GFP_KERNEL);
2809         if (ret)
2810                 return ret;
2811
2812         fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
2813                                         GFP_KERNEL);
2814         if (!fs_info->delayed_root)
2815                 return -ENOMEM;
2816         btrfs_init_delayed_root(fs_info->delayed_root);
2817
2818         if (sb_rdonly(sb))
2819                 set_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state);
2820
2821         return btrfs_alloc_stripe_hash_table(fs_info);
2822 }
2823
2824 static int btrfs_uuid_rescan_kthread(void *data)
2825 {
2826         struct btrfs_fs_info *fs_info = data;
2827         int ret;
2828
2829         /*
2830          * 1st step is to iterate through the existing UUID tree and
2831          * to delete all entries that contain outdated data.
2832          * 2nd step is to add all missing entries to the UUID tree.
2833          */
2834         ret = btrfs_uuid_tree_iterate(fs_info);
2835         if (ret < 0) {
2836                 if (ret != -EINTR)
2837                         btrfs_warn(fs_info, "iterating uuid_tree failed %d",
2838                                    ret);
2839                 up(&fs_info->uuid_tree_rescan_sem);
2840                 return ret;
2841         }
2842         return btrfs_uuid_scan_kthread(data);
2843 }
2844
2845 static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
2846 {
2847         struct task_struct *task;
2848
2849         down(&fs_info->uuid_tree_rescan_sem);
2850         task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
2851         if (IS_ERR(task)) {
2852                 /* fs_info->update_uuid_tree_gen remains 0 in all error case */
2853                 btrfs_warn(fs_info, "failed to start uuid_rescan task");
2854                 up(&fs_info->uuid_tree_rescan_sem);
2855                 return PTR_ERR(task);
2856         }
2857
2858         return 0;
2859 }
2860
2861 /*
2862  * Some options only have meaning at mount time and shouldn't persist across
2863  * remounts, or be displayed. Clear these at the end of mount and remount
2864  * code paths.
2865  */
2866 void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info)
2867 {
2868         btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
2869         btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE);
2870 }
2871
2872 /*
2873  * Mounting logic specific to read-write file systems. Shared by open_ctree
2874  * and btrfs_remount when remounting from read-only to read-write.
2875  */
2876 int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
2877 {
2878         int ret;
2879         const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE);
2880         bool rebuild_free_space_tree = false;
2881
2882         if (btrfs_test_opt(fs_info, CLEAR_CACHE) &&
2883             btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
2884                 rebuild_free_space_tree = true;
2885         } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
2886                    !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) {
2887                 btrfs_warn(fs_info, "free space tree is invalid");
2888                 rebuild_free_space_tree = true;
2889         }
2890
2891         if (rebuild_free_space_tree) {
2892                 btrfs_info(fs_info, "rebuilding free space tree");
2893                 ret = btrfs_rebuild_free_space_tree(fs_info);
2894                 if (ret) {
2895                         btrfs_warn(fs_info,
2896                                    "failed to rebuild free space tree: %d", ret);
2897                         goto out;
2898                 }
2899         }
2900
2901         if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
2902             !btrfs_test_opt(fs_info, FREE_SPACE_TREE)) {
2903                 btrfs_info(fs_info, "disabling free space tree");
2904                 ret = btrfs_delete_free_space_tree(fs_info);
2905                 if (ret) {
2906                         btrfs_warn(fs_info,
2907                                    "failed to disable free space tree: %d", ret);
2908                         goto out;
2909                 }
2910         }
2911
2912         /*
2913          * btrfs_find_orphan_roots() is responsible for finding all the dead
2914          * roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load
2915          * them into the fs_info->fs_roots_radix tree. This must be done before
2916          * calling btrfs_orphan_cleanup() on the tree root. If we don't do it
2917          * first, then btrfs_orphan_cleanup() will delete a dead root's orphan
2918          * item before the root's tree is deleted - this means that if we unmount
2919          * or crash before the deletion completes, on the next mount we will not
2920          * delete what remains of the tree because the orphan item does not
2921          * exists anymore, which is what tells us we have a pending deletion.
2922          */
2923         ret = btrfs_find_orphan_roots(fs_info);
2924         if (ret)
2925                 goto out;
2926
2927         ret = btrfs_cleanup_fs_roots(fs_info);
2928         if (ret)
2929                 goto out;
2930
2931         down_read(&fs_info->cleanup_work_sem);
2932         if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
2933             (ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
2934                 up_read(&fs_info->cleanup_work_sem);
2935                 goto out;
2936         }
2937         up_read(&fs_info->cleanup_work_sem);
2938
2939         mutex_lock(&fs_info->cleaner_mutex);
2940         ret = btrfs_recover_relocation(fs_info);
2941         mutex_unlock(&fs_info->cleaner_mutex);
2942         if (ret < 0) {
2943                 btrfs_warn(fs_info, "failed to recover relocation: %d", ret);
2944                 goto out;
2945         }
2946
2947         if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) &&
2948             !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
2949                 btrfs_info(fs_info, "creating free space tree");
2950                 ret = btrfs_create_free_space_tree(fs_info);
2951                 if (ret) {
2952                         btrfs_warn(fs_info,
2953                                 "failed to create free space tree: %d", ret);
2954                         goto out;
2955                 }
2956         }
2957
2958         if (cache_opt != btrfs_free_space_cache_v1_active(fs_info)) {
2959                 ret = btrfs_set_free_space_cache_v1_active(fs_info, cache_opt);
2960                 if (ret)
2961                         goto out;
2962         }
2963
2964         ret = btrfs_resume_balance_async(fs_info);
2965         if (ret)
2966                 goto out;
2967
2968         ret = btrfs_resume_dev_replace_async(fs_info);
2969         if (ret) {
2970                 btrfs_warn(fs_info, "failed to resume dev_replace");
2971                 goto out;
2972         }
2973
2974         btrfs_qgroup_rescan_resume(fs_info);
2975
2976         if (!fs_info->uuid_root) {
2977                 btrfs_info(fs_info, "creating UUID tree");
2978                 ret = btrfs_create_uuid_tree(fs_info);
2979                 if (ret) {
2980                         btrfs_warn(fs_info,
2981                                    "failed to create the UUID tree %d", ret);
2982                         goto out;
2983                 }
2984         }
2985
2986 out:
2987         return ret;
2988 }
2989
2990 /*
2991  * Do various sanity and dependency checks of different features.
2992  *
2993  * @is_rw_mount:        If the mount is read-write.
2994  *
2995  * This is the place for less strict checks (like for subpage or artificial
2996  * feature dependencies).
2997  *
2998  * For strict checks or possible corruption detection, see
2999  * btrfs_validate_super().
3000  *
3001  * This should be called after btrfs_parse_options(), as some mount options
3002  * (space cache related) can modify on-disk format like free space tree and
3003  * screw up certain feature dependencies.
3004  */
3005 int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount)
3006 {
3007         struct btrfs_super_block *disk_super = fs_info->super_copy;
3008         u64 incompat = btrfs_super_incompat_flags(disk_super);
3009         const u64 compat_ro = btrfs_super_compat_ro_flags(disk_super);
3010         const u64 compat_ro_unsupp = (compat_ro & ~BTRFS_FEATURE_COMPAT_RO_SUPP);
3011
3012         if (incompat & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
3013                 btrfs_err(fs_info,
3014                 "cannot mount because of unknown incompat features (0x%llx)",
3015                     incompat);
3016                 return -EINVAL;
3017         }
3018
3019         /* Runtime limitation for mixed block groups. */
3020         if ((incompat & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
3021             (fs_info->sectorsize != fs_info->nodesize)) {
3022                 btrfs_err(fs_info,
3023 "unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
3024                         fs_info->nodesize, fs_info->sectorsize);
3025                 return -EINVAL;
3026         }
3027
3028         /* Mixed backref is an always-enabled feature. */
3029         incompat |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
3030
3031         /* Set compression related flags just in case. */
3032         if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
3033                 incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
3034         else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
3035                 incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;
3036
3037         /*
3038          * An ancient flag, which should really be marked deprecated.
3039          * Such runtime limitation doesn't really need a incompat flag.
3040          */
3041         if (btrfs_super_nodesize(disk_super) > PAGE_SIZE)
3042                 incompat |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
3043
3044         if (compat_ro_unsupp && is_rw_mount) {
3045                 btrfs_err(fs_info,
3046         "cannot mount read-write because of unknown compat_ro features (0x%llx)",
3047                        compat_ro);
3048                 return -EINVAL;
3049         }
3050
3051         /*
3052          * We have unsupported RO compat features, although RO mounted, we
3053          * should not cause any metadata writes, including log replay.
3054          * Or we could screw up whatever the new feature requires.
3055          */
3056         if (compat_ro_unsupp && btrfs_super_log_root(disk_super) &&
3057             !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
3058                 btrfs_err(fs_info,
3059 "cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay",
3060                           compat_ro);
3061                 return -EINVAL;
3062         }
3063
3064         /*
3065          * Artificial limitations for block group tree, to force
3066          * block-group-tree to rely on no-holes and free-space-tree.
3067          */
3068         if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) &&
3069             (!btrfs_fs_incompat(fs_info, NO_HOLES) ||
3070              !btrfs_test_opt(fs_info, FREE_SPACE_TREE))) {
3071                 btrfs_err(fs_info,
3072 "block-group-tree feature requires no-holes and free-space-tree features");
3073                 return -EINVAL;
3074         }
3075
3076         /*
3077          * Subpage runtime limitation on v1 cache.
3078          *
3079          * V1 space cache still has some hard codeed PAGE_SIZE usage, while
3080          * we're already defaulting to v2 cache, no need to bother v1 as it's
3081          * going to be deprecated anyway.
3082          */
3083         if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) {
3084                 btrfs_warn(fs_info,
3085         "v1 space cache is not supported for page size %lu with sectorsize %u",
3086                            PAGE_SIZE, fs_info->sectorsize);
3087                 return -EINVAL;
3088         }
3089
3090         /* This can be called by remount, we need to protect the super block. */
3091         spin_lock(&fs_info->super_lock);
3092         btrfs_set_super_incompat_flags(disk_super, incompat);
3093         spin_unlock(&fs_info->super_lock);
3094
3095         return 0;
3096 }
3097
3098 int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
3099                       char *options)
3100 {
3101         u32 sectorsize;
3102         u32 nodesize;
3103         u32 stripesize;
3104         u64 generation;
3105         u64 features;
3106         u16 csum_type;
3107         struct btrfs_super_block *disk_super;
3108         struct btrfs_fs_info *fs_info = btrfs_sb(sb);
3109         struct btrfs_root *tree_root;
3110         struct btrfs_root *chunk_root;
3111         int ret;
3112         int level;
3113
3114         ret = init_mount_fs_info(fs_info, sb);
3115         if (ret)
3116                 goto fail;
3117
3118         /* These need to be init'ed before we start creating inodes and such. */
3119         tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID,
3120                                      GFP_KERNEL);
3121         fs_info->tree_root = tree_root;
3122         chunk_root = btrfs_alloc_root(fs_info, BTRFS_CHUNK_TREE_OBJECTID,
3123                                       GFP_KERNEL);
3124         fs_info->chunk_root = chunk_root;
3125         if (!tree_root || !chunk_root) {
3126                 ret = -ENOMEM;
3127                 goto fail;
3128         }
3129
3130         ret = btrfs_init_btree_inode(sb);
3131         if (ret)
3132                 goto fail;
3133
3134         invalidate_bdev(fs_devices->latest_dev->bdev);
3135
3136         /*
3137          * Read super block and check the signature bytes only
3138          */
3139         disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev);
3140         if (IS_ERR(disk_super)) {
3141                 ret = PTR_ERR(disk_super);
3142                 goto fail_alloc;
3143         }
3144
3145         /*
3146          * Verify the type first, if that or the checksum value are
3147          * corrupted, we'll find out
3148          */
3149         csum_type = btrfs_super_csum_type(disk_super);
3150         if (!btrfs_supported_super_csum(csum_type)) {
3151                 btrfs_err(fs_info, "unsupported checksum algorithm: %u",
3152                           csum_type);
3153                 ret = -EINVAL;
3154                 btrfs_release_disk_super(disk_super);
3155                 goto fail_alloc;
3156         }
3157
3158         fs_info->csum_size = btrfs_super_csum_size(disk_super);
3159
3160         ret = btrfs_init_csum_hash(fs_info, csum_type);
3161         if (ret) {
3162                 btrfs_release_disk_super(disk_super);
3163                 goto fail_alloc;
3164         }
3165
3166         /*
3167          * We want to check superblock checksum, the type is stored inside.
3168          * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
3169          */
3170         if (btrfs_check_super_csum(fs_info, disk_super)) {
3171                 btrfs_err(fs_info, "superblock checksum mismatch");
3172                 ret = -EINVAL;
3173                 btrfs_release_disk_super(disk_super);
3174                 goto fail_alloc;
3175         }
3176
3177         /*
3178          * super_copy is zeroed at allocation time and we never touch the
3179          * following bytes up to INFO_SIZE, the checksum is calculated from
3180          * the whole block of INFO_SIZE
3181          */
3182         memcpy(fs_info->super_copy, disk_super, sizeof(*fs_info->super_copy));
3183         btrfs_release_disk_super(disk_super);
3184
3185         disk_super = fs_info->super_copy;
3186
3187
3188         features = btrfs_super_flags(disk_super);
3189         if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
3190                 features &= ~BTRFS_SUPER_FLAG_CHANGING_FSID_V2;
3191                 btrfs_set_super_flags(disk_super, features);
3192                 btrfs_info(fs_info,
3193                         "found metadata UUID change in progress flag, clearing");
3194         }
3195
3196         memcpy(fs_info->super_for_commit, fs_info->super_copy,
3197                sizeof(*fs_info->super_for_commit));
3198
3199         ret = btrfs_validate_mount_super(fs_info);
3200         if (ret) {
3201                 btrfs_err(fs_info, "superblock contains fatal errors");
3202                 ret = -EINVAL;
3203                 goto fail_alloc;
3204         }
3205
3206         if (!btrfs_super_root(disk_super)) {
3207                 btrfs_err(fs_info, "invalid superblock tree root bytenr");
3208                 ret = -EINVAL;
3209                 goto fail_alloc;
3210         }
3211
3212         /* check FS state, whether FS is broken. */
3213         if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
3214                 set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
3215
3216         /*
3217          * In the long term, we'll store the compression type in the super
3218          * block, and it'll be used for per file compression control.
3219          */
3220         fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
3221
3222
3223         /* Set up fs_info before parsing mount options */
3224         nodesize = btrfs_super_nodesize(disk_super);
3225         sectorsize = btrfs_super_sectorsize(disk_super);
3226         stripesize = sectorsize;
3227         fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
3228         fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
3229
3230         fs_info->nodesize = nodesize;
3231         fs_info->sectorsize = sectorsize;
3232         fs_info->sectorsize_bits = ilog2(sectorsize);
3233         fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
3234         fs_info->stripesize = stripesize;
3235
3236         ret = btrfs_parse_options(fs_info, options, sb->s_flags);
3237         if (ret)
3238                 goto fail_alloc;
3239
3240         ret = btrfs_check_features(fs_info, !sb_rdonly(sb));
3241         if (ret < 0)
3242                 goto fail_alloc;
3243
3244         if (sectorsize < PAGE_SIZE) {
3245                 struct btrfs_subpage_info *subpage_info;
3246
3247                 /*
3248                  * V1 space cache has some hardcoded PAGE_SIZE usage, and is
3249                  * going to be deprecated.
3250                  *
3251                  * Force to use v2 cache for subpage case.
3252                  */
3253                 btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
3254                 btrfs_set_and_info(fs_info, FREE_SPACE_TREE,
3255                         "forcing free space tree for sector size %u with page size %lu",
3256                         sectorsize, PAGE_SIZE);
3257
3258                 btrfs_warn(fs_info,
3259                 "read-write for sector size %u with page size %lu is experimental",
3260                            sectorsize, PAGE_SIZE);
3261                 subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL);
3262                 if (!subpage_info) {
3263                         ret = -ENOMEM;
3264                         goto fail_alloc;
3265                 }
3266                 btrfs_init_subpage_info(subpage_info, sectorsize);
3267                 fs_info->subpage_info = subpage_info;
3268         }
3269
3270         ret = btrfs_init_workqueues(fs_info);
3271         if (ret)
3272                 goto fail_sb_buffer;
3273
3274         sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
3275         sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
3276
3277         sb->s_blocksize = sectorsize;
3278         sb->s_blocksize_bits = blksize_bits(sectorsize);
3279         memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE);
3280
3281         mutex_lock(&fs_info->chunk_mutex);
3282         ret = btrfs_read_sys_array(fs_info);
3283         mutex_unlock(&fs_info->chunk_mutex);
3284         if (ret) {
3285                 btrfs_err(fs_info, "failed to read the system array: %d", ret);
3286                 goto fail_sb_buffer;
3287         }
3288
3289         generation = btrfs_super_chunk_root_generation(disk_super);
3290         level = btrfs_super_chunk_root_level(disk_super);
3291         ret = load_super_root(chunk_root, btrfs_super_chunk_root(disk_super),
3292                               generation, level);
3293         if (ret) {
3294                 btrfs_err(fs_info, "failed to read chunk root");
3295                 goto fail_tree_roots;
3296         }
3297
3298         read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
3299                            offsetof(struct btrfs_header, chunk_tree_uuid),
3300                            BTRFS_UUID_SIZE);
3301
3302         ret = btrfs_read_chunk_tree(fs_info);
3303         if (ret) {
3304                 btrfs_err(fs_info, "failed to read chunk tree: %d", ret);
3305                 goto fail_tree_roots;
3306         }
3307
3308         /*
3309          * At this point we know all the devices that make this filesystem,
3310          * including the seed devices but we don't know yet if the replace
3311          * target is required. So free devices that are not part of this
3312          * filesystem but skip the replace target device which is checked
3313          * below in btrfs_init_dev_replace().
3314          */
3315         btrfs_free_extra_devids(fs_devices);
3316         if (!fs_devices->latest_dev->bdev) {
3317                 btrfs_err(fs_info, "failed to read devices");
3318                 ret = -EIO;
3319                 goto fail_tree_roots;
3320         }
3321
3322         ret = init_tree_roots(fs_info);
3323         if (ret)
3324                 goto fail_tree_roots;
3325
3326         /*
3327          * Get zone type information of zoned block devices. This will also
3328          * handle emulation of a zoned filesystem if a regular device has the
3329          * zoned incompat feature flag set.
3330          */
3331         ret = btrfs_get_dev_zone_info_all_devices(fs_info);
3332         if (ret) {
3333                 btrfs_err(fs_info,
3334                           "zoned: failed to read device zone info: %d", ret);
3335                 goto fail_block_groups;
3336         }
3337
3338         /*
3339          * If we have a uuid root and we're not being told to rescan we need to
3340          * check the generation here so we can set the
3341          * BTRFS_FS_UPDATE_UUID_TREE_GEN bit.  Otherwise we could commit the
3342          * transaction during a balance or the log replay without updating the
3343          * uuid generation, and then if we crash we would rescan the uuid tree,
3344          * even though it was perfectly fine.
3345          */
3346         if (fs_info->uuid_root && !btrfs_test_opt(fs_info, RESCAN_UUID_TREE) &&
3347             fs_info->generation == btrfs_super_uuid_tree_generation(disk_super))
3348                 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
3349
3350         ret = btrfs_verify_dev_extents(fs_info);
3351         if (ret) {
3352                 btrfs_err(fs_info,
3353                           "failed to verify dev extents against chunks: %d",
3354                           ret);
3355                 goto fail_block_groups;
3356         }
3357         ret = btrfs_recover_balance(fs_info);
3358         if (ret) {
3359                 btrfs_err(fs_info, "failed to recover balance: %d", ret);
3360                 goto fail_block_groups;
3361         }
3362
3363         ret = btrfs_init_dev_stats(fs_info);
3364         if (ret) {
3365                 btrfs_err(fs_info, "failed to init dev_stats: %d", ret);
3366                 goto fail_block_groups;
3367         }
3368
3369         ret = btrfs_init_dev_replace(fs_info);
3370         if (ret) {
3371                 btrfs_err(fs_info, "failed to init dev_replace: %d", ret);
3372                 goto fail_block_groups;
3373         }
3374
3375         ret = btrfs_check_zoned_mode(fs_info);
3376         if (ret) {
3377                 btrfs_err(fs_info, "failed to initialize zoned mode: %d",
3378                           ret);
3379                 goto fail_block_groups;
3380         }
3381
3382         ret = btrfs_sysfs_add_fsid(fs_devices);
3383         if (ret) {
3384                 btrfs_err(fs_info, "failed to init sysfs fsid interface: %d",
3385                                 ret);
3386                 goto fail_block_groups;
3387         }
3388
3389         ret = btrfs_sysfs_add_mounted(fs_info);
3390         if (ret) {
3391                 btrfs_err(fs_info, "failed to init sysfs interface: %d", ret);
3392                 goto fail_fsdev_sysfs;
3393         }
3394
3395         ret = btrfs_init_space_info(fs_info);
3396         if (ret) {
3397                 btrfs_err(fs_info, "failed to initialize space info: %d", ret);
3398                 goto fail_sysfs;
3399         }
3400
3401         ret = btrfs_read_block_groups(fs_info);
3402         if (ret) {
3403                 btrfs_err(fs_info, "failed to read block groups: %d", ret);
3404                 goto fail_sysfs;
3405         }
3406
3407         btrfs_free_zone_cache(fs_info);
3408
3409         if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices &&
3410             !btrfs_check_rw_degradable(fs_info, NULL)) {
3411                 btrfs_warn(fs_info,
3412                 "writable mount is not allowed due to too many missing devices");
3413                 ret = -EINVAL;
3414                 goto fail_sysfs;
3415         }
3416
3417         fs_info->cleaner_kthread = kthread_run(cleaner_kthread, fs_info,
3418                                                "btrfs-cleaner");
3419         if (IS_ERR(fs_info->cleaner_kthread)) {
3420                 ret = PTR_ERR(fs_info->cleaner_kthread);
3421                 goto fail_sysfs;
3422         }
3423
3424         fs_info->transaction_kthread = kthread_run(transaction_kthread,
3425                                                    tree_root,
3426                                                    "btrfs-transaction");
3427         if (IS_ERR(fs_info->transaction_kthread)) {
3428                 ret = PTR_ERR(fs_info->transaction_kthread);
3429                 goto fail_cleaner;
3430         }
3431
3432         if (!btrfs_test_opt(fs_info, NOSSD) &&
3433             !fs_info->fs_devices->rotating) {
3434                 btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations");
3435         }
3436
3437         /*
3438          * For devices supporting discard turn on discard=async automatically,
3439          * unless it's already set or disabled. This could be turned off by
3440          * nodiscard for the same mount.
3441          */
3442         if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) ||
3443               btrfs_test_opt(fs_info, DISCARD_ASYNC) ||
3444               btrfs_test_opt(fs_info, NODISCARD)) &&
3445             fs_info->fs_devices->discardable) {
3446                 btrfs_set_and_info(fs_info, DISCARD_ASYNC,
3447                                    "auto enabling async discard");
3448         }
3449
3450 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
3451         if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) {
3452                 ret = btrfsic_mount(fs_info, fs_devices,
3453                                     btrfs_test_opt(fs_info,
3454                                         CHECK_INTEGRITY_DATA) ? 1 : 0,
3455                                     fs_info->check_integrity_print_mask);
3456                 if (ret)
3457                         btrfs_warn(fs_info,
3458                                 "failed to initialize integrity check module: %d",
3459                                 ret);
3460         }
3461 #endif
3462         ret = btrfs_read_qgroup_config(fs_info);
3463         if (ret)
3464                 goto fail_trans_kthread;
3465
3466         if (btrfs_build_ref_tree(fs_info))
3467                 btrfs_err(fs_info, "couldn't build ref tree");
3468
3469         /* do not make disk changes in broken FS or nologreplay is given */
3470         if (btrfs_super_log_root(disk_super) != 0 &&
3471             !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
3472                 btrfs_info(fs_info, "start tree-log replay");
3473                 ret = btrfs_replay_log(fs_info, fs_devices);
3474                 if (ret)
3475                         goto fail_qgroup;
3476         }
3477
3478         fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true);
3479         if (IS_ERR(fs_info->fs_root)) {
3480                 ret = PTR_ERR(fs_info->fs_root);
3481                 btrfs_warn(fs_info, "failed to read fs tree: %d", ret);
3482                 fs_info->fs_root = NULL;
3483                 goto fail_qgroup;
3484         }
3485
3486         if (sb_rdonly(sb))
3487                 goto clear_oneshot;
3488
3489         ret = btrfs_start_pre_rw_mount(fs_info);
3490         if (ret) {
3491                 close_ctree(fs_info);
3492                 return ret;
3493         }
3494         btrfs_discard_resume(fs_info);
3495
3496         if (fs_info->uuid_root &&
3497             (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) ||
3498              fs_info->generation != btrfs_super_uuid_tree_generation(disk_super))) {
3499                 btrfs_info(fs_info, "checking UUID tree");
3500                 ret = btrfs_check_uuid_tree(fs_info);
3501                 if (ret) {
3502                         btrfs_warn(fs_info,
3503                                 "failed to check the UUID tree: %d", ret);
3504                         close_ctree(fs_info);
3505                         return ret;
3506                 }
3507         }
3508
3509         set_bit(BTRFS_FS_OPEN, &fs_info->flags);
3510
3511         /* Kick the cleaner thread so it'll start deleting snapshots. */
3512         if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags))
3513                 wake_up_process(fs_info->cleaner_kthread);
3514
3515 clear_oneshot:
3516         btrfs_clear_oneshot_options(fs_info);
3517         return 0;
3518
3519 fail_qgroup:
3520         btrfs_free_qgroup_config(fs_info);
3521 fail_trans_kthread:
3522         kthread_stop(fs_info->transaction_kthread);
3523         btrfs_cleanup_transaction(fs_info);
3524         btrfs_free_fs_roots(fs_info);
3525 fail_cleaner:
3526         kthread_stop(fs_info->cleaner_kthread);
3527
3528         /*
3529          * make sure we're done with the btree inode before we stop our
3530          * kthreads
3531          */
3532         filemap_write_and_wait(fs_info->btree_inode->i_mapping);
3533
3534 fail_sysfs:
3535         btrfs_sysfs_remove_mounted(fs_info);
3536
3537 fail_fsdev_sysfs:
3538         btrfs_sysfs_remove_fsid(fs_info->fs_devices);
3539
3540 fail_block_groups:
3541         btrfs_put_block_group_cache(fs_info);
3542
3543 fail_tree_roots:
3544         if (fs_info->data_reloc_root)
3545                 btrfs_drop_and_free_fs_root(fs_info, fs_info->data_reloc_root);
3546         free_root_pointers(fs_info, true);
3547         invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
3548
3549 fail_sb_buffer:
3550         btrfs_stop_all_workers(fs_info);
3551         btrfs_free_block_groups(fs_info);
3552 fail_alloc:
3553         btrfs_mapping_tree_free(&fs_info->mapping_tree);
3554
3555         iput(fs_info->btree_inode);
3556 fail:
3557         btrfs_close_devices(fs_info->fs_devices);
3558         ASSERT(ret < 0);
3559         return ret;
3560 }
3561 ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
3562
3563 static void btrfs_end_super_write(struct bio *bio)
3564 {
3565         struct btrfs_device *device = bio->bi_private;
3566         struct bio_vec *bvec;
3567         struct bvec_iter_all iter_all;
3568         struct page *page;
3569
3570         bio_for_each_segment_all(bvec, bio, iter_all) {
3571                 page = bvec->bv_page;
3572
3573                 if (bio->bi_status) {
3574                         btrfs_warn_rl_in_rcu(device->fs_info,
3575                                 "lost page write due to IO error on %s (%d)",
3576                                 btrfs_dev_name(device),
3577                                 blk_status_to_errno(bio->bi_status));
3578                         ClearPageUptodate(page);
3579                         SetPageError(page);
3580                         btrfs_dev_stat_inc_and_print(device,
3581                                                      BTRFS_DEV_STAT_WRITE_ERRS);
3582                 } else {
3583                         SetPageUptodate(page);
3584                 }
3585
3586                 put_page(page);
3587                 unlock_page(page);
3588         }
3589
3590         bio_put(bio);
3591 }
3592
3593 struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
3594                                                    int copy_num, bool drop_cache)
3595 {
3596         struct btrfs_super_block *super;
3597         struct page *page;
3598         u64 bytenr, bytenr_orig;
3599         struct address_space *mapping = bdev->bd_inode->i_mapping;
3600         int ret;
3601
3602         bytenr_orig = btrfs_sb_offset(copy_num);
3603         ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr);
3604         if (ret == -ENOENT)
3605                 return ERR_PTR(-EINVAL);
3606         else if (ret)
3607                 return ERR_PTR(ret);
3608
3609         if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
3610                 return ERR_PTR(-EINVAL);
3611
3612         if (drop_cache) {
3613                 /* This should only be called with the primary sb. */
3614                 ASSERT(copy_num == 0);
3615
3616                 /*
3617                  * Drop the page of the primary superblock, so later read will
3618                  * always read from the device.
3619                  */
3620                 invalidate_inode_pages2_range(mapping,
3621                                 bytenr >> PAGE_SHIFT,
3622                                 (bytenr + BTRFS_SUPER_INFO_SIZE) >> PAGE_SHIFT);
3623         }
3624
3625         page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
3626         if (IS_ERR(page))
3627                 return ERR_CAST(page);
3628
3629         super = page_address(page);
3630         if (btrfs_super_magic(super) != BTRFS_MAGIC) {
3631                 btrfs_release_disk_super(super);
3632                 return ERR_PTR(-ENODATA);
3633         }
3634
3635         if (btrfs_super_bytenr(super) != bytenr_orig) {
3636                 btrfs_release_disk_super(super);
3637                 return ERR_PTR(-EINVAL);
3638         }
3639
3640         return super;
3641 }
3642
3643
3644 struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev)
3645 {
3646         struct btrfs_super_block *super, *latest = NULL;
3647         int i;
3648         u64 transid = 0;
3649
3650         /* we would like to check all the supers, but that would make
3651          * a btrfs mount succeed after a mkfs from a different FS.
3652          * So, we need to add a special mount option to scan for
3653          * later supers, using BTRFS_SUPER_MIRROR_MAX instead
3654          */
3655         for (i = 0; i < 1; i++) {
3656                 super = btrfs_read_dev_one_super(bdev, i, false);
3657                 if (IS_ERR(super))
3658                         continue;
3659
3660                 if (!latest || btrfs_super_generation(super) > transid) {
3661                         if (latest)
3662                                 btrfs_release_disk_super(super);
3663
3664                         latest = super;
3665                         transid = btrfs_super_generation(super);
3666                 }
3667         }
3668
3669         return super;
3670 }
3671
3672 /*
3673  * Write superblock @sb to the @device. Do not wait for completion, all the
3674  * pages we use for writing are locked.
3675  *
3676  * Write @max_mirrors copies of the superblock, where 0 means default that fit
3677  * the expected device size at commit time. Note that max_mirrors must be
3678  * same for write and wait phases.
3679  *
3680  * Return number of errors when page is not found or submission fails.
3681  */
3682 static int write_dev_supers(struct btrfs_device *device,
3683                             struct btrfs_super_block *sb, int max_mirrors)
3684 {
3685         struct btrfs_fs_info *fs_info = device->fs_info;
3686         struct address_space *mapping = device->bdev->bd_inode->i_mapping;
3687         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
3688         int i;
3689         int errors = 0;
3690         int ret;
3691         u64 bytenr, bytenr_orig;
3692
3693         if (max_mirrors == 0)
3694                 max_mirrors = BTRFS_SUPER_MIRROR_MAX;
3695
3696         shash->tfm = fs_info->csum_shash;
3697
3698         for (i = 0; i < max_mirrors; i++) {
3699                 struct page *page;
3700                 struct bio *bio;
3701                 struct btrfs_super_block *disk_super;
3702
3703                 bytenr_orig = btrfs_sb_offset(i);
3704                 ret = btrfs_sb_log_location(device, i, WRITE, &bytenr);
3705                 if (ret == -ENOENT) {
3706                         continue;
3707                 } else if (ret < 0) {
3708                         btrfs_err(device->fs_info,
3709                                 "couldn't get super block location for mirror %d",
3710                                 i);
3711                         errors++;
3712                         continue;
3713                 }
3714                 if (bytenr + BTRFS_SUPER_INFO_SIZE >=
3715                     device->commit_total_bytes)
3716                         break;
3717
3718                 btrfs_set_super_bytenr(sb, bytenr_orig);
3719
3720                 crypto_shash_digest(shash, (const char *)sb + BTRFS_CSUM_SIZE,
3721                                     BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
3722                                     sb->csum);
3723
3724                 page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT,
3725                                            GFP_NOFS);
3726                 if (!page) {
3727                         btrfs_err(device->fs_info,
3728                             "couldn't get super block page for bytenr %llu",
3729                             bytenr);
3730                         errors++;
3731                         continue;
3732                 }
3733
3734                 /* Bump the refcount for wait_dev_supers() */
3735                 get_page(page);
3736
3737                 disk_super = page_address(page);
3738                 memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE);
3739
3740                 /*
3741                  * Directly use bios here instead of relying on the page cache
3742                  * to do I/O, so we don't lose the ability to do integrity
3743                  * checking.
3744                  */
3745                 bio = bio_alloc(device->bdev, 1,
3746                                 REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO,
3747                                 GFP_NOFS);
3748                 bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
3749                 bio->bi_private = device;
3750                 bio->bi_end_io = btrfs_end_super_write;
3751                 __bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE,
3752                                offset_in_page(bytenr));
3753
3754                 /*
3755                  * We FUA only the first super block.  The others we allow to
3756                  * go down lazy and there's a short window where the on-disk
3757                  * copies might still contain the older version.
3758                  */
3759                 if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
3760                         bio->bi_opf |= REQ_FUA;
3761
3762                 btrfsic_check_bio(bio);
3763                 submit_bio(bio);
3764
3765                 if (btrfs_advance_sb_log(device, i))
3766                         errors++;
3767         }
3768         return errors < i ? 0 : -1;
3769 }
3770
3771 /*
3772  * Wait for write completion of superblocks done by write_dev_supers,
3773  * @max_mirrors same for write and wait phases.
3774  *
3775  * Return number of errors when page is not found or not marked up to
3776  * date.
3777  */
3778 static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
3779 {
3780         int i;
3781         int errors = 0;
3782         bool primary_failed = false;
3783         int ret;
3784         u64 bytenr;
3785
3786         if (max_mirrors == 0)
3787                 max_mirrors = BTRFS_SUPER_MIRROR_MAX;
3788
3789         for (i = 0; i < max_mirrors; i++) {
3790                 struct page *page;
3791
3792                 ret = btrfs_sb_log_location(device, i, READ, &bytenr);
3793                 if (ret == -ENOENT) {
3794                         break;
3795                 } else if (ret < 0) {
3796                         errors++;
3797                         if (i == 0)
3798                                 primary_failed = true;
3799                         continue;
3800                 }
3801                 if (bytenr + BTRFS_SUPER_INFO_SIZE >=
3802                     device->commit_total_bytes)
3803                         break;
3804
3805                 page = find_get_page(device->bdev->bd_inode->i_mapping,
3806                                      bytenr >> PAGE_SHIFT);
3807                 if (!page) {
3808                         errors++;
3809                         if (i == 0)
3810                                 primary_failed = true;
3811                         continue;
3812                 }
3813                 /* Page is submitted locked and unlocked once the IO completes */
3814                 wait_on_page_locked(page);
3815                 if (PageError(page)) {
3816                         errors++;
3817                         if (i == 0)
3818                                 primary_failed = true;
3819                 }
3820
3821                 /* Drop our reference */
3822                 put_page(page);
3823
3824                 /* Drop the reference from the writing run */
3825                 put_page(page);
3826         }
3827
3828         /* log error, force error return */
3829         if (primary_failed) {
3830                 btrfs_err(device->fs_info, "error writing primary super block to device %llu",
3831                           device->devid);
3832                 return -1;
3833         }
3834
3835         return errors < i ? 0 : -1;
3836 }
3837
3838 /*
3839  * endio for the write_dev_flush, this will wake anyone waiting
3840  * for the barrier when it is done
3841  */
3842 static void btrfs_end_empty_barrier(struct bio *bio)
3843 {
3844         bio_uninit(bio);
3845         complete(bio->bi_private);
3846 }
3847
3848 /*
3849  * Submit a flush request to the device if it supports it. Error handling is
3850  * done in the waiting counterpart.
3851  */
3852 static void write_dev_flush(struct btrfs_device *device)
3853 {
3854         struct bio *bio = &device->flush_bio;
3855
3856         device->last_flush_error = BLK_STS_OK;
3857
3858 #ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY
3859         /*
3860          * When a disk has write caching disabled, we skip submission of a bio
3861          * with flush and sync requests before writing the superblock, since
3862          * it's not needed. However when the integrity checker is enabled, this
3863          * results in reports that there are metadata blocks referred by a
3864          * superblock that were not properly flushed. So don't skip the bio
3865          * submission only when the integrity checker is enabled for the sake
3866          * of simplicity, since this is a debug tool and not meant for use in
3867          * non-debug builds.
3868          */
3869         if (!bdev_write_cache(device->bdev))
3870                 return;
3871 #endif
3872
3873         bio_init(bio, device->bdev, NULL, 0,
3874                  REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH);
3875         bio->bi_end_io = btrfs_end_empty_barrier;
3876         init_completion(&device->flush_wait);
3877         bio->bi_private = &device->flush_wait;
3878
3879         btrfsic_check_bio(bio);
3880         submit_bio(bio);
3881         set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
3882 }
3883
3884 /*
3885  * If the flush bio has been submitted by write_dev_flush, wait for it.
3886  * Return true for any error, and false otherwise.
3887  */
3888 static bool wait_dev_flush(struct btrfs_device *device)
3889 {
3890         struct bio *bio = &device->flush_bio;
3891
3892         if (!test_and_clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state))
3893                 return false;
3894
3895         wait_for_completion_io(&device->flush_wait);
3896
3897         if (bio->bi_status) {
3898                 device->last_flush_error = bio->bi_status;
3899                 btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_FLUSH_ERRS);
3900                 return true;
3901         }
3902
3903         return false;
3904 }
3905
3906 /*
3907  * send an empty flush down to each device in parallel,
3908  * then wait for them
3909  */
3910 static int barrier_all_devices(struct btrfs_fs_info *info)
3911 {
3912         struct list_head *head;
3913         struct btrfs_device *dev;
3914         int errors_wait = 0;
3915
3916         lockdep_assert_held(&info->fs_devices->device_list_mutex);
3917         /* send down all the barriers */
3918         head = &info->fs_devices->devices;
3919         list_for_each_entry(dev, head, dev_list) {
3920                 if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
3921                         continue;
3922                 if (!dev->bdev)
3923                         continue;
3924                 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
3925                     !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
3926                         continue;
3927
3928                 write_dev_flush(dev);
3929         }
3930
3931         /* wait for all the barriers */
3932         list_for_each_entry(dev, head, dev_list) {
3933                 if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
3934                         continue;
3935                 if (!dev->bdev) {
3936                         errors_wait++;
3937                         continue;
3938                 }
3939                 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
3940                     !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
3941                         continue;
3942
3943                 if (wait_dev_flush(dev))
3944                         errors_wait++;
3945         }
3946
3947         /*
3948          * Checks last_flush_error of disks in order to determine the device
3949          * state.
3950          */
3951         if (errors_wait && !btrfs_check_rw_degradable(info, NULL))
3952                 return -EIO;
3953
3954         return 0;
3955 }
3956
3957 int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
3958 {
3959         int raid_type;
3960         int min_tolerated = INT_MAX;
3961
3962         if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 ||
3963             (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE))
3964                 min_tolerated = min_t(int, min_tolerated,
3965                                     btrfs_raid_array[BTRFS_RAID_SINGLE].
3966                                     tolerated_failures);
3967
3968         for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
3969                 if (raid_type == BTRFS_RAID_SINGLE)
3970                         continue;
3971                 if (!(flags & btrfs_raid_array[raid_type].bg_flag))
3972                         continue;
3973                 min_tolerated = min_t(int, min_tolerated,
3974                                     btrfs_raid_array[raid_type].
3975                                     tolerated_failures);
3976         }
3977
3978         if (min_tolerated == INT_MAX) {
3979                 pr_warn("BTRFS: unknown raid flag: %llu", flags);
3980                 min_tolerated = 0;
3981         }
3982
3983         return min_tolerated;
3984 }
3985
3986 int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
3987 {
3988         struct list_head *head;
3989         struct btrfs_device *dev;
3990         struct btrfs_super_block *sb;
3991         struct btrfs_dev_item *dev_item;
3992         int ret;
3993         int do_barriers;
3994         int max_errors;
3995         int total_errors = 0;
3996         u64 flags;
3997
3998         do_barriers = !btrfs_test_opt(fs_info, NOBARRIER);
3999
4000         /*
4001          * max_mirrors == 0 indicates we're from commit_transaction,
4002          * not from fsync where the tree roots in fs_info have not
4003          * been consistent on disk.
4004          */
4005         if (max_mirrors == 0)
4006                 backup_super_roots(fs_info);
4007
4008         sb = fs_info->super_for_commit;
4009         dev_item = &sb->dev_item;
4010
4011         mutex_lock(&fs_info->fs_devices->device_list_mutex);
4012         head = &fs_info->fs_devices->devices;
4013         max_errors = btrfs_super_num_devices(fs_info->super_copy) - 1;
4014
4015         if (do_barriers) {
4016                 ret = barrier_all_devices(fs_info);
4017                 if (ret) {
4018                         mutex_unlock(
4019                                 &fs_info->fs_devices->device_list_mutex);
4020                         btrfs_handle_fs_error(fs_info, ret,
4021                                               "errors while submitting device barriers.");
4022                         return ret;
4023                 }
4024         }
4025
4026         list_for_each_entry(dev, head, dev_list) {
4027                 if (!dev->bdev) {
4028                         total_errors++;
4029                         continue;
4030                 }
4031                 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4032                     !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
4033                         continue;
4034
4035                 btrfs_set_stack_device_generation(dev_item, 0);
4036                 btrfs_set_stack_device_type(dev_item, dev->type);
4037                 btrfs_set_stack_device_id(dev_item, dev->devid);
4038                 btrfs_set_stack_device_total_bytes(dev_item,
4039                                                    dev->commit_total_bytes);
4040                 btrfs_set_stack_device_bytes_used(dev_item,
4041                                                   dev->commit_bytes_used);
4042                 btrfs_set_stack_device_io_align(dev_item, dev->io_align);
4043                 btrfs_set_stack_device_io_width(dev_item, dev->io_width);
4044                 btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
4045                 memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
4046                 memcpy(dev_item->fsid, dev->fs_devices->metadata_uuid,
4047                        BTRFS_FSID_SIZE);
4048
4049                 flags = btrfs_super_flags(sb);
4050                 btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
4051
4052                 ret = btrfs_validate_write_super(fs_info, sb);
4053                 if (ret < 0) {
4054                         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4055                         btrfs_handle_fs_error(fs_info, -EUCLEAN,
4056                                 "unexpected superblock corruption detected");
4057                         return -EUCLEAN;
4058                 }
4059
4060                 ret = write_dev_supers(dev, sb, max_mirrors);
4061                 if (ret)
4062                         total_errors++;
4063         }
4064         if (total_errors > max_errors) {
4065                 btrfs_err(fs_info, "%d errors while writing supers",
4066                           total_errors);
4067                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4068
4069                 /* FUA is masked off if unsupported and can't be the reason */
4070                 btrfs_handle_fs_error(fs_info, -EIO,
4071                                       "%d errors while writing supers",
4072                                       total_errors);
4073                 return -EIO;
4074         }
4075
4076         total_errors = 0;
4077         list_for_each_entry(dev, head, dev_list) {
4078                 if (!dev->bdev)
4079                         continue;
4080                 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4081                     !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
4082                         continue;
4083
4084                 ret = wait_dev_supers(dev, max_mirrors);
4085                 if (ret)
4086                         total_errors++;
4087         }
4088         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4089         if (total_errors > max_errors) {
4090                 btrfs_handle_fs_error(fs_info, -EIO,
4091                                       "%d errors while writing supers",
4092                                       total_errors);
4093                 return -EIO;
4094         }
4095         return 0;
4096 }
4097
4098 /* Drop a fs root from the radix tree and free it. */
4099 void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
4100                                   struct btrfs_root *root)
4101 {
4102         bool drop_ref = false;
4103
4104         spin_lock(&fs_info->fs_roots_radix_lock);
4105         radix_tree_delete(&fs_info->fs_roots_radix,
4106                           (unsigned long)root->root_key.objectid);
4107         if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state))
4108                 drop_ref = true;
4109         spin_unlock(&fs_info->fs_roots_radix_lock);
4110
4111         if (BTRFS_FS_ERROR(fs_info)) {
4112                 ASSERT(root->log_root == NULL);
4113                 if (root->reloc_root) {
4114                         btrfs_put_root(root->reloc_root);
4115                         root->reloc_root = NULL;
4116                 }
4117         }
4118
4119         if (drop_ref)
4120                 btrfs_put_root(root);
4121 }
4122
4123 int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
4124 {
4125         u64 root_objectid = 0;
4126         struct btrfs_root *gang[8];
4127         int i = 0;
4128         int err = 0;
4129         unsigned int ret = 0;
4130
4131         while (1) {
4132                 spin_lock(&fs_info->fs_roots_radix_lock);
4133                 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
4134                                              (void **)gang, root_objectid,
4135                                              ARRAY_SIZE(gang));
4136                 if (!ret) {
4137                         spin_unlock(&fs_info->fs_roots_radix_lock);
4138                         break;
4139                 }
4140                 root_objectid = gang[ret - 1]->root_key.objectid + 1;
4141
4142                 for (i = 0; i < ret; i++) {
4143                         /* Avoid to grab roots in dead_roots */
4144                         if (btrfs_root_refs(&gang[i]->root_item) == 0) {
4145                                 gang[i] = NULL;
4146                                 continue;
4147                         }
4148                         /* grab all the search result for later use */
4149                         gang[i] = btrfs_grab_root(gang[i]);
4150                 }
4151                 spin_unlock(&fs_info->fs_roots_radix_lock);
4152
4153                 for (i = 0; i < ret; i++) {
4154                         if (!gang[i])
4155                                 continue;
4156                         root_objectid = gang[i]->root_key.objectid;
4157                         err = btrfs_orphan_cleanup(gang[i]);
4158                         if (err)
4159                                 goto out;
4160                         btrfs_put_root(gang[i]);
4161                 }
4162                 root_objectid++;
4163         }
4164 out:
4165         /* release the uncleaned roots due to error */
4166         for (; i < ret; i++) {
4167                 if (gang[i])
4168                         btrfs_put_root(gang[i]);
4169         }
4170         return err;
4171 }
4172
4173 int btrfs_commit_super(struct btrfs_fs_info *fs_info)
4174 {
4175         struct btrfs_root *root = fs_info->tree_root;
4176         struct btrfs_trans_handle *trans;
4177
4178         mutex_lock(&fs_info->cleaner_mutex);
4179         btrfs_run_delayed_iputs(fs_info);
4180         mutex_unlock(&fs_info->cleaner_mutex);
4181         wake_up_process(fs_info->cleaner_kthread);
4182
4183         /* wait until ongoing cleanup work done */
4184         down_write(&fs_info->cleanup_work_sem);
4185         up_write(&fs_info->cleanup_work_sem);
4186
4187         trans = btrfs_join_transaction(root);
4188         if (IS_ERR(trans))
4189                 return PTR_ERR(trans);
4190         return btrfs_commit_transaction(trans);
4191 }
4192
4193 static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info)
4194 {
4195         struct btrfs_transaction *trans;
4196         struct btrfs_transaction *tmp;
4197         bool found = false;
4198
4199         if (list_empty(&fs_info->trans_list))
4200                 return;
4201
4202         /*
4203          * This function is only called at the very end of close_ctree(),
4204          * thus no other running transaction, no need to take trans_lock.
4205          */
4206         ASSERT(test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags));
4207         list_for_each_entry_safe(trans, tmp, &fs_info->trans_list, list) {
4208                 struct extent_state *cached = NULL;
4209                 u64 dirty_bytes = 0;
4210                 u64 cur = 0;
4211                 u64 found_start;
4212                 u64 found_end;
4213
4214                 found = true;
4215                 while (!find_first_extent_bit(&trans->dirty_pages, cur,
4216                         &found_start, &found_end, EXTENT_DIRTY, &cached)) {
4217                         dirty_bytes += found_end + 1 - found_start;
4218                         cur = found_end + 1;
4219                 }
4220                 btrfs_warn(fs_info,
4221         "transaction %llu (with %llu dirty metadata bytes) is not committed",
4222                            trans->transid, dirty_bytes);
4223                 btrfs_cleanup_one_transaction(trans, fs_info);
4224
4225                 if (trans == fs_info->running_transaction)
4226                         fs_info->running_transaction = NULL;
4227                 list_del_init(&trans->list);
4228
4229                 btrfs_put_transaction(trans);
4230                 trace_btrfs_transaction_commit(fs_info);
4231         }
4232         ASSERT(!found);
4233 }
4234
4235 void __cold close_ctree(struct btrfs_fs_info *fs_info)
4236 {
4237         int ret;
4238
4239         set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);
4240
4241         /*
4242          * If we had UNFINISHED_DROPS we could still be processing them, so
4243          * clear that bit and wake up relocation so it can stop.
4244          * We must do this before stopping the block group reclaim task, because
4245          * at btrfs_relocate_block_group() we wait for this bit, and after the
4246          * wait we stop with -EINTR if btrfs_fs_closing() returns non-zero - we
4247          * have just set BTRFS_FS_CLOSING_START, so btrfs_fs_closing() will
4248          * return 1.
4249          */
4250         btrfs_wake_unfinished_drop(fs_info);
4251
4252         /*
4253          * We may have the reclaim task running and relocating a data block group,
4254          * in which case it may create delayed iputs. So stop it before we park
4255          * the cleaner kthread otherwise we can get new delayed iputs after
4256          * parking the cleaner, and that can make the async reclaim task to hang
4257          * if it's waiting for delayed iputs to complete, since the cleaner is
4258          * parked and can not run delayed iputs - this will make us hang when
4259          * trying to stop the async reclaim task.
4260          */
4261         cancel_work_sync(&fs_info->reclaim_bgs_work);
4262         /*
4263          * We don't want the cleaner to start new transactions, add more delayed
4264          * iputs, etc. while we're closing. We can't use kthread_stop() yet
4265          * because that frees the task_struct, and the transaction kthread might
4266          * still try to wake up the cleaner.
4267          */
4268         kthread_park(fs_info->cleaner_kthread);
4269
4270         /* wait for the qgroup rescan worker to stop */
4271         btrfs_qgroup_wait_for_completion(fs_info, false);
4272
4273         /* wait for the uuid_scan task to finish */
4274         down(&fs_info->uuid_tree_rescan_sem);
4275         /* avoid complains from lockdep et al., set sem back to initial state */
4276         up(&fs_info->uuid_tree_rescan_sem);
4277
4278         /* pause restriper - we want to resume on mount */
4279         btrfs_pause_balance(fs_info);
4280
4281         btrfs_dev_replace_suspend_for_unmount(fs_info);
4282
4283         btrfs_scrub_cancel(fs_info);
4284
4285         /* wait for any defraggers to finish */
4286         wait_event(fs_info->transaction_wait,
4287                    (atomic_read(&fs_info->defrag_running) == 0));
4288
4289         /* clear out the rbtree of defraggable inodes */
4290         btrfs_cleanup_defrag_inodes(fs_info);
4291
4292         /*
4293          * After we parked the cleaner kthread, ordered extents may have
4294          * completed and created new delayed iputs. If one of the async reclaim
4295          * tasks is running and in the RUN_DELAYED_IPUTS flush state, then we
4296          * can hang forever trying to stop it, because if a delayed iput is
4297          * added after it ran btrfs_run_delayed_iputs() and before it called
4298          * btrfs_wait_on_delayed_iputs(), it will hang forever since there is
4299          * no one else to run iputs.
4300          *
4301          * So wait for all ongoing ordered extents to complete and then run
4302          * delayed iputs. This works because once we reach this point no one
4303          * can either create new ordered extents nor create delayed iputs
4304          * through some other means.
4305          *
4306          * Also note that btrfs_wait_ordered_roots() is not safe here, because
4307          * it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent,
4308          * but the delayed iput for the respective inode is made only when doing
4309          * the final btrfs_put_ordered_extent() (which must happen at
4310          * btrfs_finish_ordered_io() when we are unmounting).
4311          */
4312         btrfs_flush_workqueue(fs_info->endio_write_workers);
4313         /* Ordered extents for free space inodes. */
4314         btrfs_flush_workqueue(fs_info->endio_freespace_worker);
4315         btrfs_run_delayed_iputs(fs_info);
4316
4317         cancel_work_sync(&fs_info->async_reclaim_work);
4318         cancel_work_sync(&fs_info->async_data_reclaim_work);
4319         cancel_work_sync(&fs_info->preempt_reclaim_work);
4320
4321         /* Cancel or finish ongoing discard work */
4322         btrfs_discard_cleanup(fs_info);
4323
4324         if (!sb_rdonly(fs_info->sb)) {
4325                 /*
4326                  * The cleaner kthread is stopped, so do one final pass over
4327                  * unused block groups.
4328                  */
4329                 btrfs_delete_unused_bgs(fs_info);
4330
4331                 /*
4332                  * There might be existing delayed inode workers still running
4333                  * and holding an empty delayed inode item. We must wait for
4334                  * them to complete first because they can create a transaction.
4335                  * This happens when someone calls btrfs_balance_delayed_items()
4336                  * and then a transaction commit runs the same delayed nodes
4337                  * before any delayed worker has done something with the nodes.
4338                  * We must wait for any worker here and not at transaction
4339                  * commit time since that could cause a deadlock.
4340                  * This is a very rare case.
4341                  */
4342                 btrfs_flush_workqueue(fs_info->delayed_workers);
4343
4344                 ret = btrfs_commit_super(fs_info);
4345                 if (ret)
4346                         btrfs_err(fs_info, "commit super ret %d", ret);
4347         }
4348
4349         if (BTRFS_FS_ERROR(fs_info))
4350                 btrfs_error_commit_super(fs_info);
4351
4352         kthread_stop(fs_info->transaction_kthread);
4353         kthread_stop(fs_info->cleaner_kthread);
4354
4355         ASSERT(list_empty(&fs_info->delayed_iputs));
4356         set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
4357
4358         if (btrfs_check_quota_leak(fs_info)) {
4359                 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
4360                 btrfs_err(fs_info, "qgroup reserved space leaked");
4361         }
4362
4363         btrfs_free_qgroup_config(fs_info);
4364         ASSERT(list_empty(&fs_info->delalloc_roots));
4365
4366         if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
4367                 btrfs_info(fs_info, "at unmount delalloc count %lld",
4368                        percpu_counter_sum(&fs_info->delalloc_bytes));
4369         }
4370
4371         if (percpu_counter_sum(&fs_info->ordered_bytes))
4372                 btrfs_info(fs_info, "at unmount dio bytes count %lld",
4373                            percpu_counter_sum(&fs_info->ordered_bytes));
4374
4375         btrfs_sysfs_remove_mounted(fs_info);
4376         btrfs_sysfs_remove_fsid(fs_info->fs_devices);
4377
4378         btrfs_put_block_group_cache(fs_info);
4379
4380         /*
4381          * we must make sure there is not any read request to
4382          * submit after we stopping all workers.
4383          */
4384         invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
4385         btrfs_stop_all_workers(fs_info);
4386
4387         /* We shouldn't have any transaction open at this point */
4388         warn_about_uncommitted_trans(fs_info);
4389
4390         clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
4391         free_root_pointers(fs_info, true);
4392         btrfs_free_fs_roots(fs_info);
4393
4394         /*
4395          * We must free the block groups after dropping the fs_roots as we could
4396          * have had an IO error and have left over tree log blocks that aren't
4397          * cleaned up until the fs roots are freed.  This makes the block group
4398          * accounting appear to be wrong because there's pending reserved bytes,
4399          * so make sure we do the block group cleanup afterwards.
4400          */
4401         btrfs_free_block_groups(fs_info);
4402
4403         iput(fs_info->btree_inode);
4404
4405 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
4406         if (btrfs_test_opt(fs_info, CHECK_INTEGRITY))
4407                 btrfsic_unmount(fs_info->fs_devices);
4408 #endif
4409
4410         btrfs_mapping_tree_free(&fs_info->mapping_tree);
4411         btrfs_close_devices(fs_info->fs_devices);
4412 }
4413
4414 void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
4415 {
4416         struct btrfs_fs_info *fs_info = buf->fs_info;
4417         u64 transid = btrfs_header_generation(buf);
4418
4419 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4420         /*
4421          * This is a fast path so only do this check if we have sanity tests
4422          * enabled.  Normal people shouldn't be using unmapped buffers as dirty
4423          * outside of the sanity tests.
4424          */
4425         if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
4426                 return;
4427 #endif
4428         btrfs_assert_tree_write_locked(buf);
4429         if (transid != fs_info->generation)
4430                 WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n",
4431                         buf->start, transid, fs_info->generation);
4432         set_extent_buffer_dirty(buf);
4433 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
4434         /*
4435          * btrfs_check_leaf() won't check item data if we don't have WRITTEN
4436          * set, so this will only validate the basic structure of the items.
4437          */
4438         if (btrfs_header_level(buf) == 0 && btrfs_check_leaf(buf)) {
4439                 btrfs_print_leaf(buf);
4440                 ASSERT(0);
4441         }
4442 #endif
4443 }
4444
4445 static void __btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info,
4446                                         int flush_delayed)
4447 {
4448         /*
4449          * looks as though older kernels can get into trouble with
4450          * this code, they end up stuck in balance_dirty_pages forever
4451          */
4452         int ret;
4453
4454         if (current->flags & PF_MEMALLOC)
4455                 return;
4456
4457         if (flush_delayed)
4458                 btrfs_balance_delayed_items(fs_info);
4459
4460         ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
4461                                      BTRFS_DIRTY_METADATA_THRESH,
4462                                      fs_info->dirty_metadata_batch);
4463         if (ret > 0) {
4464                 balance_dirty_pages_ratelimited(fs_info->btree_inode->i_mapping);
4465         }
4466 }
4467
4468 void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info)
4469 {
4470         __btrfs_btree_balance_dirty(fs_info, 1);
4471 }
4472
4473 void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info)
4474 {
4475         __btrfs_btree_balance_dirty(fs_info, 0);
4476 }
4477
4478 static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
4479 {
4480         /* cleanup FS via transaction */
4481         btrfs_cleanup_transaction(fs_info);
4482
4483         mutex_lock(&fs_info->cleaner_mutex);
4484         btrfs_run_delayed_iputs(fs_info);
4485         mutex_unlock(&fs_info->cleaner_mutex);
4486
4487         down_write(&fs_info->cleanup_work_sem);
4488         up_write(&fs_info->cleanup_work_sem);
4489 }
4490
4491 static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info)
4492 {
4493         struct btrfs_root *gang[8];
4494         u64 root_objectid = 0;
4495         int ret;
4496
4497         spin_lock(&fs_info->fs_roots_radix_lock);
4498         while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
4499                                              (void **)gang, root_objectid,
4500                                              ARRAY_SIZE(gang))) != 0) {
4501                 int i;
4502
4503                 for (i = 0; i < ret; i++)
4504                         gang[i] = btrfs_grab_root(gang[i]);
4505                 spin_unlock(&fs_info->fs_roots_radix_lock);
4506
4507                 for (i = 0; i < ret; i++) {
4508                         if (!gang[i])
4509                                 continue;
4510                         root_objectid = gang[i]->root_key.objectid;
4511                         btrfs_free_log(NULL, gang[i]);
4512                         btrfs_put_root(gang[i]);
4513                 }
4514                 root_objectid++;
4515                 spin_lock(&fs_info->fs_roots_radix_lock);
4516         }
4517         spin_unlock(&fs_info->fs_roots_radix_lock);
4518         btrfs_free_log_root_tree(NULL, fs_info);
4519 }
4520
4521 static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
4522 {
4523         struct btrfs_ordered_extent *ordered;
4524
4525         spin_lock(&root->ordered_extent_lock);
4526         /*
4527          * This will just short circuit the ordered completion stuff which will
4528          * make sure the ordered extent gets properly cleaned up.
4529          */
4530         list_for_each_entry(ordered, &root->ordered_extents,
4531                             root_extent_list)
4532                 set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
4533         spin_unlock(&root->ordered_extent_lock);
4534 }
4535
4536 static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
4537 {
4538         struct btrfs_root *root;
4539         struct list_head splice;
4540
4541         INIT_LIST_HEAD(&splice);
4542
4543         spin_lock(&fs_info->ordered_root_lock);
4544         list_splice_init(&fs_info->ordered_roots, &splice);
4545         while (!list_empty(&splice)) {
4546                 root = list_first_entry(&splice, struct btrfs_root,
4547                                         ordered_root);
4548                 list_move_tail(&root->ordered_root,
4549                                &fs_info->ordered_roots);
4550
4551                 spin_unlock(&fs_info->ordered_root_lock);
4552                 btrfs_destroy_ordered_extents(root);
4553
4554                 cond_resched();
4555                 spin_lock(&fs_info->ordered_root_lock);
4556         }
4557         spin_unlock(&fs_info->ordered_root_lock);
4558
4559         /*
4560          * We need this here because if we've been flipped read-only we won't
4561          * get sync() from the umount, so we need to make sure any ordered
4562          * extents that haven't had their dirty pages IO start writeout yet
4563          * actually get run and error out properly.
4564          */
4565         btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
4566 }
4567
4568 static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
4569                                        struct btrfs_fs_info *fs_info)
4570 {
4571         struct rb_node *node;
4572         struct btrfs_delayed_ref_root *delayed_refs;
4573         struct btrfs_delayed_ref_node *ref;
4574
4575         delayed_refs = &trans->delayed_refs;
4576
4577         spin_lock(&delayed_refs->lock);
4578         if (atomic_read(&delayed_refs->num_entries) == 0) {
4579                 spin_unlock(&delayed_refs->lock);
4580                 btrfs_debug(fs_info, "delayed_refs has NO entry");
4581                 return;
4582         }
4583
4584         while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
4585                 struct btrfs_delayed_ref_head *head;
4586                 struct rb_node *n;
4587                 bool pin_bytes = false;
4588
4589                 head = rb_entry(node, struct btrfs_delayed_ref_head,
4590                                 href_node);
4591                 if (btrfs_delayed_ref_lock(delayed_refs, head))
4592                         continue;
4593
4594                 spin_lock(&head->lock);
4595                 while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
4596                         ref = rb_entry(n, struct btrfs_delayed_ref_node,
4597                                        ref_node);
4598                         rb_erase_cached(&ref->ref_node, &head->ref_tree);
4599                         RB_CLEAR_NODE(&ref->ref_node);
4600                         if (!list_empty(&ref->add_list))
4601                                 list_del(&ref->add_list);
4602                         atomic_dec(&delayed_refs->num_entries);
4603                         btrfs_put_delayed_ref(ref);
4604                 }
4605                 if (head->must_insert_reserved)
4606                         pin_bytes = true;
4607                 btrfs_free_delayed_extent_op(head->extent_op);
4608                 btrfs_delete_ref_head(delayed_refs, head);
4609                 spin_unlock(&head->lock);
4610                 spin_unlock(&delayed_refs->lock);
4611                 mutex_unlock(&head->mutex);
4612
4613                 if (pin_bytes) {
4614                         struct btrfs_block_group *cache;
4615
4616                         cache = btrfs_lookup_block_group(fs_info, head->bytenr);
4617                         BUG_ON(!cache);
4618
4619                         spin_lock(&cache->space_info->lock);
4620                         spin_lock(&cache->lock);
4621                         cache->pinned += head->num_bytes;
4622                         btrfs_space_info_update_bytes_pinned(fs_info,
4623                                 cache->space_info, head->num_bytes);
4624                         cache->reserved -= head->num_bytes;
4625                         cache->space_info->bytes_reserved -= head->num_bytes;
4626                         spin_unlock(&cache->lock);
4627                         spin_unlock(&cache->space_info->lock);
4628
4629                         btrfs_put_block_group(cache);
4630
4631                         btrfs_error_unpin_extent_range(fs_info, head->bytenr,
4632                                 head->bytenr + head->num_bytes - 1);
4633                 }
4634                 btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
4635                 btrfs_put_delayed_ref_head(head);
4636                 cond_resched();
4637                 spin_lock(&delayed_refs->lock);
4638         }
4639         btrfs_qgroup_destroy_extent_records(trans);
4640
4641         spin_unlock(&delayed_refs->lock);
4642 }
4643
4644 static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
4645 {
4646         struct btrfs_inode *btrfs_inode;
4647         struct list_head splice;
4648
4649         INIT_LIST_HEAD(&splice);
4650
4651         spin_lock(&root->delalloc_lock);
4652         list_splice_init(&root->delalloc_inodes, &splice);
4653
4654         while (!list_empty(&splice)) {
4655                 struct inode *inode = NULL;
4656                 btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
4657                                                delalloc_inodes);
4658                 __btrfs_del_delalloc_inode(root, btrfs_inode);
4659                 spin_unlock(&root->delalloc_lock);
4660
4661                 /*
4662                  * Make sure we get a live inode and that it'll not disappear
4663                  * meanwhile.
4664                  */
4665                 inode = igrab(&btrfs_inode->vfs_inode);
4666                 if (inode) {
4667                         unsigned int nofs_flag;
4668
4669                         nofs_flag = memalloc_nofs_save();
4670                         invalidate_inode_pages2(inode->i_mapping);
4671                         memalloc_nofs_restore(nofs_flag);
4672                         iput(inode);
4673                 }
4674                 spin_lock(&root->delalloc_lock);
4675         }
4676         spin_unlock(&root->delalloc_lock);
4677 }
4678
4679 static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
4680 {
4681         struct btrfs_root *root;
4682         struct list_head splice;
4683
4684         INIT_LIST_HEAD(&splice);
4685
4686         spin_lock(&fs_info->delalloc_root_lock);
4687         list_splice_init(&fs_info->delalloc_roots, &splice);
4688         while (!list_empty(&splice)) {
4689                 root = list_first_entry(&splice, struct btrfs_root,
4690                                          delalloc_root);
4691                 root = btrfs_grab_root(root);
4692                 BUG_ON(!root);
4693                 spin_unlock(&fs_info->delalloc_root_lock);
4694
4695                 btrfs_destroy_delalloc_inodes(root);
4696                 btrfs_put_root(root);
4697
4698                 spin_lock(&fs_info->delalloc_root_lock);
4699         }
4700         spin_unlock(&fs_info->delalloc_root_lock);
4701 }
4702
4703 static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
4704                                         struct extent_io_tree *dirty_pages,
4705                                         int mark)
4706 {
4707         int ret;
4708         struct extent_buffer *eb;
4709         u64 start = 0;
4710         u64 end;
4711
4712         while (1) {
4713                 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
4714                                             mark, NULL);
4715                 if (ret)
4716                         break;
4717
4718                 clear_extent_bits(dirty_pages, start, end, mark);
4719                 while (start <= end) {
4720                         eb = find_extent_buffer(fs_info, start);
4721                         start += fs_info->nodesize;
4722                         if (!eb)
4723                                 continue;
4724
4725                         btrfs_tree_lock(eb);
4726                         wait_on_extent_buffer_writeback(eb);
4727                         btrfs_clear_buffer_dirty(NULL, eb);
4728                         btrfs_tree_unlock(eb);
4729
4730                         free_extent_buffer_stale(eb);
4731                 }
4732         }
4733
4734         return ret;
4735 }
4736
4737 static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
4738                                        struct extent_io_tree *unpin)
4739 {
4740         u64 start;
4741         u64 end;
4742         int ret;
4743
4744         while (1) {
4745                 struct extent_state *cached_state = NULL;
4746
4747                 /*
4748                  * The btrfs_finish_extent_commit() may get the same range as
4749                  * ours between find_first_extent_bit and clear_extent_dirty.
4750                  * Hence, hold the unused_bg_unpin_mutex to avoid double unpin
4751                  * the same extent range.
4752                  */
4753                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
4754                 ret = find_first_extent_bit(unpin, 0, &start, &end,
4755                                             EXTENT_DIRTY, &cached_state);
4756                 if (ret) {
4757                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
4758                         break;
4759                 }
4760
4761                 clear_extent_dirty(unpin, start, end, &cached_state);
4762                 free_extent_state(cached_state);
4763                 btrfs_error_unpin_extent_range(fs_info, start, end);
4764                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
4765                 cond_resched();
4766         }
4767
4768         return 0;
4769 }
4770
4771 static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache)
4772 {
4773         struct inode *inode;
4774
4775         inode = cache->io_ctl.inode;
4776         if (inode) {
4777                 unsigned int nofs_flag;
4778
4779                 nofs_flag = memalloc_nofs_save();
4780                 invalidate_inode_pages2(inode->i_mapping);
4781                 memalloc_nofs_restore(nofs_flag);
4782
4783                 BTRFS_I(inode)->generation = 0;
4784                 cache->io_ctl.inode = NULL;
4785                 iput(inode);
4786         }
4787         ASSERT(cache->io_ctl.pages == NULL);
4788         btrfs_put_block_group(cache);
4789 }
4790
4791 void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
4792                              struct btrfs_fs_info *fs_info)
4793 {
4794         struct btrfs_block_group *cache;
4795
4796         spin_lock(&cur_trans->dirty_bgs_lock);
4797         while (!list_empty(&cur_trans->dirty_bgs)) {
4798                 cache = list_first_entry(&cur_trans->dirty_bgs,
4799                                          struct btrfs_block_group,
4800                                          dirty_list);
4801
4802                 if (!list_empty(&cache->io_list)) {
4803                         spin_unlock(&cur_trans->dirty_bgs_lock);
4804                         list_del_init(&cache->io_list);
4805                         btrfs_cleanup_bg_io(cache);
4806                         spin_lock(&cur_trans->dirty_bgs_lock);
4807                 }
4808
4809                 list_del_init(&cache->dirty_list);
4810                 spin_lock(&cache->lock);
4811                 cache->disk_cache_state = BTRFS_DC_ERROR;
4812                 spin_unlock(&cache->lock);
4813
4814                 spin_unlock(&cur_trans->dirty_bgs_lock);
4815                 btrfs_put_block_group(cache);
4816                 btrfs_delayed_refs_rsv_release(fs_info, 1);
4817                 spin_lock(&cur_trans->dirty_bgs_lock);
4818         }
4819         spin_unlock(&cur_trans->dirty_bgs_lock);
4820
4821         /*
4822          * Refer to the definition of io_bgs member for details why it's safe
4823          * to use it without any locking
4824          */
4825         while (!list_empty(&cur_trans->io_bgs)) {
4826                 cache = list_first_entry(&cur_trans->io_bgs,
4827                                          struct btrfs_block_group,
4828                                          io_list);
4829
4830                 list_del_init(&cache->io_list);
4831                 spin_lock(&cache->lock);
4832                 cache->disk_cache_state = BTRFS_DC_ERROR;
4833                 spin_unlock(&cache->lock);
4834                 btrfs_cleanup_bg_io(cache);
4835         }
4836 }
4837
4838 void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
4839                                    struct btrfs_fs_info *fs_info)
4840 {
4841         struct btrfs_device *dev, *tmp;
4842
4843         btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
4844         ASSERT(list_empty(&cur_trans->dirty_bgs));
4845         ASSERT(list_empty(&cur_trans->io_bgs));
4846
4847         list_for_each_entry_safe(dev, tmp, &cur_trans->dev_update_list,
4848                                  post_commit_list) {
4849                 list_del_init(&dev->post_commit_list);
4850         }
4851
4852         btrfs_destroy_delayed_refs(cur_trans, fs_info);
4853
4854         cur_trans->state = TRANS_STATE_COMMIT_START;
4855         wake_up(&fs_info->transaction_blocked_wait);
4856
4857         cur_trans->state = TRANS_STATE_UNBLOCKED;
4858         wake_up(&fs_info->transaction_wait);
4859
4860         btrfs_destroy_delayed_inodes(fs_info);
4861
4862         btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages,
4863                                      EXTENT_DIRTY);
4864         btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents);
4865
4866         cur_trans->state =TRANS_STATE_COMPLETED;
4867         wake_up(&cur_trans->commit_wait);
4868 }
4869
4870 static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
4871 {
4872         struct btrfs_transaction *t;
4873
4874         mutex_lock(&fs_info->transaction_kthread_mutex);
4875
4876         spin_lock(&fs_info->trans_lock);
4877         while (!list_empty(&fs_info->trans_list)) {
4878                 t = list_first_entry(&fs_info->trans_list,
4879                                      struct btrfs_transaction, list);
4880                 if (t->state >= TRANS_STATE_COMMIT_START) {
4881                         refcount_inc(&t->use_count);
4882                         spin_unlock(&fs_info->trans_lock);
4883                         btrfs_wait_for_commit(fs_info, t->transid);
4884                         btrfs_put_transaction(t);
4885                         spin_lock(&fs_info->trans_lock);
4886                         continue;
4887                 }
4888                 if (t == fs_info->running_transaction) {
4889                         t->state = TRANS_STATE_COMMIT_DOING;
4890                         spin_unlock(&fs_info->trans_lock);
4891                         /*
4892                          * We wait for 0 num_writers since we don't hold a trans
4893                          * handle open currently for this transaction.
4894                          */
4895                         wait_event(t->writer_wait,
4896                                    atomic_read(&t->num_writers) == 0);
4897                 } else {
4898                         spin_unlock(&fs_info->trans_lock);
4899                 }
4900                 btrfs_cleanup_one_transaction(t, fs_info);
4901
4902                 spin_lock(&fs_info->trans_lock);
4903                 if (t == fs_info->running_transaction)
4904                         fs_info->running_transaction = NULL;
4905                 list_del_init(&t->list);
4906                 spin_unlock(&fs_info->trans_lock);
4907
4908                 btrfs_put_transaction(t);
4909                 trace_btrfs_transaction_commit(fs_info);
4910                 spin_lock(&fs_info->trans_lock);
4911         }
4912         spin_unlock(&fs_info->trans_lock);
4913         btrfs_destroy_all_ordered_extents(fs_info);
4914         btrfs_destroy_delayed_inodes(fs_info);
4915         btrfs_assert_delayed_root_empty(fs_info);
4916         btrfs_destroy_all_delalloc_inodes(fs_info);
4917         btrfs_drop_all_logs(fs_info);
4918         mutex_unlock(&fs_info->transaction_kthread_mutex);
4919
4920         return 0;
4921 }
4922
4923 int btrfs_init_root_free_objectid(struct btrfs_root *root)
4924 {
4925         struct btrfs_path *path;
4926         int ret;
4927         struct extent_buffer *l;
4928         struct btrfs_key search_key;
4929         struct btrfs_key found_key;
4930         int slot;
4931
4932         path = btrfs_alloc_path();
4933         if (!path)
4934                 return -ENOMEM;
4935
4936         search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
4937         search_key.type = -1;
4938         search_key.offset = (u64)-1;
4939         ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
4940         if (ret < 0)
4941                 goto error;
4942         BUG_ON(ret == 0); /* Corruption */
4943         if (path->slots[0] > 0) {
4944                 slot = path->slots[0] - 1;
4945                 l = path->nodes[0];
4946                 btrfs_item_key_to_cpu(l, &found_key, slot);
4947                 root->free_objectid = max_t(u64, found_key.objectid + 1,
4948                                             BTRFS_FIRST_FREE_OBJECTID);
4949         } else {
4950                 root->free_objectid = BTRFS_FIRST_FREE_OBJECTID;
4951         }
4952         ret = 0;
4953 error:
4954         btrfs_free_path(path);
4955         return ret;
4956 }
4957
4958 int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid)
4959 {
4960         int ret;
4961         mutex_lock(&root->objectid_mutex);
4962
4963         if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
4964                 btrfs_warn(root->fs_info,
4965                            "the objectid of root %llu reaches its highest value",
4966                            root->root_key.objectid);
4967                 ret = -ENOSPC;
4968                 goto out;
4969         }
4970
4971         *objectid = root->free_objectid++;
4972         ret = 0;
4973 out:
4974         mutex_unlock(&root->objectid_mutex);
4975         return ret;
4976 }