fs/btrfs/block-group.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include "misc.h"
   4 #include "ctree.h"
   5 #include "block-group.h"
   6 #include "space-info.h"
   7 #include "disk-io.h"
   8 #include "free-space-cache.h"
   9 #include "free-space-tree.h"
  10 #include "volumes.h"
  11 #include "transaction.h"
  12 #include "ref-verify.h"
  13 #include "sysfs.h"
  14 #include "tree-log.h"
  15 #include "delalloc-space.h"
  16 #include "discard.h"
  17 #include "raid56.h"
  18
  19 /*
  20  * Return target flags in extended format or 0 if restripe for this chunk_type
  21  * is not in progress
  22  *
  23  * Should be called with balance_lock held
  24  */
  25 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
  26 {
  27         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
  28         u64 target = 0;
  29
  30         if (!bctl)
  31                 return 0;
  32
  33         if (flags & BTRFS_BLOCK_GROUP_DATA &&
  34             bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
  35                 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
  36         } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
  37                    bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
  38                 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
  39         } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
  40                    bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
  41                 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
  42         }
  43
  44         return target;
  45 }
  46
  47 /*
  48  * @flags: available profiles in extended format (see ctree.h)
  49  *
  50  * Return reduced profile in chunk format.  If profile changing is in progress
  51  * (either running or paused) picks the target profile (if it's already
  52  * available), otherwise falls back to plain reducing.
  53  */
  54 static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
  55 {
  56         u64 num_devices = fs_info->fs_devices->rw_devices;
  57         u64 target;
  58         u64 raid_type;
  59         u64 allowed = 0;
  60
  61         /*
  62          * See if restripe for this chunk_type is in progress, if so try to
  63          * reduce to the target profile
  64          */
  65         spin_lock(&fs_info->balance_lock);
  66         target = get_restripe_target(fs_info, flags);
  67         if (target) {
  68                 spin_unlock(&fs_info->balance_lock);
  69                 return extended_to_chunk(target);
  70         }
  71         spin_unlock(&fs_info->balance_lock);
  72
  73         /* First, mask out the RAID levels which aren't possible */
  74         for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
  75                 if (num_devices >= btrfs_raid_array[raid_type].devs_min)
  76                         allowed |= btrfs_raid_array[raid_type].bg_flag;
  77         }
  78         allowed &= flags;
  79
  80         if (allowed & BTRFS_BLOCK_GROUP_RAID6)
  81                 allowed = BTRFS_BLOCK_GROUP_RAID6;
  82         else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
  83                 allowed = BTRFS_BLOCK_GROUP_RAID5;
  84         else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
  85                 allowed = BTRFS_BLOCK_GROUP_RAID10;
  86         else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
  87                 allowed = BTRFS_BLOCK_GROUP_RAID1;
  88         else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
  89                 allowed = BTRFS_BLOCK_GROUP_RAID0;
  90
  91         flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
  92
  93         return extended_to_chunk(flags | allowed);
  94 }
  95
  96 u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
  97 {
  98         unsigned seq;
  99         u64 flags;
 100
 101         do {
 102                 flags = orig_flags;
 103                 seq = read_seqbegin(&fs_info->profiles_lock);
 104
 105                 if (flags & BTRFS_BLOCK_GROUP_DATA)
 106                         flags |= fs_info->avail_data_alloc_bits;
 107                 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
 108                         flags |= fs_info->avail_system_alloc_bits;
 109                 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
 110                         flags |= fs_info->avail_metadata_alloc_bits;
 111         } while (read_seqretry(&fs_info->profiles_lock, seq));
 112
 113         return btrfs_reduce_alloc_profile(fs_info, flags);
 114 }
 115
 116 void btrfs_get_block_group(struct btrfs_block_group *cache)
 117 {
 118         refcount_inc(&cache->refs);
 119 }
 120
 121 void btrfs_put_block_group(struct btrfs_block_group *cache)
 122 {
 123         if (refcount_dec_and_test(&cache->refs)) {
 124                 WARN_ON(cache->pinned > 0);
 125                 WARN_ON(cache->reserved > 0);
 126
 127                 /*
 128                  * A block_group shouldn't be on the discard_list anymore.
 129                  * Remove the block_group from the discard_list to prevent us
 130                  * from causing a panic due to NULL pointer dereference.
 131                  */
 132                 if (WARN_ON(!list_empty(&cache->discard_list)))
 133                         btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
 134                                                   cache);
 135
 136                 /*
 137                  * If not empty, someone is still holding mutex of
 138                  * full_stripe_lock, which can only be released by caller.
 139                  * And it will definitely cause use-after-free when caller
 140                  * tries to release full stripe lock.
 141                  *
 142                  * No better way to resolve, but only to warn.
 143                  */
 144                 WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
 145                 kfree(cache->free_space_ctl);
 146                 kfree(cache);
 147         }
 148 }
 149
 150 /*
 151  * This adds the block group to the fs_info rb tree for the block group cache
 152  */
 153 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
 154                                        struct btrfs_block_group *block_group)
 155 {
 156         struct rb_node **p;
 157         struct rb_node *parent = NULL;
 158         struct btrfs_block_group *cache;
 159
 160         ASSERT(block_group->length != 0);
 161
 162         spin_lock(&info->block_group_cache_lock);
 163         p = &info->block_group_cache_tree.rb_node;
 164
 165         while (*p) {
 166                 parent = *p;
 167                 cache = rb_entry(parent, struct btrfs_block_group, cache_node);
 168                 if (block_group->start < cache->start) {
 169                         p = &(*p)->rb_left;
 170                 } else if (block_group->start > cache->start) {
 171                         p = &(*p)->rb_right;
 172                 } else {
 173                         spin_unlock(&info->block_group_cache_lock);
 174                         return -EEXIST;
 175                 }
 176         }
 177
 178         rb_link_node(&block_group->cache_node, parent, p);
 179         rb_insert_color(&block_group->cache_node,
 180                         &info->block_group_cache_tree);
 181
 182         if (info->first_logical_byte > block_group->start)
 183                 info->first_logical_byte = block_group->start;
 184
 185         spin_unlock(&info->block_group_cache_lock);
 186
 187         return 0;
 188 }
 189
 190 /*
 191  * This will return the block group at or after bytenr if contains is 0, else
 192  * it will return the block group that contains the bytenr
 193  */
 194 static struct btrfs_block_group *block_group_cache_tree_search(
 195                 struct btrfs_fs_info *info, u64 bytenr, int contains)
 196 {
 197         struct btrfs_block_group *cache, *ret = NULL;
 198         struct rb_node *n;
 199         u64 end, start;
 200
 201         spin_lock(&info->block_group_cache_lock);
 202         n = info->block_group_cache_tree.rb_node;
 203
 204         while (n) {
 205                 cache = rb_entry(n, struct btrfs_block_group, cache_node);
 206                 end = cache->start + cache->length - 1;
 207                 start = cache->start;
 208
 209                 if (bytenr < start) {
 210                         if (!contains && (!ret || start < ret->start))
 211                                 ret = cache;
 212                         n = n->rb_left;
 213                 } else if (bytenr > start) {
 214                         if (contains && bytenr <= end) {
 215                                 ret = cache;
 216                                 break;
 217                         }
 218                         n = n->rb_right;
 219                 } else {
 220                         ret = cache;
 221                         break;
 222                 }
 223         }
 224         if (ret) {
 225                 btrfs_get_block_group(ret);
 226                 if (bytenr == 0 && info->first_logical_byte > ret->start)
 227                         info->first_logical_byte = ret->start;
 228         }
 229         spin_unlock(&info->block_group_cache_lock);
 230
 231         return ret;
 232 }
 233
 234 /*
 235  * Return the block group that starts at or after bytenr
 236  */
 237 struct btrfs_block_group *btrfs_lookup_first_block_group(
 238                 struct btrfs_fs_info *info, u64 bytenr)
 239 {
 240         return block_group_cache_tree_search(info, bytenr, 0);
 241 }
 242
 243 /*
 244  * Return the block group that contains the given bytenr
 245  */
 246 struct btrfs_block_group *btrfs_lookup_block_group(
 247                 struct btrfs_fs_info *info, u64 bytenr)
 248 {
 249         return block_group_cache_tree_search(info, bytenr, 1);
 250 }
 251
 252 struct btrfs_block_group *btrfs_next_block_group(
 253                 struct btrfs_block_group *cache)
 254 {
 255         struct btrfs_fs_info *fs_info = cache->fs_info;
 256         struct rb_node *node;
 257
 258         spin_lock(&fs_info->block_group_cache_lock);
 259
 260         /* If our block group was removed, we need a full search. */
 261         if (RB_EMPTY_NODE(&cache->cache_node)) {
 262                 const u64 next_bytenr = cache->start + cache->length;
 263
 264                 spin_unlock(&fs_info->block_group_cache_lock);
 265                 btrfs_put_block_group(cache);
 266                 cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
 267         }
 268         node = rb_next(&cache->cache_node);
 269         btrfs_put_block_group(cache);
 270         if (node) {
 271                 cache = rb_entry(node, struct btrfs_block_group, cache_node);
 272                 btrfs_get_block_group(cache);
 273         } else
 274                 cache = NULL;
 275         spin_unlock(&fs_info->block_group_cache_lock);
 276         return cache;
 277 }
 278
 279 bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
 280 {
 281         struct btrfs_block_group *bg;
 282         bool ret = true;
 283
 284         bg = btrfs_lookup_block_group(fs_info, bytenr);
 285         if (!bg)
 286                 return false;
 287
 288         spin_lock(&bg->lock);
 289         if (bg->ro)
 290                 ret = false;
 291         else
 292                 atomic_inc(&bg->nocow_writers);
 293         spin_unlock(&bg->lock);
 294
 295         /* No put on block group, done by btrfs_dec_nocow_writers */
 296         if (!ret)
 297                 btrfs_put_block_group(bg);
 298
 299         return ret;
 300 }
 301
 302 void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
 303 {
 304         struct btrfs_block_group *bg;
 305
 306         bg = btrfs_lookup_block_group(fs_info, bytenr);
 307         ASSERT(bg);
 308         if (atomic_dec_and_test(&bg->nocow_writers))
 309                 wake_up_var(&bg->nocow_writers);
 310         /*
 311          * Once for our lookup and once for the lookup done by a previous call
 312          * to btrfs_inc_nocow_writers()
 313          */
 314         btrfs_put_block_group(bg);
 315         btrfs_put_block_group(bg);
 316 }
 317
 318 void btrfs_wait_nocow_writers(struct btrfs_block_group *bg)
 319 {
 320         wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
 321 }
 322
 323 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
 324                                         const u64 start)
 325 {
 326         struct btrfs_block_group *bg;
 327
 328         bg = btrfs_lookup_block_group(fs_info, start);
 329         ASSERT(bg);
 330         if (atomic_dec_and_test(&bg->reservations))
 331                 wake_up_var(&bg->reservations);
 332         btrfs_put_block_group(bg);
 333 }
 334
 335 void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg)
 336 {
 337         struct btrfs_space_info *space_info = bg->space_info;
 338
 339         ASSERT(bg->ro);
 340
 341         if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
 342                 return;
 343
 344         /*
 345          * Our block group is read only but before we set it to read only,
 346          * some task might have had allocated an extent from it already, but it
 347          * has not yet created a respective ordered extent (and added it to a
 348          * root's list of ordered extents).
 349          * Therefore wait for any task currently allocating extents, since the
 350          * block group's reservations counter is incremented while a read lock
 351          * on the groups' semaphore is held and decremented after releasing
 352          * the read access on that semaphore and creating the ordered extent.
 353          */
 354         down_write(&space_info->groups_sem);
 355         up_write(&space_info->groups_sem);
 356
 357         wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
 358 }
 359
 360 struct btrfs_caching_control *btrfs_get_caching_control(
 361                 struct btrfs_block_group *cache)
 362 {
 363         struct btrfs_caching_control *ctl;
 364
 365         spin_lock(&cache->lock);
 366         if (!cache->caching_ctl) {
 367                 spin_unlock(&cache->lock);
 368                 return NULL;
 369         }
 370
 371         ctl = cache->caching_ctl;
 372         refcount_inc(&ctl->count);
 373         spin_unlock(&cache->lock);
 374         return ctl;
 375 }
 376
 377 void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
 378 {
 379         if (refcount_dec_and_test(&ctl->count))
 380                 kfree(ctl);
 381 }
 382
 383 /*
 384  * When we wait for progress in the block group caching, its because our
 385  * allocation attempt failed at least once.  So, we must sleep and let some
 386  * progress happen before we try again.
 387  *
 388  * This function will sleep at least once waiting for new free space to show
 389  * up, and then it will check the block group free space numbers for our min
 390  * num_bytes.  Another option is to have it go ahead and look in the rbtree for
 391  * a free extent of a given size, but this is a good start.
 392  *
 393  * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
 394  * any of the information in this block group.
 395  */
 396 void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
 397                                            u64 num_bytes)
 398 {
 399         struct btrfs_caching_control *caching_ctl;
 400
 401         caching_ctl = btrfs_get_caching_control(cache);
 402         if (!caching_ctl)
 403                 return;
 404
 405         wait_event(caching_ctl->wait, btrfs_block_group_done(cache) ||
 406                    (cache->free_space_ctl->free_space >= num_bytes));
 407
 408         btrfs_put_caching_control(caching_ctl);
 409 }
 410
 411 int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
 412 {
 413         struct btrfs_caching_control *caching_ctl;
 414         int ret = 0;
 415
 416         caching_ctl = btrfs_get_caching_control(cache);
 417         if (!caching_ctl)
 418                 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
 419
 420         wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
 421         if (cache->cached == BTRFS_CACHE_ERROR)
 422                 ret = -EIO;
 423         btrfs_put_caching_control(caching_ctl);
 424         return ret;
 425 }
 426
 427 static bool space_cache_v1_done(struct btrfs_block_group *cache)
 428 {
 429         bool ret;
 430
 431         spin_lock(&cache->lock);
 432         ret = cache->cached != BTRFS_CACHE_FAST;
 433         spin_unlock(&cache->lock);
 434
 435         return ret;
 436 }
 437
 438 void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache,
 439                                 struct btrfs_caching_control *caching_ctl)
 440 {
 441         wait_event(caching_ctl->wait, space_cache_v1_done(cache));
 442 }
 443
 444 #ifdef CONFIG_BTRFS_DEBUG
 445 static void fragment_free_space(struct btrfs_block_group *block_group)
 446 {
 447         struct btrfs_fs_info *fs_info = block_group->fs_info;
 448         u64 start = block_group->start;
 449         u64 len = block_group->length;
 450         u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
 451                 fs_info->nodesize : fs_info->sectorsize;
 452         u64 step = chunk << 1;
 453
 454         while (len > chunk) {
 455                 btrfs_remove_free_space(block_group, start, chunk);
 456                 start += step;
 457                 if (len < step)
 458                         len = 0;
 459                 else
 460                         len -= step;
 461         }
 462 }
 463 #endif
 464
 465 /*
 466  * This is only called by btrfs_cache_block_group, since we could have freed
 467  * extents we need to check the pinned_extents for any extents that can't be
 468  * used yet since their free space will be released as soon as the transaction
 469  * commits.
 470  */
 471 u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end)
 472 {
 473         struct btrfs_fs_info *info = block_group->fs_info;
 474         u64 extent_start, extent_end, size, total_added = 0;
 475         int ret;
 476
 477         while (start < end) {
 478                 ret = find_first_extent_bit(&info->excluded_extents, start,
 479                                             &extent_start, &extent_end,
 480                                             EXTENT_DIRTY | EXTENT_UPTODATE,
 481                                             NULL);
 482                 if (ret)
 483                         break;
 484
 485                 if (extent_start <= start) {
 486                         start = extent_end + 1;
 487                 } else if (extent_start > start && extent_start < end) {
 488                         size = extent_start - start;
 489                         total_added += size;
 490                         ret = btrfs_add_free_space_async_trimmed(block_group,
 491                                                                  start, size);
 492                         BUG_ON(ret); /* -ENOMEM or logic error */
 493                         start = extent_end + 1;
 494                 } else {
 495                         break;
 496                 }
 497         }
 498
 499         if (start < end) {
 500                 size = end - start;
 501                 total_added += size;
 502                 ret = btrfs_add_free_space_async_trimmed(block_group, start,
 503                                                          size);
 504                 BUG_ON(ret); /* -ENOMEM or logic error */
 505         }
 506
 507         return total_added;
 508 }
 509
 510 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
 511 {
 512         struct btrfs_block_group *block_group = caching_ctl->block_group;
 513         struct btrfs_fs_info *fs_info = block_group->fs_info;
 514         struct btrfs_root *extent_root = fs_info->extent_root;
 515         struct btrfs_path *path;
 516         struct extent_buffer *leaf;
 517         struct btrfs_key key;
 518         u64 total_found = 0;
 519         u64 last = 0;
 520         u32 nritems;
 521         int ret;
 522         bool wakeup = true;
 523
 524         path = btrfs_alloc_path();
 525         if (!path)
 526                 return -ENOMEM;
 527
 528         last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET);
 529
 530 #ifdef CONFIG_BTRFS_DEBUG
 531         /*
 532          * If we're fragmenting we don't want to make anybody think we can
 533          * allocate from this block group until we've had a chance to fragment
 534          * the free space.
 535          */
 536         if (btrfs_should_fragment_free_space(block_group))
 537                 wakeup = false;
 538 #endif
 539         /*
 540          * We don't want to deadlock with somebody trying to allocate a new
 541          * extent for the extent root while also trying to search the extent
 542          * root to add free space.  So we skip locking and search the commit
 543          * root, since its read-only
 544          */
 545         path->skip_locking = 1;
 546         path->search_commit_root = 1;
 547         path->reada = READA_FORWARD;
 548
 549         key.objectid = last;
 550         key.offset = 0;
 551         key.type = BTRFS_EXTENT_ITEM_KEY;
 552
 553 next:
 554         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 555         if (ret < 0)
 556                 goto out;
 557
 558         leaf = path->nodes[0];
 559         nritems = btrfs_header_nritems(leaf);
 560
 561         while (1) {
 562                 if (btrfs_fs_closing(fs_info) > 1) {
 563                         last = (u64)-1;
 564                         break;
 565                 }
 566
 567                 if (path->slots[0] < nritems) {
 568                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 569                 } else {
 570                         ret = btrfs_find_next_key(extent_root, path, &key, 0, 0);
 571                         if (ret)
 572                                 break;
 573
 574                         if (need_resched() ||
 575                             rwsem_is_contended(&fs_info->commit_root_sem)) {
 576                                 if (wakeup)
 577                                         caching_ctl->progress = last;
 578                                 btrfs_release_path(path);
 579                                 up_read(&fs_info->commit_root_sem);
 580                                 mutex_unlock(&caching_ctl->mutex);
 581                                 cond_resched();
 582                                 mutex_lock(&caching_ctl->mutex);
 583                                 down_read(&fs_info->commit_root_sem);
 584                                 goto next;
 585                         }
 586
 587                         ret = btrfs_next_leaf(extent_root, path);
 588                         if (ret < 0)
 589                                 goto out;
 590                         if (ret)
 591                                 break;
 592                         leaf = path->nodes[0];
 593                         nritems = btrfs_header_nritems(leaf);
 594                         continue;
 595                 }
 596
 597                 if (key.objectid < last) {
 598                         key.objectid = last;
 599                         key.offset = 0;
 600                         key.type = BTRFS_EXTENT_ITEM_KEY;
 601
 602                         if (wakeup)
 603                                 caching_ctl->progress = last;
 604                         btrfs_release_path(path);
 605                         goto next;
 606                 }
 607
 608                 if (key.objectid < block_group->start) {
 609                         path->slots[0]++;
 610                         continue;
 611                 }
 612
 613                 if (key.objectid >= block_group->start + block_group->length)
 614                         break;
 615
 616                 if (key.type == BTRFS_EXTENT_ITEM_KEY ||
 617                     key.type == BTRFS_METADATA_ITEM_KEY) {
 618                         total_found += add_new_free_space(block_group, last,
 619                                                           key.objectid);
 620                         if (key.type == BTRFS_METADATA_ITEM_KEY)
 621                                 last = key.objectid +
 622                                         fs_info->nodesize;
 623                         else
 624                                 last = key.objectid + key.offset;
 625
 626                         if (total_found > CACHING_CTL_WAKE_UP) {
 627                                 total_found = 0;
 628                                 if (wakeup)
 629                                         wake_up(&caching_ctl->wait);
 630                         }
 631                 }
 632                 path->slots[0]++;
 633         }
 634         ret = 0;
 635
 636         total_found += add_new_free_space(block_group, last,
 637                                 block_group->start + block_group->length);
 638         caching_ctl->progress = (u64)-1;
 639
 640 out:
 641         btrfs_free_path(path);
 642         return ret;
 643 }
 644
 645 static noinline void caching_thread(struct btrfs_work *work)
 646 {
 647         struct btrfs_block_group *block_group;
 648         struct btrfs_fs_info *fs_info;
 649         struct btrfs_caching_control *caching_ctl;
 650         int ret;
 651
 652         caching_ctl = container_of(work, struct btrfs_caching_control, work);
 653         block_group = caching_ctl->block_group;
 654         fs_info = block_group->fs_info;
 655
 656         mutex_lock(&caching_ctl->mutex);
 657         down_read(&fs_info->commit_root_sem);
 658
 659         if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
 660                 ret = load_free_space_cache(block_group);
 661                 if (ret == 1) {
 662                         ret = 0;
 663                         goto done;
 664                 }
 665
 666                 /*
 667                  * We failed to load the space cache, set ourselves to
 668                  * CACHE_STARTED and carry on.
 669                  */
 670                 spin_lock(&block_group->lock);
 671                 block_group->cached = BTRFS_CACHE_STARTED;
 672                 spin_unlock(&block_group->lock);
 673                 wake_up(&caching_ctl->wait);
 674         }
 675
 676         if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
 677                 ret = load_free_space_tree(caching_ctl);
 678         else
 679                 ret = load_extent_tree_free(caching_ctl);
 680 done:
 681         spin_lock(&block_group->lock);
 682         block_group->caching_ctl = NULL;
 683         block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
 684         spin_unlock(&block_group->lock);
 685
 686 #ifdef CONFIG_BTRFS_DEBUG
 687         if (btrfs_should_fragment_free_space(block_group)) {
 688                 u64 bytes_used;
 689
 690                 spin_lock(&block_group->space_info->lock);
 691                 spin_lock(&block_group->lock);
 692                 bytes_used = block_group->length - block_group->used;
 693                 block_group->space_info->bytes_used += bytes_used >> 1;
 694                 spin_unlock(&block_group->lock);
 695                 spin_unlock(&block_group->space_info->lock);
 696                 fragment_free_space(block_group);
 697         }
 698 #endif
 699
 700         caching_ctl->progress = (u64)-1;
 701
 702         up_read(&fs_info->commit_root_sem);
 703         btrfs_free_excluded_extents(block_group);
 704         mutex_unlock(&caching_ctl->mutex);
 705
 706         wake_up(&caching_ctl->wait);
 707
 708         btrfs_put_caching_control(caching_ctl);
 709         btrfs_put_block_group(block_group);
 710 }
 711
 712 int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only)
 713 {
 714         DEFINE_WAIT(wait);
 715         struct btrfs_fs_info *fs_info = cache->fs_info;
 716         struct btrfs_caching_control *caching_ctl = NULL;
 717         int ret = 0;
 718
 719         caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
 720         if (!caching_ctl)
 721                 return -ENOMEM;
 722
 723         INIT_LIST_HEAD(&caching_ctl->list);
 724         mutex_init(&caching_ctl->mutex);
 725         init_waitqueue_head(&caching_ctl->wait);
 726         caching_ctl->block_group = cache;
 727         caching_ctl->progress = cache->start;
 728         refcount_set(&caching_ctl->count, 2);
 729         btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
 730
 731         spin_lock(&cache->lock);
 732         if (cache->cached != BTRFS_CACHE_NO) {
 733                 kfree(caching_ctl);
 734
 735                 caching_ctl = cache->caching_ctl;
 736                 if (caching_ctl)
 737                         refcount_inc(&caching_ctl->count);
 738                 spin_unlock(&cache->lock);
 739                 goto out;
 740         }
 741         WARN_ON(cache->caching_ctl);
 742         cache->caching_ctl = caching_ctl;
 743         if (btrfs_test_opt(fs_info, SPACE_CACHE))
 744                 cache->cached = BTRFS_CACHE_FAST;
 745         else
 746                 cache->cached = BTRFS_CACHE_STARTED;
 747         cache->has_caching_ctl = 1;
 748         spin_unlock(&cache->lock);
 749
 750         spin_lock(&fs_info->block_group_cache_lock);
 751         refcount_inc(&caching_ctl->count);
 752         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
 753         spin_unlock(&fs_info->block_group_cache_lock);
 754
 755         btrfs_get_block_group(cache);
 756
 757         btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
 758 out:
 759         if (load_cache_only && caching_ctl)
 760                 btrfs_wait_space_cache_v1_finished(cache, caching_ctl);
 761         if (caching_ctl)
 762                 btrfs_put_caching_control(caching_ctl);
 763
 764         return ret;
 765 }
 766
 767 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 768 {
 769         u64 extra_flags = chunk_to_extended(flags) &
 770                                 BTRFS_EXTENDED_PROFILE_MASK;
 771
 772         write_seqlock(&fs_info->profiles_lock);
 773         if (flags & BTRFS_BLOCK_GROUP_DATA)
 774                 fs_info->avail_data_alloc_bits &= ~extra_flags;
 775         if (flags & BTRFS_BLOCK_GROUP_METADATA)
 776                 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
 777         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
 778                 fs_info->avail_system_alloc_bits &= ~extra_flags;
 779         write_sequnlock(&fs_info->profiles_lock);
 780 }
 781
 782 /*
 783  * Clear incompat bits for the following feature(s):
 784  *
 785  * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
 786  *            in the whole filesystem
 787  *
 788  * - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups
 789  */
 790 static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
 791 {
 792         bool found_raid56 = false;
 793         bool found_raid1c34 = false;
 794
 795         if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) ||
 796             (flags & BTRFS_BLOCK_GROUP_RAID1C3) ||
 797             (flags & BTRFS_BLOCK_GROUP_RAID1C4)) {
 798                 struct list_head *head = &fs_info->space_info;
 799                 struct btrfs_space_info *sinfo;
 800
 801                 list_for_each_entry_rcu(sinfo, head, list) {
 802                         down_read(&sinfo->groups_sem);
 803                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
 804                                 found_raid56 = true;
 805                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
 806                                 found_raid56 = true;
 807                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C3]))
 808                                 found_raid1c34 = true;
 809                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C4]))
 810                                 found_raid1c34 = true;
 811                         up_read(&sinfo->groups_sem);
 812                 }
 813                 if (!found_raid56)
 814                         btrfs_clear_fs_incompat(fs_info, RAID56);
 815                 if (!found_raid1c34)
 816                         btrfs_clear_fs_incompat(fs_info, RAID1C34);
 817         }
 818 }
 819
 820 static int remove_block_group_item(struct btrfs_trans_handle *trans,
 821                                    struct btrfs_path *path,
 822                                    struct btrfs_block_group *block_group)
 823 {
 824         struct btrfs_fs_info *fs_info = trans->fs_info;
 825         struct btrfs_root *root;
 826         struct btrfs_key key;
 827         int ret;
 828
 829         root = fs_info->extent_root;
 830         key.objectid = block_group->start;
 831         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
 832         key.offset = block_group->length;
 833
 834         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 835         if (ret > 0)
 836                 ret = -ENOENT;
 837         if (ret < 0)
 838                 return ret;
 839
 840         ret = btrfs_del_item(trans, root, path);
 841         return ret;
 842 }
 843
 844 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 845                              u64 group_start, struct extent_map *em)
 846 {
 847         struct btrfs_fs_info *fs_info = trans->fs_info;
 848         struct btrfs_path *path;
 849         struct btrfs_block_group *block_group;
 850         struct btrfs_free_cluster *cluster;
 851         struct btrfs_root *tree_root = fs_info->tree_root;
 852         struct btrfs_key key;
 853         struct inode *inode;
 854         struct kobject *kobj = NULL;
 855         int ret;
 856         int index;
 857         int factor;
 858         struct btrfs_caching_control *caching_ctl = NULL;
 859         bool remove_em;
 860         bool remove_rsv = false;
 861
 862         block_group = btrfs_lookup_block_group(fs_info, group_start);
 863         BUG_ON(!block_group);
 864         BUG_ON(!block_group->ro);
 865
 866         trace_btrfs_remove_block_group(block_group);
 867         /*
 868          * Free the reserved super bytes from this block group before
 869          * remove it.
 870          */
 871         btrfs_free_excluded_extents(block_group);
 872         btrfs_free_ref_tree_range(fs_info, block_group->start,
 873                                   block_group->length);
 874
 875         index = btrfs_bg_flags_to_raid_index(block_group->flags);
 876         factor = btrfs_bg_type_to_factor(block_group->flags);
 877
 878         /* make sure this block group isn't part of an allocation cluster */
 879         cluster = &fs_info->data_alloc_cluster;
 880         spin_lock(&cluster->refill_lock);
 881         btrfs_return_cluster_to_free_space(block_group, cluster);
 882         spin_unlock(&cluster->refill_lock);
 883
 884         /*
 885          * make sure this block group isn't part of a metadata
 886          * allocation cluster
 887          */
 888         cluster = &fs_info->meta_alloc_cluster;
 889         spin_lock(&cluster->refill_lock);
 890         btrfs_return_cluster_to_free_space(block_group, cluster);
 891         spin_unlock(&cluster->refill_lock);
 892
 893         path = btrfs_alloc_path();
 894         if (!path) {
 895                 ret = -ENOMEM;
 896                 goto out;
 897         }
 898
 899         /*
 900          * get the inode first so any iput calls done for the io_list
 901          * aren't the final iput (no unlinks allowed now)
 902          */
 903         inode = lookup_free_space_inode(block_group, path);
 904
 905         mutex_lock(&trans->transaction->cache_write_mutex);
 906         /*
 907          * Make sure our free space cache IO is done before removing the
 908          * free space inode
 909          */
 910         spin_lock(&trans->transaction->dirty_bgs_lock);
 911         if (!list_empty(&block_group->io_list)) {
 912                 list_del_init(&block_group->io_list);
 913
 914                 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
 915
 916                 spin_unlock(&trans->transaction->dirty_bgs_lock);
 917                 btrfs_wait_cache_io(trans, block_group, path);
 918                 btrfs_put_block_group(block_group);
 919                 spin_lock(&trans->transaction->dirty_bgs_lock);
 920         }
 921
 922         if (!list_empty(&block_group->dirty_list)) {
 923                 list_del_init(&block_group->dirty_list);
 924                 remove_rsv = true;
 925                 btrfs_put_block_group(block_group);
 926         }
 927         spin_unlock(&trans->transaction->dirty_bgs_lock);
 928         mutex_unlock(&trans->transaction->cache_write_mutex);
 929
 930         if (!IS_ERR(inode)) {
 931                 ret = btrfs_orphan_add(trans, BTRFS_I(inode));
 932                 if (ret) {
 933                         btrfs_add_delayed_iput(inode);
 934                         goto out;
 935                 }
 936                 clear_nlink(inode);
 937                 /* One for the block groups ref */
 938                 spin_lock(&block_group->lock);
 939                 if (block_group->iref) {
 940                         block_group->iref = 0;
 941                         block_group->inode = NULL;
 942                         spin_unlock(&block_group->lock);
 943                         iput(inode);
 944                 } else {
 945                         spin_unlock(&block_group->lock);
 946                 }
 947                 /* One for our lookup ref */
 948                 btrfs_add_delayed_iput(inode);
 949         }
 950
 951         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
 952         key.type = 0;
 953         key.offset = block_group->start;
 954
 955         ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
 956         if (ret < 0)
 957                 goto out;
 958         if (ret > 0)
 959                 btrfs_release_path(path);
 960         if (ret == 0) {
 961                 ret = btrfs_del_item(trans, tree_root, path);
 962                 if (ret)
 963                         goto out;
 964                 btrfs_release_path(path);
 965         }
 966
 967         spin_lock(&fs_info->block_group_cache_lock);
 968         rb_erase(&block_group->cache_node,
 969                  &fs_info->block_group_cache_tree);
 970         RB_CLEAR_NODE(&block_group->cache_node);
 971
 972         /* Once for the block groups rbtree */
 973         btrfs_put_block_group(block_group);
 974
 975         if (fs_info->first_logical_byte == block_group->start)
 976                 fs_info->first_logical_byte = (u64)-1;
 977         spin_unlock(&fs_info->block_group_cache_lock);
 978
 979         down_write(&block_group->space_info->groups_sem);
 980         /*
 981          * we must use list_del_init so people can check to see if they
 982          * are still on the list after taking the semaphore
 983          */
 984         list_del_init(&block_group->list);
 985         if (list_empty(&block_group->space_info->block_groups[index])) {
 986                 kobj = block_group->space_info->block_group_kobjs[index];
 987                 block_group->space_info->block_group_kobjs[index] = NULL;
 988                 clear_avail_alloc_bits(fs_info, block_group->flags);
 989         }
 990         up_write(&block_group->space_info->groups_sem);
 991         clear_incompat_bg_bits(fs_info, block_group->flags);
 992         if (kobj) {
 993                 kobject_del(kobj);
 994                 kobject_put(kobj);
 995         }
 996
 997         if (block_group->has_caching_ctl)
 998                 caching_ctl = btrfs_get_caching_control(block_group);
 999         if (block_group->cached == BTRFS_CACHE_STARTED)
1000                 btrfs_wait_block_group_cache_done(block_group);
1001         if (block_group->has_caching_ctl) {
1002                 spin_lock(&fs_info->block_group_cache_lock);
1003                 if (!caching_ctl) {
1004                         struct btrfs_caching_control *ctl;
1005
1006                         list_for_each_entry(ctl,
1007                                     &fs_info->caching_block_groups, list)
1008                                 if (ctl->block_group == block_group) {
1009                                         caching_ctl = ctl;
1010                                         refcount_inc(&caching_ctl->count);
1011                                         break;
1012                                 }
1013                 }
1014                 if (caching_ctl)
1015                         list_del_init(&caching_ctl->list);
1016                 spin_unlock(&fs_info->block_group_cache_lock);
1017                 if (caching_ctl) {
1018                         /* Once for the caching bgs list and once for us. */
1019                         btrfs_put_caching_control(caching_ctl);
1020                         btrfs_put_caching_control(caching_ctl);
1021                 }
1022         }
1023
1024         spin_lock(&trans->transaction->dirty_bgs_lock);
1025         WARN_ON(!list_empty(&block_group->dirty_list));
1026         WARN_ON(!list_empty(&block_group->io_list));
1027         spin_unlock(&trans->transaction->dirty_bgs_lock);
1028
1029         btrfs_remove_free_space_cache(block_group);
1030
1031         spin_lock(&block_group->space_info->lock);
1032         list_del_init(&block_group->ro_list);
1033
1034         if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
1035                 WARN_ON(block_group->space_info->total_bytes
1036                         < block_group->length);
1037                 WARN_ON(block_group->space_info->bytes_readonly
1038                         < block_group->length);
1039                 WARN_ON(block_group->space_info->disk_total
1040                         < block_group->length * factor);
1041         }
1042         block_group->space_info->total_bytes -= block_group->length;
1043         block_group->space_info->bytes_readonly -= block_group->length;
1044         block_group->space_info->disk_total -= block_group->length * factor;
1045
1046         spin_unlock(&block_group->space_info->lock);
1047
1048         /*
1049          * Remove the free space for the block group from the free space tree
1050          * and the block group's item from the extent tree before marking the
1051          * block group as removed. This is to prevent races with tasks that
1052          * freeze and unfreeze a block group, this task and another task
1053          * allocating a new block group - the unfreeze task ends up removing
1054          * the block group's extent map before the task calling this function
1055          * deletes the block group item from the extent tree, allowing for
1056          * another task to attempt to create another block group with the same
1057          * item key (and failing with -EEXIST and a transaction abort).
1058          */
1059         ret = remove_block_group_free_space(trans, block_group);
1060         if (ret)
1061                 goto out;
1062
1063         ret = remove_block_group_item(trans, path, block_group);
1064         if (ret < 0)
1065                 goto out;
1066
1067         spin_lock(&block_group->lock);
1068         block_group->removed = 1;
1069         /*
1070          * At this point trimming or scrub can't start on this block group,
1071          * because we removed the block group from the rbtree
1072          * fs_info->block_group_cache_tree so no one can't find it anymore and
1073          * even if someone already got this block group before we removed it
1074          * from the rbtree, they have already incremented block_group->frozen -
1075          * if they didn't, for the trimming case they won't find any free space
1076          * entries because we already removed them all when we called
1077          * btrfs_remove_free_space_cache().
1078          *
1079          * And we must not remove the extent map from the fs_info->mapping_tree
1080          * to prevent the same logical address range and physical device space
1081          * ranges from being reused for a new block group. This is needed to
1082          * avoid races with trimming and scrub.
1083          *
1084          * An fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
1085          * completely transactionless, so while it is trimming a range the
1086          * currently running transaction might finish and a new one start,
1087          * allowing for new block groups to be created that can reuse the same
1088          * physical device locations unless we take this special care.
1089          *
1090          * There may also be an implicit trim operation if the file system
1091          * is mounted with -odiscard. The same protections must remain
1092          * in place until the extents have been discarded completely when
1093          * the transaction commit has completed.
1094          */
1095         remove_em = (atomic_read(&block_group->frozen) == 0);
1096         spin_unlock(&block_group->lock);
1097
1098         if (remove_em) {
1099                 struct extent_map_tree *em_tree;
1100
1101                 em_tree = &fs_info->mapping_tree;
1102                 write_lock(&em_tree->lock);
1103                 remove_extent_mapping(em_tree, em);
1104                 write_unlock(&em_tree->lock);
1105                 /* once for the tree */
1106                 free_extent_map(em);
1107         }
1108
1109 out:
1110         /* Once for the lookup reference */
1111         btrfs_put_block_group(block_group);
1112         if (remove_rsv)
1113                 btrfs_delayed_refs_rsv_release(fs_info, 1);
1114         btrfs_free_path(path);
1115         return ret;
1116 }
1117
1118 struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
1119                 struct btrfs_fs_info *fs_info, const u64 chunk_offset)
1120 {
1121         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
1122         struct extent_map *em;
1123         struct map_lookup *map;
1124         unsigned int num_items;
1125
1126         read_lock(&em_tree->lock);
1127         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
1128         read_unlock(&em_tree->lock);
1129         ASSERT(em && em->start == chunk_offset);
1130
1131         /*
1132          * We need to reserve 3 + N units from the metadata space info in order
1133          * to remove a block group (done at btrfs_remove_chunk() and at
1134          * btrfs_remove_block_group()), which are used for:
1135          *
1136          * 1 unit for adding the free space inode's orphan (located in the tree
1137          * of tree roots).
1138          * 1 unit for deleting the block group item (located in the extent
1139          * tree).
1140          * 1 unit for deleting the free space item (located in tree of tree
1141          * roots).
1142          * N units for deleting N device extent items corresponding to each
1143          * stripe (located in the device tree).
1144          *
1145          * In order to remove a block group we also need to reserve units in the
1146          * system space info in order to update the chunk tree (update one or
1147          * more device items and remove one chunk item), but this is done at
1148          * btrfs_remove_chunk() through a call to check_system_chunk().
1149          */
1150         map = em->map_lookup;
1151         num_items = 3 + map->num_stripes;
1152         free_extent_map(em);
1153
1154         return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
1155                                                            num_items);
1156 }
1157
1158 /*
1159  * Mark block group @cache read-only, so later write won't happen to block
1160  * group @cache.
1161  *
1162  * If @force is not set, this function will only mark the block group readonly
1163  * if we have enough free space (1M) in other metadata/system block groups.
1164  * If @force is not set, this function will mark the block group readonly
1165  * without checking free space.
1166  *
1167  * NOTE: This function doesn't care if other block groups can contain all the
1168  * data in this block group. That check should be done by relocation routine,
1169  * not this function.
1170  */
1171 static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
1172 {
1173         struct btrfs_space_info *sinfo = cache->space_info;
1174         u64 num_bytes;
1175         int ret = -ENOSPC;
1176
1177         spin_lock(&sinfo->lock);
1178         spin_lock(&cache->lock);
1179
1180         if (cache->ro) {
1181                 cache->ro++;
1182                 ret = 0;
1183                 goto out;
1184         }
1185
1186         num_bytes = cache->length - cache->reserved - cache->pinned -
1187                     cache->bytes_super - cache->used;
1188
1189         /*
1190          * Data never overcommits, even in mixed mode, so do just the straight
1191          * check of left over space in how much we have allocated.
1192          */
1193         if (force) {
1194                 ret = 0;
1195         } else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) {
1196                 u64 sinfo_used = btrfs_space_info_used(sinfo, true);
1197
1198                 /*
1199                  * Here we make sure if we mark this bg RO, we still have enough
1200                  * free space as buffer.
1201                  */
1202                 if (sinfo_used + num_bytes <= sinfo->total_bytes)
1203                         ret = 0;
1204         } else {
1205                 /*
1206                  * We overcommit metadata, so we need to do the
1207                  * btrfs_can_overcommit check here, and we need to pass in
1208                  * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of
1209                  * leeway to allow us to mark this block group as read only.
1210                  */
1211                 if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes,
1212                                          BTRFS_RESERVE_NO_FLUSH))
1213                         ret = 0;
1214         }
1215
1216         if (!ret) {
1217                 sinfo->bytes_readonly += num_bytes;
1218                 cache->ro++;
1219                 list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
1220         }
1221 out:
1222         spin_unlock(&cache->lock);
1223         spin_unlock(&sinfo->lock);
1224         if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
1225                 btrfs_info(cache->fs_info,
1226                         "unable to make block group %llu ro", cache->start);
1227                 btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
1228         }
1229         return ret;
1230 }
1231
1232 static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
1233                                  struct btrfs_block_group *bg)
1234 {
1235         struct btrfs_fs_info *fs_info = bg->fs_info;
1236         struct btrfs_transaction *prev_trans = NULL;
1237         const u64 start = bg->start;
1238         const u64 end = start + bg->length - 1;
1239         int ret;
1240
1241         spin_lock(&fs_info->trans_lock);
1242         if (trans->transaction->list.prev != &fs_info->trans_list) {
1243                 prev_trans = list_last_entry(&trans->transaction->list,
1244                                              struct btrfs_transaction, list);
1245                 refcount_inc(&prev_trans->use_count);
1246         }
1247         spin_unlock(&fs_info->trans_lock);
1248
1249         /*
1250          * Hold the unused_bg_unpin_mutex lock to avoid racing with
1251          * btrfs_finish_extent_commit(). If we are at transaction N, another
1252          * task might be running finish_extent_commit() for the previous
1253          * transaction N - 1, and have seen a range belonging to the block
1254          * group in pinned_extents before we were able to clear the whole block
1255          * group range from pinned_extents. This means that task can lookup for
1256          * the block group after we unpinned it from pinned_extents and removed
1257          * it, leading to a BUG_ON() at unpin_extent_range().
1258          */
1259         mutex_lock(&fs_info->unused_bg_unpin_mutex);
1260         if (prev_trans) {
1261                 ret = clear_extent_bits(&prev_trans->pinned_extents, start, end,
1262                                         EXTENT_DIRTY);
1263                 if (ret)
1264                         goto out;
1265         }
1266
1267         ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end,
1268                                 EXTENT_DIRTY);
1269 out:
1270         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
1271         if (prev_trans)
1272                 btrfs_put_transaction(prev_trans);
1273
1274         return ret == 0;
1275 }
1276
1277 /*
1278  * Process the unused_bgs list and remove any that don't have any allocated
1279  * space inside of them.
1280  */
1281 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
1282 {
1283         struct btrfs_block_group *block_group;
1284         struct btrfs_space_info *space_info;
1285         struct btrfs_trans_handle *trans;
1286         const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC);
1287         int ret = 0;
1288
1289         if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1290                 return;
1291
1292         spin_lock(&fs_info->unused_bgs_lock);
1293         while (!list_empty(&fs_info->unused_bgs)) {
1294                 int trimming;
1295
1296                 block_group = list_first_entry(&fs_info->unused_bgs,
1297                                                struct btrfs_block_group,
1298                                                bg_list);
1299                 list_del_init(&block_group->bg_list);
1300
1301                 space_info = block_group->space_info;
1302
1303                 if (ret || btrfs_mixed_space_info(space_info)) {
1304                         btrfs_put_block_group(block_group);
1305                         continue;
1306                 }
1307                 spin_unlock(&fs_info->unused_bgs_lock);
1308
1309                 btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
1310
1311                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
1312
1313                 /* Don't want to race with allocators so take the groups_sem */
1314                 down_write(&space_info->groups_sem);
1315
1316                 /*
1317                  * Async discard moves the final block group discard to be prior
1318                  * to the unused_bgs code path.  Therefore, if it's not fully
1319                  * trimmed, punt it back to the async discard lists.
1320                  */
1321                 if (btrfs_test_opt(fs_info, DISCARD_ASYNC) &&
1322                     !btrfs_is_free_space_trimmed(block_group)) {
1323                         trace_btrfs_skip_unused_block_group(block_group);
1324                         up_write(&space_info->groups_sem);
1325                         /* Requeue if we failed because of async discard */
1326                         btrfs_discard_queue_work(&fs_info->discard_ctl,
1327                                                  block_group);
1328                         goto next;
1329                 }
1330
1331                 spin_lock(&block_group->lock);
1332                 if (block_group->reserved || block_group->pinned ||
1333                     block_group->used || block_group->ro ||
1334                     list_is_singular(&block_group->list)) {
1335                         /*
1336                          * We want to bail if we made new allocations or have
1337                          * outstanding allocations in this block group.  We do
1338                          * the ro check in case balance is currently acting on
1339                          * this block group.
1340                          */
1341                         trace_btrfs_skip_unused_block_group(block_group);
1342                         spin_unlock(&block_group->lock);
1343                         up_write(&space_info->groups_sem);
1344                         goto next;
1345                 }
1346                 spin_unlock(&block_group->lock);
1347
1348                 /* We don't want to force the issue, only flip if it's ok. */
1349                 ret = inc_block_group_ro(block_group, 0);
1350                 up_write(&space_info->groups_sem);
1351                 if (ret < 0) {
1352                         ret = 0;
1353                         goto next;
1354                 }
1355
1356                 /*
1357                  * Want to do this before we do anything else so we can recover
1358                  * properly if we fail to join the transaction.
1359                  */
1360                 trans = btrfs_start_trans_remove_block_group(fs_info,
1361                                                      block_group->start);
1362                 if (IS_ERR(trans)) {
1363                         btrfs_dec_block_group_ro(block_group);
1364                         ret = PTR_ERR(trans);
1365                         goto next;
1366                 }
1367
1368                 /*
1369                  * We could have pending pinned extents for this block group,
1370                  * just delete them, we don't care about them anymore.
1371                  */
1372                 if (!clean_pinned_extents(trans, block_group)) {
1373                         btrfs_dec_block_group_ro(block_group);
1374                         goto end_trans;
1375                 }
1376
1377                 /*
1378                  * At this point, the block_group is read only and should fail
1379                  * new allocations.  However, btrfs_finish_extent_commit() can
1380                  * cause this block_group to be placed back on the discard
1381                  * lists because now the block_group isn't fully discarded.
1382                  * Bail here and try again later after discarding everything.
1383                  */
1384                 spin_lock(&fs_info->discard_ctl.lock);
1385                 if (!list_empty(&block_group->discard_list)) {
1386                         spin_unlock(&fs_info->discard_ctl.lock);
1387                         btrfs_dec_block_group_ro(block_group);
1388                         btrfs_discard_queue_work(&fs_info->discard_ctl,
1389                                                  block_group);
1390                         goto end_trans;
1391                 }
1392                 spin_unlock(&fs_info->discard_ctl.lock);
1393
1394                 /* Reset pinned so btrfs_put_block_group doesn't complain */
1395                 spin_lock(&space_info->lock);
1396                 spin_lock(&block_group->lock);
1397
1398                 btrfs_space_info_update_bytes_pinned(fs_info, space_info,
1399                                                      -block_group->pinned);
1400                 space_info->bytes_readonly += block_group->pinned;
1401                 percpu_counter_add_batch(&space_info->total_bytes_pinned,
1402                                    -block_group->pinned,
1403                                    BTRFS_TOTAL_BYTES_PINNED_BATCH);
1404                 block_group->pinned = 0;
1405
1406                 spin_unlock(&block_group->lock);
1407                 spin_unlock(&space_info->lock);
1408
1409                 /*
1410                  * The normal path here is an unused block group is passed here,
1411                  * then trimming is handled in the transaction commit path.
1412                  * Async discard interposes before this to do the trimming
1413                  * before coming down the unused block group path as trimming
1414                  * will no longer be done later in the transaction commit path.
1415                  */
1416                 if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC))
1417                         goto flip_async;
1418
1419                 /* DISCARD can flip during remount */
1420                 trimming = btrfs_test_opt(fs_info, DISCARD_SYNC);
1421
1422                 /* Implicit trim during transaction commit. */
1423                 if (trimming)
1424                         btrfs_freeze_block_group(block_group);
1425
1426                 /*
1427                  * Btrfs_remove_chunk will abort the transaction if things go
1428                  * horribly wrong.
1429                  */
1430                 ret = btrfs_remove_chunk(trans, block_group->start);
1431
1432                 if (ret) {
1433                         if (trimming)
1434                                 btrfs_unfreeze_block_group(block_group);
1435                         goto end_trans;
1436                 }
1437
1438                 /*
1439                  * If we're not mounted with -odiscard, we can just forget
1440                  * about this block group. Otherwise we'll need to wait
1441                  * until transaction commit to do the actual discard.
1442                  */
1443                 if (trimming) {
1444                         spin_lock(&fs_info->unused_bgs_lock);
1445                         /*
1446                          * A concurrent scrub might have added us to the list
1447                          * fs_info->unused_bgs, so use a list_move operation
1448                          * to add the block group to the deleted_bgs list.
1449                          */
1450                         list_move(&block_group->bg_list,
1451                                   &trans->transaction->deleted_bgs);
1452                         spin_unlock(&fs_info->unused_bgs_lock);
1453                         btrfs_get_block_group(block_group);
1454                 }
1455 end_trans:
1456                 btrfs_end_transaction(trans);
1457 next:
1458                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
1459                 btrfs_put_block_group(block_group);
1460                 spin_lock(&fs_info->unused_bgs_lock);
1461         }
1462         spin_unlock(&fs_info->unused_bgs_lock);
1463         return;
1464
1465 flip_async:
1466         btrfs_end_transaction(trans);
1467         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
1468         btrfs_put_block_group(block_group);
1469         btrfs_discard_punt_unused_bgs_list(fs_info);
1470 }
1471
1472 void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
1473 {
1474         struct btrfs_fs_info *fs_info = bg->fs_info;
1475
1476         spin_lock(&fs_info->unused_bgs_lock);
1477         if (list_empty(&bg->bg_list)) {
1478                 btrfs_get_block_group(bg);
1479                 trace_btrfs_add_unused_block_group(bg);
1480                 list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
1481         }
1482         spin_unlock(&fs_info->unused_bgs_lock);
1483 }
1484
1485 static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
1486                            struct btrfs_path *path)
1487 {
1488         struct extent_map_tree *em_tree;
1489         struct extent_map *em;
1490         struct btrfs_block_group_item bg;
1491         struct extent_buffer *leaf;
1492         int slot;
1493         u64 flags;
1494         int ret = 0;
1495
1496         slot = path->slots[0];
1497         leaf = path->nodes[0];
1498
1499         em_tree = &fs_info->mapping_tree;
1500         read_lock(&em_tree->lock);
1501         em = lookup_extent_mapping(em_tree, key->objectid, key->offset);
1502         read_unlock(&em_tree->lock);
1503         if (!em) {
1504                 btrfs_err(fs_info,
1505                           "logical %llu len %llu found bg but no related chunk",
1506                           key->objectid, key->offset);
1507                 return -ENOENT;
1508         }
1509
1510         if (em->start != key->objectid || em->len != key->offset) {
1511                 btrfs_err(fs_info,
1512                         "block group %llu len %llu mismatch with chunk %llu len %llu",
1513                         key->objectid, key->offset, em->start, em->len);
1514                 ret = -EUCLEAN;
1515                 goto out_free_em;
1516         }
1517
1518         read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot),
1519                            sizeof(bg));
1520         flags = btrfs_stack_block_group_flags(&bg) &
1521                 BTRFS_BLOCK_GROUP_TYPE_MASK;
1522
1523         if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
1524                 btrfs_err(fs_info,
1525 "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
1526                           key->objectid, key->offset, flags,
1527                           (BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type));
1528                 ret = -EUCLEAN;
1529         }
1530
1531 out_free_em:
1532         free_extent_map(em);
1533         return ret;
1534 }
1535
1536 static int find_first_block_group(struct btrfs_fs_info *fs_info,
1537                                   struct btrfs_path *path,
1538                                   struct btrfs_key *key)
1539 {
1540         struct btrfs_root *root = fs_info->extent_root;
1541         int ret;
1542         struct btrfs_key found_key;
1543         struct extent_buffer *leaf;
1544         int slot;
1545
1546         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
1547         if (ret < 0)
1548                 return ret;
1549
1550         while (1) {
1551                 slot = path->slots[0];
1552                 leaf = path->nodes[0];
1553                 if (slot >= btrfs_header_nritems(leaf)) {
1554                         ret = btrfs_next_leaf(root, path);
1555                         if (ret == 0)
1556                                 continue;
1557                         if (ret < 0)
1558                                 goto out;
1559                         break;
1560                 }
1561                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
1562
1563                 if (found_key.objectid >= key->objectid &&
1564                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
1565                         ret = read_bg_from_eb(fs_info, &found_key, path);
1566                         break;
1567                 }
1568
1569                 path->slots[0]++;
1570         }
1571 out:
1572         return ret;
1573 }
1574
1575 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
1576 {
1577         u64 extra_flags = chunk_to_extended(flags) &
1578                                 BTRFS_EXTENDED_PROFILE_MASK;
1579
1580         write_seqlock(&fs_info->profiles_lock);
1581         if (flags & BTRFS_BLOCK_GROUP_DATA)
1582                 fs_info->avail_data_alloc_bits |= extra_flags;
1583         if (flags & BTRFS_BLOCK_GROUP_METADATA)
1584                 fs_info->avail_metadata_alloc_bits |= extra_flags;
1585         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
1586                 fs_info->avail_system_alloc_bits |= extra_flags;
1587         write_sequnlock(&fs_info->profiles_lock);
1588 }
1589
1590 /**
1591  * btrfs_rmap_block - Map a physical disk address to a list of logical addresses
1592  * @chunk_start:   logical address of block group
1593  * @physical:      physical address to map to logical addresses
1594  * @logical:       return array of logical addresses which map to @physical
1595  * @naddrs:        length of @logical
1596  * @stripe_len:    size of IO stripe for the given block group
1597  *
1598  * Maps a particular @physical disk address to a list of @logical addresses.
1599  * Used primarily to exclude those portions of a block group that contain super
1600  * block copies.
1601  */
1602 EXPORT_FOR_TESTS
1603 int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
1604                      u64 physical, u64 **logical, int *naddrs, int *stripe_len)
1605 {
1606         struct extent_map *em;
1607         struct map_lookup *map;
1608         u64 *buf;
1609         u64 bytenr;
1610         u64 data_stripe_length;
1611         u64 io_stripe_size;
1612         int i, nr = 0;
1613         int ret = 0;
1614
1615         em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
1616         if (IS_ERR(em))
1617                 return -EIO;
1618
1619         map = em->map_lookup;
1620         data_stripe_length = em->orig_block_len;
1621         io_stripe_size = map->stripe_len;
1622
1623         /* For RAID5/6 adjust to a full IO stripe length */
1624         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
1625                 io_stripe_size = map->stripe_len * nr_data_stripes(map);
1626
1627         buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
1628         if (!buf) {
1629                 ret = -ENOMEM;
1630                 goto out;
1631         }
1632
1633         for (i = 0; i < map->num_stripes; i++) {
1634                 bool already_inserted = false;
1635                 u64 stripe_nr;
1636                 int j;
1637
1638                 if (!in_range(physical, map->stripes[i].physical,
1639                               data_stripe_length))
1640                         continue;
1641
1642                 stripe_nr = physical - map->stripes[i].physical;
1643                 stripe_nr = div64_u64(stripe_nr, map->stripe_len);
1644
1645                 if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
1646                         stripe_nr = stripe_nr * map->num_stripes + i;
1647                         stripe_nr = div_u64(stripe_nr, map->sub_stripes);
1648                 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
1649                         stripe_nr = stripe_nr * map->num_stripes + i;
1650                 }
1651                 /*
1652                  * The remaining case would be for RAID56, multiply by
1653                  * nr_data_stripes().  Alternatively, just use rmap_len below
1654                  * instead of map->stripe_len
1655                  */
1656
1657                 bytenr = chunk_start + stripe_nr * io_stripe_size;
1658
1659                 /* Ensure we don't add duplicate addresses */
1660                 for (j = 0; j < nr; j++) {
1661                         if (buf[j] == bytenr) {
1662                                 already_inserted = true;
1663                                 break;
1664                         }
1665                 }
1666
1667                 if (!already_inserted)
1668                         buf[nr++] = bytenr;
1669         }
1670
1671         *logical = buf;
1672         *naddrs = nr;
1673         *stripe_len = io_stripe_size;
1674 out:
1675         free_extent_map(em);
1676         return ret;
1677 }
1678
1679 static int exclude_super_stripes(struct btrfs_block_group *cache)
1680 {
1681         struct btrfs_fs_info *fs_info = cache->fs_info;
1682         u64 bytenr;
1683         u64 *logical;
1684         int stripe_len;
1685         int i, nr, ret;
1686
1687         if (cache->start < BTRFS_SUPER_INFO_OFFSET) {
1688                 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start;
1689                 cache->bytes_super += stripe_len;
1690                 ret = btrfs_add_excluded_extent(fs_info, cache->start,
1691                                                 stripe_len);
1692                 if (ret)
1693                         return ret;
1694         }
1695
1696         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
1697                 bytenr = btrfs_sb_offset(i);
1698                 ret = btrfs_rmap_block(fs_info, cache->start,
1699                                        bytenr, &logical, &nr, &stripe_len);
1700                 if (ret)
1701                         return ret;
1702
1703                 while (nr--) {
1704                         u64 len = min_t(u64, stripe_len,
1705                                 cache->start + cache->length - logical[nr]);
1706
1707                         cache->bytes_super += len;
1708                         ret = btrfs_add_excluded_extent(fs_info, logical[nr],
1709                                                         len);
1710                         if (ret) {
1711                                 kfree(logical);
1712                                 return ret;
1713                         }
1714                 }
1715
1716                 kfree(logical);
1717         }
1718         return 0;
1719 }
1720
1721 static void link_block_group(struct btrfs_block_group *cache)
1722 {
1723         struct btrfs_space_info *space_info = cache->space_info;
1724         int index = btrfs_bg_flags_to_raid_index(cache->flags);
1725
1726         down_write(&space_info->groups_sem);
1727         list_add_tail(&cache->list, &space_info->block_groups[index]);
1728         up_write(&space_info->groups_sem);
1729 }
1730
1731 static struct btrfs_block_group *btrfs_create_block_group_cache(
1732                 struct btrfs_fs_info *fs_info, u64 start)
1733 {
1734         struct btrfs_block_group *cache;
1735
1736         cache = kzalloc(sizeof(*cache), GFP_NOFS);
1737         if (!cache)
1738                 return NULL;
1739
1740         cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
1741                                         GFP_NOFS);
1742         if (!cache->free_space_ctl) {
1743                 kfree(cache);
1744                 return NULL;
1745         }
1746
1747         cache->start = start;
1748
1749         cache->fs_info = fs_info;
1750         cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
1751
1752         cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
1753
1754         refcount_set(&cache->refs, 1);
1755         spin_lock_init(&cache->lock);
1756         init_rwsem(&cache->data_rwsem);
1757         INIT_LIST_HEAD(&cache->list);
1758         INIT_LIST_HEAD(&cache->cluster_list);
1759         INIT_LIST_HEAD(&cache->bg_list);
1760         INIT_LIST_HEAD(&cache->ro_list);
1761         INIT_LIST_HEAD(&cache->discard_list);
1762         INIT_LIST_HEAD(&cache->dirty_list);
1763         INIT_LIST_HEAD(&cache->io_list);
1764         btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
1765         atomic_set(&cache->frozen, 0);
1766         mutex_init(&cache->free_space_lock);
1767         btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
1768
1769         return cache;
1770 }
1771
1772 /*
1773  * Iterate all chunks and verify that each of them has the corresponding block
1774  * group
1775  */
1776 static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
1777 {
1778         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
1779         struct extent_map *em;
1780         struct btrfs_block_group *bg;
1781         u64 start = 0;
1782         int ret = 0;
1783
1784         while (1) {
1785                 read_lock(&map_tree->lock);
1786                 /*
1787                  * lookup_extent_mapping will return the first extent map
1788                  * intersecting the range, so setting @len to 1 is enough to
1789                  * get the first chunk.
1790                  */
1791                 em = lookup_extent_mapping(map_tree, start, 1);
1792                 read_unlock(&map_tree->lock);
1793                 if (!em)
1794                         break;
1795
1796                 bg = btrfs_lookup_block_group(fs_info, em->start);
1797                 if (!bg) {
1798                         btrfs_err(fs_info,
1799         "chunk start=%llu len=%llu doesn't have corresponding block group",
1800                                      em->start, em->len);
1801                         ret = -EUCLEAN;
1802                         free_extent_map(em);
1803                         break;
1804                 }
1805                 if (bg->start != em->start || bg->length != em->len ||
1806                     (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
1807                     (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
1808                         btrfs_err(fs_info,
1809 "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
1810                                 em->start, em->len,
1811                                 em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
1812                                 bg->start, bg->length,
1813                                 bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
1814                         ret = -EUCLEAN;
1815                         free_extent_map(em);
1816                         btrfs_put_block_group(bg);
1817                         break;
1818                 }
1819                 start = em->start + em->len;
1820                 free_extent_map(em);
1821                 btrfs_put_block_group(bg);
1822         }
1823         return ret;
1824 }
1825
1826 static void read_block_group_item(struct btrfs_block_group *cache,
1827                                  struct btrfs_path *path,
1828                                  const struct btrfs_key *key)
1829 {
1830         struct extent_buffer *leaf = path->nodes[0];
1831         struct btrfs_block_group_item bgi;
1832         int slot = path->slots[0];
1833
1834         cache->length = key->offset;
1835
1836         read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
1837                            sizeof(bgi));
1838         cache->used = btrfs_stack_block_group_used(&bgi);
1839         cache->flags = btrfs_stack_block_group_flags(&bgi);
1840 }
1841
1842 static int read_one_block_group(struct btrfs_fs_info *info,
1843                                 struct btrfs_path *path,
1844                                 const struct btrfs_key *key,
1845                                 int need_clear)
1846 {
1847         struct btrfs_block_group *cache;
1848         struct btrfs_space_info *space_info;
1849         const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS);
1850         int ret;
1851
1852         ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY);
1853
1854         cache = btrfs_create_block_group_cache(info, key->objectid);
1855         if (!cache)
1856                 return -ENOMEM;
1857
1858         read_block_group_item(cache, path, key);
1859
1860         set_free_space_tree_thresholds(cache);
1861
1862         if (need_clear) {
1863                 /*
1864                  * When we mount with old space cache, we need to
1865                  * set BTRFS_DC_CLEAR and set dirty flag.
1866                  *
1867                  * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
1868                  *    truncate the old free space cache inode and
1869                  *    setup a new one.
1870                  * b) Setting 'dirty flag' makes sure that we flush
1871                  *    the new space cache info onto disk.
1872                  */
1873                 if (btrfs_test_opt(info, SPACE_CACHE))
1874                         cache->disk_cache_state = BTRFS_DC_CLEAR;
1875         }
1876         if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
1877             (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
1878                         btrfs_err(info,
1879 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
1880                                   cache->start);
1881                         ret = -EINVAL;
1882                         goto error;
1883         }
1884
1885         /*
1886          * We need to exclude the super stripes now so that the space info has
1887          * super bytes accounted for, otherwise we'll think we have more space
1888          * than we actually do.
1889          */
1890         ret = exclude_super_stripes(cache);
1891         if (ret) {
1892                 /* We may have excluded something, so call this just in case. */
1893                 btrfs_free_excluded_extents(cache);
1894                 goto error;
1895         }
1896
1897         /*
1898          * Check for two cases, either we are full, and therefore don't need
1899          * to bother with the caching work since we won't find any space, or we
1900          * are empty, and we can just add all the space in and be done with it.
1901          * This saves us _a_lot_ of time, particularly in the full case.
1902          */
1903         if (cache->length == cache->used) {
1904                 cache->last_byte_to_unpin = (u64)-1;
1905                 cache->cached = BTRFS_CACHE_FINISHED;
1906                 btrfs_free_excluded_extents(cache);
1907         } else if (cache->used == 0) {
1908                 cache->last_byte_to_unpin = (u64)-1;
1909                 cache->cached = BTRFS_CACHE_FINISHED;
1910                 add_new_free_space(cache, cache->start,
1911                                    cache->start + cache->length);
1912                 btrfs_free_excluded_extents(cache);
1913         }
1914
1915         ret = btrfs_add_block_group_cache(info, cache);
1916         if (ret) {
1917                 btrfs_remove_free_space_cache(cache);
1918                 goto error;
1919         }
1920         trace_btrfs_add_block_group(info, cache, 0);
1921         btrfs_update_space_info(info, cache->flags, cache->length,
1922                                 cache->used, cache->bytes_super, &space_info);
1923
1924         cache->space_info = space_info;
1925
1926         link_block_group(cache);
1927
1928         set_avail_alloc_bits(info, cache->flags);
1929         if (btrfs_chunk_readonly(info, cache->start)) {
1930                 inc_block_group_ro(cache, 1);
1931         } else if (cache->used == 0) {
1932                 ASSERT(list_empty(&cache->bg_list));
1933                 if (btrfs_test_opt(info, DISCARD_ASYNC))
1934                         btrfs_discard_queue_work(&info->discard_ctl, cache);
1935                 else
1936                         btrfs_mark_bg_unused(cache);
1937         }
1938         return 0;
1939 error:
1940         btrfs_put_block_group(cache);
1941         return ret;
1942 }
1943
1944 static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
1945 {
1946         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
1947         struct btrfs_space_info *space_info;
1948         struct rb_node *node;
1949         int ret = 0;
1950
1951         for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
1952                 struct extent_map *em;
1953                 struct map_lookup *map;
1954                 struct btrfs_block_group *bg;
1955
1956                 em = rb_entry(node, struct extent_map, rb_node);
1957                 map = em->map_lookup;
1958                 bg = btrfs_create_block_group_cache(fs_info, em->start);
1959                 if (!bg) {
1960                         ret = -ENOMEM;
1961                         break;
1962                 }
1963
1964                 /* Fill dummy cache as FULL */
1965                 bg->length = em->len;
1966                 bg->flags = map->type;
1967                 bg->last_byte_to_unpin = (u64)-1;
1968                 bg->cached = BTRFS_CACHE_FINISHED;
1969                 bg->used = em->len;
1970                 bg->flags = map->type;
1971                 ret = btrfs_add_block_group_cache(fs_info, bg);
1972                 if (ret) {
1973                         btrfs_remove_free_space_cache(bg);
1974                         btrfs_put_block_group(bg);
1975                         break;
1976                 }
1977                 btrfs_update_space_info(fs_info, bg->flags, em->len, em->len,
1978                                         0, &space_info);
1979                 bg->space_info = space_info;
1980                 link_block_group(bg);
1981
1982                 set_avail_alloc_bits(fs_info, bg->flags);
1983         }
1984         if (!ret)
1985                 btrfs_init_global_block_rsv(fs_info);
1986         return ret;
1987 }
1988
1989 int btrfs_read_block_groups(struct btrfs_fs_info *info)
1990 {
1991         struct btrfs_path *path;
1992         int ret;
1993         struct btrfs_block_group *cache;
1994         struct btrfs_space_info *space_info;
1995         struct btrfs_key key;
1996         int need_clear = 0;
1997         u64 cache_gen;
1998
1999         if (!info->extent_root)
2000                 return fill_dummy_bgs(info);
2001
2002         key.objectid = 0;
2003         key.offset = 0;
2004         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
2005         path = btrfs_alloc_path();
2006         if (!path)
2007                 return -ENOMEM;
2008
2009         cache_gen = btrfs_super_cache_generation(info->super_copy);
2010         if (btrfs_test_opt(info, SPACE_CACHE) &&
2011             btrfs_super_generation(info->super_copy) != cache_gen)
2012                 need_clear = 1;
2013         if (btrfs_test_opt(info, CLEAR_CACHE))
2014                 need_clear = 1;
2015
2016         while (1) {
2017                 ret = find_first_block_group(info, path, &key);
2018                 if (ret > 0)
2019                         break;
2020                 if (ret != 0)
2021                         goto error;
2022
2023                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2024                 ret = read_one_block_group(info, path, &key, need_clear);
2025                 if (ret < 0)
2026                         goto error;
2027                 key.objectid += key.offset;
2028                 key.offset = 0;
2029                 btrfs_release_path(path);
2030         }
2031         btrfs_release_path(path);
2032
2033         list_for_each_entry(space_info, &info->space_info, list) {
2034                 int i;
2035
2036                 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
2037                         if (list_empty(&space_info->block_groups[i]))
2038                                 continue;
2039                         cache = list_first_entry(&space_info->block_groups[i],
2040                                                  struct btrfs_block_group,
2041                                                  list);
2042                         btrfs_sysfs_add_block_group_type(cache);
2043                 }
2044
2045                 if (!(btrfs_get_alloc_profile(info, space_info->flags) &
2046                       (BTRFS_BLOCK_GROUP_RAID10 |
2047                        BTRFS_BLOCK_GROUP_RAID1_MASK |
2048                        BTRFS_BLOCK_GROUP_RAID56_MASK |
2049                        BTRFS_BLOCK_GROUP_DUP)))
2050                         continue;
2051                 /*
2052                  * Avoid allocating from un-mirrored block group if there are
2053                  * mirrored block groups.
2054                  */
2055                 list_for_each_entry(cache,
2056                                 &space_info->block_groups[BTRFS_RAID_RAID0],
2057                                 list)
2058                         inc_block_group_ro(cache, 1);
2059                 list_for_each_entry(cache,
2060                                 &space_info->block_groups[BTRFS_RAID_SINGLE],
2061                                 list)
2062                         inc_block_group_ro(cache, 1);
2063         }
2064
2065         btrfs_init_global_block_rsv(info);
2066         ret = check_chunk_block_group_mappings(info);
2067 error:
2068         btrfs_free_path(path);
2069         return ret;
2070 }
2071
2072 static int insert_block_group_item(struct btrfs_trans_handle *trans,
2073                                    struct btrfs_block_group *block_group)
2074 {
2075         struct btrfs_fs_info *fs_info = trans->fs_info;
2076         struct btrfs_block_group_item bgi;
2077         struct btrfs_root *root;
2078         struct btrfs_key key;
2079
2080         spin_lock(&block_group->lock);
2081         btrfs_set_stack_block_group_used(&bgi, block_group->used);
2082         btrfs_set_stack_block_group_chunk_objectid(&bgi,
2083                                 BTRFS_FIRST_CHUNK_TREE_OBJECTID);
2084         btrfs_set_stack_block_group_flags(&bgi, block_group->flags);
2085         key.objectid = block_group->start;
2086         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
2087         key.offset = block_group->length;
2088         spin_unlock(&block_group->lock);
2089
2090         root = fs_info->extent_root;
2091         return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
2092 }
2093
2094 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
2095 {
2096         struct btrfs_fs_info *fs_info = trans->fs_info;
2097         struct btrfs_block_group *block_group;
2098         int ret = 0;
2099
2100         if (!trans->can_flush_pending_bgs)
2101                 return;
2102
2103         while (!list_empty(&trans->new_bgs)) {
2104                 int index;
2105
2106                 block_group = list_first_entry(&trans->new_bgs,
2107                                                struct btrfs_block_group,
2108                                                bg_list);
2109                 if (ret)
2110                         goto next;
2111
2112                 index = btrfs_bg_flags_to_raid_index(block_group->flags);
2113
2114                 ret = insert_block_group_item(trans, block_group);
2115                 if (ret)
2116                         btrfs_abort_transaction(trans, ret);
2117                 ret = btrfs_finish_chunk_alloc(trans, block_group->start,
2118                                         block_group->length);
2119                 if (ret)
2120                         btrfs_abort_transaction(trans, ret);
2121                 add_block_group_free_space(trans, block_group);
2122
2123                 /*
2124                  * If we restriped during balance, we may have added a new raid
2125                  * type, so now add the sysfs entries when it is safe to do so.
2126                  * We don't have to worry about locking here as it's handled in
2127                  * btrfs_sysfs_add_block_group_type.
2128                  */
2129                 if (block_group->space_info->block_group_kobjs[index] == NULL)
2130                         btrfs_sysfs_add_block_group_type(block_group);
2131
2132                 /* Already aborted the transaction if it failed. */
2133 next:
2134                 btrfs_delayed_refs_rsv_release(fs_info, 1);
2135                 list_del_init(&block_group->bg_list);
2136         }
2137         btrfs_trans_release_chunk_metadata(trans);
2138 }
2139
2140 int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
2141                            u64 type, u64 chunk_offset, u64 size)
2142 {
2143         struct btrfs_fs_info *fs_info = trans->fs_info;
2144         struct btrfs_block_group *cache;
2145         int ret;
2146
2147         btrfs_set_log_full_commit(trans);
2148
2149         cache = btrfs_create_block_group_cache(fs_info, chunk_offset);
2150         if (!cache)
2151                 return -ENOMEM;
2152
2153         cache->length = size;
2154         set_free_space_tree_thresholds(cache);
2155         cache->used = bytes_used;
2156         cache->flags = type;
2157         cache->last_byte_to_unpin = (u64)-1;
2158         cache->cached = BTRFS_CACHE_FINISHED;
2159         cache->needs_free_space = 1;
2160         ret = exclude_super_stripes(cache);
2161         if (ret) {
2162                 /* We may have excluded something, so call this just in case */
2163                 btrfs_free_excluded_extents(cache);
2164                 btrfs_put_block_group(cache);
2165                 return ret;
2166         }
2167
2168         add_new_free_space(cache, chunk_offset, chunk_offset + size);
2169
2170         btrfs_free_excluded_extents(cache);
2171
2172 #ifdef CONFIG_BTRFS_DEBUG
2173         if (btrfs_should_fragment_free_space(cache)) {
2174                 u64 new_bytes_used = size - bytes_used;
2175
2176                 bytes_used += new_bytes_used >> 1;
2177                 fragment_free_space(cache);
2178         }
2179 #endif
2180         /*
2181          * Ensure the corresponding space_info object is created and
2182          * assigned to our block group. We want our bg to be added to the rbtree
2183          * with its ->space_info set.
2184          */
2185         cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
2186         ASSERT(cache->space_info);
2187
2188         ret = btrfs_add_block_group_cache(fs_info, cache);
2189         if (ret) {
2190                 btrfs_remove_free_space_cache(cache);
2191                 btrfs_put_block_group(cache);
2192                 return ret;
2193         }
2194
2195         /*
2196          * Now that our block group has its ->space_info set and is inserted in
2197          * the rbtree, update the space info's counters.
2198          */
2199         trace_btrfs_add_block_group(fs_info, cache, 1);
2200         btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
2201                                 cache->bytes_super, &cache->space_info);
2202         btrfs_update_global_block_rsv(fs_info);
2203
2204         link_block_group(cache);
2205
2206         list_add_tail(&cache->bg_list, &trans->new_bgs);
2207         trans->delayed_ref_updates++;
2208         btrfs_update_delayed_refs_rsv(trans);
2209
2210         set_avail_alloc_bits(fs_info, type);
2211         return 0;
2212 }
2213
2214 /*
2215  * Mark one block group RO, can be called several times for the same block
2216  * group.
2217  *
2218  * @cache:              the destination block group
2219  * @do_chunk_alloc:     whether need to do chunk pre-allocation, this is to
2220  *                      ensure we still have some free space after marking this
2221  *                      block group RO.
2222  */
2223 int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
2224                              bool do_chunk_alloc)
2225 {
2226         struct btrfs_fs_info *fs_info = cache->fs_info;
2227         struct btrfs_trans_handle *trans;
2228         u64 alloc_flags;
2229         int ret;
2230
2231 again:
2232         trans = btrfs_join_transaction(fs_info->extent_root);
2233         if (IS_ERR(trans))
2234                 return PTR_ERR(trans);
2235
2236         /*
2237          * we're not allowed to set block groups readonly after the dirty
2238          * block groups cache has started writing.  If it already started,
2239          * back off and let this transaction commit
2240          */
2241         mutex_lock(&fs_info->ro_block_group_mutex);
2242         if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
2243                 u64 transid = trans->transid;
2244
2245                 mutex_unlock(&fs_info->ro_block_group_mutex);
2246                 btrfs_end_transaction(trans);
2247
2248                 ret = btrfs_wait_for_commit(fs_info, transid);
2249                 if (ret)
2250                         return ret;
2251                 goto again;
2252         }
2253
2254         if (do_chunk_alloc) {
2255                 /*
2256                  * If we are changing raid levels, try to allocate a
2257                  * corresponding block group with the new raid level.
2258                  */
2259                 alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
2260                 if (alloc_flags != cache->flags) {
2261                         ret = btrfs_chunk_alloc(trans, alloc_flags,
2262                                                 CHUNK_ALLOC_FORCE);
2263                         /*
2264                          * ENOSPC is allowed here, we may have enough space
2265                          * already allocated at the new raid level to carry on
2266                          */
2267                         if (ret == -ENOSPC)
2268                                 ret = 0;
2269                         if (ret < 0)
2270                                 goto out;
2271                 }
2272         }
2273
2274         ret = inc_block_group_ro(cache, 0);
2275         if (!do_chunk_alloc)
2276                 goto unlock_out;
2277         if (!ret)
2278                 goto out;
2279         alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags);
2280         ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
2281         if (ret < 0)
2282                 goto out;
2283         ret = inc_block_group_ro(cache, 0);
2284 out:
2285         if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
2286                 alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
2287                 mutex_lock(&fs_info->chunk_mutex);
2288                 check_system_chunk(trans, alloc_flags);
2289                 mutex_unlock(&fs_info->chunk_mutex);
2290         }
2291 unlock_out:
2292         mutex_unlock(&fs_info->ro_block_group_mutex);
2293
2294         btrfs_end_transaction(trans);
2295         return ret;
2296 }
2297
2298 void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
2299 {
2300         struct btrfs_space_info *sinfo = cache->space_info;
2301         u64 num_bytes;
2302
2303         BUG_ON(!cache->ro);
2304
2305         spin_lock(&sinfo->lock);
2306         spin_lock(&cache->lock);
2307         if (!--cache->ro) {
2308                 num_bytes = cache->length - cache->reserved -
2309                             cache->pinned - cache->bytes_super - cache->used;
2310                 sinfo->bytes_readonly -= num_bytes;
2311                 list_del_init(&cache->ro_list);
2312         }
2313         spin_unlock(&cache->lock);
2314         spin_unlock(&sinfo->lock);
2315 }
2316
2317 static int update_block_group_item(struct btrfs_trans_handle *trans,
2318                                    struct btrfs_path *path,
2319                                    struct btrfs_block_group *cache)
2320 {
2321         struct btrfs_fs_info *fs_info = trans->fs_info;
2322         int ret;
2323         struct btrfs_root *root = fs_info->extent_root;
2324         unsigned long bi;
2325         struct extent_buffer *leaf;
2326         struct btrfs_block_group_item bgi;
2327         struct btrfs_key key;
2328
2329         key.objectid = cache->start;
2330         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
2331         key.offset = cache->length;
2332
2333         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2334         if (ret) {
2335                 if (ret > 0)
2336                         ret = -ENOENT;
2337                 goto fail;
2338         }
2339
2340         leaf = path->nodes[0];
2341         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
2342         btrfs_set_stack_block_group_used(&bgi, cache->used);
2343         btrfs_set_stack_block_group_chunk_objectid(&bgi,
2344                         BTRFS_FIRST_CHUNK_TREE_OBJECTID);
2345         btrfs_set_stack_block_group_flags(&bgi, cache->flags);
2346         write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
2347         btrfs_mark_buffer_dirty(leaf);
2348 fail:
2349         btrfs_release_path(path);
2350         return ret;
2351
2352 }
2353
2354 static int cache_save_setup(struct btrfs_block_group *block_group,
2355                             struct btrfs_trans_handle *trans,
2356                             struct btrfs_path *path)
2357 {
2358         struct btrfs_fs_info *fs_info = block_group->fs_info;
2359         struct btrfs_root *root = fs_info->tree_root;
2360         struct inode *inode = NULL;
2361         struct extent_changeset *data_reserved = NULL;
2362         u64 alloc_hint = 0;
2363         int dcs = BTRFS_DC_ERROR;
2364         u64 num_pages = 0;
2365         int retries = 0;
2366         int ret = 0;
2367
2368         /*
2369          * If this block group is smaller than 100 megs don't bother caching the
2370          * block group.
2371          */
2372         if (block_group->length < (100 * SZ_1M)) {
2373                 spin_lock(&block_group->lock);
2374                 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
2375                 spin_unlock(&block_group->lock);
2376                 return 0;
2377         }
2378
2379         if (TRANS_ABORTED(trans))
2380                 return 0;
2381 again:
2382         inode = lookup_free_space_inode(block_group, path);
2383         if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
2384                 ret = PTR_ERR(inode);
2385                 btrfs_release_path(path);
2386                 goto out;
2387         }
2388
2389         if (IS_ERR(inode)) {
2390                 BUG_ON(retries);
2391                 retries++;
2392
2393                 if (block_group->ro)
2394                         goto out_free;
2395
2396                 ret = create_free_space_inode(trans, block_group, path);
2397                 if (ret)
2398                         goto out_free;
2399                 goto again;
2400         }
2401
2402         /*
2403          * We want to set the generation to 0, that way if anything goes wrong
2404          * from here on out we know not to trust this cache when we load up next
2405          * time.
2406          */
2407         BTRFS_I(inode)->generation = 0;
2408         ret = btrfs_update_inode(trans, root, inode);
2409         if (ret) {
2410                 /*
2411                  * So theoretically we could recover from this, simply set the
2412                  * super cache generation to 0 so we know to invalidate the
2413                  * cache, but then we'd have to keep track of the block groups
2414                  * that fail this way so we know we _have_ to reset this cache
2415                  * before the next commit or risk reading stale cache.  So to
2416                  * limit our exposure to horrible edge cases lets just abort the
2417                  * transaction, this only happens in really bad situations
2418                  * anyway.
2419                  */
2420                 btrfs_abort_transaction(trans, ret);
2421                 goto out_put;
2422         }
2423         WARN_ON(ret);
2424
2425         /* We've already setup this transaction, go ahead and exit */
2426         if (block_group->cache_generation == trans->transid &&
2427             i_size_read(inode)) {
2428                 dcs = BTRFS_DC_SETUP;
2429                 goto out_put;
2430         }
2431
2432         if (i_size_read(inode) > 0) {
2433                 ret = btrfs_check_trunc_cache_free_space(fs_info,
2434                                         &fs_info->global_block_rsv);
2435                 if (ret)
2436                         goto out_put;
2437
2438                 ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
2439                 if (ret)
2440                         goto out_put;
2441         }
2442
2443         spin_lock(&block_group->lock);
2444         if (block_group->cached != BTRFS_CACHE_FINISHED ||
2445             !btrfs_test_opt(fs_info, SPACE_CACHE)) {
2446                 /*
2447                  * don't bother trying to write stuff out _if_
2448                  * a) we're not cached,
2449                  * b) we're with nospace_cache mount option,
2450                  * c) we're with v2 space_cache (FREE_SPACE_TREE).
2451                  */
2452                 dcs = BTRFS_DC_WRITTEN;
2453                 spin_unlock(&block_group->lock);
2454                 goto out_put;
2455         }
2456         spin_unlock(&block_group->lock);
2457
2458         /*
2459          * We hit an ENOSPC when setting up the cache in this transaction, just
2460          * skip doing the setup, we've already cleared the cache so we're safe.
2461          */
2462         if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
2463                 ret = -ENOSPC;
2464                 goto out_put;
2465         }
2466
2467         /*
2468          * Try to preallocate enough space based on how big the block group is.
2469          * Keep in mind this has to include any pinned space which could end up
2470          * taking up quite a bit since it's not folded into the other space
2471          * cache.
2472          */
2473         num_pages = div_u64(block_group->length, SZ_256M);
2474         if (!num_pages)
2475                 num_pages = 1;
2476
2477         num_pages *= 16;
2478         num_pages *= PAGE_SIZE;
2479
2480         ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0,
2481                                           num_pages);
2482         if (ret)
2483                 goto out_put;
2484
2485         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
2486                                               num_pages, num_pages,
2487                                               &alloc_hint);
2488         /*
2489          * Our cache requires contiguous chunks so that we don't modify a bunch
2490          * of metadata or split extents when writing the cache out, which means
2491          * we can enospc if we are heavily fragmented in addition to just normal
2492          * out of space conditions.  So if we hit this just skip setting up any
2493          * other block groups for this transaction, maybe we'll unpin enough
2494          * space the next time around.
2495          */
2496         if (!ret)
2497                 dcs = BTRFS_DC_SETUP;
2498         else if (ret == -ENOSPC)
2499                 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
2500
2501 out_put:
2502         iput(inode);
2503 out_free:
2504         btrfs_release_path(path);
2505 out:
2506         spin_lock(&block_group->lock);
2507         if (!ret && dcs == BTRFS_DC_SETUP)
2508                 block_group->cache_generation = trans->transid;
2509         block_group->disk_cache_state = dcs;
2510         spin_unlock(&block_group->lock);
2511
2512         extent_changeset_free(data_reserved);
2513         return ret;
2514 }
2515
2516 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
2517 {
2518         struct btrfs_fs_info *fs_info = trans->fs_info;
2519         struct btrfs_block_group *cache, *tmp;
2520         struct btrfs_transaction *cur_trans = trans->transaction;
2521         struct btrfs_path *path;
2522
2523         if (list_empty(&cur_trans->dirty_bgs) ||
2524             !btrfs_test_opt(fs_info, SPACE_CACHE))
2525                 return 0;
2526
2527         path = btrfs_alloc_path();
2528         if (!path)
2529                 return -ENOMEM;
2530
2531         /* Could add new block groups, use _safe just in case */
2532         list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
2533                                  dirty_list) {
2534                 if (cache->disk_cache_state == BTRFS_DC_CLEAR)
2535                         cache_save_setup(cache, trans, path);
2536         }
2537
2538         btrfs_free_path(path);
2539         return 0;
2540 }
2541
2542 /*
2543  * Transaction commit does final block group cache writeback during a critical
2544  * section where nothing is allowed to change the FS.  This is required in
2545  * order for the cache to actually match the block group, but can introduce a
2546  * lot of latency into the commit.
2547  *
2548  * So, btrfs_start_dirty_block_groups is here to kick off block group cache IO.
2549  * There's a chance we'll have to redo some of it if the block group changes
2550  * again during the commit, but it greatly reduces the commit latency by
2551  * getting rid of the easy block groups while we're still allowing others to
2552  * join the commit.
2553  */
2554 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
2555 {
2556         struct btrfs_fs_info *fs_info = trans->fs_info;
2557         struct btrfs_block_group *cache;
2558         struct btrfs_transaction *cur_trans = trans->transaction;
2559         int ret = 0;
2560         int should_put;
2561         struct btrfs_path *path = NULL;
2562         LIST_HEAD(dirty);
2563         struct list_head *io = &cur_trans->io_bgs;
2564         int num_started = 0;
2565         int loops = 0;
2566
2567         spin_lock(&cur_trans->dirty_bgs_lock);
2568         if (list_empty(&cur_trans->dirty_bgs)) {
2569                 spin_unlock(&cur_trans->dirty_bgs_lock);
2570                 return 0;
2571         }
2572         list_splice_init(&cur_trans->dirty_bgs, &dirty);
2573         spin_unlock(&cur_trans->dirty_bgs_lock);
2574
2575 again:
2576         /* Make sure all the block groups on our dirty list actually exist */
2577         btrfs_create_pending_block_groups(trans);
2578
2579         if (!path) {
2580                 path = btrfs_alloc_path();
2581                 if (!path)
2582                         return -ENOMEM;
2583         }
2584
2585         /*
2586          * cache_write_mutex is here only to save us from balance or automatic
2587          * removal of empty block groups deleting this block group while we are
2588          * writing out the cache
2589          */
2590         mutex_lock(&trans->transaction->cache_write_mutex);
2591         while (!list_empty(&dirty)) {
2592                 bool drop_reserve = true;
2593
2594                 cache = list_first_entry(&dirty, struct btrfs_block_group,
2595                                          dirty_list);
2596                 /*
2597                  * This can happen if something re-dirties a block group that
2598                  * is already under IO.  Just wait for it to finish and then do
2599                  * it all again
2600                  */
2601                 if (!list_empty(&cache->io_list)) {
2602                         list_del_init(&cache->io_list);
2603                         btrfs_wait_cache_io(trans, cache, path);
2604                         btrfs_put_block_group(cache);
2605                 }
2606
2607
2608                 /*
2609                  * btrfs_wait_cache_io uses the cache->dirty_list to decide if
2610                  * it should update the cache_state.  Don't delete until after
2611                  * we wait.
2612                  *
2613                  * Since we're not running in the commit critical section
2614                  * we need the dirty_bgs_lock to protect from update_block_group
2615                  */
2616                 spin_lock(&cur_trans->dirty_bgs_lock);
2617                 list_del_init(&cache->dirty_list);
2618                 spin_unlock(&cur_trans->dirty_bgs_lock);
2619
2620                 should_put = 1;
2621
2622                 cache_save_setup(cache, trans, path);
2623
2624                 if (cache->disk_cache_state == BTRFS_DC_SETUP) {
2625                         cache->io_ctl.inode = NULL;
2626                         ret = btrfs_write_out_cache(trans, cache, path);
2627                         if (ret == 0 && cache->io_ctl.inode) {
2628                                 num_started++;
2629                                 should_put = 0;
2630
2631                                 /*
2632                                  * The cache_write_mutex is protecting the
2633                                  * io_list, also refer to the definition of
2634                                  * btrfs_transaction::io_bgs for more details
2635                                  */
2636                                 list_add_tail(&cache->io_list, io);
2637                         } else {
2638                                 /*
2639                                  * If we failed to write the cache, the
2640                                  * generation will be bad and life goes on
2641                                  */
2642                                 ret = 0;
2643                         }
2644                 }
2645                 if (!ret) {
2646                         ret = update_block_group_item(trans, path, cache);
2647                         /*
2648                          * Our block group might still be attached to the list
2649                          * of new block groups in the transaction handle of some
2650                          * other task (struct btrfs_trans_handle->new_bgs). This
2651                          * means its block group item isn't yet in the extent
2652                          * tree. If this happens ignore the error, as we will
2653                          * try again later in the critical section of the
2654                          * transaction commit.
2655                          */
2656                         if (ret == -ENOENT) {
2657                                 ret = 0;
2658                                 spin_lock(&cur_trans->dirty_bgs_lock);
2659                                 if (list_empty(&cache->dirty_list)) {
2660                                         list_add_tail(&cache->dirty_list,
2661                                                       &cur_trans->dirty_bgs);
2662                                         btrfs_get_block_group(cache);
2663                                         drop_reserve = false;
2664                                 }
2665                                 spin_unlock(&cur_trans->dirty_bgs_lock);
2666                         } else if (ret) {
2667                                 btrfs_abort_transaction(trans, ret);
2668                         }
2669                 }
2670
2671                 /* If it's not on the io list, we need to put the block group */
2672                 if (should_put)
2673                         btrfs_put_block_group(cache);
2674                 if (drop_reserve)
2675                         btrfs_delayed_refs_rsv_release(fs_info, 1);
2676
2677                 if (ret)
2678                         break;
2679
2680                 /*
2681                  * Avoid blocking other tasks for too long. It might even save
2682                  * us from writing caches for block groups that are going to be
2683                  * removed.
2684                  */
2685                 mutex_unlock(&trans->transaction->cache_write_mutex);
2686                 mutex_lock(&trans->transaction->cache_write_mutex);
2687         }
2688         mutex_unlock(&trans->transaction->cache_write_mutex);
2689
2690         /*
2691          * Go through delayed refs for all the stuff we've just kicked off
2692          * and then loop back (just once)
2693          */
2694         ret = btrfs_run_delayed_refs(trans, 0);
2695         if (!ret && loops == 0) {
2696                 loops++;
2697                 spin_lock(&cur_trans->dirty_bgs_lock);
2698                 list_splice_init(&cur_trans->dirty_bgs, &dirty);
2699                 /*
2700                  * dirty_bgs_lock protects us from concurrent block group
2701                  * deletes too (not just cache_write_mutex).
2702                  */
2703                 if (!list_empty(&dirty)) {
2704                         spin_unlock(&cur_trans->dirty_bgs_lock);
2705                         goto again;
2706                 }
2707                 spin_unlock(&cur_trans->dirty_bgs_lock);
2708         } else if (ret < 0) {
2709                 btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
2710         }
2711
2712         btrfs_free_path(path);
2713         return ret;
2714 }
2715
2716 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
2717 {
2718         struct btrfs_fs_info *fs_info = trans->fs_info;
2719         struct btrfs_block_group *cache;
2720         struct btrfs_transaction *cur_trans = trans->transaction;
2721         int ret = 0;
2722         int should_put;
2723         struct btrfs_path *path;
2724         struct list_head *io = &cur_trans->io_bgs;
2725         int num_started = 0;
2726
2727         path = btrfs_alloc_path();
2728         if (!path)
2729                 return -ENOMEM;
2730
2731         /*
2732          * Even though we are in the critical section of the transaction commit,
2733          * we can still have concurrent tasks adding elements to this
2734          * transaction's list of dirty block groups. These tasks correspond to
2735          * endio free space workers started when writeback finishes for a
2736          * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
2737          * allocate new block groups as a result of COWing nodes of the root
2738          * tree when updating the free space inode. The writeback for the space
2739          * caches is triggered by an earlier call to
2740          * btrfs_start_dirty_block_groups() and iterations of the following
2741          * loop.
2742          * Also we want to do the cache_save_setup first and then run the
2743          * delayed refs to make sure we have the best chance at doing this all
2744          * in one shot.
2745          */
2746         spin_lock(&cur_trans->dirty_bgs_lock);
2747         while (!list_empty(&cur_trans->dirty_bgs)) {
2748                 cache = list_first_entry(&cur_trans->dirty_bgs,
2749                                          struct btrfs_block_group,
2750                                          dirty_list);
2751
2752                 /*
2753                  * This can happen if cache_save_setup re-dirties a block group
2754                  * that is already under IO.  Just wait for it to finish and
2755                  * then do it all again
2756                  */
2757                 if (!list_empty(&cache->io_list)) {
2758                         spin_unlock(&cur_trans->dirty_bgs_lock);
2759                         list_del_init(&cache->io_list);
2760                         btrfs_wait_cache_io(trans, cache, path);
2761                         btrfs_put_block_group(cache);
2762                         spin_lock(&cur_trans->dirty_bgs_lock);
2763                 }
2764
2765                 /*
2766                  * Don't remove from the dirty list until after we've waited on
2767                  * any pending IO
2768                  */
2769                 list_del_init(&cache->dirty_list);
2770                 spin_unlock(&cur_trans->dirty_bgs_lock);
2771                 should_put = 1;
2772
2773                 cache_save_setup(cache, trans, path);
2774
2775                 if (!ret)
2776                         ret = btrfs_run_delayed_refs(trans,
2777                                                      (unsigned long) -1);
2778
2779                 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
2780                         cache->io_ctl.inode = NULL;
2781                         ret = btrfs_write_out_cache(trans, cache, path);
2782                         if (ret == 0 && cache->io_ctl.inode) {
2783                                 num_started++;
2784                                 should_put = 0;
2785                                 list_add_tail(&cache->io_list, io);
2786                         } else {
2787                                 /*
2788                                  * If we failed to write the cache, the
2789                                  * generation will be bad and life goes on
2790                                  */
2791                                 ret = 0;
2792                         }
2793                 }
2794                 if (!ret) {
2795                         ret = update_block_group_item(trans, path, cache);
2796                         /*
2797                          * One of the free space endio workers might have
2798                          * created a new block group while updating a free space
2799                          * cache's inode (at inode.c:btrfs_finish_ordered_io())
2800                          * and hasn't released its transaction handle yet, in
2801                          * which case the new block group is still attached to
2802                          * its transaction handle and its creation has not
2803                          * finished yet (no block group item in the extent tree
2804                          * yet, etc). If this is the case, wait for all free
2805                          * space endio workers to finish and retry. This is a
2806                          * very rare case so no need for a more efficient and
2807                          * complex approach.
2808                          */
2809                         if (ret == -ENOENT) {
2810                                 wait_event(cur_trans->writer_wait,
2811                                    atomic_read(&cur_trans->num_writers) == 1);
2812                                 ret = update_block_group_item(trans, path, cache);
2813                         }
2814                         if (ret)
2815                                 btrfs_abort_transaction(trans, ret);
2816                 }
2817
2818                 /* If its not on the io list, we need to put the block group */
2819                 if (should_put)
2820                         btrfs_put_block_group(cache);
2821                 btrfs_delayed_refs_rsv_release(fs_info, 1);
2822                 spin_lock(&cur_trans->dirty_bgs_lock);
2823         }
2824         spin_unlock(&cur_trans->dirty_bgs_lock);
2825
2826         /*
2827          * Refer to the definition of io_bgs member for details why it's safe
2828          * to use it without any locking
2829          */
2830         while (!list_empty(io)) {
2831                 cache = list_first_entry(io, struct btrfs_block_group,
2832                                          io_list);
2833                 list_del_init(&cache->io_list);
2834                 btrfs_wait_cache_io(trans, cache, path);
2835                 btrfs_put_block_group(cache);
2836         }
2837
2838         btrfs_free_path(path);
2839         return ret;
2840 }
2841
2842 int btrfs_update_block_group(struct btrfs_trans_handle *trans,
2843                              u64 bytenr, u64 num_bytes, int alloc)
2844 {
2845         struct btrfs_fs_info *info = trans->fs_info;
2846         struct btrfs_block_group *cache = NULL;
2847         u64 total = num_bytes;
2848         u64 old_val;
2849         u64 byte_in_group;
2850         int factor;
2851         int ret = 0;
2852
2853         /* Block accounting for super block */
2854         spin_lock(&info->delalloc_root_lock);
2855         old_val = btrfs_super_bytes_used(info->super_copy);
2856         if (alloc)
2857                 old_val += num_bytes;
2858         else
2859                 old_val -= num_bytes;
2860         btrfs_set_super_bytes_used(info->super_copy, old_val);
2861         spin_unlock(&info->delalloc_root_lock);
2862
2863         while (total) {
2864                 cache = btrfs_lookup_block_group(info, bytenr);
2865                 if (!cache) {
2866                         ret = -ENOENT;
2867                         break;
2868                 }
2869                 factor = btrfs_bg_type_to_factor(cache->flags);
2870
2871                 /*
2872                  * If this block group has free space cache written out, we
2873                  * need to make sure to load it if we are removing space.  This
2874                  * is because we need the unpinning stage to actually add the
2875                  * space back to the block group, otherwise we will leak space.
2876                  */
2877                 if (!alloc && !btrfs_block_group_done(cache))
2878                         btrfs_cache_block_group(cache, 1);
2879
2880                 byte_in_group = bytenr - cache->start;
2881                 WARN_ON(byte_in_group > cache->length);
2882
2883                 spin_lock(&cache->space_info->lock);
2884                 spin_lock(&cache->lock);
2885
2886                 if (btrfs_test_opt(info, SPACE_CACHE) &&
2887                     cache->disk_cache_state < BTRFS_DC_CLEAR)
2888                         cache->disk_cache_state = BTRFS_DC_CLEAR;
2889
2890                 old_val = cache->used;
2891                 num_bytes = min(total, cache->length - byte_in_group);
2892                 if (alloc) {
2893                         old_val += num_bytes;
2894                         cache->used = old_val;
2895                         cache->reserved -= num_bytes;
2896                         cache->space_info->bytes_reserved -= num_bytes;
2897                         cache->space_info->bytes_used += num_bytes;
2898                         cache->space_info->disk_used += num_bytes * factor;
2899                         spin_unlock(&cache->lock);
2900                         spin_unlock(&cache->space_info->lock);
2901                 } else {
2902                         old_val -= num_bytes;
2903                         cache->used = old_val;
2904                         cache->pinned += num_bytes;
2905                         btrfs_space_info_update_bytes_pinned(info,
2906                                         cache->space_info, num_bytes);
2907                         cache->space_info->bytes_used -= num_bytes;
2908                         cache->space_info->disk_used -= num_bytes * factor;
2909                         spin_unlock(&cache->lock);
2910                         spin_unlock(&cache->space_info->lock);
2911
2912                         percpu_counter_add_batch(
2913                                         &cache->space_info->total_bytes_pinned,
2914                                         num_bytes,
2915                                         BTRFS_TOTAL_BYTES_PINNED_BATCH);
2916                         set_extent_dirty(&trans->transaction->pinned_extents,
2917                                          bytenr, bytenr + num_bytes - 1,
2918                                          GFP_NOFS | __GFP_NOFAIL);
2919                 }
2920
2921                 spin_lock(&trans->transaction->dirty_bgs_lock);
2922                 if (list_empty(&cache->dirty_list)) {
2923                         list_add_tail(&cache->dirty_list,
2924                                       &trans->transaction->dirty_bgs);
2925                         trans->delayed_ref_updates++;
2926                         btrfs_get_block_group(cache);
2927                 }
2928                 spin_unlock(&trans->transaction->dirty_bgs_lock);
2929
2930                 /*
2931                  * No longer have used bytes in this block group, queue it for
2932                  * deletion. We do this after adding the block group to the
2933                  * dirty list to avoid races between cleaner kthread and space
2934                  * cache writeout.
2935                  */
2936                 if (!alloc && old_val == 0) {
2937                         if (!btrfs_test_opt(info, DISCARD_ASYNC))
2938                                 btrfs_mark_bg_unused(cache);
2939                 }
2940
2941                 btrfs_put_block_group(cache);
2942                 total -= num_bytes;
2943                 bytenr += num_bytes;
2944         }
2945
2946         /* Modified block groups are accounted for in the delayed_refs_rsv. */
2947         btrfs_update_delayed_refs_rsv(trans);
2948         return ret;
2949 }
2950
2951 /**
2952  * btrfs_add_reserved_bytes - update the block_group and space info counters
2953  * @cache:      The cache we are manipulating
2954  * @ram_bytes:  The number of bytes of file content, and will be same to
2955  *              @num_bytes except for the compress path.
2956  * @num_bytes:  The number of bytes in question
2957  * @delalloc:   The blocks are allocated for the delalloc write
2958  *
2959  * This is called by the allocator when it reserves space. If this is a
2960  * reservation and the block group has become read only we cannot make the
2961  * reservation and return -EAGAIN, otherwise this function always succeeds.
2962  */
2963 int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
2964                              u64 ram_bytes, u64 num_bytes, int delalloc)
2965 {
2966         struct btrfs_space_info *space_info = cache->space_info;
2967         int ret = 0;
2968
2969         spin_lock(&space_info->lock);
2970         spin_lock(&cache->lock);
2971         if (cache->ro) {
2972                 ret = -EAGAIN;
2973         } else {
2974                 cache->reserved += num_bytes;
2975                 space_info->bytes_reserved += num_bytes;
2976                 trace_btrfs_space_reservation(cache->fs_info, "space_info",
2977                                               space_info->flags, num_bytes, 1);
2978                 btrfs_space_info_update_bytes_may_use(cache->fs_info,
2979                                                       space_info, -ram_bytes);
2980                 if (delalloc)
2981                         cache->delalloc_bytes += num_bytes;
2982
2983                 /*
2984                  * Compression can use less space than we reserved, so wake
2985                  * tickets if that happens
2986                  */
2987                 if (num_bytes < ram_bytes)
2988                         btrfs_try_granting_tickets(cache->fs_info, space_info);
2989         }
2990         spin_unlock(&cache->lock);
2991         spin_unlock(&space_info->lock);
2992         return ret;
2993 }
2994
2995 /**
2996  * btrfs_free_reserved_bytes - update the block_group and space info counters
2997  * @cache:      The cache we are manipulating
2998  * @num_bytes:  The number of bytes in question
2999  * @delalloc:   The blocks are allocated for the delalloc write
3000  *
3001  * This is called by somebody who is freeing space that was never actually used
3002  * on disk.  For example if you reserve some space for a new leaf in transaction
3003  * A and before transaction A commits you free that leaf, you call this with
3004  * reserve set to 0 in order to clear the reservation.
3005  */
3006 void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
3007                                u64 num_bytes, int delalloc)
3008 {
3009         struct btrfs_space_info *space_info = cache->space_info;
3010
3011         spin_lock(&space_info->lock);
3012         spin_lock(&cache->lock);
3013         if (cache->ro)
3014                 space_info->bytes_readonly += num_bytes;
3015         cache->reserved -= num_bytes;
3016         space_info->bytes_reserved -= num_bytes;
3017         space_info->max_extent_size = 0;
3018
3019         if (delalloc)
3020                 cache->delalloc_bytes -= num_bytes;
3021         spin_unlock(&cache->lock);
3022
3023         btrfs_try_granting_tickets(cache->fs_info, space_info);
3024         spin_unlock(&space_info->lock);
3025 }
3026
3027 static void force_metadata_allocation(struct btrfs_fs_info *info)
3028 {
3029         struct list_head *head = &info->space_info;
3030         struct btrfs_space_info *found;
3031
3032         list_for_each_entry(found, head, list) {
3033                 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3034                         found->force_alloc = CHUNK_ALLOC_FORCE;
3035         }
3036 }
3037
3038 static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
3039                               struct btrfs_space_info *sinfo, int force)
3040 {
3041         u64 bytes_used = btrfs_space_info_used(sinfo, false);
3042         u64 thresh;
3043
3044         if (force == CHUNK_ALLOC_FORCE)
3045                 return 1;
3046
3047         /*
3048          * in limited mode, we want to have some free space up to
3049          * about 1% of the FS size.
3050          */
3051         if (force == CHUNK_ALLOC_LIMITED) {
3052                 thresh = btrfs_super_total_bytes(fs_info->super_copy);
3053                 thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
3054
3055                 if (sinfo->total_bytes - bytes_used < thresh)
3056                         return 1;
3057         }
3058
3059         if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
3060                 return 0;
3061         return 1;
3062 }
3063
3064 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
3065 {
3066         u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type);
3067
3068         return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
3069 }
3070
3071 /*
3072  * If force is CHUNK_ALLOC_FORCE:
3073  *    - return 1 if it successfully allocates a chunk,
3074  *    - return errors including -ENOSPC otherwise.
3075  * If force is NOT CHUNK_ALLOC_FORCE:
3076  *    - return 0 if it doesn't need to allocate a new chunk,
3077  *    - return 1 if it successfully allocates a chunk,
3078  *    - return errors including -ENOSPC otherwise.
3079  */
3080 int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
3081                       enum btrfs_chunk_alloc_enum force)
3082 {
3083         struct btrfs_fs_info *fs_info = trans->fs_info;
3084         struct btrfs_space_info *space_info;
3085         bool wait_for_alloc = false;
3086         bool should_alloc = false;
3087         int ret = 0;
3088
3089         /* Don't re-enter if we're already allocating a chunk */
3090         if (trans->allocating_chunk)
3091                 return -ENOSPC;
3092
3093         space_info = btrfs_find_space_info(fs_info, flags);
3094         ASSERT(space_info);
3095
3096         do {
3097                 spin_lock(&space_info->lock);
3098                 if (force < space_info->force_alloc)
3099                         force = space_info->force_alloc;
3100                 should_alloc = should_alloc_chunk(fs_info, space_info, force);
3101                 if (space_info->full) {
3102                         /* No more free physical space */
3103                         if (should_alloc)
3104                                 ret = -ENOSPC;
3105                         else
3106                                 ret = 0;
3107                         spin_unlock(&space_info->lock);
3108                         return ret;
3109                 } else if (!should_alloc) {
3110                         spin_unlock(&space_info->lock);
3111                         return 0;
3112                 } else if (space_info->chunk_alloc) {
3113                         /*
3114                          * Someone is already allocating, so we need to block
3115                          * until this someone is finished and then loop to
3116                          * recheck if we should continue with our allocation
3117                          * attempt.
3118                          */
3119                         wait_for_alloc = true;
3120                         spin_unlock(&space_info->lock);
3121                         mutex_lock(&fs_info->chunk_mutex);
3122                         mutex_unlock(&fs_info->chunk_mutex);
3123                 } else {
3124                         /* Proceed with allocation */
3125                         space_info->chunk_alloc = 1;
3126                         wait_for_alloc = false;
3127                         spin_unlock(&space_info->lock);
3128                 }
3129
3130                 cond_resched();
3131         } while (wait_for_alloc);
3132
3133         mutex_lock(&fs_info->chunk_mutex);
3134         trans->allocating_chunk = true;
3135
3136         /*
3137          * If we have mixed data/metadata chunks we want to make sure we keep
3138          * allocating mixed chunks instead of individual chunks.
3139          */
3140         if (btrfs_mixed_space_info(space_info))
3141                 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
3142
3143         /*
3144          * if we're doing a data chunk, go ahead and make sure that
3145          * we keep a reasonable number of metadata chunks allocated in the
3146          * FS as well.
3147          */
3148         if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3149                 fs_info->data_chunk_allocations++;
3150                 if (!(fs_info->data_chunk_allocations %
3151                       fs_info->metadata_ratio))
3152                         force_metadata_allocation(fs_info);
3153         }
3154
3155         /*
3156          * Check if we have enough space in SYSTEM chunk because we may need
3157          * to update devices.
3158          */
3159         check_system_chunk(trans, flags);
3160
3161         ret = btrfs_alloc_chunk(trans, flags);
3162         trans->allocating_chunk = false;
3163
3164         spin_lock(&space_info->lock);
3165         if (ret < 0) {
3166                 if (ret == -ENOSPC)
3167                         space_info->full = 1;
3168                 else
3169                         goto out;
3170         } else {
3171                 ret = 1;
3172                 space_info->max_extent_size = 0;
3173         }
3174
3175         space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3176 out:
3177         space_info->chunk_alloc = 0;
3178         spin_unlock(&space_info->lock);
3179         mutex_unlock(&fs_info->chunk_mutex);
3180         /*
3181          * When we allocate a new chunk we reserve space in the chunk block
3182          * reserve to make sure we can COW nodes/leafs in the chunk tree or
3183          * add new nodes/leafs to it if we end up needing to do it when
3184          * inserting the chunk item and updating device items as part of the
3185          * second phase of chunk allocation, performed by
3186          * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
3187          * large number of new block groups to create in our transaction
3188          * handle's new_bgs list to avoid exhausting the chunk block reserve
3189          * in extreme cases - like having a single transaction create many new
3190          * block groups when starting to write out the free space caches of all
3191          * the block groups that were made dirty during the lifetime of the
3192          * transaction.
3193          */
3194         if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
3195                 btrfs_create_pending_block_groups(trans);
3196
3197         return ret;
3198 }
3199
3200 static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
3201 {
3202         u64 num_dev;
3203
3204         num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
3205         if (!num_dev)
3206                 num_dev = fs_info->fs_devices->rw_devices;
3207
3208         return num_dev;
3209 }
3210
3211 /*
3212  * Reserve space in the system space for allocating or removing a chunk
3213  */
3214 void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
3215 {
3216         struct btrfs_fs_info *fs_info = trans->fs_info;
3217         struct btrfs_space_info *info;
3218         u64 left;
3219         u64 thresh;
3220         int ret = 0;
3221         u64 num_devs;
3222
3223         /*
3224          * Needed because we can end up allocating a system chunk and for an
3225          * atomic and race free space reservation in the chunk block reserve.
3226          */
3227         lockdep_assert_held(&fs_info->chunk_mutex);
3228
3229         info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3230         spin_lock(&info->lock);
3231         left = info->total_bytes - btrfs_space_info_used(info, true);
3232         spin_unlock(&info->lock);
3233
3234         num_devs = get_profile_num_devs(fs_info, type);
3235
3236         /* num_devs device items to update and 1 chunk item to add or remove */
3237         thresh = btrfs_calc_metadata_size(fs_info, num_devs) +
3238                 btrfs_calc_insert_metadata_size(fs_info, 1);
3239
3240         if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
3241                 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
3242                            left, thresh, type);
3243                 btrfs_dump_space_info(fs_info, info, 0, 0);
3244         }
3245
3246         if (left < thresh) {
3247                 u64 flags = btrfs_system_alloc_profile(fs_info);
3248
3249                 /*
3250                  * Ignore failure to create system chunk. We might end up not
3251                  * needing it, as we might not need to COW all nodes/leafs from
3252                  * the paths we visit in the chunk tree (they were already COWed
3253                  * or created in the current transaction for example).
3254                  */
3255                 ret = btrfs_alloc_chunk(trans, flags);
3256         }
3257
3258         if (!ret) {
3259                 ret = btrfs_block_rsv_add(fs_info->chunk_root,
3260                                           &fs_info->chunk_block_rsv,
3261                                           thresh, BTRFS_RESERVE_NO_FLUSH);
3262                 if (!ret)
3263                         trans->chunk_bytes_reserved += thresh;
3264         }
3265 }
3266
3267 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
3268 {
3269         struct btrfs_block_group *block_group;
3270         u64 last = 0;
3271
3272         while (1) {
3273                 struct inode *inode;
3274
3275                 block_group = btrfs_lookup_first_block_group(info, last);
3276                 while (block_group) {
3277                         btrfs_wait_block_group_cache_done(block_group);
3278                         spin_lock(&block_group->lock);
3279                         if (block_group->iref)
3280                                 break;
3281                         spin_unlock(&block_group->lock);
3282                         block_group = btrfs_next_block_group(block_group);
3283                 }
3284                 if (!block_group) {
3285                         if (last == 0)
3286                                 break;
3287                         last = 0;
3288                         continue;
3289                 }
3290
3291                 inode = block_group->inode;
3292                 block_group->iref = 0;
3293                 block_group->inode = NULL;
3294                 spin_unlock(&block_group->lock);
3295                 ASSERT(block_group->io_ctl.inode == NULL);
3296                 iput(inode);
3297                 last = block_group->start + block_group->length;
3298                 btrfs_put_block_group(block_group);
3299         }
3300 }
3301
3302 /*
3303  * Must be called only after stopping all workers, since we could have block
3304  * group caching kthreads running, and therefore they could race with us if we
3305  * freed the block groups before stopping them.
3306  */
3307 int btrfs_free_block_groups(struct btrfs_fs_info *info)
3308 {
3309         struct btrfs_block_group *block_group;
3310         struct btrfs_space_info *space_info;
3311         struct btrfs_caching_control *caching_ctl;
3312         struct rb_node *n;
3313
3314         spin_lock(&info->block_group_cache_lock);
3315         while (!list_empty(&info->caching_block_groups)) {
3316                 caching_ctl = list_entry(info->caching_block_groups.next,
3317                                          struct btrfs_caching_control, list);
3318                 list_del(&caching_ctl->list);
3319                 btrfs_put_caching_control(caching_ctl);
3320         }
3321         spin_unlock(&info->block_group_cache_lock);
3322
3323         spin_lock(&info->unused_bgs_lock);
3324         while (!list_empty(&info->unused_bgs)) {
3325                 block_group = list_first_entry(&info->unused_bgs,
3326                                                struct btrfs_block_group,
3327                                                bg_list);
3328                 list_del_init(&block_group->bg_list);
3329                 btrfs_put_block_group(block_group);
3330         }
3331         spin_unlock(&info->unused_bgs_lock);
3332
3333         spin_lock(&info->block_group_cache_lock);
3334         while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
3335                 block_group = rb_entry(n, struct btrfs_block_group,
3336                                        cache_node);
3337                 rb_erase(&block_group->cache_node,
3338                          &info->block_group_cache_tree);
3339                 RB_CLEAR_NODE(&block_group->cache_node);
3340                 spin_unlock(&info->block_group_cache_lock);
3341
3342                 down_write(&block_group->space_info->groups_sem);
3343                 list_del(&block_group->list);
3344                 up_write(&block_group->space_info->groups_sem);
3345
3346                 /*
3347                  * We haven't cached this block group, which means we could
3348                  * possibly have excluded extents on this block group.
3349                  */
3350                 if (block_group->cached == BTRFS_CACHE_NO ||
3351                     block_group->cached == BTRFS_CACHE_ERROR)
3352                         btrfs_free_excluded_extents(block_group);
3353
3354                 btrfs_remove_free_space_cache(block_group);
3355                 ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
3356                 ASSERT(list_empty(&block_group->dirty_list));
3357                 ASSERT(list_empty(&block_group->io_list));
3358                 ASSERT(list_empty(&block_group->bg_list));
3359                 ASSERT(refcount_read(&block_group->refs) == 1);
3360                 btrfs_put_block_group(block_group);
3361
3362                 spin_lock(&info->block_group_cache_lock);
3363         }
3364         spin_unlock(&info->block_group_cache_lock);
3365
3366         btrfs_release_global_block_rsv(info);
3367
3368         while (!list_empty(&info->space_info)) {
3369                 space_info = list_entry(info->space_info.next,
3370                                         struct btrfs_space_info,
3371                                         list);
3372
3373                 /*
3374                  * Do not hide this behind enospc_debug, this is actually
3375                  * important and indicates a real bug if this happens.
3376                  */
3377                 if (WARN_ON(space_info->bytes_pinned > 0 ||
3378                             space_info->bytes_reserved > 0 ||
3379                             space_info->bytes_may_use > 0))
3380                         btrfs_dump_space_info(info, space_info, 0, 0);
3381                 WARN_ON(space_info->reclaim_size > 0);
3382                 list_del(&space_info->list);
3383                 btrfs_sysfs_remove_space_info(space_info);
3384         }
3385         return 0;
3386 }
3387
3388 void btrfs_freeze_block_group(struct btrfs_block_group *cache)
3389 {
3390         atomic_inc(&cache->frozen);
3391 }
3392
3393 void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
3394 {
3395         struct btrfs_fs_info *fs_info = block_group->fs_info;
3396         struct extent_map_tree *em_tree;
3397         struct extent_map *em;
3398         bool cleanup;
3399
3400         spin_lock(&block_group->lock);
3401         cleanup = (atomic_dec_and_test(&block_group->frozen) &&
3402                    block_group->removed);
3403         spin_unlock(&block_group->lock);
3404
3405         if (cleanup) {
3406                 em_tree = &fs_info->mapping_tree;
3407                 write_lock(&em_tree->lock);
3408                 em = lookup_extent_mapping(em_tree, block_group->start,
3409                                            1);
3410                 BUG_ON(!em); /* logic error, can't happen */
3411                 remove_extent_mapping(em_tree, em);
3412                 write_unlock(&em_tree->lock);
3413
3414                 /* once for us and once for the tree */
3415                 free_extent_map(em);
3416                 free_extent_map(em);
3417
3418                 /*
3419                  * We may have left one free space entry and other possible
3420                  * tasks trimming this block group have left 1 entry each one.
3421                  * Free them if any.
3422                  */
3423                 __btrfs_remove_free_space_cache(block_group->free_space_ctl);
3424         }
3425 }