fs/btrfs/extent-tree.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2007 Oracle.  All rights reserved.
   4  */
   5
   6 #include <linux/sched.h>
   7 #include <linux/sched/signal.h>
   8 #include <linux/pagemap.h>
   9 #include <linux/writeback.h>
  10 #include <linux/blkdev.h>
  11 #include <linux/sort.h>
  12 #include <linux/rcupdate.h>
  13 #include <linux/kthread.h>
  14 #include <linux/slab.h>
  15 #include <linux/ratelimit.h>
  16 #include <linux/percpu_counter.h>
  17 #include <linux/lockdep.h>
  18 #include <linux/crc32c.h>
  19 #include "tree-log.h"
  20 #include "disk-io.h"
  21 #include "print-tree.h"
  22 #include "volumes.h"
  23 #include "raid56.h"
  24 #include "locking.h"
  25 #include "free-space-cache.h"
  26 #include "free-space-tree.h"
  27 #include "math.h"
  28 #include "sysfs.h"
  29 #include "qgroup.h"
  30 #include "ref-verify.h"
  31 #include "space-info.h"
  32
  33 #undef SCRAMBLE_DELAYED_REFS
  34
  35
  36 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
  37                                struct btrfs_delayed_ref_node *node, u64 parent,
  38                                u64 root_objectid, u64 owner_objectid,
  39                                u64 owner_offset, int refs_to_drop,
  40                                struct btrfs_delayed_extent_op *extra_op);
  41 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
  42                                     struct extent_buffer *leaf,
  43                                     struct btrfs_extent_item *ei);
  44 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
  45                                       u64 parent, u64 root_objectid,
  46                                       u64 flags, u64 owner, u64 offset,
  47                                       struct btrfs_key *ins, int ref_mod);
  48 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
  49                                      struct btrfs_delayed_ref_node *node,
  50                                      struct btrfs_delayed_extent_op *extent_op);
  51 static int find_next_key(struct btrfs_path *path, int level,
  52                          struct btrfs_key *key);
  53 static void dump_space_info(struct btrfs_fs_info *fs_info,
  54                             struct btrfs_space_info *info, u64 bytes,
  55                             int dump_block_groups);
  56 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
  57                                u64 num_bytes);
  58
  59 static noinline int
  60 block_group_cache_done(struct btrfs_block_group_cache *cache)
  61 {
  62         smp_mb();
  63         return cache->cached == BTRFS_CACHE_FINISHED ||
  64                 cache->cached == BTRFS_CACHE_ERROR;
  65 }
  66
  67 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
  68 {
  69         return (cache->flags & bits) == bits;
  70 }
  71
  72 void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
  73 {
  74         atomic_inc(&cache->count);
  75 }
  76
  77 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
  78 {
  79         if (atomic_dec_and_test(&cache->count)) {
  80                 WARN_ON(cache->pinned > 0);
  81                 WARN_ON(cache->reserved > 0);
  82
  83                 /*
  84                  * If not empty, someone is still holding mutex of
  85                  * full_stripe_lock, which can only be released by caller.
  86                  * And it will definitely cause use-after-free when caller
  87                  * tries to release full stripe lock.
  88                  *
  89                  * No better way to resolve, but only to warn.
  90                  */
  91                 WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
  92                 kfree(cache->free_space_ctl);
  93                 kfree(cache);
  94         }
  95 }
  96
  97 /*
  98  * this adds the block group to the fs_info rb tree for the block group
  99  * cache
 100  */
 101 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
 102                                 struct btrfs_block_group_cache *block_group)
 103 {
 104         struct rb_node **p;
 105         struct rb_node *parent = NULL;
 106         struct btrfs_block_group_cache *cache;
 107
 108         spin_lock(&info->block_group_cache_lock);
 109         p = &info->block_group_cache_tree.rb_node;
 110
 111         while (*p) {
 112                 parent = *p;
 113                 cache = rb_entry(parent, struct btrfs_block_group_cache,
 114                                  cache_node);
 115                 if (block_group->key.objectid < cache->key.objectid) {
 116                         p = &(*p)->rb_left;
 117                 } else if (block_group->key.objectid > cache->key.objectid) {
 118                         p = &(*p)->rb_right;
 119                 } else {
 120                         spin_unlock(&info->block_group_cache_lock);
 121                         return -EEXIST;
 122                 }
 123         }
 124
 125         rb_link_node(&block_group->cache_node, parent, p);
 126         rb_insert_color(&block_group->cache_node,
 127                         &info->block_group_cache_tree);
 128
 129         if (info->first_logical_byte > block_group->key.objectid)
 130                 info->first_logical_byte = block_group->key.objectid;
 131
 132         spin_unlock(&info->block_group_cache_lock);
 133
 134         return 0;
 135 }
 136
 137 /*
 138  * This will return the block group at or after bytenr if contains is 0, else
 139  * it will return the block group that contains the bytenr
 140  */
 141 static struct btrfs_block_group_cache *
 142 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
 143                               int contains)
 144 {
 145         struct btrfs_block_group_cache *cache, *ret = NULL;
 146         struct rb_node *n;
 147         u64 end, start;
 148
 149         spin_lock(&info->block_group_cache_lock);
 150         n = info->block_group_cache_tree.rb_node;
 151
 152         while (n) {
 153                 cache = rb_entry(n, struct btrfs_block_group_cache,
 154                                  cache_node);
 155                 end = cache->key.objectid + cache->key.offset - 1;
 156                 start = cache->key.objectid;
 157
 158                 if (bytenr < start) {
 159                         if (!contains && (!ret || start < ret->key.objectid))
 160                                 ret = cache;
 161                         n = n->rb_left;
 162                 } else if (bytenr > start) {
 163                         if (contains && bytenr <= end) {
 164                                 ret = cache;
 165                                 break;
 166                         }
 167                         n = n->rb_right;
 168                 } else {
 169                         ret = cache;
 170                         break;
 171                 }
 172         }
 173         if (ret) {
 174                 btrfs_get_block_group(ret);
 175                 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
 176                         info->first_logical_byte = ret->key.objectid;
 177         }
 178         spin_unlock(&info->block_group_cache_lock);
 179
 180         return ret;
 181 }
 182
 183 static int add_excluded_extent(struct btrfs_fs_info *fs_info,
 184                                u64 start, u64 num_bytes)
 185 {
 186         u64 end = start + num_bytes - 1;
 187         set_extent_bits(&fs_info->freed_extents[0],
 188                         start, end, EXTENT_UPTODATE);
 189         set_extent_bits(&fs_info->freed_extents[1],
 190                         start, end, EXTENT_UPTODATE);
 191         return 0;
 192 }
 193
 194 static void free_excluded_extents(struct btrfs_block_group_cache *cache)
 195 {
 196         struct btrfs_fs_info *fs_info = cache->fs_info;
 197         u64 start, end;
 198
 199         start = cache->key.objectid;
 200         end = start + cache->key.offset - 1;
 201
 202         clear_extent_bits(&fs_info->freed_extents[0],
 203                           start, end, EXTENT_UPTODATE);
 204         clear_extent_bits(&fs_info->freed_extents[1],
 205                           start, end, EXTENT_UPTODATE);
 206 }
 207
 208 static int exclude_super_stripes(struct btrfs_block_group_cache *cache)
 209 {
 210         struct btrfs_fs_info *fs_info = cache->fs_info;
 211         u64 bytenr;
 212         u64 *logical;
 213         int stripe_len;
 214         int i, nr, ret;
 215
 216         if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
 217                 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
 218                 cache->bytes_super += stripe_len;
 219                 ret = add_excluded_extent(fs_info, cache->key.objectid,
 220                                           stripe_len);
 221                 if (ret)
 222                         return ret;
 223         }
 224
 225         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
 226                 bytenr = btrfs_sb_offset(i);
 227                 ret = btrfs_rmap_block(fs_info, cache->key.objectid,
 228                                        bytenr, &logical, &nr, &stripe_len);
 229                 if (ret)
 230                         return ret;
 231
 232                 while (nr--) {
 233                         u64 start, len;
 234
 235                         if (logical[nr] > cache->key.objectid +
 236                             cache->key.offset)
 237                                 continue;
 238
 239                         if (logical[nr] + stripe_len <= cache->key.objectid)
 240                                 continue;
 241
 242                         start = logical[nr];
 243                         if (start < cache->key.objectid) {
 244                                 start = cache->key.objectid;
 245                                 len = (logical[nr] + stripe_len) - start;
 246                         } else {
 247                                 len = min_t(u64, stripe_len,
 248                                             cache->key.objectid +
 249                                             cache->key.offset - start);
 250                         }
 251
 252                         cache->bytes_super += len;
 253                         ret = add_excluded_extent(fs_info, start, len);
 254                         if (ret) {
 255                                 kfree(logical);
 256                                 return ret;
 257                         }
 258                 }
 259
 260                 kfree(logical);
 261         }
 262         return 0;
 263 }
 264
 265 static struct btrfs_caching_control *
 266 get_caching_control(struct btrfs_block_group_cache *cache)
 267 {
 268         struct btrfs_caching_control *ctl;
 269
 270         spin_lock(&cache->lock);
 271         if (!cache->caching_ctl) {
 272                 spin_unlock(&cache->lock);
 273                 return NULL;
 274         }
 275
 276         ctl = cache->caching_ctl;
 277         refcount_inc(&ctl->count);
 278         spin_unlock(&cache->lock);
 279         return ctl;
 280 }
 281
 282 static void put_caching_control(struct btrfs_caching_control *ctl)
 283 {
 284         if (refcount_dec_and_test(&ctl->count))
 285                 kfree(ctl);
 286 }
 287
 288 #ifdef CONFIG_BTRFS_DEBUG
 289 static void fragment_free_space(struct btrfs_block_group_cache *block_group)
 290 {
 291         struct btrfs_fs_info *fs_info = block_group->fs_info;
 292         u64 start = block_group->key.objectid;
 293         u64 len = block_group->key.offset;
 294         u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
 295                 fs_info->nodesize : fs_info->sectorsize;
 296         u64 step = chunk << 1;
 297
 298         while (len > chunk) {
 299                 btrfs_remove_free_space(block_group, start, chunk);
 300                 start += step;
 301                 if (len < step)
 302                         len = 0;
 303                 else
 304                         len -= step;
 305         }
 306 }
 307 #endif
 308
 309 /*
 310  * this is only called by cache_block_group, since we could have freed extents
 311  * we need to check the pinned_extents for any extents that can't be used yet
 312  * since their free space will be released as soon as the transaction commits.
 313  */
 314 u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
 315                        u64 start, u64 end)
 316 {
 317         struct btrfs_fs_info *info = block_group->fs_info;
 318         u64 extent_start, extent_end, size, total_added = 0;
 319         int ret;
 320
 321         while (start < end) {
 322                 ret = find_first_extent_bit(info->pinned_extents, start,
 323                                             &extent_start, &extent_end,
 324                                             EXTENT_DIRTY | EXTENT_UPTODATE,
 325                                             NULL);
 326                 if (ret)
 327                         break;
 328
 329                 if (extent_start <= start) {
 330                         start = extent_end + 1;
 331                 } else if (extent_start > start && extent_start < end) {
 332                         size = extent_start - start;
 333                         total_added += size;
 334                         ret = btrfs_add_free_space(block_group, start,
 335                                                    size);
 336                         BUG_ON(ret); /* -ENOMEM or logic error */
 337                         start = extent_end + 1;
 338                 } else {
 339                         break;
 340                 }
 341         }
 342
 343         if (start < end) {
 344                 size = end - start;
 345                 total_added += size;
 346                 ret = btrfs_add_free_space(block_group, start, size);
 347                 BUG_ON(ret); /* -ENOMEM or logic error */
 348         }
 349
 350         return total_added;
 351 }
 352
 353 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
 354 {
 355         struct btrfs_block_group_cache *block_group = caching_ctl->block_group;
 356         struct btrfs_fs_info *fs_info = block_group->fs_info;
 357         struct btrfs_root *extent_root = fs_info->extent_root;
 358         struct btrfs_path *path;
 359         struct extent_buffer *leaf;
 360         struct btrfs_key key;
 361         u64 total_found = 0;
 362         u64 last = 0;
 363         u32 nritems;
 364         int ret;
 365         bool wakeup = true;
 366
 367         path = btrfs_alloc_path();
 368         if (!path)
 369                 return -ENOMEM;
 370
 371         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
 372
 373 #ifdef CONFIG_BTRFS_DEBUG
 374         /*
 375          * If we're fragmenting we don't want to make anybody think we can
 376          * allocate from this block group until we've had a chance to fragment
 377          * the free space.
 378          */
 379         if (btrfs_should_fragment_free_space(block_group))
 380                 wakeup = false;
 381 #endif
 382         /*
 383          * We don't want to deadlock with somebody trying to allocate a new
 384          * extent for the extent root while also trying to search the extent
 385          * root to add free space.  So we skip locking and search the commit
 386          * root, since its read-only
 387          */
 388         path->skip_locking = 1;
 389         path->search_commit_root = 1;
 390         path->reada = READA_FORWARD;
 391
 392         key.objectid = last;
 393         key.offset = 0;
 394         key.type = BTRFS_EXTENT_ITEM_KEY;
 395
 396 next:
 397         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 398         if (ret < 0)
 399                 goto out;
 400
 401         leaf = path->nodes[0];
 402         nritems = btrfs_header_nritems(leaf);
 403
 404         while (1) {
 405                 if (btrfs_fs_closing(fs_info) > 1) {
 406                         last = (u64)-1;
 407                         break;
 408                 }
 409
 410                 if (path->slots[0] < nritems) {
 411                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 412                 } else {
 413                         ret = find_next_key(path, 0, &key);
 414                         if (ret)
 415                                 break;
 416
 417                         if (need_resched() ||
 418                             rwsem_is_contended(&fs_info->commit_root_sem)) {
 419                                 if (wakeup)
 420                                         caching_ctl->progress = last;
 421                                 btrfs_release_path(path);
 422                                 up_read(&fs_info->commit_root_sem);
 423                                 mutex_unlock(&caching_ctl->mutex);
 424                                 cond_resched();
 425                                 mutex_lock(&caching_ctl->mutex);
 426                                 down_read(&fs_info->commit_root_sem);
 427                                 goto next;
 428                         }
 429
 430                         ret = btrfs_next_leaf(extent_root, path);
 431                         if (ret < 0)
 432                                 goto out;
 433                         if (ret)
 434                                 break;
 435                         leaf = path->nodes[0];
 436                         nritems = btrfs_header_nritems(leaf);
 437                         continue;
 438                 }
 439
 440                 if (key.objectid < last) {
 441                         key.objectid = last;
 442                         key.offset = 0;
 443                         key.type = BTRFS_EXTENT_ITEM_KEY;
 444
 445                         if (wakeup)
 446                                 caching_ctl->progress = last;
 447                         btrfs_release_path(path);
 448                         goto next;
 449                 }
 450
 451                 if (key.objectid < block_group->key.objectid) {
 452                         path->slots[0]++;
 453                         continue;
 454                 }
 455
 456                 if (key.objectid >= block_group->key.objectid +
 457                     block_group->key.offset)
 458                         break;
 459
 460                 if (key.type == BTRFS_EXTENT_ITEM_KEY ||
 461                     key.type == BTRFS_METADATA_ITEM_KEY) {
 462                         total_found += add_new_free_space(block_group, last,
 463                                                           key.objectid);
 464                         if (key.type == BTRFS_METADATA_ITEM_KEY)
 465                                 last = key.objectid +
 466                                         fs_info->nodesize;
 467                         else
 468                                 last = key.objectid + key.offset;
 469
 470                         if (total_found > CACHING_CTL_WAKE_UP) {
 471                                 total_found = 0;
 472                                 if (wakeup)
 473                                         wake_up(&caching_ctl->wait);
 474                         }
 475                 }
 476                 path->slots[0]++;
 477         }
 478         ret = 0;
 479
 480         total_found += add_new_free_space(block_group, last,
 481                                           block_group->key.objectid +
 482                                           block_group->key.offset);
 483         caching_ctl->progress = (u64)-1;
 484
 485 out:
 486         btrfs_free_path(path);
 487         return ret;
 488 }
 489
 490 static noinline void caching_thread(struct btrfs_work *work)
 491 {
 492         struct btrfs_block_group_cache *block_group;
 493         struct btrfs_fs_info *fs_info;
 494         struct btrfs_caching_control *caching_ctl;
 495         int ret;
 496
 497         caching_ctl = container_of(work, struct btrfs_caching_control, work);
 498         block_group = caching_ctl->block_group;
 499         fs_info = block_group->fs_info;
 500
 501         mutex_lock(&caching_ctl->mutex);
 502         down_read(&fs_info->commit_root_sem);
 503
 504         if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
 505                 ret = load_free_space_tree(caching_ctl);
 506         else
 507                 ret = load_extent_tree_free(caching_ctl);
 508
 509         spin_lock(&block_group->lock);
 510         block_group->caching_ctl = NULL;
 511         block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
 512         spin_unlock(&block_group->lock);
 513
 514 #ifdef CONFIG_BTRFS_DEBUG
 515         if (btrfs_should_fragment_free_space(block_group)) {
 516                 u64 bytes_used;
 517
 518                 spin_lock(&block_group->space_info->lock);
 519                 spin_lock(&block_group->lock);
 520                 bytes_used = block_group->key.offset -
 521                         btrfs_block_group_used(&block_group->item);
 522                 block_group->space_info->bytes_used += bytes_used >> 1;
 523                 spin_unlock(&block_group->lock);
 524                 spin_unlock(&block_group->space_info->lock);
 525                 fragment_free_space(block_group);
 526         }
 527 #endif
 528
 529         caching_ctl->progress = (u64)-1;
 530
 531         up_read(&fs_info->commit_root_sem);
 532         free_excluded_extents(block_group);
 533         mutex_unlock(&caching_ctl->mutex);
 534
 535         wake_up(&caching_ctl->wait);
 536
 537         put_caching_control(caching_ctl);
 538         btrfs_put_block_group(block_group);
 539 }
 540
 541 static int cache_block_group(struct btrfs_block_group_cache *cache,
 542                              int load_cache_only)
 543 {
 544         DEFINE_WAIT(wait);
 545         struct btrfs_fs_info *fs_info = cache->fs_info;
 546         struct btrfs_caching_control *caching_ctl;
 547         int ret = 0;
 548
 549         caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
 550         if (!caching_ctl)
 551                 return -ENOMEM;
 552
 553         INIT_LIST_HEAD(&caching_ctl->list);
 554         mutex_init(&caching_ctl->mutex);
 555         init_waitqueue_head(&caching_ctl->wait);
 556         caching_ctl->block_group = cache;
 557         caching_ctl->progress = cache->key.objectid;
 558         refcount_set(&caching_ctl->count, 1);
 559         btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
 560                         caching_thread, NULL, NULL);
 561
 562         spin_lock(&cache->lock);
 563         /*
 564          * This should be a rare occasion, but this could happen I think in the
 565          * case where one thread starts to load the space cache info, and then
 566          * some other thread starts a transaction commit which tries to do an
 567          * allocation while the other thread is still loading the space cache
 568          * info.  The previous loop should have kept us from choosing this block
 569          * group, but if we've moved to the state where we will wait on caching
 570          * block groups we need to first check if we're doing a fast load here,
 571          * so we can wait for it to finish, otherwise we could end up allocating
 572          * from a block group who's cache gets evicted for one reason or
 573          * another.
 574          */
 575         while (cache->cached == BTRFS_CACHE_FAST) {
 576                 struct btrfs_caching_control *ctl;
 577
 578                 ctl = cache->caching_ctl;
 579                 refcount_inc(&ctl->count);
 580                 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
 581                 spin_unlock(&cache->lock);
 582
 583                 schedule();
 584
 585                 finish_wait(&ctl->wait, &wait);
 586                 put_caching_control(ctl);
 587                 spin_lock(&cache->lock);
 588         }
 589
 590         if (cache->cached != BTRFS_CACHE_NO) {
 591                 spin_unlock(&cache->lock);
 592                 kfree(caching_ctl);
 593                 return 0;
 594         }
 595         WARN_ON(cache->caching_ctl);
 596         cache->caching_ctl = caching_ctl;
 597         cache->cached = BTRFS_CACHE_FAST;
 598         spin_unlock(&cache->lock);
 599
 600         if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
 601                 mutex_lock(&caching_ctl->mutex);
 602                 ret = load_free_space_cache(cache);
 603
 604                 spin_lock(&cache->lock);
 605                 if (ret == 1) {
 606                         cache->caching_ctl = NULL;
 607                         cache->cached = BTRFS_CACHE_FINISHED;
 608                         cache->last_byte_to_unpin = (u64)-1;
 609                         caching_ctl->progress = (u64)-1;
 610                 } else {
 611                         if (load_cache_only) {
 612                                 cache->caching_ctl = NULL;
 613                                 cache->cached = BTRFS_CACHE_NO;
 614                         } else {
 615                                 cache->cached = BTRFS_CACHE_STARTED;
 616                                 cache->has_caching_ctl = 1;
 617                         }
 618                 }
 619                 spin_unlock(&cache->lock);
 620 #ifdef CONFIG_BTRFS_DEBUG
 621                 if (ret == 1 &&
 622                     btrfs_should_fragment_free_space(cache)) {
 623                         u64 bytes_used;
 624
 625                         spin_lock(&cache->space_info->lock);
 626                         spin_lock(&cache->lock);
 627                         bytes_used = cache->key.offset -
 628                                 btrfs_block_group_used(&cache->item);
 629                         cache->space_info->bytes_used += bytes_used >> 1;
 630                         spin_unlock(&cache->lock);
 631                         spin_unlock(&cache->space_info->lock);
 632                         fragment_free_space(cache);
 633                 }
 634 #endif
 635                 mutex_unlock(&caching_ctl->mutex);
 636
 637                 wake_up(&caching_ctl->wait);
 638                 if (ret == 1) {
 639                         put_caching_control(caching_ctl);
 640                         free_excluded_extents(cache);
 641                         return 0;
 642                 }
 643         } else {
 644                 /*
 645                  * We're either using the free space tree or no caching at all.
 646                  * Set cached to the appropriate value and wakeup any waiters.
 647                  */
 648                 spin_lock(&cache->lock);
 649                 if (load_cache_only) {
 650                         cache->caching_ctl = NULL;
 651                         cache->cached = BTRFS_CACHE_NO;
 652                 } else {
 653                         cache->cached = BTRFS_CACHE_STARTED;
 654                         cache->has_caching_ctl = 1;
 655                 }
 656                 spin_unlock(&cache->lock);
 657                 wake_up(&caching_ctl->wait);
 658         }
 659
 660         if (load_cache_only) {
 661                 put_caching_control(caching_ctl);
 662                 return 0;
 663         }
 664
 665         down_write(&fs_info->commit_root_sem);
 666         refcount_inc(&caching_ctl->count);
 667         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
 668         up_write(&fs_info->commit_root_sem);
 669
 670         btrfs_get_block_group(cache);
 671
 672         btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
 673
 674         return ret;
 675 }
 676
 677 /*
 678  * return the block group that starts at or after bytenr
 679  */
 680 static struct btrfs_block_group_cache *
 681 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
 682 {
 683         return block_group_cache_tree_search(info, bytenr, 0);
 684 }
 685
 686 /*
 687  * return the block group that contains the given bytenr
 688  */
 689 struct btrfs_block_group_cache *btrfs_lookup_block_group(
 690                                                  struct btrfs_fs_info *info,
 691                                                  u64 bytenr)
 692 {
 693         return block_group_cache_tree_search(info, bytenr, 1);
 694 }
 695
 696 static u64 generic_ref_to_space_flags(struct btrfs_ref *ref)
 697 {
 698         if (ref->type == BTRFS_REF_METADATA) {
 699                 if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID)
 700                         return BTRFS_BLOCK_GROUP_SYSTEM;
 701                 else
 702                         return BTRFS_BLOCK_GROUP_METADATA;
 703         }
 704         return BTRFS_BLOCK_GROUP_DATA;
 705 }
 706
 707 static void add_pinned_bytes(struct btrfs_fs_info *fs_info,
 708                              struct btrfs_ref *ref)
 709 {
 710         struct btrfs_space_info *space_info;
 711         u64 flags = generic_ref_to_space_flags(ref);
 712
 713         space_info = btrfs_find_space_info(fs_info, flags);
 714         ASSERT(space_info);
 715         percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len,
 716                     BTRFS_TOTAL_BYTES_PINNED_BATCH);
 717 }
 718
 719 static void sub_pinned_bytes(struct btrfs_fs_info *fs_info,
 720                              struct btrfs_ref *ref)
 721 {
 722         struct btrfs_space_info *space_info;
 723         u64 flags = generic_ref_to_space_flags(ref);
 724
 725         space_info = btrfs_find_space_info(fs_info, flags);
 726         ASSERT(space_info);
 727         percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len,
 728                     BTRFS_TOTAL_BYTES_PINNED_BATCH);
 729 }
 730
 731 /* simple helper to search for an existing data extent at a given offset */
 732 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
 733 {
 734         int ret;
 735         struct btrfs_key key;
 736         struct btrfs_path *path;
 737
 738         path = btrfs_alloc_path();
 739         if (!path)
 740                 return -ENOMEM;
 741
 742         key.objectid = start;
 743         key.offset = len;
 744         key.type = BTRFS_EXTENT_ITEM_KEY;
 745         ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
 746         btrfs_free_path(path);
 747         return ret;
 748 }
 749
 750 /*
 751  * helper function to lookup reference count and flags of a tree block.
 752  *
 753  * the head node for delayed ref is used to store the sum of all the
 754  * reference count modifications queued up in the rbtree. the head
 755  * node may also store the extent flags to set. This way you can check
 756  * to see what the reference count and extent flags would be if all of
 757  * the delayed refs are not processed.
 758  */
 759 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 760                              struct btrfs_fs_info *fs_info, u64 bytenr,
 761                              u64 offset, int metadata, u64 *refs, u64 *flags)
 762 {
 763         struct btrfs_delayed_ref_head *head;
 764         struct btrfs_delayed_ref_root *delayed_refs;
 765         struct btrfs_path *path;
 766         struct btrfs_extent_item *ei;
 767         struct extent_buffer *leaf;
 768         struct btrfs_key key;
 769         u32 item_size;
 770         u64 num_refs;
 771         u64 extent_flags;
 772         int ret;
 773
 774         /*
 775          * If we don't have skinny metadata, don't bother doing anything
 776          * different
 777          */
 778         if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
 779                 offset = fs_info->nodesize;
 780                 metadata = 0;
 781         }
 782
 783         path = btrfs_alloc_path();
 784         if (!path)
 785                 return -ENOMEM;
 786
 787         if (!trans) {
 788                 path->skip_locking = 1;
 789                 path->search_commit_root = 1;
 790         }
 791
 792 search_again:
 793         key.objectid = bytenr;
 794         key.offset = offset;
 795         if (metadata)
 796                 key.type = BTRFS_METADATA_ITEM_KEY;
 797         else
 798                 key.type = BTRFS_EXTENT_ITEM_KEY;
 799
 800         ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
 801         if (ret < 0)
 802                 goto out_free;
 803
 804         if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
 805                 if (path->slots[0]) {
 806                         path->slots[0]--;
 807                         btrfs_item_key_to_cpu(path->nodes[0], &key,
 808                                               path->slots[0]);
 809                         if (key.objectid == bytenr &&
 810                             key.type == BTRFS_EXTENT_ITEM_KEY &&
 811                             key.offset == fs_info->nodesize)
 812                                 ret = 0;
 813                 }
 814         }
 815
 816         if (ret == 0) {
 817                 leaf = path->nodes[0];
 818                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 819                 if (item_size >= sizeof(*ei)) {
 820                         ei = btrfs_item_ptr(leaf, path->slots[0],
 821                                             struct btrfs_extent_item);
 822                         num_refs = btrfs_extent_refs(leaf, ei);
 823                         extent_flags = btrfs_extent_flags(leaf, ei);
 824                 } else {
 825                         ret = -EINVAL;
 826                         btrfs_print_v0_err(fs_info);
 827                         if (trans)
 828                                 btrfs_abort_transaction(trans, ret);
 829                         else
 830                                 btrfs_handle_fs_error(fs_info, ret, NULL);
 831
 832                         goto out_free;
 833                 }
 834
 835                 BUG_ON(num_refs == 0);
 836         } else {
 837                 num_refs = 0;
 838                 extent_flags = 0;
 839                 ret = 0;
 840         }
 841
 842         if (!trans)
 843                 goto out;
 844
 845         delayed_refs = &trans->transaction->delayed_refs;
 846         spin_lock(&delayed_refs->lock);
 847         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
 848         if (head) {
 849                 if (!mutex_trylock(&head->mutex)) {
 850                         refcount_inc(&head->refs);
 851                         spin_unlock(&delayed_refs->lock);
 852
 853                         btrfs_release_path(path);
 854
 855                         /*
 856                          * Mutex was contended, block until it's released and try
 857                          * again
 858                          */
 859                         mutex_lock(&head->mutex);
 860                         mutex_unlock(&head->mutex);
 861                         btrfs_put_delayed_ref_head(head);
 862                         goto search_again;
 863                 }
 864                 spin_lock(&head->lock);
 865                 if (head->extent_op && head->extent_op->update_flags)
 866                         extent_flags |= head->extent_op->flags_to_set;
 867                 else
 868                         BUG_ON(num_refs == 0);
 869
 870                 num_refs += head->ref_mod;
 871                 spin_unlock(&head->lock);
 872                 mutex_unlock(&head->mutex);
 873         }
 874         spin_unlock(&delayed_refs->lock);
 875 out:
 876         WARN_ON(num_refs == 0);
 877         if (refs)
 878                 *refs = num_refs;
 879         if (flags)
 880                 *flags = extent_flags;
 881 out_free:
 882         btrfs_free_path(path);
 883         return ret;
 884 }
 885
 886 /*
 887  * Back reference rules.  Back refs have three main goals:
 888  *
 889  * 1) differentiate between all holders of references to an extent so that
 890  *    when a reference is dropped we can make sure it was a valid reference
 891  *    before freeing the extent.
 892  *
 893  * 2) Provide enough information to quickly find the holders of an extent
 894  *    if we notice a given block is corrupted or bad.
 895  *
 896  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
 897  *    maintenance.  This is actually the same as #2, but with a slightly
 898  *    different use case.
 899  *
 900  * There are two kinds of back refs. The implicit back refs is optimized
 901  * for pointers in non-shared tree blocks. For a given pointer in a block,
 902  * back refs of this kind provide information about the block's owner tree
 903  * and the pointer's key. These information allow us to find the block by
 904  * b-tree searching. The full back refs is for pointers in tree blocks not
 905  * referenced by their owner trees. The location of tree block is recorded
 906  * in the back refs. Actually the full back refs is generic, and can be
 907  * used in all cases the implicit back refs is used. The major shortcoming
 908  * of the full back refs is its overhead. Every time a tree block gets
 909  * COWed, we have to update back refs entry for all pointers in it.
 910  *
 911  * For a newly allocated tree block, we use implicit back refs for
 912  * pointers in it. This means most tree related operations only involve
 913  * implicit back refs. For a tree block created in old transaction, the
 914  * only way to drop a reference to it is COW it. So we can detect the
 915  * event that tree block loses its owner tree's reference and do the
 916  * back refs conversion.
 917  *
 918  * When a tree block is COWed through a tree, there are four cases:
 919  *
 920  * The reference count of the block is one and the tree is the block's
 921  * owner tree. Nothing to do in this case.
 922  *
 923  * The reference count of the block is one and the tree is not the
 924  * block's owner tree. In this case, full back refs is used for pointers
 925  * in the block. Remove these full back refs, add implicit back refs for
 926  * every pointers in the new block.
 927  *
 928  * The reference count of the block is greater than one and the tree is
 929  * the block's owner tree. In this case, implicit back refs is used for
 930  * pointers in the block. Add full back refs for every pointers in the
 931  * block, increase lower level extents' reference counts. The original
 932  * implicit back refs are entailed to the new block.
 933  *
 934  * The reference count of the block is greater than one and the tree is
 935  * not the block's owner tree. Add implicit back refs for every pointer in
 936  * the new block, increase lower level extents' reference count.
 937  *
 938  * Back Reference Key composing:
 939  *
 940  * The key objectid corresponds to the first byte in the extent,
 941  * The key type is used to differentiate between types of back refs.
 942  * There are different meanings of the key offset for different types
 943  * of back refs.
 944  *
 945  * File extents can be referenced by:
 946  *
 947  * - multiple snapshots, subvolumes, or different generations in one subvol
 948  * - different files inside a single subvolume
 949  * - different offsets inside a file (bookend extents in file.c)
 950  *
 951  * The extent ref structure for the implicit back refs has fields for:
 952  *
 953  * - Objectid of the subvolume root
 954  * - objectid of the file holding the reference
 955  * - original offset in the file
 956  * - how many bookend extents
 957  *
 958  * The key offset for the implicit back refs is hash of the first
 959  * three fields.
 960  *
 961  * The extent ref structure for the full back refs has field for:
 962  *
 963  * - number of pointers in the tree leaf
 964  *
 965  * The key offset for the implicit back refs is the first byte of
 966  * the tree leaf
 967  *
 968  * When a file extent is allocated, The implicit back refs is used.
 969  * the fields are filled in:
 970  *
 971  *     (root_key.objectid, inode objectid, offset in file, 1)
 972  *
 973  * When a file extent is removed file truncation, we find the
 974  * corresponding implicit back refs and check the following fields:
 975  *
 976  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
 977  *
 978  * Btree extents can be referenced by:
 979  *
 980  * - Different subvolumes
 981  *
 982  * Both the implicit back refs and the full back refs for tree blocks
 983  * only consist of key. The key offset for the implicit back refs is
 984  * objectid of block's owner tree. The key offset for the full back refs
 985  * is the first byte of parent block.
 986  *
 987  * When implicit back refs is used, information about the lowest key and
 988  * level of the tree block are required. These information are stored in
 989  * tree block info structure.
 990  */
 991
 992 /*
 993  * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
 994  * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
 995  * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
 996  */
 997 int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
 998                                      struct btrfs_extent_inline_ref *iref,
 999                                      enum btrfs_inline_ref_type is_data)
1000 {
1001         int type = btrfs_extent_inline_ref_type(eb, iref);
1002         u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
1003
1004         if (type == BTRFS_TREE_BLOCK_REF_KEY ||
1005             type == BTRFS_SHARED_BLOCK_REF_KEY ||
1006             type == BTRFS_SHARED_DATA_REF_KEY ||
1007             type == BTRFS_EXTENT_DATA_REF_KEY) {
1008                 if (is_data == BTRFS_REF_TYPE_BLOCK) {
1009                         if (type == BTRFS_TREE_BLOCK_REF_KEY)
1010                                 return type;
1011                         if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1012                                 ASSERT(eb->fs_info);
1013                                 /*
1014                                  * Every shared one has parent tree
1015                                  * block, which must be aligned to
1016                                  * nodesize.
1017                                  */
1018                                 if (offset &&
1019                                     IS_ALIGNED(offset, eb->fs_info->nodesize))
1020                                         return type;
1021                         }
1022                 } else if (is_data == BTRFS_REF_TYPE_DATA) {
1023                         if (type == BTRFS_EXTENT_DATA_REF_KEY)
1024                                 return type;
1025                         if (type == BTRFS_SHARED_DATA_REF_KEY) {
1026                                 ASSERT(eb->fs_info);
1027                                 /*
1028                                  * Every shared one has parent tree
1029                                  * block, which must be aligned to
1030                                  * nodesize.
1031                                  */
1032                                 if (offset &&
1033                                     IS_ALIGNED(offset, eb->fs_info->nodesize))
1034                                         return type;
1035                         }
1036                 } else {
1037                         ASSERT(is_data == BTRFS_REF_TYPE_ANY);
1038                         return type;
1039                 }
1040         }
1041
1042         btrfs_print_leaf((struct extent_buffer *)eb);
1043         btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d",
1044                   eb->start, type);
1045         WARN_ON(1);
1046
1047         return BTRFS_REF_TYPE_INVALID;
1048 }
1049
1050 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1051 {
1052         u32 high_crc = ~(u32)0;
1053         u32 low_crc = ~(u32)0;
1054         __le64 lenum;
1055
1056         lenum = cpu_to_le64(root_objectid);
1057         high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
1058         lenum = cpu_to_le64(owner);
1059         low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1060         lenum = cpu_to_le64(offset);
1061         low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1062
1063         return ((u64)high_crc << 31) ^ (u64)low_crc;
1064 }
1065
1066 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1067                                      struct btrfs_extent_data_ref *ref)
1068 {
1069         return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1070                                     btrfs_extent_data_ref_objectid(leaf, ref),
1071                                     btrfs_extent_data_ref_offset(leaf, ref));
1072 }
1073
1074 static int match_extent_data_ref(struct extent_buffer *leaf,
1075                                  struct btrfs_extent_data_ref *ref,
1076                                  u64 root_objectid, u64 owner, u64 offset)
1077 {
1078         if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1079             btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1080             btrfs_extent_data_ref_offset(leaf, ref) != offset)
1081                 return 0;
1082         return 1;
1083 }
1084
1085 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1086                                            struct btrfs_path *path,
1087                                            u64 bytenr, u64 parent,
1088                                            u64 root_objectid,
1089                                            u64 owner, u64 offset)
1090 {
1091         struct btrfs_root *root = trans->fs_info->extent_root;
1092         struct btrfs_key key;
1093         struct btrfs_extent_data_ref *ref;
1094         struct extent_buffer *leaf;
1095         u32 nritems;
1096         int ret;
1097         int recow;
1098         int err = -ENOENT;
1099
1100         key.objectid = bytenr;
1101         if (parent) {
1102                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1103                 key.offset = parent;
1104         } else {
1105                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1106                 key.offset = hash_extent_data_ref(root_objectid,
1107                                                   owner, offset);
1108         }
1109 again:
1110         recow = 0;
1111         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1112         if (ret < 0) {
1113                 err = ret;
1114                 goto fail;
1115         }
1116
1117         if (parent) {
1118                 if (!ret)
1119                         return 0;
1120                 goto fail;
1121         }
1122
1123         leaf = path->nodes[0];
1124         nritems = btrfs_header_nritems(leaf);
1125         while (1) {
1126                 if (path->slots[0] >= nritems) {
1127                         ret = btrfs_next_leaf(root, path);
1128                         if (ret < 0)
1129                                 err = ret;
1130                         if (ret)
1131                                 goto fail;
1132
1133                         leaf = path->nodes[0];
1134                         nritems = btrfs_header_nritems(leaf);
1135                         recow = 1;
1136                 }
1137
1138                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1139                 if (key.objectid != bytenr ||
1140                     key.type != BTRFS_EXTENT_DATA_REF_KEY)
1141                         goto fail;
1142
1143                 ref = btrfs_item_ptr(leaf, path->slots[0],
1144                                      struct btrfs_extent_data_ref);
1145
1146                 if (match_extent_data_ref(leaf, ref, root_objectid,
1147                                           owner, offset)) {
1148                         if (recow) {
1149                                 btrfs_release_path(path);
1150                                 goto again;
1151                         }
1152                         err = 0;
1153                         break;
1154                 }
1155                 path->slots[0]++;
1156         }
1157 fail:
1158         return err;
1159 }
1160
1161 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1162                                            struct btrfs_path *path,
1163                                            u64 bytenr, u64 parent,
1164                                            u64 root_objectid, u64 owner,
1165                                            u64 offset, int refs_to_add)
1166 {
1167         struct btrfs_root *root = trans->fs_info->extent_root;
1168         struct btrfs_key key;
1169         struct extent_buffer *leaf;
1170         u32 size;
1171         u32 num_refs;
1172         int ret;
1173
1174         key.objectid = bytenr;
1175         if (parent) {
1176                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1177                 key.offset = parent;
1178                 size = sizeof(struct btrfs_shared_data_ref);
1179         } else {
1180                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1181                 key.offset = hash_extent_data_ref(root_objectid,
1182                                                   owner, offset);
1183                 size = sizeof(struct btrfs_extent_data_ref);
1184         }
1185
1186         ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1187         if (ret && ret != -EEXIST)
1188                 goto fail;
1189
1190         leaf = path->nodes[0];
1191         if (parent) {
1192                 struct btrfs_shared_data_ref *ref;
1193                 ref = btrfs_item_ptr(leaf, path->slots[0],
1194                                      struct btrfs_shared_data_ref);
1195                 if (ret == 0) {
1196                         btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1197                 } else {
1198                         num_refs = btrfs_shared_data_ref_count(leaf, ref);
1199                         num_refs += refs_to_add;
1200                         btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1201                 }
1202         } else {
1203                 struct btrfs_extent_data_ref *ref;
1204                 while (ret == -EEXIST) {
1205                         ref = btrfs_item_ptr(leaf, path->slots[0],
1206                                              struct btrfs_extent_data_ref);
1207                         if (match_extent_data_ref(leaf, ref, root_objectid,
1208                                                   owner, offset))
1209                                 break;
1210                         btrfs_release_path(path);
1211                         key.offset++;
1212                         ret = btrfs_insert_empty_item(trans, root, path, &key,
1213                                                       size);
1214                         if (ret && ret != -EEXIST)
1215                                 goto fail;
1216
1217                         leaf = path->nodes[0];
1218                 }
1219                 ref = btrfs_item_ptr(leaf, path->slots[0],
1220                                      struct btrfs_extent_data_ref);
1221                 if (ret == 0) {
1222                         btrfs_set_extent_data_ref_root(leaf, ref,
1223                                                        root_objectid);
1224                         btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1225                         btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1226                         btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1227                 } else {
1228                         num_refs = btrfs_extent_data_ref_count(leaf, ref);
1229                         num_refs += refs_to_add;
1230                         btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1231                 }
1232         }
1233         btrfs_mark_buffer_dirty(leaf);
1234         ret = 0;
1235 fail:
1236         btrfs_release_path(path);
1237         return ret;
1238 }
1239
1240 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1241                                            struct btrfs_path *path,
1242                                            int refs_to_drop, int *last_ref)
1243 {
1244         struct btrfs_key key;
1245         struct btrfs_extent_data_ref *ref1 = NULL;
1246         struct btrfs_shared_data_ref *ref2 = NULL;
1247         struct extent_buffer *leaf;
1248         u32 num_refs = 0;
1249         int ret = 0;
1250
1251         leaf = path->nodes[0];
1252         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1253
1254         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1255                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1256                                       struct btrfs_extent_data_ref);
1257                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1258         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1259                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1260                                       struct btrfs_shared_data_ref);
1261                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1262         } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
1263                 btrfs_print_v0_err(trans->fs_info);
1264                 btrfs_abort_transaction(trans, -EINVAL);
1265                 return -EINVAL;
1266         } else {
1267                 BUG();
1268         }
1269
1270         BUG_ON(num_refs < refs_to_drop);
1271         num_refs -= refs_to_drop;
1272
1273         if (num_refs == 0) {
1274                 ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
1275                 *last_ref = 1;
1276         } else {
1277                 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1278                         btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1279                 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1280                         btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1281                 btrfs_mark_buffer_dirty(leaf);
1282         }
1283         return ret;
1284 }
1285
1286 static noinline u32 extent_data_ref_count(struct btrfs_path *path,
1287                                           struct btrfs_extent_inline_ref *iref)
1288 {
1289         struct btrfs_key key;
1290         struct extent_buffer *leaf;
1291         struct btrfs_extent_data_ref *ref1;
1292         struct btrfs_shared_data_ref *ref2;
1293         u32 num_refs = 0;
1294         int type;
1295
1296         leaf = path->nodes[0];
1297         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1298
1299         BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
1300         if (iref) {
1301                 /*
1302                  * If type is invalid, we should have bailed out earlier than
1303                  * this call.
1304                  */
1305                 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
1306                 ASSERT(type != BTRFS_REF_TYPE_INVALID);
1307                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1308                         ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1309                         num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1310                 } else {
1311                         ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1312                         num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1313                 }
1314         } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1315                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1316                                       struct btrfs_extent_data_ref);
1317                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1318         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1319                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1320                                       struct btrfs_shared_data_ref);
1321                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1322         } else {
1323                 WARN_ON(1);
1324         }
1325         return num_refs;
1326 }
1327
1328 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1329                                           struct btrfs_path *path,
1330                                           u64 bytenr, u64 parent,
1331                                           u64 root_objectid)
1332 {
1333         struct btrfs_root *root = trans->fs_info->extent_root;
1334         struct btrfs_key key;
1335         int ret;
1336
1337         key.objectid = bytenr;
1338         if (parent) {
1339                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1340                 key.offset = parent;
1341         } else {
1342                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1343                 key.offset = root_objectid;
1344         }
1345
1346         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1347         if (ret > 0)
1348                 ret = -ENOENT;
1349         return ret;
1350 }
1351
1352 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1353                                           struct btrfs_path *path,
1354                                           u64 bytenr, u64 parent,
1355                                           u64 root_objectid)
1356 {
1357         struct btrfs_key key;
1358         int ret;
1359
1360         key.objectid = bytenr;
1361         if (parent) {
1362                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1363                 key.offset = parent;
1364         } else {
1365                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1366                 key.offset = root_objectid;
1367         }
1368
1369         ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root,
1370                                       path, &key, 0);
1371         btrfs_release_path(path);
1372         return ret;
1373 }
1374
1375 static inline int extent_ref_type(u64 parent, u64 owner)
1376 {
1377         int type;
1378         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1379                 if (parent > 0)
1380                         type = BTRFS_SHARED_BLOCK_REF_KEY;
1381                 else
1382                         type = BTRFS_TREE_BLOCK_REF_KEY;
1383         } else {
1384                 if (parent > 0)
1385                         type = BTRFS_SHARED_DATA_REF_KEY;
1386                 else
1387                         type = BTRFS_EXTENT_DATA_REF_KEY;
1388         }
1389         return type;
1390 }
1391
1392 static int find_next_key(struct btrfs_path *path, int level,
1393                          struct btrfs_key *key)
1394
1395 {
1396         for (; level < BTRFS_MAX_LEVEL; level++) {
1397                 if (!path->nodes[level])
1398                         break;
1399                 if (path->slots[level] + 1 >=
1400                     btrfs_header_nritems(path->nodes[level]))
1401                         continue;
1402                 if (level == 0)
1403                         btrfs_item_key_to_cpu(path->nodes[level], key,
1404                                               path->slots[level] + 1);
1405                 else
1406                         btrfs_node_key_to_cpu(path->nodes[level], key,
1407                                               path->slots[level] + 1);
1408                 return 0;
1409         }
1410         return 1;
1411 }
1412
1413 /*
1414  * look for inline back ref. if back ref is found, *ref_ret is set
1415  * to the address of inline back ref, and 0 is returned.
1416  *
1417  * if back ref isn't found, *ref_ret is set to the address where it
1418  * should be inserted, and -ENOENT is returned.
1419  *
1420  * if insert is true and there are too many inline back refs, the path
1421  * points to the extent item, and -EAGAIN is returned.
1422  *
1423  * NOTE: inline back refs are ordered in the same way that back ref
1424  *       items in the tree are ordered.
1425  */
1426 static noinline_for_stack
1427 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1428                                  struct btrfs_path *path,
1429                                  struct btrfs_extent_inline_ref **ref_ret,
1430                                  u64 bytenr, u64 num_bytes,
1431                                  u64 parent, u64 root_objectid,
1432                                  u64 owner, u64 offset, int insert)
1433 {
1434         struct btrfs_fs_info *fs_info = trans->fs_info;
1435         struct btrfs_root *root = fs_info->extent_root;
1436         struct btrfs_key key;
1437         struct extent_buffer *leaf;
1438         struct btrfs_extent_item *ei;
1439         struct btrfs_extent_inline_ref *iref;
1440         u64 flags;
1441         u64 item_size;
1442         unsigned long ptr;
1443         unsigned long end;
1444         int extra_size;
1445         int type;
1446         int want;
1447         int ret;
1448         int err = 0;
1449         bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
1450         int needed;
1451
1452         key.objectid = bytenr;
1453         key.type = BTRFS_EXTENT_ITEM_KEY;
1454         key.offset = num_bytes;
1455
1456         want = extent_ref_type(parent, owner);
1457         if (insert) {
1458                 extra_size = btrfs_extent_inline_ref_size(want);
1459                 path->keep_locks = 1;
1460         } else
1461                 extra_size = -1;
1462
1463         /*
1464          * Owner is our level, so we can just add one to get the level for the
1465          * block we are interested in.
1466          */
1467         if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
1468                 key.type = BTRFS_METADATA_ITEM_KEY;
1469                 key.offset = owner;
1470         }
1471
1472 again:
1473         ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1474         if (ret < 0) {
1475                 err = ret;
1476                 goto out;
1477         }
1478
1479         /*
1480          * We may be a newly converted file system which still has the old fat
1481          * extent entries for metadata, so try and see if we have one of those.
1482          */
1483         if (ret > 0 && skinny_metadata) {
1484                 skinny_metadata = false;
1485                 if (path->slots[0]) {
1486                         path->slots[0]--;
1487                         btrfs_item_key_to_cpu(path->nodes[0], &key,
1488                                               path->slots[0]);
1489                         if (key.objectid == bytenr &&
1490                             key.type == BTRFS_EXTENT_ITEM_KEY &&
1491                             key.offset == num_bytes)
1492                                 ret = 0;
1493                 }
1494                 if (ret) {
1495                         key.objectid = bytenr;
1496                         key.type = BTRFS_EXTENT_ITEM_KEY;
1497                         key.offset = num_bytes;
1498                         btrfs_release_path(path);
1499                         goto again;
1500                 }
1501         }
1502
1503         if (ret && !insert) {
1504                 err = -ENOENT;
1505                 goto out;
1506         } else if (WARN_ON(ret)) {
1507                 err = -EIO;
1508                 goto out;
1509         }
1510
1511         leaf = path->nodes[0];
1512         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1513         if (unlikely(item_size < sizeof(*ei))) {
1514                 err = -EINVAL;
1515                 btrfs_print_v0_err(fs_info);
1516                 btrfs_abort_transaction(trans, err);
1517                 goto out;
1518         }
1519
1520         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1521         flags = btrfs_extent_flags(leaf, ei);
1522
1523         ptr = (unsigned long)(ei + 1);
1524         end = (unsigned long)ei + item_size;
1525
1526         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
1527                 ptr += sizeof(struct btrfs_tree_block_info);
1528                 BUG_ON(ptr > end);
1529         }
1530
1531         if (owner >= BTRFS_FIRST_FREE_OBJECTID)
1532                 needed = BTRFS_REF_TYPE_DATA;
1533         else
1534                 needed = BTRFS_REF_TYPE_BLOCK;
1535
1536         err = -ENOENT;
1537         while (1) {
1538                 if (ptr >= end) {
1539                         WARN_ON(ptr > end);
1540                         break;
1541                 }
1542                 iref = (struct btrfs_extent_inline_ref *)ptr;
1543                 type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
1544                 if (type == BTRFS_REF_TYPE_INVALID) {
1545                         err = -EUCLEAN;
1546                         goto out;
1547                 }
1548
1549                 if (want < type)
1550                         break;
1551                 if (want > type) {
1552                         ptr += btrfs_extent_inline_ref_size(type);
1553                         continue;
1554                 }
1555
1556                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1557                         struct btrfs_extent_data_ref *dref;
1558                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1559                         if (match_extent_data_ref(leaf, dref, root_objectid,
1560                                                   owner, offset)) {
1561                                 err = 0;
1562                                 break;
1563                         }
1564                         if (hash_extent_data_ref_item(leaf, dref) <
1565                             hash_extent_data_ref(root_objectid, owner, offset))
1566                                 break;
1567                 } else {
1568                         u64 ref_offset;
1569                         ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1570                         if (parent > 0) {
1571                                 if (parent == ref_offset) {
1572                                         err = 0;
1573                                         break;
1574                                 }
1575                                 if (ref_offset < parent)
1576                                         break;
1577                         } else {
1578                                 if (root_objectid == ref_offset) {
1579                                         err = 0;
1580                                         break;
1581                                 }
1582                                 if (ref_offset < root_objectid)
1583                                         break;
1584                         }
1585                 }
1586                 ptr += btrfs_extent_inline_ref_size(type);
1587         }
1588         if (err == -ENOENT && insert) {
1589                 if (item_size + extra_size >=
1590                     BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1591                         err = -EAGAIN;
1592                         goto out;
1593                 }
1594                 /*
1595                  * To add new inline back ref, we have to make sure
1596                  * there is no corresponding back ref item.
1597                  * For simplicity, we just do not add new inline back
1598                  * ref if there is any kind of item for this block
1599                  */
1600                 if (find_next_key(path, 0, &key) == 0 &&
1601                     key.objectid == bytenr &&
1602                     key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1603                         err = -EAGAIN;
1604                         goto out;
1605                 }
1606         }
1607         *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1608 out:
1609         if (insert) {
1610                 path->keep_locks = 0;
1611                 btrfs_unlock_up_safe(path, 1);
1612         }
1613         return err;
1614 }
1615
1616 /*
1617  * helper to add new inline back ref
1618  */
1619 static noinline_for_stack
1620 void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
1621                                  struct btrfs_path *path,
1622                                  struct btrfs_extent_inline_ref *iref,
1623                                  u64 parent, u64 root_objectid,
1624                                  u64 owner, u64 offset, int refs_to_add,
1625                                  struct btrfs_delayed_extent_op *extent_op)
1626 {
1627         struct extent_buffer *leaf;
1628         struct btrfs_extent_item *ei;
1629         unsigned long ptr;
1630         unsigned long end;
1631         unsigned long item_offset;
1632         u64 refs;
1633         int size;
1634         int type;
1635
1636         leaf = path->nodes[0];
1637         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1638         item_offset = (unsigned long)iref - (unsigned long)ei;
1639
1640         type = extent_ref_type(parent, owner);
1641         size = btrfs_extent_inline_ref_size(type);
1642
1643         btrfs_extend_item(path, size);
1644
1645         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1646         refs = btrfs_extent_refs(leaf, ei);
1647         refs += refs_to_add;
1648         btrfs_set_extent_refs(leaf, ei, refs);
1649         if (extent_op)
1650                 __run_delayed_extent_op(extent_op, leaf, ei);
1651
1652         ptr = (unsigned long)ei + item_offset;
1653         end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1654         if (ptr < end - size)
1655                 memmove_extent_buffer(leaf, ptr + size, ptr,
1656                                       end - size - ptr);
1657
1658         iref = (struct btrfs_extent_inline_ref *)ptr;
1659         btrfs_set_extent_inline_ref_type(leaf, iref, type);
1660         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1661                 struct btrfs_extent_data_ref *dref;
1662                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1663                 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1664                 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1665                 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1666                 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1667         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1668                 struct btrfs_shared_data_ref *sref;
1669                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1670                 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1671                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1672         } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1673                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1674         } else {
1675                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1676         }
1677         btrfs_mark_buffer_dirty(leaf);
1678 }
1679
1680 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1681                                  struct btrfs_path *path,
1682                                  struct btrfs_extent_inline_ref **ref_ret,
1683                                  u64 bytenr, u64 num_bytes, u64 parent,
1684                                  u64 root_objectid, u64 owner, u64 offset)
1685 {
1686         int ret;
1687
1688         ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
1689                                            num_bytes, parent, root_objectid,
1690                                            owner, offset, 0);
1691         if (ret != -ENOENT)
1692                 return ret;
1693
1694         btrfs_release_path(path);
1695         *ref_ret = NULL;
1696
1697         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1698                 ret = lookup_tree_block_ref(trans, path, bytenr, parent,
1699                                             root_objectid);
1700         } else {
1701                 ret = lookup_extent_data_ref(trans, path, bytenr, parent,
1702                                              root_objectid, owner, offset);
1703         }
1704         return ret;
1705 }
1706
1707 /*
1708  * helper to update/remove inline back ref
1709  */
1710 static noinline_for_stack
1711 void update_inline_extent_backref(struct btrfs_path *path,
1712                                   struct btrfs_extent_inline_ref *iref,
1713                                   int refs_to_mod,
1714                                   struct btrfs_delayed_extent_op *extent_op,
1715                                   int *last_ref)
1716 {
1717         struct extent_buffer *leaf = path->nodes[0];
1718         struct btrfs_extent_item *ei;
1719         struct btrfs_extent_data_ref *dref = NULL;
1720         struct btrfs_shared_data_ref *sref = NULL;
1721         unsigned long ptr;
1722         unsigned long end;
1723         u32 item_size;
1724         int size;
1725         int type;
1726         u64 refs;
1727
1728         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1729         refs = btrfs_extent_refs(leaf, ei);
1730         WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1731         refs += refs_to_mod;
1732         btrfs_set_extent_refs(leaf, ei, refs);
1733         if (extent_op)
1734                 __run_delayed_extent_op(extent_op, leaf, ei);
1735
1736         /*
1737          * If type is invalid, we should have bailed out after
1738          * lookup_inline_extent_backref().
1739          */
1740         type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
1741         ASSERT(type != BTRFS_REF_TYPE_INVALID);
1742
1743         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1744                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1745                 refs = btrfs_extent_data_ref_count(leaf, dref);
1746         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1747                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1748                 refs = btrfs_shared_data_ref_count(leaf, sref);
1749         } else {
1750                 refs = 1;
1751                 BUG_ON(refs_to_mod != -1);
1752         }
1753
1754         BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1755         refs += refs_to_mod;
1756
1757         if (refs > 0) {
1758                 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1759                         btrfs_set_extent_data_ref_count(leaf, dref, refs);
1760                 else
1761                         btrfs_set_shared_data_ref_count(leaf, sref, refs);
1762         } else {
1763                 *last_ref = 1;
1764                 size =  btrfs_extent_inline_ref_size(type);
1765                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1766                 ptr = (unsigned long)iref;
1767                 end = (unsigned long)ei + item_size;
1768                 if (ptr + size < end)
1769                         memmove_extent_buffer(leaf, ptr, ptr + size,
1770                                               end - ptr - size);
1771                 item_size -= size;
1772                 btrfs_truncate_item(path, item_size, 1);
1773         }
1774         btrfs_mark_buffer_dirty(leaf);
1775 }
1776
1777 static noinline_for_stack
1778 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1779                                  struct btrfs_path *path,
1780                                  u64 bytenr, u64 num_bytes, u64 parent,
1781                                  u64 root_objectid, u64 owner,
1782                                  u64 offset, int refs_to_add,
1783                                  struct btrfs_delayed_extent_op *extent_op)
1784 {
1785         struct btrfs_extent_inline_ref *iref;
1786         int ret;
1787
1788         ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
1789                                            num_bytes, parent, root_objectid,
1790                                            owner, offset, 1);
1791         if (ret == 0) {
1792                 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1793                 update_inline_extent_backref(path, iref, refs_to_add,
1794                                              extent_op, NULL);
1795         } else if (ret == -ENOENT) {
1796                 setup_inline_extent_backref(trans->fs_info, path, iref, parent,
1797                                             root_objectid, owner, offset,
1798                                             refs_to_add, extent_op);
1799                 ret = 0;
1800         }
1801         return ret;
1802 }
1803
1804 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1805                                  struct btrfs_path *path,
1806                                  u64 bytenr, u64 parent, u64 root_objectid,
1807                                  u64 owner, u64 offset, int refs_to_add)
1808 {
1809         int ret;
1810         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1811                 BUG_ON(refs_to_add != 1);
1812                 ret = insert_tree_block_ref(trans, path, bytenr, parent,
1813                                             root_objectid);
1814         } else {
1815                 ret = insert_extent_data_ref(trans, path, bytenr, parent,
1816                                              root_objectid, owner, offset,
1817                                              refs_to_add);
1818         }
1819         return ret;
1820 }
1821
1822 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1823                                  struct btrfs_path *path,
1824                                  struct btrfs_extent_inline_ref *iref,
1825                                  int refs_to_drop, int is_data, int *last_ref)
1826 {
1827         int ret = 0;
1828
1829         BUG_ON(!is_data && refs_to_drop != 1);
1830         if (iref) {
1831                 update_inline_extent_backref(path, iref, -refs_to_drop, NULL,
1832                                              last_ref);
1833         } else if (is_data) {
1834                 ret = remove_extent_data_ref(trans, path, refs_to_drop,
1835                                              last_ref);
1836         } else {
1837                 *last_ref = 1;
1838                 ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
1839         }
1840         return ret;
1841 }
1842
1843 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1844                                u64 *discarded_bytes)
1845 {
1846         int j, ret = 0;
1847         u64 bytes_left, end;
1848         u64 aligned_start = ALIGN(start, 1 << 9);
1849
1850         if (WARN_ON(start != aligned_start)) {
1851                 len -= aligned_start - start;
1852                 len = round_down(len, 1 << 9);
1853                 start = aligned_start;
1854         }
1855
1856         *discarded_bytes = 0;
1857
1858         if (!len)
1859                 return 0;
1860
1861         end = start + len;
1862         bytes_left = len;
1863
1864         /* Skip any superblocks on this device. */
1865         for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1866                 u64 sb_start = btrfs_sb_offset(j);
1867                 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1868                 u64 size = sb_start - start;
1869
1870                 if (!in_range(sb_start, start, bytes_left) &&
1871                     !in_range(sb_end, start, bytes_left) &&
1872                     !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1873                         continue;
1874
1875                 /*
1876                  * Superblock spans beginning of range.  Adjust start and
1877                  * try again.
1878                  */
1879                 if (sb_start <= start) {
1880                         start += sb_end - start;
1881                         if (start > end) {
1882                                 bytes_left = 0;
1883                                 break;
1884                         }
1885                         bytes_left = end - start;
1886                         continue;
1887                 }
1888
1889                 if (size) {
1890                         ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
1891                                                    GFP_NOFS, 0);
1892                         if (!ret)
1893                                 *discarded_bytes += size;
1894                         else if (ret != -EOPNOTSUPP)
1895                                 return ret;
1896                 }
1897
1898                 start = sb_end;
1899                 if (start > end) {
1900                         bytes_left = 0;
1901                         break;
1902                 }
1903                 bytes_left = end - start;
1904         }
1905
1906         if (bytes_left) {
1907                 ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
1908                                            GFP_NOFS, 0);
1909                 if (!ret)
1910                         *discarded_bytes += bytes_left;
1911         }
1912         return ret;
1913 }
1914
1915 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1916                          u64 num_bytes, u64 *actual_bytes)
1917 {
1918         int ret;
1919         u64 discarded_bytes = 0;
1920         struct btrfs_bio *bbio = NULL;
1921
1922
1923         /*
1924          * Avoid races with device replace and make sure our bbio has devices
1925          * associated to its stripes that don't go away while we are discarding.
1926          */
1927         btrfs_bio_counter_inc_blocked(fs_info);
1928         /* Tell the block device(s) that the sectors can be discarded */
1929         ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes,
1930                               &bbio, 0);
1931         /* Error condition is -ENOMEM */
1932         if (!ret) {
1933                 struct btrfs_bio_stripe *stripe = bbio->stripes;
1934                 int i;
1935
1936
1937                 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1938                         u64 bytes;
1939                         struct request_queue *req_q;
1940
1941                         if (!stripe->dev->bdev) {
1942                                 ASSERT(btrfs_test_opt(fs_info, DEGRADED));
1943                                 continue;
1944                         }
1945                         req_q = bdev_get_queue(stripe->dev->bdev);
1946                         if (!blk_queue_discard(req_q))
1947                                 continue;
1948
1949                         ret = btrfs_issue_discard(stripe->dev->bdev,
1950                                                   stripe->physical,
1951                                                   stripe->length,
1952                                                   &bytes);
1953                         if (!ret)
1954                                 discarded_bytes += bytes;
1955                         else if (ret != -EOPNOTSUPP)
1956                                 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
1957
1958                         /*
1959                          * Just in case we get back EOPNOTSUPP for some reason,
1960                          * just ignore the return value so we don't screw up
1961                          * people calling discard_extent.
1962                          */
1963                         ret = 0;
1964                 }
1965                 btrfs_put_bbio(bbio);
1966         }
1967         btrfs_bio_counter_dec(fs_info);
1968
1969         if (actual_bytes)
1970                 *actual_bytes = discarded_bytes;
1971
1972
1973         if (ret == -EOPNOTSUPP)
1974                 ret = 0;
1975         return ret;
1976 }
1977
1978 /* Can return -ENOMEM */
1979 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1980                          struct btrfs_ref *generic_ref)
1981 {
1982         struct btrfs_fs_info *fs_info = trans->fs_info;
1983         int old_ref_mod, new_ref_mod;
1984         int ret;
1985
1986         ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
1987                generic_ref->action);
1988         BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
1989                generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID);
1990
1991         if (generic_ref->type == BTRFS_REF_METADATA)
1992                 ret = btrfs_add_delayed_tree_ref(trans, generic_ref,
1993                                 NULL, &old_ref_mod, &new_ref_mod);
1994         else
1995                 ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0,
1996                                                  &old_ref_mod, &new_ref_mod);
1997
1998         btrfs_ref_tree_mod(fs_info, generic_ref);
1999
2000         if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
2001                 sub_pinned_bytes(fs_info, generic_ref);
2002
2003         return ret;
2004 }
2005
2006 /*
2007  * __btrfs_inc_extent_ref - insert backreference for a given extent
2008  *
2009  * @trans:          Handle of transaction
2010  *
2011  * @node:           The delayed ref node used to get the bytenr/length for
2012  *                  extent whose references are incremented.
2013  *
2014  * @parent:         If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
2015  *                  BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
2016  *                  bytenr of the parent block. Since new extents are always
2017  *                  created with indirect references, this will only be the case
2018  *                  when relocating a shared extent. In that case, root_objectid
2019  *                  will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must
2020  *                  be 0
2021  *
2022  * @root_objectid:  The id of the root where this modification has originated,
2023  *                  this can be either one of the well-known metadata trees or
2024  *                  the subvolume id which references this extent.
2025  *
2026  * @owner:          For data extents it is the inode number of the owning file.
2027  *                  For metadata extents this parameter holds the level in the
2028  *                  tree of the extent.
2029  *
2030  * @offset:         For metadata extents the offset is ignored and is currently
2031  *                  always passed as 0. For data extents it is the fileoffset
2032  *                  this extent belongs to.
2033  *
2034  * @refs_to_add     Number of references to add
2035  *
2036  * @extent_op       Pointer to a structure, holding information necessary when
2037  *                  updating a tree block's flags
2038  *
2039  */
2040 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2041                                   struct btrfs_delayed_ref_node *node,
2042                                   u64 parent, u64 root_objectid,
2043                                   u64 owner, u64 offset, int refs_to_add,
2044                                   struct btrfs_delayed_extent_op *extent_op)
2045 {
2046         struct btrfs_path *path;
2047         struct extent_buffer *leaf;
2048         struct btrfs_extent_item *item;
2049         struct btrfs_key key;
2050         u64 bytenr = node->bytenr;
2051         u64 num_bytes = node->num_bytes;
2052         u64 refs;
2053         int ret;
2054
2055         path = btrfs_alloc_path();
2056         if (!path)
2057                 return -ENOMEM;
2058
2059         path->reada = READA_FORWARD;
2060         path->leave_spinning = 1;
2061         /* this will setup the path even if it fails to insert the back ref */
2062         ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
2063                                            parent, root_objectid, owner,
2064                                            offset, refs_to_add, extent_op);
2065         if ((ret < 0 && ret != -EAGAIN) || !ret)
2066                 goto out;
2067
2068         /*
2069          * Ok we had -EAGAIN which means we didn't have space to insert and
2070          * inline extent ref, so just update the reference count and add a
2071          * normal backref.
2072          */
2073         leaf = path->nodes[0];
2074         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2075         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2076         refs = btrfs_extent_refs(leaf, item);
2077         btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
2078         if (extent_op)
2079                 __run_delayed_extent_op(extent_op, leaf, item);
2080
2081         btrfs_mark_buffer_dirty(leaf);
2082         btrfs_release_path(path);
2083
2084         path->reada = READA_FORWARD;
2085         path->leave_spinning = 1;
2086         /* now insert the actual backref */
2087         ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid,
2088                                     owner, offset, refs_to_add);
2089         if (ret)
2090                 btrfs_abort_transaction(trans, ret);
2091 out:
2092         btrfs_free_path(path);
2093         return ret;
2094 }
2095
2096 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
2097                                 struct btrfs_delayed_ref_node *node,
2098                                 struct btrfs_delayed_extent_op *extent_op,
2099                                 int insert_reserved)
2100 {
2101         int ret = 0;
2102         struct btrfs_delayed_data_ref *ref;
2103         struct btrfs_key ins;
2104         u64 parent = 0;
2105         u64 ref_root = 0;
2106         u64 flags = 0;
2107
2108         ins.objectid = node->bytenr;
2109         ins.offset = node->num_bytes;
2110         ins.type = BTRFS_EXTENT_ITEM_KEY;
2111
2112         ref = btrfs_delayed_node_to_data_ref(node);
2113         trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
2114
2115         if (node->type == BTRFS_SHARED_DATA_REF_KEY)
2116                 parent = ref->parent;
2117         ref_root = ref->root;
2118
2119         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2120                 if (extent_op)
2121                         flags |= extent_op->flags_to_set;
2122                 ret = alloc_reserved_file_extent(trans, parent, ref_root,
2123                                                  flags, ref->objectid,
2124                                                  ref->offset, &ins,
2125                                                  node->ref_mod);
2126         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2127                 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
2128                                              ref->objectid, ref->offset,
2129                                              node->ref_mod, extent_op);
2130         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2131                 ret = __btrfs_free_extent(trans, node, parent,
2132                                           ref_root, ref->objectid,
2133                                           ref->offset, node->ref_mod,
2134                                           extent_op);
2135         } else {
2136                 BUG();
2137         }
2138         return ret;
2139 }
2140
2141 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2142                                     struct extent_buffer *leaf,
2143                                     struct btrfs_extent_item *ei)
2144 {
2145         u64 flags = btrfs_extent_flags(leaf, ei);
2146         if (extent_op->update_flags) {
2147                 flags |= extent_op->flags_to_set;
2148                 btrfs_set_extent_flags(leaf, ei, flags);
2149         }
2150
2151         if (extent_op->update_key) {
2152                 struct btrfs_tree_block_info *bi;
2153                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2154                 bi = (struct btrfs_tree_block_info *)(ei + 1);
2155                 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2156         }
2157 }
2158
2159 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2160                                  struct btrfs_delayed_ref_head *head,
2161                                  struct btrfs_delayed_extent_op *extent_op)
2162 {
2163         struct btrfs_fs_info *fs_info = trans->fs_info;
2164         struct btrfs_key key;
2165         struct btrfs_path *path;
2166         struct btrfs_extent_item *ei;
2167         struct extent_buffer *leaf;
2168         u32 item_size;
2169         int ret;
2170         int err = 0;
2171         int metadata = !extent_op->is_data;
2172
2173         if (trans->aborted)
2174                 return 0;
2175
2176         if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2177                 metadata = 0;
2178
2179         path = btrfs_alloc_path();
2180         if (!path)
2181                 return -ENOMEM;
2182
2183         key.objectid = head->bytenr;
2184
2185         if (metadata) {
2186                 key.type = BTRFS_METADATA_ITEM_KEY;
2187                 key.offset = extent_op->level;
2188         } else {
2189                 key.type = BTRFS_EXTENT_ITEM_KEY;
2190                 key.offset = head->num_bytes;
2191         }
2192
2193 again:
2194         path->reada = READA_FORWARD;
2195         path->leave_spinning = 1;
2196         ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
2197         if (ret < 0) {
2198                 err = ret;
2199                 goto out;
2200         }
2201         if (ret > 0) {
2202                 if (metadata) {
2203                         if (path->slots[0] > 0) {
2204                                 path->slots[0]--;
2205                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
2206                                                       path->slots[0]);
2207                                 if (key.objectid == head->bytenr &&
2208                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
2209                                     key.offset == head->num_bytes)
2210                                         ret = 0;
2211                         }
2212                         if (ret > 0) {
2213                                 btrfs_release_path(path);
2214                                 metadata = 0;
2215
2216                                 key.objectid = head->bytenr;
2217                                 key.offset = head->num_bytes;
2218                                 key.type = BTRFS_EXTENT_ITEM_KEY;
2219                                 goto again;
2220                         }
2221                 } else {
2222                         err = -EIO;
2223                         goto out;
2224                 }
2225         }
2226
2227         leaf = path->nodes[0];
2228         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2229
2230         if (unlikely(item_size < sizeof(*ei))) {
2231                 err = -EINVAL;
2232                 btrfs_print_v0_err(fs_info);
2233                 btrfs_abort_transaction(trans, err);
2234                 goto out;
2235         }
2236
2237         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2238         __run_delayed_extent_op(extent_op, leaf, ei);
2239
2240         btrfs_mark_buffer_dirty(leaf);
2241 out:
2242         btrfs_free_path(path);
2243         return err;
2244 }
2245
2246 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2247                                 struct btrfs_delayed_ref_node *node,
2248                                 struct btrfs_delayed_extent_op *extent_op,
2249                                 int insert_reserved)
2250 {
2251         int ret = 0;
2252         struct btrfs_delayed_tree_ref *ref;
2253         u64 parent = 0;
2254         u64 ref_root = 0;
2255
2256         ref = btrfs_delayed_node_to_tree_ref(node);
2257         trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
2258
2259         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2260                 parent = ref->parent;
2261         ref_root = ref->root;
2262
2263         if (node->ref_mod != 1) {
2264                 btrfs_err(trans->fs_info,
2265         "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
2266                           node->bytenr, node->ref_mod, node->action, ref_root,
2267                           parent);
2268                 return -EIO;
2269         }
2270         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2271                 BUG_ON(!extent_op || !extent_op->update_flags);
2272                 ret = alloc_reserved_tree_block(trans, node, extent_op);
2273         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2274                 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
2275                                              ref->level, 0, 1, extent_op);
2276         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2277                 ret = __btrfs_free_extent(trans, node, parent, ref_root,
2278                                           ref->level, 0, 1, extent_op);
2279         } else {
2280                 BUG();
2281         }
2282         return ret;
2283 }
2284
2285 /* helper function to actually process a single delayed ref entry */
2286 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2287                                struct btrfs_delayed_ref_node *node,
2288                                struct btrfs_delayed_extent_op *extent_op,
2289                                int insert_reserved)
2290 {
2291         int ret = 0;
2292
2293         if (trans->aborted) {
2294                 if (insert_reserved)
2295                         btrfs_pin_extent(trans->fs_info, node->bytenr,
2296                                          node->num_bytes, 1);
2297                 return 0;
2298         }
2299
2300         if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2301             node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2302                 ret = run_delayed_tree_ref(trans, node, extent_op,
2303                                            insert_reserved);
2304         else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2305                  node->type == BTRFS_SHARED_DATA_REF_KEY)
2306                 ret = run_delayed_data_ref(trans, node, extent_op,
2307                                            insert_reserved);
2308         else
2309                 BUG();
2310         if (ret && insert_reserved)
2311                 btrfs_pin_extent(trans->fs_info, node->bytenr,
2312                                  node->num_bytes, 1);
2313         return ret;
2314 }
2315
2316 static inline struct btrfs_delayed_ref_node *
2317 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2318 {
2319         struct btrfs_delayed_ref_node *ref;
2320
2321         if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
2322                 return NULL;
2323
2324         /*
2325          * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
2326          * This is to prevent a ref count from going down to zero, which deletes
2327          * the extent item from the extent tree, when there still are references
2328          * to add, which would fail because they would not find the extent item.
2329          */
2330         if (!list_empty(&head->ref_add_list))
2331                 return list_first_entry(&head->ref_add_list,
2332                                 struct btrfs_delayed_ref_node, add_list);
2333
2334         ref = rb_entry(rb_first_cached(&head->ref_tree),
2335                        struct btrfs_delayed_ref_node, ref_node);
2336         ASSERT(list_empty(&ref->add_list));
2337         return ref;
2338 }
2339
2340 static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
2341                                       struct btrfs_delayed_ref_head *head)
2342 {
2343         spin_lock(&delayed_refs->lock);
2344         head->processing = 0;
2345         delayed_refs->num_heads_ready++;
2346         spin_unlock(&delayed_refs->lock);
2347         btrfs_delayed_ref_unlock(head);
2348 }
2349
2350 static struct btrfs_delayed_extent_op *cleanup_extent_op(
2351                                 struct btrfs_delayed_ref_head *head)
2352 {
2353         struct btrfs_delayed_extent_op *extent_op = head->extent_op;
2354
2355         if (!extent_op)
2356                 return NULL;
2357
2358         if (head->must_insert_reserved) {
2359                 head->extent_op = NULL;
2360                 btrfs_free_delayed_extent_op(extent_op);
2361                 return NULL;
2362         }
2363         return extent_op;
2364 }
2365
2366 static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
2367                                      struct btrfs_delayed_ref_head *head)
2368 {
2369         struct btrfs_delayed_extent_op *extent_op;
2370         int ret;
2371
2372         extent_op = cleanup_extent_op(head);
2373         if (!extent_op)
2374                 return 0;
2375         head->extent_op = NULL;
2376         spin_unlock(&head->lock);
2377         ret = run_delayed_extent_op(trans, head, extent_op);
2378         btrfs_free_delayed_extent_op(extent_op);
2379         return ret ? ret : 1;
2380 }
2381
2382 void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
2383                                   struct btrfs_delayed_ref_root *delayed_refs,
2384                                   struct btrfs_delayed_ref_head *head)
2385 {
2386         int nr_items = 1;       /* Dropping this ref head update. */
2387
2388         if (head->total_ref_mod < 0) {
2389                 struct btrfs_space_info *space_info;
2390                 u64 flags;
2391
2392                 if (head->is_data)
2393                         flags = BTRFS_BLOCK_GROUP_DATA;
2394                 else if (head->is_system)
2395                         flags = BTRFS_BLOCK_GROUP_SYSTEM;
2396                 else
2397                         flags = BTRFS_BLOCK_GROUP_METADATA;
2398                 space_info = btrfs_find_space_info(fs_info, flags);
2399                 ASSERT(space_info);
2400                 percpu_counter_add_batch(&space_info->total_bytes_pinned,
2401                                    -head->num_bytes,
2402                                    BTRFS_TOTAL_BYTES_PINNED_BATCH);
2403
2404                 /*
2405                  * We had csum deletions accounted for in our delayed refs rsv,
2406                  * we need to drop the csum leaves for this update from our
2407                  * delayed_refs_rsv.
2408                  */
2409                 if (head->is_data) {
2410                         spin_lock(&delayed_refs->lock);
2411                         delayed_refs->pending_csums -= head->num_bytes;
2412                         spin_unlock(&delayed_refs->lock);
2413                         nr_items += btrfs_csum_bytes_to_leaves(fs_info,
2414                                 head->num_bytes);
2415                 }
2416         }
2417
2418         btrfs_delayed_refs_rsv_release(fs_info, nr_items);
2419 }
2420
2421 static int cleanup_ref_head(struct btrfs_trans_handle *trans,
2422                             struct btrfs_delayed_ref_head *head)
2423 {
2424
2425         struct btrfs_fs_info *fs_info = trans->fs_info;
2426         struct btrfs_delayed_ref_root *delayed_refs;
2427         int ret;
2428
2429         delayed_refs = &trans->transaction->delayed_refs;
2430
2431         ret = run_and_cleanup_extent_op(trans, head);
2432         if (ret < 0) {
2433                 unselect_delayed_ref_head(delayed_refs, head);
2434                 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
2435                 return ret;
2436         } else if (ret) {
2437                 return ret;
2438         }
2439
2440         /*
2441          * Need to drop our head ref lock and re-acquire the delayed ref lock
2442          * and then re-check to make sure nobody got added.
2443          */
2444         spin_unlock(&head->lock);
2445         spin_lock(&delayed_refs->lock);
2446         spin_lock(&head->lock);
2447         if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
2448                 spin_unlock(&head->lock);
2449                 spin_unlock(&delayed_refs->lock);
2450                 return 1;
2451         }
2452         btrfs_delete_ref_head(delayed_refs, head);
2453         spin_unlock(&head->lock);
2454         spin_unlock(&delayed_refs->lock);
2455
2456         if (head->must_insert_reserved) {
2457                 btrfs_pin_extent(fs_info, head->bytenr,
2458                                  head->num_bytes, 1);
2459                 if (head->is_data) {
2460                         ret = btrfs_del_csums(trans, fs_info, head->bytenr,
2461                                               head->num_bytes);
2462                 }
2463         }
2464
2465         btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
2466
2467         trace_run_delayed_ref_head(fs_info, head, 0);
2468         btrfs_delayed_ref_unlock(head);
2469         btrfs_put_delayed_ref_head(head);
2470         return 0;
2471 }
2472
2473 static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
2474                                         struct btrfs_trans_handle *trans)
2475 {
2476         struct btrfs_delayed_ref_root *delayed_refs =
2477                 &trans->transaction->delayed_refs;
2478         struct btrfs_delayed_ref_head *head = NULL;
2479         int ret;
2480
2481         spin_lock(&delayed_refs->lock);
2482         head = btrfs_select_ref_head(delayed_refs);
2483         if (!head) {
2484                 spin_unlock(&delayed_refs->lock);
2485                 return head;
2486         }
2487
2488         /*
2489          * Grab the lock that says we are going to process all the refs for
2490          * this head
2491          */
2492         ret = btrfs_delayed_ref_lock(delayed_refs, head);
2493         spin_unlock(&delayed_refs->lock);
2494
2495         /*
2496          * We may have dropped the spin lock to get the head mutex lock, and
2497          * that might have given someone else time to free the head.  If that's
2498          * true, it has been removed from our list and we can move on.
2499          */
2500         if (ret == -EAGAIN)
2501                 head = ERR_PTR(-EAGAIN);
2502
2503         return head;
2504 }
2505
2506 static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
2507                                     struct btrfs_delayed_ref_head *locked_ref,
2508                                     unsigned long *run_refs)
2509 {
2510         struct btrfs_fs_info *fs_info = trans->fs_info;
2511         struct btrfs_delayed_ref_root *delayed_refs;
2512         struct btrfs_delayed_extent_op *extent_op;
2513         struct btrfs_delayed_ref_node *ref;
2514         int must_insert_reserved = 0;
2515         int ret;
2516
2517         delayed_refs = &trans->transaction->delayed_refs;
2518
2519         lockdep_assert_held(&locked_ref->mutex);
2520         lockdep_assert_held(&locked_ref->lock);
2521
2522         while ((ref = select_delayed_ref(locked_ref))) {
2523                 if (ref->seq &&
2524                     btrfs_check_delayed_seq(fs_info, ref->seq)) {
2525                         spin_unlock(&locked_ref->lock);
2526                         unselect_delayed_ref_head(delayed_refs, locked_ref);
2527                         return -EAGAIN;
2528                 }
2529
2530                 (*run_refs)++;
2531                 ref->in_tree = 0;
2532                 rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
2533                 RB_CLEAR_NODE(&ref->ref_node);
2534                 if (!list_empty(&ref->add_list))
2535                         list_del(&ref->add_list);
2536                 /*
2537                  * When we play the delayed ref, also correct the ref_mod on
2538                  * head
2539                  */
2540                 switch (ref->action) {
2541                 case BTRFS_ADD_DELAYED_REF:
2542                 case BTRFS_ADD_DELAYED_EXTENT:
2543                         locked_ref->ref_mod -= ref->ref_mod;
2544                         break;
2545                 case BTRFS_DROP_DELAYED_REF:
2546                         locked_ref->ref_mod += ref->ref_mod;
2547                         break;
2548                 default:
2549                         WARN_ON(1);
2550                 }
2551                 atomic_dec(&delayed_refs->num_entries);
2552
2553                 /*
2554                  * Record the must_insert_reserved flag before we drop the
2555                  * spin lock.
2556                  */
2557                 must_insert_reserved = locked_ref->must_insert_reserved;
2558                 locked_ref->must_insert_reserved = 0;
2559
2560                 extent_op = locked_ref->extent_op;
2561                 locked_ref->extent_op = NULL;
2562                 spin_unlock(&locked_ref->lock);
2563
2564                 ret = run_one_delayed_ref(trans, ref, extent_op,
2565                                           must_insert_reserved);
2566
2567                 btrfs_free_delayed_extent_op(extent_op);
2568                 if (ret) {
2569                         unselect_delayed_ref_head(delayed_refs, locked_ref);
2570                         btrfs_put_delayed_ref(ref);
2571                         btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
2572                                     ret);
2573                         return ret;
2574                 }
2575
2576                 btrfs_put_delayed_ref(ref);
2577                 cond_resched();
2578
2579                 spin_lock(&locked_ref->lock);
2580                 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2581         }
2582
2583         return 0;
2584 }
2585
2586 /*
2587  * Returns 0 on success or if called with an already aborted transaction.
2588  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2589  */
2590 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2591                                              unsigned long nr)
2592 {
2593         struct btrfs_fs_info *fs_info = trans->fs_info;
2594         struct btrfs_delayed_ref_root *delayed_refs;
2595         struct btrfs_delayed_ref_head *locked_ref = NULL;
2596         ktime_t start = ktime_get();
2597         int ret;
2598         unsigned long count = 0;
2599         unsigned long actual_count = 0;
2600
2601         delayed_refs = &trans->transaction->delayed_refs;
2602         do {
2603                 if (!locked_ref) {
2604                         locked_ref = btrfs_obtain_ref_head(trans);
2605                         if (IS_ERR_OR_NULL(locked_ref)) {
2606                                 if (PTR_ERR(locked_ref) == -EAGAIN) {
2607                                         continue;
2608                                 } else {
2609                                         break;
2610                                 }
2611                         }
2612                         count++;
2613                 }
2614                 /*
2615                  * We need to try and merge add/drops of the same ref since we
2616                  * can run into issues with relocate dropping the implicit ref
2617                  * and then it being added back again before the drop can
2618                  * finish.  If we merged anything we need to re-loop so we can
2619                  * get a good ref.
2620                  * Or we can get node references of the same type that weren't
2621                  * merged when created due to bumps in the tree mod seq, and
2622                  * we need to merge them to prevent adding an inline extent
2623                  * backref before dropping it (triggering a BUG_ON at
2624                  * insert_inline_extent_backref()).
2625                  */
2626                 spin_lock(&locked_ref->lock);
2627                 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2628
2629                 ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
2630                                                       &actual_count);
2631                 if (ret < 0 && ret != -EAGAIN) {
2632                         /*
2633                          * Error, btrfs_run_delayed_refs_for_head already
2634                          * unlocked everything so just bail out
2635                          */
2636                         return ret;
2637                 } else if (!ret) {
2638                         /*
2639                          * Success, perform the usual cleanup of a processed
2640                          * head
2641                          */
2642                         ret = cleanup_ref_head(trans, locked_ref);
2643                         if (ret > 0 ) {
2644                                 /* We dropped our lock, we need to loop. */
2645                                 ret = 0;
2646                                 continue;
2647                         } else if (ret) {
2648                                 return ret;
2649                         }
2650                 }
2651
2652                 /*
2653                  * Either success case or btrfs_run_delayed_refs_for_head
2654                  * returned -EAGAIN, meaning we need to select another head
2655                  */
2656
2657                 locked_ref = NULL;
2658                 cond_resched();
2659         } while ((nr != -1 && count < nr) || locked_ref);
2660
2661         /*
2662          * We don't want to include ref heads since we can have empty ref heads
2663          * and those will drastically skew our runtime down since we just do
2664          * accounting, no actual extent tree updates.
2665          */
2666         if (actual_count > 0) {
2667                 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2668                 u64 avg;
2669
2670                 /*
2671                  * We weigh the current average higher than our current runtime
2672                  * to avoid large swings in the average.
2673                  */
2674                 spin_lock(&delayed_refs->lock);
2675                 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2676                 fs_info->avg_delayed_ref_runtime = avg >> 2;    /* div by 4 */
2677                 spin_unlock(&delayed_refs->lock);
2678         }
2679         return 0;
2680 }
2681
2682 #ifdef SCRAMBLE_DELAYED_REFS
2683 /*
2684  * Normally delayed refs get processed in ascending bytenr order. This
2685  * correlates in most cases to the order added. To expose dependencies on this
2686  * order, we start to process the tree in the middle instead of the beginning
2687  */
2688 static u64 find_middle(struct rb_root *root)
2689 {
2690         struct rb_node *n = root->rb_node;
2691         struct btrfs_delayed_ref_node *entry;
2692         int alt = 1;
2693         u64 middle;
2694         u64 first = 0, last = 0;
2695
2696         n = rb_first(root);
2697         if (n) {
2698                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2699                 first = entry->bytenr;
2700         }
2701         n = rb_last(root);
2702         if (n) {
2703                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2704                 last = entry->bytenr;
2705         }
2706         n = root->rb_node;
2707
2708         while (n) {
2709                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2710                 WARN_ON(!entry->in_tree);
2711
2712                 middle = entry->bytenr;
2713
2714                 if (alt)
2715                         n = n->rb_left;
2716                 else
2717                         n = n->rb_right;
2718
2719                 alt = 1 - alt;
2720         }
2721         return middle;
2722 }
2723 #endif
2724
2725 static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads)
2726 {
2727         u64 num_bytes;
2728
2729         num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2730                              sizeof(struct btrfs_extent_inline_ref));
2731         if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2732                 num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2733
2734         /*
2735          * We don't ever fill up leaves all the way so multiply by 2 just to be
2736          * closer to what we're really going to want to use.
2737          */
2738         return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info));
2739 }
2740
2741 /*
2742  * Takes the number of bytes to be csumm'ed and figures out how many leaves it
2743  * would require to store the csums for that many bytes.
2744  */
2745 u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
2746 {
2747         u64 csum_size;
2748         u64 num_csums_per_leaf;
2749         u64 num_csums;
2750
2751         csum_size = BTRFS_MAX_ITEM_SIZE(fs_info);
2752         num_csums_per_leaf = div64_u64(csum_size,
2753                         (u64)btrfs_super_csum_size(fs_info->super_copy));
2754         num_csums = div64_u64(csum_bytes, fs_info->sectorsize);
2755         num_csums += num_csums_per_leaf - 1;
2756         num_csums = div64_u64(num_csums, num_csums_per_leaf);
2757         return num_csums;
2758 }
2759
2760 bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info)
2761 {
2762         struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
2763         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
2764         bool ret = false;
2765         u64 reserved;
2766
2767         spin_lock(&global_rsv->lock);
2768         reserved = global_rsv->reserved;
2769         spin_unlock(&global_rsv->lock);
2770
2771         /*
2772          * Since the global reserve is just kind of magic we don't really want
2773          * to rely on it to save our bacon, so if our size is more than the
2774          * delayed_refs_rsv and the global rsv then it's time to think about
2775          * bailing.
2776          */
2777         spin_lock(&delayed_refs_rsv->lock);
2778         reserved += delayed_refs_rsv->reserved;
2779         if (delayed_refs_rsv->size >= reserved)
2780                 ret = true;
2781         spin_unlock(&delayed_refs_rsv->lock);
2782         return ret;
2783 }
2784
2785 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
2786 {
2787         u64 num_entries =
2788                 atomic_read(&trans->transaction->delayed_refs.num_entries);
2789         u64 avg_runtime;
2790         u64 val;
2791
2792         smp_mb();
2793         avg_runtime = trans->fs_info->avg_delayed_ref_runtime;
2794         val = num_entries * avg_runtime;
2795         if (val >= NSEC_PER_SEC)
2796                 return 1;
2797         if (val >= NSEC_PER_SEC / 2)
2798                 return 2;
2799
2800         return btrfs_check_space_for_delayed_refs(trans->fs_info);
2801 }
2802
2803 /*
2804  * this starts processing the delayed reference count updates and
2805  * extent insertions we have queued up so far.  count can be
2806  * 0, which means to process everything in the tree at the start
2807  * of the run (but not newly added entries), or it can be some target
2808  * number you'd like to process.
2809  *
2810  * Returns 0 on success or if called with an aborted transaction
2811  * Returns <0 on error and aborts the transaction
2812  */
2813 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2814                            unsigned long count)
2815 {
2816         struct btrfs_fs_info *fs_info = trans->fs_info;
2817         struct rb_node *node;
2818         struct btrfs_delayed_ref_root *delayed_refs;
2819         struct btrfs_delayed_ref_head *head;
2820         int ret;
2821         int run_all = count == (unsigned long)-1;
2822
2823         /* We'll clean this up in btrfs_cleanup_transaction */
2824         if (trans->aborted)
2825                 return 0;
2826
2827         if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
2828                 return 0;
2829
2830         delayed_refs = &trans->transaction->delayed_refs;
2831         if (count == 0)
2832                 count = atomic_read(&delayed_refs->num_entries) * 2;
2833
2834 again:
2835 #ifdef SCRAMBLE_DELAYED_REFS
2836         delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2837 #endif
2838         ret = __btrfs_run_delayed_refs(trans, count);
2839         if (ret < 0) {
2840                 btrfs_abort_transaction(trans, ret);
2841                 return ret;
2842         }
2843
2844         if (run_all) {
2845                 btrfs_create_pending_block_groups(trans);
2846
2847                 spin_lock(&delayed_refs->lock);
2848                 node = rb_first_cached(&delayed_refs->href_root);
2849                 if (!node) {
2850                         spin_unlock(&delayed_refs->lock);
2851                         goto out;
2852                 }
2853                 head = rb_entry(node, struct btrfs_delayed_ref_head,
2854                                 href_node);
2855                 refcount_inc(&head->refs);
2856                 spin_unlock(&delayed_refs->lock);
2857
2858                 /* Mutex was contended, block until it's released and retry. */
2859                 mutex_lock(&head->mutex);
2860                 mutex_unlock(&head->mutex);
2861
2862                 btrfs_put_delayed_ref_head(head);
2863                 cond_resched();
2864                 goto again;
2865         }
2866 out:
2867         return 0;
2868 }
2869
2870 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2871                                 u64 bytenr, u64 num_bytes, u64 flags,
2872                                 int level, int is_data)
2873 {
2874         struct btrfs_delayed_extent_op *extent_op;
2875         int ret;
2876
2877         extent_op = btrfs_alloc_delayed_extent_op();
2878         if (!extent_op)
2879                 return -ENOMEM;
2880
2881         extent_op->flags_to_set = flags;
2882         extent_op->update_flags = true;
2883         extent_op->update_key = false;
2884         extent_op->is_data = is_data ? true : false;
2885         extent_op->level = level;
2886
2887         ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
2888         if (ret)
2889                 btrfs_free_delayed_extent_op(extent_op);
2890         return ret;
2891 }
2892
2893 static noinline int check_delayed_ref(struct btrfs_root *root,
2894                                       struct btrfs_path *path,
2895                                       u64 objectid, u64 offset, u64 bytenr)
2896 {
2897         struct btrfs_delayed_ref_head *head;
2898         struct btrfs_delayed_ref_node *ref;
2899         struct btrfs_delayed_data_ref *data_ref;
2900         struct btrfs_delayed_ref_root *delayed_refs;
2901         struct btrfs_transaction *cur_trans;
2902         struct rb_node *node;
2903         int ret = 0;
2904
2905         spin_lock(&root->fs_info->trans_lock);
2906         cur_trans = root->fs_info->running_transaction;
2907         if (cur_trans)
2908                 refcount_inc(&cur_trans->use_count);
2909         spin_unlock(&root->fs_info->trans_lock);
2910         if (!cur_trans)
2911                 return 0;
2912
2913         delayed_refs = &cur_trans->delayed_refs;
2914         spin_lock(&delayed_refs->lock);
2915         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
2916         if (!head) {
2917                 spin_unlock(&delayed_refs->lock);
2918                 btrfs_put_transaction(cur_trans);
2919                 return 0;
2920         }
2921
2922         if (!mutex_trylock(&head->mutex)) {
2923                 refcount_inc(&head->refs);
2924                 spin_unlock(&delayed_refs->lock);
2925
2926                 btrfs_release_path(path);
2927
2928                 /*
2929                  * Mutex was contended, block until it's released and let
2930                  * caller try again
2931                  */
2932                 mutex_lock(&head->mutex);
2933                 mutex_unlock(&head->mutex);
2934                 btrfs_put_delayed_ref_head(head);
2935                 btrfs_put_transaction(cur_trans);
2936                 return -EAGAIN;
2937         }
2938         spin_unlock(&delayed_refs->lock);
2939
2940         spin_lock(&head->lock);
2941         /*
2942          * XXX: We should replace this with a proper search function in the
2943          * future.
2944          */
2945         for (node = rb_first_cached(&head->ref_tree); node;
2946              node = rb_next(node)) {
2947                 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
2948                 /* If it's a shared ref we know a cross reference exists */
2949                 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
2950                         ret = 1;
2951                         break;
2952                 }
2953
2954                 data_ref = btrfs_delayed_node_to_data_ref(ref);
2955
2956                 /*
2957                  * If our ref doesn't match the one we're currently looking at
2958                  * then we have a cross reference.
2959                  */
2960                 if (data_ref->root != root->root_key.objectid ||
2961                     data_ref->objectid != objectid ||
2962                     data_ref->offset != offset) {
2963                         ret = 1;
2964                         break;
2965                 }
2966         }
2967         spin_unlock(&head->lock);
2968         mutex_unlock(&head->mutex);
2969         btrfs_put_transaction(cur_trans);
2970         return ret;
2971 }
2972
2973 static noinline int check_committed_ref(struct btrfs_root *root,
2974                                         struct btrfs_path *path,
2975                                         u64 objectid, u64 offset, u64 bytenr)
2976 {
2977         struct btrfs_fs_info *fs_info = root->fs_info;
2978         struct btrfs_root *extent_root = fs_info->extent_root;
2979         struct extent_buffer *leaf;
2980         struct btrfs_extent_data_ref *ref;
2981         struct btrfs_extent_inline_ref *iref;
2982         struct btrfs_extent_item *ei;
2983         struct btrfs_key key;
2984         u32 item_size;
2985         int type;
2986         int ret;
2987
2988         key.objectid = bytenr;
2989         key.offset = (u64)-1;
2990         key.type = BTRFS_EXTENT_ITEM_KEY;
2991
2992         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2993         if (ret < 0)
2994                 goto out;
2995         BUG_ON(ret == 0); /* Corruption */
2996
2997         ret = -ENOENT;
2998         if (path->slots[0] == 0)
2999                 goto out;
3000
3001         path->slots[0]--;
3002         leaf = path->nodes[0];
3003         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3004
3005         if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
3006                 goto out;
3007
3008         ret = 1;
3009         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3010         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
3011
3012         if (item_size != sizeof(*ei) +
3013             btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
3014                 goto out;
3015
3016         if (btrfs_extent_generation(leaf, ei) <=
3017             btrfs_root_last_snapshot(&root->root_item))
3018                 goto out;
3019
3020         iref = (struct btrfs_extent_inline_ref *)(ei + 1);
3021
3022         type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
3023         if (type != BTRFS_EXTENT_DATA_REF_KEY)
3024                 goto out;
3025
3026         ref = (struct btrfs_extent_data_ref *)(&iref->offset);
3027         if (btrfs_extent_refs(leaf, ei) !=
3028             btrfs_extent_data_ref_count(leaf, ref) ||
3029             btrfs_extent_data_ref_root(leaf, ref) !=
3030             root->root_key.objectid ||
3031             btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
3032             btrfs_extent_data_ref_offset(leaf, ref) != offset)
3033                 goto out;
3034
3035         ret = 0;
3036 out:
3037         return ret;
3038 }
3039
3040 int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
3041                           u64 bytenr)
3042 {
3043         struct btrfs_path *path;
3044         int ret;
3045
3046         path = btrfs_alloc_path();
3047         if (!path)
3048                 return -ENOMEM;
3049
3050         do {
3051                 ret = check_committed_ref(root, path, objectid,
3052                                           offset, bytenr);
3053                 if (ret && ret != -ENOENT)
3054                         goto out;
3055
3056                 ret = check_delayed_ref(root, path, objectid, offset, bytenr);
3057         } while (ret == -EAGAIN);
3058
3059 out:
3060         btrfs_free_path(path);
3061         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
3062                 WARN_ON(ret > 0);
3063         return ret;
3064 }
3065
3066 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3067                            struct btrfs_root *root,
3068                            struct extent_buffer *buf,
3069                            int full_backref, int inc)
3070 {
3071         struct btrfs_fs_info *fs_info = root->fs_info;
3072         u64 bytenr;
3073         u64 num_bytes;
3074         u64 parent;
3075         u64 ref_root;
3076         u32 nritems;
3077         struct btrfs_key key;
3078         struct btrfs_file_extent_item *fi;
3079         struct btrfs_ref generic_ref = { 0 };
3080         bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
3081         int i;
3082         int action;
3083         int level;
3084         int ret = 0;
3085
3086         if (btrfs_is_testing(fs_info))
3087                 return 0;
3088
3089         ref_root = btrfs_header_owner(buf);
3090         nritems = btrfs_header_nritems(buf);
3091         level = btrfs_header_level(buf);
3092
3093         if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
3094                 return 0;
3095
3096         if (full_backref)
3097                 parent = buf->start;
3098         else
3099                 parent = 0;
3100         if (inc)
3101                 action = BTRFS_ADD_DELAYED_REF;
3102         else
3103                 action = BTRFS_DROP_DELAYED_REF;
3104
3105         for (i = 0; i < nritems; i++) {
3106                 if (level == 0) {
3107                         btrfs_item_key_to_cpu(buf, &key, i);
3108                         if (key.type != BTRFS_EXTENT_DATA_KEY)
3109                                 continue;
3110                         fi = btrfs_item_ptr(buf, i,
3111                                             struct btrfs_file_extent_item);
3112                         if (btrfs_file_extent_type(buf, fi) ==
3113                             BTRFS_FILE_EXTENT_INLINE)
3114                                 continue;
3115                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
3116                         if (bytenr == 0)
3117                                 continue;
3118
3119                         num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
3120                         key.offset -= btrfs_file_extent_offset(buf, fi);
3121                         btrfs_init_generic_ref(&generic_ref, action, bytenr,
3122                                                num_bytes, parent);
3123                         generic_ref.real_root = root->root_key.objectid;
3124                         btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
3125                                             key.offset);
3126                         generic_ref.skip_qgroup = for_reloc;
3127                         if (inc)
3128                                 ret = btrfs_inc_extent_ref(trans, &generic_ref);
3129                         else
3130                                 ret = btrfs_free_extent(trans, &generic_ref);
3131                         if (ret)
3132                                 goto fail;
3133                 } else {
3134                         bytenr = btrfs_node_blockptr(buf, i);
3135                         num_bytes = fs_info->nodesize;
3136                         btrfs_init_generic_ref(&generic_ref, action, bytenr,
3137                                                num_bytes, parent);
3138                         generic_ref.real_root = root->root_key.objectid;
3139                         btrfs_init_tree_ref(&generic_ref, level - 1, ref_root);
3140                         generic_ref.skip_qgroup = for_reloc;
3141                         if (inc)
3142                                 ret = btrfs_inc_extent_ref(trans, &generic_ref);
3143                         else
3144                                 ret = btrfs_free_extent(trans, &generic_ref);
3145                         if (ret)
3146                                 goto fail;
3147                 }
3148         }
3149         return 0;
3150 fail:
3151         return ret;
3152 }
3153
3154 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3155                   struct extent_buffer *buf, int full_backref)
3156 {
3157         return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
3158 }
3159
3160 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3161                   struct extent_buffer *buf, int full_backref)
3162 {
3163         return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
3164 }
3165
3166 static int write_one_cache_group(struct btrfs_trans_handle *trans,
3167                                  struct btrfs_path *path,
3168                                  struct btrfs_block_group_cache *cache)
3169 {
3170         struct btrfs_fs_info *fs_info = trans->fs_info;
3171         int ret;
3172         struct btrfs_root *extent_root = fs_info->extent_root;
3173         unsigned long bi;
3174         struct extent_buffer *leaf;
3175
3176         ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
3177         if (ret) {
3178                 if (ret > 0)
3179                         ret = -ENOENT;
3180                 goto fail;
3181         }
3182
3183         leaf = path->nodes[0];
3184         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3185         write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
3186         btrfs_mark_buffer_dirty(leaf);
3187 fail:
3188         btrfs_release_path(path);
3189         return ret;
3190
3191 }
3192
3193 static struct btrfs_block_group_cache *next_block_group(
3194                 struct btrfs_block_group_cache *cache)
3195 {
3196         struct btrfs_fs_info *fs_info = cache->fs_info;
3197         struct rb_node *node;
3198
3199         spin_lock(&fs_info->block_group_cache_lock);
3200
3201         /* If our block group was removed, we need a full search. */
3202         if (RB_EMPTY_NODE(&cache->cache_node)) {
3203                 const u64 next_bytenr = cache->key.objectid + cache->key.offset;
3204
3205                 spin_unlock(&fs_info->block_group_cache_lock);
3206                 btrfs_put_block_group(cache);
3207                 cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
3208         }
3209         node = rb_next(&cache->cache_node);
3210         btrfs_put_block_group(cache);
3211         if (node) {
3212                 cache = rb_entry(node, struct btrfs_block_group_cache,
3213                                  cache_node);
3214                 btrfs_get_block_group(cache);
3215         } else
3216                 cache = NULL;
3217         spin_unlock(&fs_info->block_group_cache_lock);
3218         return cache;
3219 }
3220
3221 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
3222                             struct btrfs_trans_handle *trans,
3223                             struct btrfs_path *path)
3224 {
3225         struct btrfs_fs_info *fs_info = block_group->fs_info;
3226         struct btrfs_root *root = fs_info->tree_root;
3227         struct inode *inode = NULL;
3228         struct extent_changeset *data_reserved = NULL;
3229         u64 alloc_hint = 0;
3230         int dcs = BTRFS_DC_ERROR;
3231         u64 num_pages = 0;
3232         int retries = 0;
3233         int ret = 0;
3234
3235         /*
3236          * If this block group is smaller than 100 megs don't bother caching the
3237          * block group.
3238          */
3239         if (block_group->key.offset < (100 * SZ_1M)) {
3240                 spin_lock(&block_group->lock);
3241                 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3242                 spin_unlock(&block_group->lock);
3243                 return 0;
3244         }
3245
3246         if (trans->aborted)
3247                 return 0;
3248 again:
3249         inode = lookup_free_space_inode(block_group, path);
3250         if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3251                 ret = PTR_ERR(inode);
3252                 btrfs_release_path(path);
3253                 goto out;
3254         }
3255
3256         if (IS_ERR(inode)) {
3257                 BUG_ON(retries);
3258                 retries++;
3259
3260                 if (block_group->ro)
3261                         goto out_free;
3262
3263                 ret = create_free_space_inode(trans, block_group, path);
3264                 if (ret)
3265                         goto out_free;
3266                 goto again;
3267         }
3268
3269         /*
3270          * We want to set the generation to 0, that way if anything goes wrong
3271          * from here on out we know not to trust this cache when we load up next
3272          * time.
3273          */
3274         BTRFS_I(inode)->generation = 0;
3275         ret = btrfs_update_inode(trans, root, inode);
3276         if (ret) {
3277                 /*
3278                  * So theoretically we could recover from this, simply set the
3279                  * super cache generation to 0 so we know to invalidate the
3280                  * cache, but then we'd have to keep track of the block groups
3281                  * that fail this way so we know we _have_ to reset this cache
3282                  * before the next commit or risk reading stale cache.  So to
3283                  * limit our exposure to horrible edge cases lets just abort the
3284                  * transaction, this only happens in really bad situations
3285                  * anyway.
3286                  */
3287                 btrfs_abort_transaction(trans, ret);
3288                 goto out_put;
3289         }
3290         WARN_ON(ret);
3291
3292         /* We've already setup this transaction, go ahead and exit */
3293         if (block_group->cache_generation == trans->transid &&
3294             i_size_read(inode)) {
3295                 dcs = BTRFS_DC_SETUP;
3296                 goto out_put;
3297         }
3298
3299         if (i_size_read(inode) > 0) {
3300                 ret = btrfs_check_trunc_cache_free_space(fs_info,
3301                                         &fs_info->global_block_rsv);
3302                 if (ret)
3303                         goto out_put;
3304
3305                 ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
3306                 if (ret)
3307                         goto out_put;
3308         }
3309
3310         spin_lock(&block_group->lock);
3311         if (block_group->cached != BTRFS_CACHE_FINISHED ||
3312             !btrfs_test_opt(fs_info, SPACE_CACHE)) {
3313                 /*
3314                  * don't bother trying to write stuff out _if_
3315                  * a) we're not cached,
3316                  * b) we're with nospace_cache mount option,
3317                  * c) we're with v2 space_cache (FREE_SPACE_TREE).
3318                  */
3319                 dcs = BTRFS_DC_WRITTEN;
3320                 spin_unlock(&block_group->lock);
3321                 goto out_put;
3322         }
3323         spin_unlock(&block_group->lock);
3324
3325         /*
3326          * We hit an ENOSPC when setting up the cache in this transaction, just
3327          * skip doing the setup, we've already cleared the cache so we're safe.
3328          */
3329         if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
3330                 ret = -ENOSPC;
3331                 goto out_put;
3332         }
3333
3334         /*
3335          * Try to preallocate enough space based on how big the block group is.
3336          * Keep in mind this has to include any pinned space which could end up
3337          * taking up quite a bit since it's not folded into the other space
3338          * cache.
3339          */
3340         num_pages = div_u64(block_group->key.offset, SZ_256M);
3341         if (!num_pages)
3342                 num_pages = 1;
3343
3344         num_pages *= 16;
3345         num_pages *= PAGE_SIZE;
3346
3347         ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
3348         if (ret)
3349                 goto out_put;
3350
3351         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3352                                               num_pages, num_pages,
3353                                               &alloc_hint);
3354         /*
3355          * Our cache requires contiguous chunks so that we don't modify a bunch
3356          * of metadata or split extents when writing the cache out, which means
3357          * we can enospc if we are heavily fragmented in addition to just normal
3358          * out of space conditions.  So if we hit this just skip setting up any
3359          * other block groups for this transaction, maybe we'll unpin enough
3360          * space the next time around.
3361          */
3362         if (!ret)
3363                 dcs = BTRFS_DC_SETUP;
3364         else if (ret == -ENOSPC)
3365                 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
3366
3367 out_put:
3368         iput(inode);
3369 out_free:
3370         btrfs_release_path(path);
3371 out:
3372         spin_lock(&block_group->lock);
3373         if (!ret && dcs == BTRFS_DC_SETUP)
3374                 block_group->cache_generation = trans->transid;
3375         block_group->disk_cache_state = dcs;
3376         spin_unlock(&block_group->lock);
3377
3378         extent_changeset_free(data_reserved);
3379         return ret;
3380 }
3381
3382 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
3383 {
3384         struct btrfs_fs_info *fs_info = trans->fs_info;
3385         struct btrfs_block_group_cache *cache, *tmp;
3386         struct btrfs_transaction *cur_trans = trans->transaction;
3387         struct btrfs_path *path;
3388
3389         if (list_empty(&cur_trans->dirty_bgs) ||
3390             !btrfs_test_opt(fs_info, SPACE_CACHE))
3391                 return 0;
3392
3393         path = btrfs_alloc_path();
3394         if (!path)
3395                 return -ENOMEM;
3396
3397         /* Could add new block groups, use _safe just in case */
3398         list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
3399                                  dirty_list) {
3400                 if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3401                         cache_save_setup(cache, trans, path);
3402         }
3403
3404         btrfs_free_path(path);
3405         return 0;
3406 }
3407
3408 /*
3409  * transaction commit does final block group cache writeback during a
3410  * critical section where nothing is allowed to change the FS.  This is
3411  * required in order for the cache to actually match the block group,
3412  * but can introduce a lot of latency into the commit.
3413  *
3414  * So, btrfs_start_dirty_block_groups is here to kick off block group
3415  * cache IO.  There's a chance we'll have to redo some of it if the
3416  * block group changes again during the commit, but it greatly reduces
3417  * the commit latency by getting rid of the easy block groups while
3418  * we're still allowing others to join the commit.
3419  */
3420 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
3421 {
3422         struct btrfs_fs_info *fs_info = trans->fs_info;
3423         struct btrfs_block_group_cache *cache;
3424         struct btrfs_transaction *cur_trans = trans->transaction;
3425         int ret = 0;
3426         int should_put;
3427         struct btrfs_path *path = NULL;
3428         LIST_HEAD(dirty);
3429         struct list_head *io = &cur_trans->io_bgs;
3430         int num_started = 0;
3431         int loops = 0;
3432
3433         spin_lock(&cur_trans->dirty_bgs_lock);
3434         if (list_empty(&cur_trans->dirty_bgs)) {
3435                 spin_unlock(&cur_trans->dirty_bgs_lock);
3436                 return 0;
3437         }
3438         list_splice_init(&cur_trans->dirty_bgs, &dirty);
3439         spin_unlock(&cur_trans->dirty_bgs_lock);
3440
3441 again:
3442         /*
3443          * make sure all the block groups on our dirty list actually
3444          * exist
3445          */
3446         btrfs_create_pending_block_groups(trans);
3447
3448         if (!path) {
3449                 path = btrfs_alloc_path();
3450                 if (!path)
3451                         return -ENOMEM;
3452         }
3453
3454         /*
3455          * cache_write_mutex is here only to save us from balance or automatic
3456          * removal of empty block groups deleting this block group while we are
3457          * writing out the cache
3458          */
3459         mutex_lock(&trans->transaction->cache_write_mutex);
3460         while (!list_empty(&dirty)) {
3461                 bool drop_reserve = true;
3462
3463                 cache = list_first_entry(&dirty,
3464                                          struct btrfs_block_group_cache,
3465                                          dirty_list);
3466                 /*
3467                  * this can happen if something re-dirties a block
3468                  * group that is already under IO.  Just wait for it to
3469                  * finish and then do it all again
3470                  */
3471                 if (!list_empty(&cache->io_list)) {
3472                         list_del_init(&cache->io_list);
3473                         btrfs_wait_cache_io(trans, cache, path);
3474                         btrfs_put_block_group(cache);
3475                 }
3476
3477
3478                 /*
3479                  * btrfs_wait_cache_io uses the cache->dirty_list to decide
3480                  * if it should update the cache_state.  Don't delete
3481                  * until after we wait.
3482                  *
3483                  * Since we're not running in the commit critical section
3484                  * we need the dirty_bgs_lock to protect from update_block_group
3485                  */
3486                 spin_lock(&cur_trans->dirty_bgs_lock);
3487                 list_del_init(&cache->dirty_list);
3488                 spin_unlock(&cur_trans->dirty_bgs_lock);
3489
3490                 should_put = 1;
3491
3492                 cache_save_setup(cache, trans, path);
3493
3494                 if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3495                         cache->io_ctl.inode = NULL;
3496                         ret = btrfs_write_out_cache(trans, cache, path);
3497                         if (ret == 0 && cache->io_ctl.inode) {
3498                                 num_started++;
3499                                 should_put = 0;
3500
3501                                 /*
3502                                  * The cache_write_mutex is protecting the
3503                                  * io_list, also refer to the definition of
3504                                  * btrfs_transaction::io_bgs for more details
3505                                  */
3506                                 list_add_tail(&cache->io_list, io);
3507                         } else {
3508                                 /*
3509                                  * if we failed to write the cache, the
3510                                  * generation will be bad and life goes on
3511                                  */
3512                                 ret = 0;
3513                         }
3514                 }
3515                 if (!ret) {
3516                         ret = write_one_cache_group(trans, path, cache);
3517                         /*
3518                          * Our block group might still be attached to the list
3519                          * of new block groups in the transaction handle of some
3520                          * other task (struct btrfs_trans_handle->new_bgs). This
3521                          * means its block group item isn't yet in the extent
3522                          * tree. If this happens ignore the error, as we will
3523                          * try again later in the critical section of the
3524                          * transaction commit.
3525                          */
3526                         if (ret == -ENOENT) {
3527                                 ret = 0;
3528                                 spin_lock(&cur_trans->dirty_bgs_lock);
3529                                 if (list_empty(&cache->dirty_list)) {
3530                                         list_add_tail(&cache->dirty_list,
3531                                                       &cur_trans->dirty_bgs);
3532                                         btrfs_get_block_group(cache);
3533                                         drop_reserve = false;
3534                                 }
3535                                 spin_unlock(&cur_trans->dirty_bgs_lock);
3536                         } else if (ret) {
3537                                 btrfs_abort_transaction(trans, ret);
3538                         }
3539                 }
3540
3541                 /* if it's not on the io list, we need to put the block group */
3542                 if (should_put)
3543                         btrfs_put_block_group(cache);
3544                 if (drop_reserve)
3545                         btrfs_delayed_refs_rsv_release(fs_info, 1);
3546
3547                 if (ret)
3548                         break;
3549
3550                 /*
3551                  * Avoid blocking other tasks for too long. It might even save
3552                  * us from writing caches for block groups that are going to be
3553                  * removed.
3554                  */
3555                 mutex_unlock(&trans->transaction->cache_write_mutex);
3556                 mutex_lock(&trans->transaction->cache_write_mutex);
3557         }
3558         mutex_unlock(&trans->transaction->cache_write_mutex);
3559
3560         /*
3561          * go through delayed refs for all the stuff we've just kicked off
3562          * and then loop back (just once)
3563          */
3564         ret = btrfs_run_delayed_refs(trans, 0);
3565         if (!ret && loops == 0) {
3566                 loops++;
3567                 spin_lock(&cur_trans->dirty_bgs_lock);
3568                 list_splice_init(&cur_trans->dirty_bgs, &dirty);
3569                 /*
3570                  * dirty_bgs_lock protects us from concurrent block group
3571                  * deletes too (not just cache_write_mutex).
3572                  */
3573                 if (!list_empty(&dirty)) {
3574                         spin_unlock(&cur_trans->dirty_bgs_lock);
3575                         goto again;
3576                 }
3577                 spin_unlock(&cur_trans->dirty_bgs_lock);
3578         } else if (ret < 0) {
3579                 btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
3580         }
3581
3582         btrfs_free_path(path);
3583         return ret;
3584 }
3585
3586 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
3587 {
3588         struct btrfs_fs_info *fs_info = trans->fs_info;
3589         struct btrfs_block_group_cache *cache;
3590         struct btrfs_transaction *cur_trans = trans->transaction;
3591         int ret = 0;
3592         int should_put;
3593         struct btrfs_path *path;
3594         struct list_head *io = &cur_trans->io_bgs;
3595         int num_started = 0;
3596
3597         path = btrfs_alloc_path();
3598         if (!path)
3599                 return -ENOMEM;
3600
3601         /*
3602          * Even though we are in the critical section of the transaction commit,
3603          * we can still have concurrent tasks adding elements to this
3604          * transaction's list of dirty block groups. These tasks correspond to
3605          * endio free space workers started when writeback finishes for a
3606          * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
3607          * allocate new block groups as a result of COWing nodes of the root
3608          * tree when updating the free space inode. The writeback for the space
3609          * caches is triggered by an earlier call to
3610          * btrfs_start_dirty_block_groups() and iterations of the following
3611          * loop.
3612          * Also we want to do the cache_save_setup first and then run the
3613          * delayed refs to make sure we have the best chance at doing this all
3614          * in one shot.
3615          */
3616         spin_lock(&cur_trans->dirty_bgs_lock);
3617         while (!list_empty(&cur_trans->dirty_bgs)) {
3618                 cache = list_first_entry(&cur_trans->dirty_bgs,
3619                                          struct btrfs_block_group_cache,
3620                                          dirty_list);
3621
3622                 /*
3623                  * this can happen if cache_save_setup re-dirties a block
3624                  * group that is already under IO.  Just wait for it to
3625                  * finish and then do it all again
3626                  */
3627                 if (!list_empty(&cache->io_list)) {
3628                         spin_unlock(&cur_trans->dirty_bgs_lock);
3629                         list_del_init(&cache->io_list);
3630                         btrfs_wait_cache_io(trans, cache, path);
3631                         btrfs_put_block_group(cache);
3632                         spin_lock(&cur_trans->dirty_bgs_lock);
3633                 }
3634
3635                 /*
3636                  * don't remove from the dirty list until after we've waited
3637                  * on any pending IO
3638                  */
3639                 list_del_init(&cache->dirty_list);
3640                 spin_unlock(&cur_trans->dirty_bgs_lock);
3641                 should_put = 1;
3642
3643                 cache_save_setup(cache, trans, path);
3644
3645                 if (!ret)
3646                         ret = btrfs_run_delayed_refs(trans,
3647                                                      (unsigned long) -1);
3648
3649                 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3650                         cache->io_ctl.inode = NULL;
3651                         ret = btrfs_write_out_cache(trans, cache, path);
3652                         if (ret == 0 && cache->io_ctl.inode) {
3653                                 num_started++;
3654                                 should_put = 0;
3655                                 list_add_tail(&cache->io_list, io);
3656                         } else {
3657                                 /*
3658                                  * if we failed to write the cache, the
3659                                  * generation will be bad and life goes on
3660                                  */
3661                                 ret = 0;
3662                         }
3663                 }
3664                 if (!ret) {
3665                         ret = write_one_cache_group(trans, path, cache);
3666                         /*
3667                          * One of the free space endio workers might have
3668                          * created a new block group while updating a free space
3669                          * cache's inode (at inode.c:btrfs_finish_ordered_io())
3670                          * and hasn't released its transaction handle yet, in
3671                          * which case the new block group is still attached to
3672                          * its transaction handle and its creation has not
3673                          * finished yet (no block group item in the extent tree
3674                          * yet, etc). If this is the case, wait for all free
3675                          * space endio workers to finish and retry. This is a
3676                          * a very rare case so no need for a more efficient and
3677                          * complex approach.
3678                          */
3679                         if (ret == -ENOENT) {
3680                                 wait_event(cur_trans->writer_wait,
3681                                    atomic_read(&cur_trans->num_writers) == 1);
3682                                 ret = write_one_cache_group(trans, path, cache);
3683                         }
3684                         if (ret)
3685                                 btrfs_abort_transaction(trans, ret);
3686                 }
3687
3688                 /* if its not on the io list, we need to put the block group */
3689                 if (should_put)
3690                         btrfs_put_block_group(cache);
3691                 btrfs_delayed_refs_rsv_release(fs_info, 1);
3692                 spin_lock(&cur_trans->dirty_bgs_lock);
3693         }
3694         spin_unlock(&cur_trans->dirty_bgs_lock);
3695
3696         /*
3697          * Refer to the definition of io_bgs member for details why it's safe
3698          * to use it without any locking
3699          */
3700         while (!list_empty(io)) {
3701                 cache = list_first_entry(io, struct btrfs_block_group_cache,
3702                                          io_list);
3703                 list_del_init(&cache->io_list);
3704                 btrfs_wait_cache_io(trans, cache, path);
3705                 btrfs_put_block_group(cache);
3706         }
3707
3708         btrfs_free_path(path);
3709         return ret;
3710 }
3711
3712 int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
3713 {
3714         struct btrfs_block_group_cache *block_group;
3715         int readonly = 0;
3716
3717         block_group = btrfs_lookup_block_group(fs_info, bytenr);
3718         if (!block_group || block_group->ro)
3719                 readonly = 1;
3720         if (block_group)
3721                 btrfs_put_block_group(block_group);
3722         return readonly;
3723 }
3724
3725 bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3726 {
3727         struct btrfs_block_group_cache *bg;
3728         bool ret = true;
3729
3730         bg = btrfs_lookup_block_group(fs_info, bytenr);
3731         if (!bg)
3732                 return false;
3733
3734         spin_lock(&bg->lock);
3735         if (bg->ro)
3736                 ret = false;
3737         else
3738                 atomic_inc(&bg->nocow_writers);
3739         spin_unlock(&bg->lock);
3740
3741         /* no put on block group, done by btrfs_dec_nocow_writers */
3742         if (!ret)
3743                 btrfs_put_block_group(bg);
3744
3745         return ret;
3746
3747 }
3748
3749 void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3750 {
3751         struct btrfs_block_group_cache *bg;
3752
3753         bg = btrfs_lookup_block_group(fs_info, bytenr);
3754         ASSERT(bg);
3755         if (atomic_dec_and_test(&bg->nocow_writers))
3756                 wake_up_var(&bg->nocow_writers);
3757         /*
3758          * Once for our lookup and once for the lookup done by a previous call
3759          * to btrfs_inc_nocow_writers()
3760          */
3761         btrfs_put_block_group(bg);
3762         btrfs_put_block_group(bg);
3763 }
3764
3765 void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
3766 {
3767         wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
3768 }
3769
3770 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3771 {
3772         u64 extra_flags = chunk_to_extended(flags) &
3773                                 BTRFS_EXTENDED_PROFILE_MASK;
3774
3775         write_seqlock(&fs_info->profiles_lock);
3776         if (flags & BTRFS_BLOCK_GROUP_DATA)
3777                 fs_info->avail_data_alloc_bits |= extra_flags;
3778         if (flags & BTRFS_BLOCK_GROUP_METADATA)
3779                 fs_info->avail_metadata_alloc_bits |= extra_flags;
3780         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3781                 fs_info->avail_system_alloc_bits |= extra_flags;
3782         write_sequnlock(&fs_info->profiles_lock);
3783 }
3784
3785 /*
3786  * returns target flags in extended format or 0 if restripe for this
3787  * chunk_type is not in progress
3788  *
3789  * should be called with balance_lock held
3790  */
3791 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
3792 {
3793         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3794         u64 target = 0;
3795
3796         if (!bctl)
3797                 return 0;
3798
3799         if (flags & BTRFS_BLOCK_GROUP_DATA &&
3800             bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3801                 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
3802         } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
3803                    bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3804                 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
3805         } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
3806                    bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3807                 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
3808         }
3809
3810         return target;
3811 }
3812
3813 /*
3814  * @flags: available profiles in extended format (see ctree.h)
3815  *
3816  * Returns reduced profile in chunk format.  If profile changing is in
3817  * progress (either running or paused) picks the target profile (if it's
3818  * already available), otherwise falls back to plain reducing.
3819  */
3820 static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
3821 {
3822         u64 num_devices = fs_info->fs_devices->rw_devices;
3823         u64 target;
3824         u64 raid_type;
3825         u64 allowed = 0;
3826
3827         /*
3828          * see if restripe for this chunk_type is in progress, if so
3829          * try to reduce to the target profile
3830          */
3831         spin_lock(&fs_info->balance_lock);
3832         target = get_restripe_target(fs_info, flags);
3833         if (target) {
3834                 /* pick target profile only if it's already available */
3835                 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
3836                         spin_unlock(&fs_info->balance_lock);
3837                         return extended_to_chunk(target);
3838                 }
3839         }
3840         spin_unlock(&fs_info->balance_lock);
3841
3842         /* First, mask out the RAID levels which aren't possible */
3843         for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
3844                 if (num_devices >= btrfs_raid_array[raid_type].devs_min)
3845                         allowed |= btrfs_raid_array[raid_type].bg_flag;
3846         }
3847         allowed &= flags;
3848
3849         if (allowed & BTRFS_BLOCK_GROUP_RAID6)
3850                 allowed = BTRFS_BLOCK_GROUP_RAID6;
3851         else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
3852                 allowed = BTRFS_BLOCK_GROUP_RAID5;
3853         else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
3854                 allowed = BTRFS_BLOCK_GROUP_RAID10;
3855         else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
3856                 allowed = BTRFS_BLOCK_GROUP_RAID1;
3857         else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
3858                 allowed = BTRFS_BLOCK_GROUP_RAID0;
3859
3860         flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
3861
3862         return extended_to_chunk(flags | allowed);
3863 }
3864
3865 static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
3866 {
3867         unsigned seq;
3868         u64 flags;
3869
3870         do {
3871                 flags = orig_flags;
3872                 seq = read_seqbegin(&fs_info->profiles_lock);
3873
3874                 if (flags & BTRFS_BLOCK_GROUP_DATA)
3875                         flags |= fs_info->avail_data_alloc_bits;
3876                 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3877                         flags |= fs_info->avail_system_alloc_bits;
3878                 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3879                         flags |= fs_info->avail_metadata_alloc_bits;
3880         } while (read_seqretry(&fs_info->profiles_lock, seq));
3881
3882         return btrfs_reduce_alloc_profile(fs_info, flags);
3883 }
3884
3885 static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
3886 {
3887         struct btrfs_fs_info *fs_info = root->fs_info;
3888         u64 flags;
3889         u64 ret;
3890
3891         if (data)
3892                 flags = BTRFS_BLOCK_GROUP_DATA;
3893         else if (root == fs_info->chunk_root)
3894                 flags = BTRFS_BLOCK_GROUP_SYSTEM;
3895         else
3896                 flags = BTRFS_BLOCK_GROUP_METADATA;
3897
3898         ret = get_alloc_profile(fs_info, flags);
3899         return ret;
3900 }
3901
3902 u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
3903 {
3904         return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA);
3905 }
3906
3907 u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info)
3908 {
3909         return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3910 }
3911
3912 u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
3913 {
3914         return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3915 }
3916
3917 int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
3918 {
3919         struct btrfs_root *root = inode->root;
3920         struct btrfs_fs_info *fs_info = root->fs_info;
3921         struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
3922         u64 used;
3923         int ret = 0;
3924         int need_commit = 2;
3925         int have_pinned_space;
3926
3927         /* make sure bytes are sectorsize aligned */
3928         bytes = ALIGN(bytes, fs_info->sectorsize);
3929
3930         if (btrfs_is_free_space_inode(inode)) {
3931                 need_commit = 0;
3932                 ASSERT(current->journal_info);
3933         }
3934
3935 again:
3936         /* make sure we have enough space to handle the data first */
3937         spin_lock(&data_sinfo->lock);
3938         used = btrfs_space_info_used(data_sinfo, true);
3939
3940         if (used + bytes > data_sinfo->total_bytes) {
3941                 struct btrfs_trans_handle *trans;
3942
3943                 /*
3944                  * if we don't have enough free bytes in this space then we need
3945                  * to alloc a new chunk.
3946                  */
3947                 if (!data_sinfo->full) {
3948                         u64 alloc_target;
3949
3950                         data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
3951                         spin_unlock(&data_sinfo->lock);
3952
3953                         alloc_target = btrfs_data_alloc_profile(fs_info);
3954                         /*
3955                          * It is ugly that we don't call nolock join
3956                          * transaction for the free space inode case here.
3957                          * But it is safe because we only do the data space
3958                          * reservation for the free space cache in the
3959                          * transaction context, the common join transaction
3960                          * just increase the counter of the current transaction
3961                          * handler, doesn't try to acquire the trans_lock of
3962                          * the fs.
3963                          */
3964                         trans = btrfs_join_transaction(root);
3965                         if (IS_ERR(trans))
3966                                 return PTR_ERR(trans);
3967
3968                         ret = btrfs_chunk_alloc(trans, alloc_target,
3969                                                 CHUNK_ALLOC_NO_FORCE);
3970                         btrfs_end_transaction(trans);
3971                         if (ret < 0) {
3972                                 if (ret != -ENOSPC)
3973                                         return ret;
3974                                 else {
3975                                         have_pinned_space = 1;
3976                                         goto commit_trans;
3977                                 }
3978                         }
3979
3980                         goto again;
3981                 }
3982
3983                 /*
3984                  * If we don't have enough pinned space to deal with this
3985                  * allocation, and no removed chunk in current transaction,
3986                  * don't bother committing the transaction.
3987                  */
3988                 have_pinned_space = __percpu_counter_compare(
3989                         &data_sinfo->total_bytes_pinned,
3990                         used + bytes - data_sinfo->total_bytes,
3991                         BTRFS_TOTAL_BYTES_PINNED_BATCH);
3992                 spin_unlock(&data_sinfo->lock);
3993
3994                 /* commit the current transaction and try again */
3995 commit_trans:
3996                 if (need_commit) {
3997                         need_commit--;
3998
3999                         if (need_commit > 0) {
4000                                 btrfs_start_delalloc_roots(fs_info, -1);
4001                                 btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
4002                                                          (u64)-1);
4003                         }
4004
4005                         trans = btrfs_join_transaction(root);
4006                         if (IS_ERR(trans))
4007                                 return PTR_ERR(trans);
4008                         if (have_pinned_space >= 0 ||
4009                             test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
4010                                      &trans->transaction->flags) ||
4011                             need_commit > 0) {
4012                                 ret = btrfs_commit_transaction(trans);
4013                                 if (ret)
4014                                         return ret;
4015                                 /*
4016                                  * The cleaner kthread might still be doing iput
4017                                  * operations. Wait for it to finish so that
4018                                  * more space is released.  We don't need to
4019                                  * explicitly run the delayed iputs here because
4020                                  * the commit_transaction would have woken up
4021                                  * the cleaner.
4022                                  */
4023                                 ret = btrfs_wait_on_delayed_iputs(fs_info);
4024                                 if (ret)
4025                                         return ret;
4026                                 goto again;
4027                         } else {
4028                                 btrfs_end_transaction(trans);
4029                         }
4030                 }
4031
4032                 trace_btrfs_space_reservation(fs_info,
4033                                               "space_info:enospc",
4034                                               data_sinfo->flags, bytes, 1);
4035                 return -ENOSPC;
4036         }
4037         btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes);
4038         trace_btrfs_space_reservation(fs_info, "space_info",
4039                                       data_sinfo->flags, bytes, 1);
4040         spin_unlock(&data_sinfo->lock);
4041
4042         return 0;
4043 }
4044
4045 int btrfs_check_data_free_space(struct inode *inode,
4046                         struct extent_changeset **reserved, u64 start, u64 len)
4047 {
4048         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4049         int ret;
4050
4051         /* align the range */
4052         len = round_up(start + len, fs_info->sectorsize) -
4053               round_down(start, fs_info->sectorsize);
4054         start = round_down(start, fs_info->sectorsize);
4055
4056         ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len);
4057         if (ret < 0)
4058                 return ret;
4059
4060         /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
4061         ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
4062         if (ret < 0)
4063                 btrfs_free_reserved_data_space_noquota(inode, start, len);
4064         else
4065                 ret = 0;
4066         return ret;
4067 }
4068
4069 /*
4070  * Called if we need to clear a data reservation for this inode
4071  * Normally in a error case.
4072  *
4073  * This one will *NOT* use accurate qgroup reserved space API, just for case
4074  * which we can't sleep and is sure it won't affect qgroup reserved space.
4075  * Like clear_bit_hook().
4076  */
4077 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
4078                                             u64 len)
4079 {
4080         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4081         struct btrfs_space_info *data_sinfo;
4082
4083         /* Make sure the range is aligned to sectorsize */
4084         len = round_up(start + len, fs_info->sectorsize) -
4085               round_down(start, fs_info->sectorsize);
4086         start = round_down(start, fs_info->sectorsize);
4087
4088         data_sinfo = fs_info->data_sinfo;
4089         spin_lock(&data_sinfo->lock);
4090         btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, -len);
4091         trace_btrfs_space_reservation(fs_info, "space_info",
4092                                       data_sinfo->flags, len, 0);
4093         spin_unlock(&data_sinfo->lock);
4094 }
4095
4096 /*
4097  * Called if we need to clear a data reservation for this inode
4098  * Normally in a error case.
4099  *
4100  * This one will handle the per-inode data rsv map for accurate reserved
4101  * space framework.
4102  */
4103 void btrfs_free_reserved_data_space(struct inode *inode,
4104                         struct extent_changeset *reserved, u64 start, u64 len)
4105 {
4106         struct btrfs_root *root = BTRFS_I(inode)->root;
4107
4108         /* Make sure the range is aligned to sectorsize */
4109         len = round_up(start + len, root->fs_info->sectorsize) -
4110               round_down(start, root->fs_info->sectorsize);
4111         start = round_down(start, root->fs_info->sectorsize);
4112
4113         btrfs_free_reserved_data_space_noquota(inode, start, len);
4114         btrfs_qgroup_free_data(inode, reserved, start, len);
4115 }
4116
4117 static void force_metadata_allocation(struct btrfs_fs_info *info)
4118 {
4119         struct list_head *head = &info->space_info;
4120         struct btrfs_space_info *found;
4121
4122         rcu_read_lock();
4123         list_for_each_entry_rcu(found, head, list) {
4124                 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
4125                         found->force_alloc = CHUNK_ALLOC_FORCE;
4126         }
4127         rcu_read_unlock();
4128 }
4129
4130 static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
4131                               struct btrfs_space_info *sinfo, int force)
4132 {
4133         u64 bytes_used = btrfs_space_info_used(sinfo, false);
4134         u64 thresh;
4135
4136         if (force == CHUNK_ALLOC_FORCE)
4137                 return 1;
4138
4139         /*
4140          * in limited mode, we want to have some free space up to
4141          * about 1% of the FS size.
4142          */
4143         if (force == CHUNK_ALLOC_LIMITED) {
4144                 thresh = btrfs_super_total_bytes(fs_info->super_copy);
4145                 thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
4146
4147                 if (sinfo->total_bytes - bytes_used < thresh)
4148                         return 1;
4149         }
4150
4151         if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
4152                 return 0;
4153         return 1;
4154 }
4155
4156 static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
4157 {
4158         u64 num_dev;
4159
4160         num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
4161         if (!num_dev)
4162                 num_dev = fs_info->fs_devices->rw_devices;
4163
4164         return num_dev;
4165 }
4166
4167 /*
4168  * If @is_allocation is true, reserve space in the system space info necessary
4169  * for allocating a chunk, otherwise if it's false, reserve space necessary for
4170  * removing a chunk.
4171  */
4172 void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
4173 {
4174         struct btrfs_fs_info *fs_info = trans->fs_info;
4175         struct btrfs_space_info *info;
4176         u64 left;
4177         u64 thresh;
4178         int ret = 0;
4179         u64 num_devs;
4180
4181         /*
4182          * Needed because we can end up allocating a system chunk and for an
4183          * atomic and race free space reservation in the chunk block reserve.
4184          */
4185         lockdep_assert_held(&fs_info->chunk_mutex);
4186
4187         info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4188         spin_lock(&info->lock);
4189         left = info->total_bytes - btrfs_space_info_used(info, true);
4190         spin_unlock(&info->lock);
4191
4192         num_devs = get_profile_num_devs(fs_info, type);
4193
4194         /* num_devs device items to update and 1 chunk item to add or remove */
4195         thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) +
4196                 btrfs_calc_trans_metadata_size(fs_info, 1);
4197
4198         if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4199                 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
4200                            left, thresh, type);
4201                 dump_space_info(fs_info, info, 0, 0);
4202         }
4203
4204         if (left < thresh) {
4205                 u64 flags = btrfs_system_alloc_profile(fs_info);
4206
4207                 /*
4208                  * Ignore failure to create system chunk. We might end up not
4209                  * needing it, as we might not need to COW all nodes/leafs from
4210                  * the paths we visit in the chunk tree (they were already COWed
4211                  * or created in the current transaction for example).
4212                  */
4213                 ret = btrfs_alloc_chunk(trans, flags);
4214         }
4215
4216         if (!ret) {
4217                 ret = btrfs_block_rsv_add(fs_info->chunk_root,
4218                                           &fs_info->chunk_block_rsv,
4219                                           thresh, BTRFS_RESERVE_NO_FLUSH);
4220                 if (!ret)
4221                         trans->chunk_bytes_reserved += thresh;
4222         }
4223 }
4224
4225 /*
4226  * If force is CHUNK_ALLOC_FORCE:
4227  *    - return 1 if it successfully allocates a chunk,
4228  *    - return errors including -ENOSPC otherwise.
4229  * If force is NOT CHUNK_ALLOC_FORCE:
4230  *    - return 0 if it doesn't need to allocate a new chunk,
4231  *    - return 1 if it successfully allocates a chunk,
4232  *    - return errors including -ENOSPC otherwise.
4233  */
4234 int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
4235                       enum btrfs_chunk_alloc_enum force)
4236 {
4237         struct btrfs_fs_info *fs_info = trans->fs_info;
4238         struct btrfs_space_info *space_info;
4239         bool wait_for_alloc = false;
4240         bool should_alloc = false;
4241         int ret = 0;
4242
4243         /* Don't re-enter if we're already allocating a chunk */
4244         if (trans->allocating_chunk)
4245                 return -ENOSPC;
4246
4247         space_info = btrfs_find_space_info(fs_info, flags);
4248         ASSERT(space_info);
4249
4250         do {
4251                 spin_lock(&space_info->lock);
4252                 if (force < space_info->force_alloc)
4253                         force = space_info->force_alloc;
4254                 should_alloc = should_alloc_chunk(fs_info, space_info, force);
4255                 if (space_info->full) {
4256                         /* No more free physical space */
4257                         if (should_alloc)
4258                                 ret = -ENOSPC;
4259                         else
4260                                 ret = 0;
4261                         spin_unlock(&space_info->lock);
4262                         return ret;
4263                 } else if (!should_alloc) {
4264                         spin_unlock(&space_info->lock);
4265                         return 0;
4266                 } else if (space_info->chunk_alloc) {
4267                         /*
4268                          * Someone is already allocating, so we need to block
4269                          * until this someone is finished and then loop to
4270                          * recheck if we should continue with our allocation
4271                          * attempt.
4272                          */
4273                         wait_for_alloc = true;
4274                         spin_unlock(&space_info->lock);
4275                         mutex_lock(&fs_info->chunk_mutex);
4276                         mutex_unlock(&fs_info->chunk_mutex);
4277                 } else {
4278                         /* Proceed with allocation */
4279                         space_info->chunk_alloc = 1;
4280                         wait_for_alloc = false;
4281                         spin_unlock(&space_info->lock);
4282                 }
4283
4284                 cond_resched();
4285         } while (wait_for_alloc);
4286
4287         mutex_lock(&fs_info->chunk_mutex);
4288         trans->allocating_chunk = true;
4289
4290         /*
4291          * If we have mixed data/metadata chunks we want to make sure we keep
4292          * allocating mixed chunks instead of individual chunks.
4293          */
4294         if (btrfs_mixed_space_info(space_info))
4295                 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
4296
4297         /*
4298          * if we're doing a data chunk, go ahead and make sure that
4299          * we keep a reasonable number of metadata chunks allocated in the
4300          * FS as well.
4301          */
4302         if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
4303                 fs_info->data_chunk_allocations++;
4304                 if (!(fs_info->data_chunk_allocations %
4305                       fs_info->metadata_ratio))
4306                         force_metadata_allocation(fs_info);
4307         }
4308
4309         /*
4310          * Check if we have enough space in SYSTEM chunk because we may need
4311          * to update devices.
4312          */
4313         check_system_chunk(trans, flags);
4314
4315         ret = btrfs_alloc_chunk(trans, flags);
4316         trans->allocating_chunk = false;
4317
4318         spin_lock(&space_info->lock);
4319         if (ret < 0) {
4320                 if (ret == -ENOSPC)
4321                         space_info->full = 1;
4322                 else
4323                         goto out;
4324         } else {
4325                 ret = 1;
4326                 space_info->max_extent_size = 0;
4327         }
4328
4329         space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4330 out:
4331         space_info->chunk_alloc = 0;
4332         spin_unlock(&space_info->lock);
4333         mutex_unlock(&fs_info->chunk_mutex);
4334         /*
4335          * When we allocate a new chunk we reserve space in the chunk block
4336          * reserve to make sure we can COW nodes/leafs in the chunk tree or
4337          * add new nodes/leafs to it if we end up needing to do it when
4338          * inserting the chunk item and updating device items as part of the
4339          * second phase of chunk allocation, performed by
4340          * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
4341          * large number of new block groups to create in our transaction
4342          * handle's new_bgs list to avoid exhausting the chunk block reserve
4343          * in extreme cases - like having a single transaction create many new
4344          * block groups when starting to write out the free space caches of all
4345          * the block groups that were made dirty during the lifetime of the
4346          * transaction.
4347          */
4348         if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
4349                 btrfs_create_pending_block_groups(trans);
4350
4351         return ret;
4352 }
4353
4354 static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
4355                                          unsigned long nr_pages, int nr_items)
4356 {
4357         struct super_block *sb = fs_info->sb;
4358
4359         if (down_read_trylock(&sb->s_umount)) {
4360                 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
4361                 up_read(&sb->s_umount);
4362         } else {
4363                 /*
4364                  * We needn't worry the filesystem going from r/w to r/o though
4365                  * we don't acquire ->s_umount mutex, because the filesystem
4366                  * should guarantee the delalloc inodes list be empty after
4367                  * the filesystem is readonly(all dirty pages are written to
4368                  * the disk).
4369                  */
4370                 btrfs_start_delalloc_roots(fs_info, nr_items);
4371                 if (!current->journal_info)
4372                         btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
4373         }
4374 }
4375
4376 static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
4377                                         u64 to_reclaim)
4378 {
4379         u64 bytes;
4380         u64 nr;
4381
4382         bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
4383         nr = div64_u64(to_reclaim, bytes);
4384         if (!nr)
4385                 nr = 1;
4386         return nr;
4387 }
4388
4389 #define EXTENT_SIZE_PER_ITEM    SZ_256K
4390
4391 /*
4392  * shrink metadata reservation for delalloc
4393  */
4394 static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
4395                             u64 orig, bool wait_ordered)
4396 {
4397         struct btrfs_space_info *space_info;
4398         struct btrfs_trans_handle *trans;
4399         u64 delalloc_bytes;
4400         u64 dio_bytes;
4401         u64 async_pages;
4402         u64 items;
4403         long time_left;
4404         unsigned long nr_pages;
4405         int loops;
4406
4407         /* Calc the number of the pages we need flush for space reservation */
4408         items = calc_reclaim_items_nr(fs_info, to_reclaim);
4409         to_reclaim = items * EXTENT_SIZE_PER_ITEM;
4410
4411         trans = (struct btrfs_trans_handle *)current->journal_info;
4412         space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4413
4414         delalloc_bytes = percpu_counter_sum_positive(
4415                                                 &fs_info->delalloc_bytes);
4416         dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
4417         if (delalloc_bytes == 0 && dio_bytes == 0) {
4418                 if (trans)
4419                         return;
4420                 if (wait_ordered)
4421                         btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
4422                 return;
4423         }
4424
4425         /*
4426          * If we are doing more ordered than delalloc we need to just wait on
4427          * ordered extents, otherwise we'll waste time trying to flush delalloc
4428          * that likely won't give us the space back we need.
4429          */
4430         if (dio_bytes > delalloc_bytes)
4431                 wait_ordered = true;
4432
4433         loops = 0;
4434         while ((delalloc_bytes || dio_bytes) && loops < 3) {
4435                 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
4436
4437                 /*
4438                  * Triggers inode writeback for up to nr_pages. This will invoke
4439                  * ->writepages callback and trigger delalloc filling
4440                  *  (btrfs_run_delalloc_range()).
4441                  */
4442                 btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
4443
4444                 /*
4445                  * We need to wait for the compressed pages to start before
4446                  * we continue.
4447                  */
4448                 async_pages = atomic_read(&fs_info->async_delalloc_pages);
4449                 if (!async_pages)
4450                         goto skip_async;
4451
4452                 /*
4453                  * Calculate how many compressed pages we want to be written
4454                  * before we continue. I.e if there are more async pages than we
4455                  * require wait_event will wait until nr_pages are written.
4456                  */
4457                 if (async_pages <= nr_pages)
4458                         async_pages = 0;
4459                 else
4460                         async_pages -= nr_pages;
4461
4462                 wait_event(fs_info->async_submit_wait,
4463                            atomic_read(&fs_info->async_delalloc_pages) <=
4464                            (int)async_pages);
4465 skip_async:
4466                 spin_lock(&space_info->lock);
4467                 if (list_empty(&space_info->tickets) &&
4468                     list_empty(&space_info->priority_tickets)) {
4469                         spin_unlock(&space_info->lock);
4470                         break;
4471                 }
4472                 spin_unlock(&space_info->lock);
4473
4474                 loops++;
4475                 if (wait_ordered && !trans) {
4476                         btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
4477                 } else {
4478                         time_left = schedule_timeout_killable(1);
4479                         if (time_left)
4480                                 break;
4481                 }
4482                 delalloc_bytes = percpu_counter_sum_positive(
4483                                                 &fs_info->delalloc_bytes);
4484                 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
4485         }
4486 }
4487
4488 /**
4489  * maybe_commit_transaction - possibly commit the transaction if its ok to
4490  * @root - the root we're allocating for
4491  * @bytes - the number of bytes we want to reserve
4492  * @force - force the commit
4493  *
4494  * This will check to make sure that committing the transaction will actually
4495  * get us somewhere and then commit the transaction if it does.  Otherwise it
4496  * will return -ENOSPC.
4497  */
4498 static int may_commit_transaction(struct btrfs_fs_info *fs_info,
4499                                   struct btrfs_space_info *space_info)
4500 {
4501         struct reserve_ticket *ticket = NULL;
4502         struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
4503         struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
4504         struct btrfs_trans_handle *trans;
4505         u64 bytes_needed;
4506         u64 reclaim_bytes = 0;
4507
4508         trans = (struct btrfs_trans_handle *)current->journal_info;
4509         if (trans)
4510                 return -EAGAIN;
4511
4512         spin_lock(&space_info->lock);
4513         if (!list_empty(&space_info->priority_tickets))
4514                 ticket = list_first_entry(&space_info->priority_tickets,
4515                                           struct reserve_ticket, list);
4516         else if (!list_empty(&space_info->tickets))
4517                 ticket = list_first_entry(&space_info->tickets,
4518                                           struct reserve_ticket, list);
4519         bytes_needed = (ticket) ? ticket->bytes : 0;
4520         spin_unlock(&space_info->lock);
4521
4522         if (!bytes_needed)
4523                 return 0;
4524
4525         trans = btrfs_join_transaction(fs_info->extent_root);
4526         if (IS_ERR(trans))
4527                 return PTR_ERR(trans);
4528
4529         /*
4530          * See if there is enough pinned space to make this reservation, or if
4531          * we have block groups that are going to be freed, allowing us to
4532          * possibly do a chunk allocation the next loop through.
4533          */
4534         if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
4535             __percpu_counter_compare(&space_info->total_bytes_pinned,
4536                                      bytes_needed,
4537                                      BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
4538                 goto commit;
4539
4540         /*
4541          * See if there is some space in the delayed insertion reservation for
4542          * this reservation.
4543          */
4544         if (space_info != delayed_rsv->space_info)
4545                 goto enospc;
4546
4547         spin_lock(&delayed_rsv->lock);
4548         reclaim_bytes += delayed_rsv->reserved;
4549         spin_unlock(&delayed_rsv->lock);
4550
4551         spin_lock(&delayed_refs_rsv->lock);
4552         reclaim_bytes += delayed_refs_rsv->reserved;
4553         spin_unlock(&delayed_refs_rsv->lock);
4554         if (reclaim_bytes >= bytes_needed)
4555                 goto commit;
4556         bytes_needed -= reclaim_bytes;
4557
4558         if (__percpu_counter_compare(&space_info->total_bytes_pinned,
4559                                    bytes_needed,
4560                                    BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
4561                 goto enospc;
4562
4563 commit:
4564         return btrfs_commit_transaction(trans);
4565 enospc:
4566         btrfs_end_transaction(trans);
4567         return -ENOSPC;
4568 }
4569
4570 /*
4571  * Try to flush some data based on policy set by @state. This is only advisory
4572  * and may fail for various reasons. The caller is supposed to examine the
4573  * state of @space_info to detect the outcome.
4574  */
4575 static void flush_space(struct btrfs_fs_info *fs_info,
4576                        struct btrfs_space_info *space_info, u64 num_bytes,
4577                        int state)
4578 {
4579         struct btrfs_root *root = fs_info->extent_root;
4580         struct btrfs_trans_handle *trans;
4581         int nr;
4582         int ret = 0;
4583
4584         switch (state) {
4585         case FLUSH_DELAYED_ITEMS_NR:
4586         case FLUSH_DELAYED_ITEMS:
4587                 if (state == FLUSH_DELAYED_ITEMS_NR)
4588                         nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
4589                 else
4590                         nr = -1;
4591
4592                 trans = btrfs_join_transaction(root);
4593                 if (IS_ERR(trans)) {
4594                         ret = PTR_ERR(trans);
4595                         break;
4596                 }
4597                 ret = btrfs_run_delayed_items_nr(trans, nr);
4598                 btrfs_end_transaction(trans);
4599                 break;
4600         case FLUSH_DELALLOC:
4601         case FLUSH_DELALLOC_WAIT:
4602                 shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
4603                                 state == FLUSH_DELALLOC_WAIT);
4604                 break;
4605         case FLUSH_DELAYED_REFS_NR:
4606         case FLUSH_DELAYED_REFS:
4607                 trans = btrfs_join_transaction(root);
4608                 if (IS_ERR(trans)) {
4609                         ret = PTR_ERR(trans);
4610                         break;
4611                 }
4612                 if (state == FLUSH_DELAYED_REFS_NR)
4613                         nr = calc_reclaim_items_nr(fs_info, num_bytes);
4614                 else
4615                         nr = 0;
4616                 btrfs_run_delayed_refs(trans, nr);
4617                 btrfs_end_transaction(trans);
4618                 break;
4619         case ALLOC_CHUNK:
4620         case ALLOC_CHUNK_FORCE:
4621                 trans = btrfs_join_transaction(root);
4622                 if (IS_ERR(trans)) {
4623                         ret = PTR_ERR(trans);
4624                         break;
4625                 }
4626                 ret = btrfs_chunk_alloc(trans,
4627                                 btrfs_metadata_alloc_profile(fs_info),
4628                                 (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
4629                                         CHUNK_ALLOC_FORCE);
4630                 btrfs_end_transaction(trans);
4631                 if (ret > 0 || ret == -ENOSPC)
4632                         ret = 0;
4633                 break;
4634         case COMMIT_TRANS:
4635                 /*
4636                  * If we have pending delayed iputs then we could free up a
4637                  * bunch of pinned space, so make sure we run the iputs before
4638                  * we do our pinned bytes check below.
4639                  */
4640                 btrfs_run_delayed_iputs(fs_info);
4641                 btrfs_wait_on_delayed_iputs(fs_info);
4642
4643                 ret = may_commit_transaction(fs_info, space_info);
4644                 break;
4645         default:
4646                 ret = -ENOSPC;
4647                 break;
4648         }
4649
4650         trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
4651                                 ret);
4652         return;
4653 }
4654
4655 static inline u64
4656 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
4657                                  struct btrfs_space_info *space_info,
4658                                  bool system_chunk)
4659 {
4660         struct reserve_ticket *ticket;
4661         u64 used;
4662         u64 expected;
4663         u64 to_reclaim = 0;
4664
4665         list_for_each_entry(ticket, &space_info->tickets, list)
4666                 to_reclaim += ticket->bytes;
4667         list_for_each_entry(ticket, &space_info->priority_tickets, list)
4668                 to_reclaim += ticket->bytes;
4669         if (to_reclaim)
4670                 return to_reclaim;
4671
4672         to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
4673         if (btrfs_can_overcommit(fs_info, space_info, to_reclaim,
4674                                  BTRFS_RESERVE_FLUSH_ALL, system_chunk))
4675                 return 0;
4676
4677         used = btrfs_space_info_used(space_info, true);
4678
4679         if (btrfs_can_overcommit(fs_info, space_info, SZ_1M,
4680                                  BTRFS_RESERVE_FLUSH_ALL, system_chunk))
4681                 expected = div_factor_fine(space_info->total_bytes, 95);
4682         else
4683                 expected = div_factor_fine(space_info->total_bytes, 90);
4684
4685         if (used > expected)
4686                 to_reclaim = used - expected;
4687         else
4688                 to_reclaim = 0;
4689         to_reclaim = min(to_reclaim, space_info->bytes_may_use +
4690                                      space_info->bytes_reserved);
4691         return to_reclaim;
4692 }
4693
4694 static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
4695                                         struct btrfs_space_info *space_info,
4696                                         u64 used, bool system_chunk)
4697 {
4698         u64 thresh = div_factor_fine(space_info->total_bytes, 98);
4699
4700         /* If we're just plain full then async reclaim just slows us down. */
4701         if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
4702                 return 0;
4703
4704         if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
4705                                               system_chunk))
4706                 return 0;
4707
4708         return (used >= thresh && !btrfs_fs_closing(fs_info) &&
4709                 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
4710 }
4711
4712 static bool wake_all_tickets(struct list_head *head)
4713 {
4714         struct reserve_ticket *ticket;
4715
4716         while (!list_empty(head)) {
4717                 ticket = list_first_entry(head, struct reserve_ticket, list);
4718                 list_del_init(&ticket->list);
4719                 ticket->error = -ENOSPC;
4720                 wake_up(&ticket->wait);
4721                 if (ticket->bytes != ticket->orig_bytes)
4722                         return true;
4723         }
4724         return false;
4725 }
4726
4727 /*
4728  * This is for normal flushers, we can wait all goddamned day if we want to.  We
4729  * will loop and continuously try to flush as long as we are making progress.
4730  * We count progress as clearing off tickets each time we have to loop.
4731  */
4732 static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
4733 {
4734         struct btrfs_fs_info *fs_info;
4735         struct btrfs_space_info *space_info;
4736         u64 to_reclaim;
4737         int flush_state;
4738         int commit_cycles = 0;
4739         u64 last_tickets_id;
4740
4741         fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
4742         space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4743
4744         spin_lock(&space_info->lock);
4745         to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
4746                                                       false);
4747         if (!to_reclaim) {
4748                 space_info->flush = 0;
4749                 spin_unlock(&space_info->lock);
4750                 return;
4751         }
4752         last_tickets_id = space_info->tickets_id;
4753         spin_unlock(&space_info->lock);
4754
4755         flush_state = FLUSH_DELAYED_ITEMS_NR;
4756         do {
4757                 flush_space(fs_info, space_info, to_reclaim, flush_state);
4758                 spin_lock(&space_info->lock);
4759                 if (list_empty(&space_info->tickets)) {
4760                         space_info->flush = 0;
4761                         spin_unlock(&space_info->lock);
4762                         return;
4763                 }
4764                 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
4765                                                               space_info,
4766                                                               false);
4767                 if (last_tickets_id == space_info->tickets_id) {
4768                         flush_state++;
4769                 } else {
4770                         last_tickets_id = space_info->tickets_id;
4771                         flush_state = FLUSH_DELAYED_ITEMS_NR;
4772                         if (commit_cycles)
4773                                 commit_cycles--;
4774                 }
4775
4776                 /*
4777                  * We don't want to force a chunk allocation until we've tried
4778                  * pretty hard to reclaim space.  Think of the case where we
4779                  * freed up a bunch of space and so have a lot of pinned space
4780                  * to reclaim.  We would rather use that than possibly create a
4781                  * underutilized metadata chunk.  So if this is our first run
4782                  * through the flushing state machine skip ALLOC_CHUNK_FORCE and
4783                  * commit the transaction.  If nothing has changed the next go
4784                  * around then we can force a chunk allocation.
4785                  */
4786                 if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
4787                         flush_state++;
4788
4789                 if (flush_state > COMMIT_TRANS) {
4790                         commit_cycles++;
4791                         if (commit_cycles > 2) {
4792                                 if (wake_all_tickets(&space_info->tickets)) {
4793                                         flush_state = FLUSH_DELAYED_ITEMS_NR;
4794                                         commit_cycles--;
4795                                 } else {
4796                                         space_info->flush = 0;
4797                                 }
4798                         } else {
4799                                 flush_state = FLUSH_DELAYED_ITEMS_NR;
4800                         }
4801                 }
4802                 spin_unlock(&space_info->lock);
4803         } while (flush_state <= COMMIT_TRANS);
4804 }
4805
4806 void btrfs_init_async_reclaim_work(struct work_struct *work)
4807 {
4808         INIT_WORK(work, btrfs_async_reclaim_metadata_space);
4809 }
4810
4811 static const enum btrfs_flush_state priority_flush_states[] = {
4812         FLUSH_DELAYED_ITEMS_NR,
4813         FLUSH_DELAYED_ITEMS,
4814         ALLOC_CHUNK,
4815 };
4816
4817 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
4818                                             struct btrfs_space_info *space_info,
4819                                             struct reserve_ticket *ticket)
4820 {
4821         u64 to_reclaim;
4822         int flush_state;
4823
4824         spin_lock(&space_info->lock);
4825         to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
4826                                                       false);
4827         if (!to_reclaim) {
4828                 spin_unlock(&space_info->lock);
4829                 return;
4830         }
4831         spin_unlock(&space_info->lock);
4832
4833         flush_state = 0;
4834         do {
4835                 flush_space(fs_info, space_info, to_reclaim,
4836                             priority_flush_states[flush_state]);
4837                 flush_state++;
4838                 spin_lock(&space_info->lock);
4839                 if (ticket->bytes == 0) {
4840                         spin_unlock(&space_info->lock);
4841                         return;
4842                 }
4843                 spin_unlock(&space_info->lock);
4844         } while (flush_state < ARRAY_SIZE(priority_flush_states));
4845 }
4846
4847 static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
4848                                struct btrfs_space_info *space_info,
4849                                struct reserve_ticket *ticket)
4850
4851 {
4852         DEFINE_WAIT(wait);
4853         u64 reclaim_bytes = 0;
4854         int ret = 0;
4855
4856         spin_lock(&space_info->lock);
4857         while (ticket->bytes > 0 && ticket->error == 0) {
4858                 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
4859                 if (ret) {
4860                         ret = -EINTR;
4861                         break;
4862                 }
4863                 spin_unlock(&space_info->lock);
4864
4865                 schedule();
4866
4867                 finish_wait(&ticket->wait, &wait);
4868                 spin_lock(&space_info->lock);
4869         }
4870         if (!ret)
4871                 ret = ticket->error;
4872         if (!list_empty(&ticket->list))
4873                 list_del_init(&ticket->list);
4874         if (ticket->bytes && ticket->bytes < ticket->orig_bytes)
4875                 reclaim_bytes = ticket->orig_bytes - ticket->bytes;
4876         spin_unlock(&space_info->lock);
4877
4878         if (reclaim_bytes)
4879                 btrfs_space_info_add_old_bytes(fs_info, space_info,
4880                                                reclaim_bytes);
4881         return ret;
4882 }
4883
4884 /**
4885  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
4886  * @root - the root we're allocating for
4887  * @space_info - the space info we want to allocate from
4888  * @orig_bytes - the number of bytes we want
4889  * @flush - whether or not we can flush to make our reservation
4890  *
4891  * This will reserve orig_bytes number of bytes from the space info associated
4892  * with the block_rsv.  If there is not enough space it will make an attempt to
4893  * flush out space to make room.  It will do this by flushing delalloc if
4894  * possible or committing the transaction.  If flush is 0 then no attempts to
4895  * regain reservations will be made and this will fail if there is not enough
4896  * space already.
4897  */
4898 static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
4899                                     struct btrfs_space_info *space_info,
4900                                     u64 orig_bytes,
4901                                     enum btrfs_reserve_flush_enum flush,
4902                                     bool system_chunk)
4903 {
4904         struct reserve_ticket ticket;
4905         u64 used;
4906         u64 reclaim_bytes = 0;
4907         int ret = 0;
4908
4909         ASSERT(orig_bytes);
4910         ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
4911
4912         spin_lock(&space_info->lock);
4913         ret = -ENOSPC;
4914         used = btrfs_space_info_used(space_info, true);
4915
4916         /*
4917          * If we have enough space then hooray, make our reservation and carry
4918          * on.  If not see if we can overcommit, and if we can, hooray carry on.
4919          * If not things get more complicated.
4920          */
4921         if (used + orig_bytes <= space_info->total_bytes) {
4922                 btrfs_space_info_update_bytes_may_use(fs_info, space_info,
4923                                                       orig_bytes);
4924                 trace_btrfs_space_reservation(fs_info, "space_info",
4925                                               space_info->flags, orig_bytes, 1);
4926                 ret = 0;
4927         } else if (btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush,
4928                                         system_chunk)) {
4929                 btrfs_space_info_update_bytes_may_use(fs_info, space_info,
4930                                                       orig_bytes);
4931                 trace_btrfs_space_reservation(fs_info, "space_info",
4932                                               space_info->flags, orig_bytes, 1);
4933                 ret = 0;
4934         }
4935
4936         /*
4937          * If we couldn't make a reservation then setup our reservation ticket
4938          * and kick the async worker if it's not already running.
4939          *
4940          * If we are a priority flusher then we just need to add our ticket to
4941          * the list and we will do our own flushing further down.
4942          */
4943         if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
4944                 ticket.orig_bytes = orig_bytes;
4945                 ticket.bytes = orig_bytes;
4946                 ticket.error = 0;
4947                 init_waitqueue_head(&ticket.wait);
4948                 if (flush == BTRFS_RESERVE_FLUSH_ALL) {
4949                         list_add_tail(&ticket.list, &space_info->tickets);
4950                         if (!space_info->flush) {
4951                                 space_info->flush = 1;
4952                                 trace_btrfs_trigger_flush(fs_info,
4953                                                           space_info->flags,
4954                                                           orig_bytes, flush,
4955                                                           "enospc");
4956                                 queue_work(system_unbound_wq,
4957                                            &fs_info->async_reclaim_work);
4958                         }
4959                 } else {
4960                         list_add_tail(&ticket.list,
4961                                       &space_info->priority_tickets);
4962                 }
4963         } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
4964                 used += orig_bytes;
4965                 /*
4966                  * We will do the space reservation dance during log replay,
4967                  * which means we won't have fs_info->fs_root set, so don't do
4968                  * the async reclaim as we will panic.
4969                  */
4970                 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
4971                     need_do_async_reclaim(fs_info, space_info,
4972                                           used, system_chunk) &&
4973                     !work_busy(&fs_info->async_reclaim_work)) {
4974                         trace_btrfs_trigger_flush(fs_info, space_info->flags,
4975                                                   orig_bytes, flush, "preempt");
4976                         queue_work(system_unbound_wq,
4977                                    &fs_info->async_reclaim_work);
4978                 }
4979         }
4980         spin_unlock(&space_info->lock);
4981         if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
4982                 return ret;
4983
4984         if (flush == BTRFS_RESERVE_FLUSH_ALL)
4985                 return wait_reserve_ticket(fs_info, space_info, &ticket);
4986
4987         ret = 0;
4988         priority_reclaim_metadata_space(fs_info, space_info, &ticket);
4989         spin_lock(&space_info->lock);
4990         if (ticket.bytes) {
4991                 if (ticket.bytes < orig_bytes)
4992                         reclaim_bytes = orig_bytes - ticket.bytes;
4993                 list_del_init(&ticket.list);
4994                 ret = -ENOSPC;
4995         }
4996         spin_unlock(&space_info->lock);
4997
4998         if (reclaim_bytes)
4999                 btrfs_space_info_add_old_bytes(fs_info, space_info,
5000                                                reclaim_bytes);
5001         ASSERT(list_empty(&ticket.list));
5002         return ret;
5003 }
5004
5005 /**
5006  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5007  * @root - the root we're allocating for
5008  * @block_rsv - the block_rsv we're allocating for
5009  * @orig_bytes - the number of bytes we want
5010  * @flush - whether or not we can flush to make our reservation
5011  *
5012  * This will reserve orig_bytes number of bytes from the space info associated
5013  * with the block_rsv.  If there is not enough space it will make an attempt to
5014  * flush out space to make room.  It will do this by flushing delalloc if
5015  * possible or committing the transaction.  If flush is 0 then no attempts to
5016  * regain reservations will be made and this will fail if there is not enough
5017  * space already.
5018  */
5019 static int reserve_metadata_bytes(struct btrfs_root *root,
5020                                   struct btrfs_block_rsv *block_rsv,
5021                                   u64 orig_bytes,
5022                                   enum btrfs_reserve_flush_enum flush)
5023 {
5024         struct btrfs_fs_info *fs_info = root->fs_info;
5025         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5026         int ret;
5027         bool system_chunk = (root == fs_info->chunk_root);
5028
5029         ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
5030                                        orig_bytes, flush, system_chunk);
5031         if (ret == -ENOSPC &&
5032             unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
5033                 if (block_rsv != global_rsv &&
5034                     !block_rsv_use_bytes(global_rsv, orig_bytes))
5035                         ret = 0;
5036         }
5037         if (ret == -ENOSPC) {
5038                 trace_btrfs_space_reservation(fs_info, "space_info:enospc",
5039                                               block_rsv->space_info->flags,
5040                                               orig_bytes, 1);
5041
5042                 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
5043                         dump_space_info(fs_info, block_rsv->space_info,
5044                                         orig_bytes, 0);
5045         }
5046         return ret;
5047 }
5048
5049 static struct btrfs_block_rsv *get_block_rsv(
5050                                         const struct btrfs_trans_handle *trans,
5051                                         const struct btrfs_root *root)
5052 {
5053         struct btrfs_fs_info *fs_info = root->fs_info;
5054         struct btrfs_block_rsv *block_rsv = NULL;
5055
5056         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
5057             (root == fs_info->csum_root && trans->adding_csums) ||
5058             (root == fs_info->uuid_root))
5059                 block_rsv = trans->block_rsv;
5060
5061         if (!block_rsv)
5062                 block_rsv = root->block_rsv;
5063
5064         if (!block_rsv)
5065                 block_rsv = &fs_info->empty_block_rsv;
5066
5067         return block_rsv;
5068 }
5069
5070 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
5071                                u64 num_bytes)
5072 {
5073         int ret = -ENOSPC;
5074         spin_lock(&block_rsv->lock);
5075         if (block_rsv->reserved >= num_bytes) {
5076                 block_rsv->reserved -= num_bytes;
5077                 if (block_rsv->reserved < block_rsv->size)
5078                         block_rsv->full = 0;
5079                 ret = 0;
5080         }
5081         spin_unlock(&block_rsv->lock);
5082         return ret;
5083 }
5084
5085 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
5086                                 u64 num_bytes, bool update_size)
5087 {
5088         spin_lock(&block_rsv->lock);
5089         block_rsv->reserved += num_bytes;
5090         if (update_size)
5091                 block_rsv->size += num_bytes;
5092         else if (block_rsv->reserved >= block_rsv->size)
5093                 block_rsv->full = 1;
5094         spin_unlock(&block_rsv->lock);
5095 }
5096
5097 int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
5098                              struct btrfs_block_rsv *dest, u64 num_bytes,
5099                              int min_factor)
5100 {
5101         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5102         u64 min_bytes;
5103
5104         if (global_rsv->space_info != dest->space_info)
5105                 return -ENOSPC;
5106
5107         spin_lock(&global_rsv->lock);
5108         min_bytes = div_factor(global_rsv->size, min_factor);
5109         if (global_rsv->reserved < min_bytes + num_bytes) {
5110                 spin_unlock(&global_rsv->lock);
5111                 return -ENOSPC;
5112         }
5113         global_rsv->reserved -= num_bytes;
5114         if (global_rsv->reserved < global_rsv->size)
5115                 global_rsv->full = 0;
5116         spin_unlock(&global_rsv->lock);
5117
5118         block_rsv_add_bytes(dest, num_bytes, true);
5119         return 0;
5120 }
5121
5122 /**
5123  * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv.
5124  * @fs_info - the fs info for our fs.
5125  * @src - the source block rsv to transfer from.
5126  * @num_bytes - the number of bytes to transfer.
5127  *
5128  * This transfers up to the num_bytes amount from the src rsv to the
5129  * delayed_refs_rsv.  Any extra bytes are returned to the space info.
5130  */
5131 void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
5132                                        struct btrfs_block_rsv *src,
5133                                        u64 num_bytes)
5134 {
5135         struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
5136         u64 to_free = 0;
5137
5138         spin_lock(&src->lock);
5139         src->reserved -= num_bytes;
5140         src->size -= num_bytes;
5141         spin_unlock(&src->lock);
5142
5143         spin_lock(&delayed_refs_rsv->lock);
5144         if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) {
5145                 u64 delta = delayed_refs_rsv->size -
5146                         delayed_refs_rsv->reserved;
5147                 if (num_bytes > delta) {
5148                         to_free = num_bytes - delta;
5149                         num_bytes = delta;
5150                 }
5151         } else {
5152                 to_free = num_bytes;
5153                 num_bytes = 0;
5154         }
5155
5156         if (num_bytes)
5157                 delayed_refs_rsv->reserved += num_bytes;
5158         if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size)
5159                 delayed_refs_rsv->full = 1;
5160         spin_unlock(&delayed_refs_rsv->lock);
5161
5162         if (num_bytes)
5163                 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
5164                                               0, num_bytes, 1);
5165         if (to_free)
5166                 btrfs_space_info_add_old_bytes(fs_info,
5167                                 delayed_refs_rsv->space_info, to_free);
5168 }
5169
5170 /**
5171  * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage.
5172  * @fs_info - the fs_info for our fs.
5173  * @flush - control how we can flush for this reservation.
5174  *
5175  * This will refill the delayed block_rsv up to 1 items size worth of space and
5176  * will return -ENOSPC if we can't make the reservation.
5177  */
5178 int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
5179                                   enum btrfs_reserve_flush_enum flush)
5180 {
5181         struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
5182         u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1);
5183         u64 num_bytes = 0;
5184         int ret = -ENOSPC;
5185
5186         spin_lock(&block_rsv->lock);
5187         if (block_rsv->reserved < block_rsv->size) {
5188                 num_bytes = block_rsv->size - block_rsv->reserved;
5189                 num_bytes = min(num_bytes, limit);
5190         }
5191         spin_unlock(&block_rsv->lock);
5192
5193         if (!num_bytes)
5194                 return 0;
5195
5196         ret = reserve_metadata_bytes(fs_info->extent_root, block_rsv,
5197                                      num_bytes, flush);
5198         if (ret)
5199                 return ret;
5200         block_rsv_add_bytes(block_rsv, num_bytes, 0);
5201         trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
5202                                       0, num_bytes, 1);
5203         return 0;
5204 }
5205
5206 static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
5207                                     struct btrfs_block_rsv *block_rsv,
5208                                     struct btrfs_block_rsv *dest, u64 num_bytes,
5209                                     u64 *qgroup_to_release_ret)
5210 {
5211         struct btrfs_space_info *space_info = block_rsv->space_info;
5212         u64 qgroup_to_release = 0;
5213         u64 ret;
5214
5215         spin_lock(&block_rsv->lock);
5216         if (num_bytes == (u64)-1) {
5217                 num_bytes = block_rsv->size;
5218                 qgroup_to_release = block_rsv->qgroup_rsv_size;
5219         }
5220         block_rsv->size -= num_bytes;
5221         if (block_rsv->reserved >= block_rsv->size) {
5222                 num_bytes = block_rsv->reserved - block_rsv->size;
5223                 block_rsv->reserved = block_rsv->size;
5224                 block_rsv->full = 1;
5225         } else {
5226                 num_bytes = 0;
5227         }
5228         if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) {
5229                 qgroup_to_release = block_rsv->qgroup_rsv_reserved -
5230                                     block_rsv->qgroup_rsv_size;
5231                 block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size;
5232         } else {
5233                 qgroup_to_release = 0;
5234         }
5235         spin_unlock(&block_rsv->lock);
5236
5237         ret = num_bytes;
5238         if (num_bytes > 0) {
5239                 if (dest) {
5240                         spin_lock(&dest->lock);
5241                         if (!dest->full) {
5242                                 u64 bytes_to_add;
5243
5244                                 bytes_to_add = dest->size - dest->reserved;
5245                                 bytes_to_add = min(num_bytes, bytes_to_add);
5246                                 dest->reserved += bytes_to_add;
5247                                 if (dest->reserved >= dest->size)
5248                                         dest->full = 1;
5249                                 num_bytes -= bytes_to_add;
5250                         }
5251                         spin_unlock(&dest->lock);
5252                 }
5253                 if (num_bytes)
5254                         btrfs_space_info_add_old_bytes(fs_info, space_info,
5255                                                        num_bytes);
5256         }
5257         if (qgroup_to_release_ret)
5258                 *qgroup_to_release_ret = qgroup_to_release;
5259         return ret;
5260 }
5261
5262 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
5263                             struct btrfs_block_rsv *dst, u64 num_bytes,
5264                             bool update_size)
5265 {
5266         int ret;
5267
5268         ret = block_rsv_use_bytes(src, num_bytes);
5269         if (ret)
5270                 return ret;
5271
5272         block_rsv_add_bytes(dst, num_bytes, update_size);
5273         return 0;
5274 }
5275
5276 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
5277 {
5278         memset(rsv, 0, sizeof(*rsv));
5279         spin_lock_init(&rsv->lock);
5280         rsv->type = type;
5281 }
5282
5283 void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
5284                                    struct btrfs_block_rsv *rsv,
5285                                    unsigned short type)
5286 {
5287         btrfs_init_block_rsv(rsv, type);
5288         rsv->space_info = btrfs_find_space_info(fs_info,
5289                                             BTRFS_BLOCK_GROUP_METADATA);
5290 }
5291
5292 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
5293                                               unsigned short type)
5294 {
5295         struct btrfs_block_rsv *block_rsv;
5296
5297         block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
5298         if (!block_rsv)
5299                 return NULL;
5300
5301         btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
5302         return block_rsv;
5303 }
5304
5305 void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
5306                           struct btrfs_block_rsv *rsv)
5307 {
5308         if (!rsv)
5309                 return;
5310         btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
5311         kfree(rsv);
5312 }
5313
5314 int btrfs_block_rsv_add(struct btrfs_root *root,
5315                         struct btrfs_block_rsv *block_rsv, u64 num_bytes,
5316                         enum btrfs_reserve_flush_enum flush)
5317 {
5318         int ret;
5319
5320         if (num_bytes == 0)
5321                 return 0;
5322
5323         ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5324         if (!ret)
5325                 block_rsv_add_bytes(block_rsv, num_bytes, true);
5326
5327         return ret;
5328 }
5329
5330 int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
5331 {
5332         u64 num_bytes = 0;
5333         int ret = -ENOSPC;
5334
5335         if (!block_rsv)
5336                 return 0;
5337
5338         spin_lock(&block_rsv->lock);
5339         num_bytes = div_factor(block_rsv->size, min_factor);
5340         if (block_rsv->reserved >= num_bytes)
5341                 ret = 0;
5342         spin_unlock(&block_rsv->lock);
5343
5344         return ret;
5345 }
5346
5347 int btrfs_block_rsv_refill(struct btrfs_root *root,
5348                            struct btrfs_block_rsv *block_rsv, u64 min_reserved,
5349                            enum btrfs_reserve_flush_enum flush)
5350 {
5351         u64 num_bytes = 0;
5352         int ret = -ENOSPC;
5353
5354         if (!block_rsv)
5355                 return 0;
5356
5357         spin_lock(&block_rsv->lock);
5358         num_bytes = min_reserved;
5359         if (block_rsv->reserved >= num_bytes)
5360                 ret = 0;
5361         else
5362                 num_bytes -= block_rsv->reserved;
5363         spin_unlock(&block_rsv->lock);
5364
5365         if (!ret)
5366                 return 0;
5367
5368         ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5369         if (!ret) {
5370                 block_rsv_add_bytes(block_rsv, num_bytes, false);
5371                 return 0;
5372         }
5373
5374         return ret;
5375 }
5376
5377 static u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
5378                                      struct btrfs_block_rsv *block_rsv,
5379                                      u64 num_bytes, u64 *qgroup_to_release)
5380 {
5381         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5382         struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
5383         struct btrfs_block_rsv *target = delayed_rsv;
5384
5385         if (target->full || target == block_rsv)
5386                 target = global_rsv;
5387
5388         if (block_rsv->space_info != target->space_info)
5389                 target = NULL;
5390
5391         return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes,
5392                                        qgroup_to_release);
5393 }
5394
5395 void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
5396                              struct btrfs_block_rsv *block_rsv,
5397                              u64 num_bytes)
5398 {
5399         __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
5400 }
5401
5402 /**
5403  * btrfs_inode_rsv_release - release any excessive reservation.
5404  * @inode - the inode we need to release from.
5405  * @qgroup_free - free or convert qgroup meta.
5406  *   Unlike normal operation, qgroup meta reservation needs to know if we are
5407  *   freeing qgroup reservation or just converting it into per-trans.  Normally
5408  *   @qgroup_free is true for error handling, and false for normal release.
5409  *
5410  * This is the same as btrfs_block_rsv_release, except that it handles the
5411  * tracepoint for the reservation.
5412  */
5413 static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
5414 {
5415         struct btrfs_fs_info *fs_info = inode->root->fs_info;
5416         struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5417         u64 released = 0;
5418         u64 qgroup_to_release = 0;
5419
5420         /*
5421          * Since we statically set the block_rsv->size we just want to say we
5422          * are releasing 0 bytes, and then we'll just get the reservation over
5423          * the size free'd.
5424          */
5425         released = __btrfs_block_rsv_release(fs_info, block_rsv, 0,
5426                                              &qgroup_to_release);
5427         if (released > 0)
5428                 trace_btrfs_space_reservation(fs_info, "delalloc",
5429                                               btrfs_ino(inode), released, 0);
5430         if (qgroup_free)
5431                 btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release);
5432         else
5433                 btrfs_qgroup_convert_reserved_meta(inode->root,
5434                                                    qgroup_to_release);
5435 }
5436
5437 /**
5438  * btrfs_delayed_refs_rsv_release - release a ref head's reservation.
5439  * @fs_info - the fs_info for our fs.
5440  * @nr - the number of items to drop.
5441  *
5442  * This drops the delayed ref head's count from the delayed refs rsv and frees
5443  * any excess reservation we had.
5444  */
5445 void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
5446 {
5447         struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
5448         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5449         u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr);
5450         u64 released = 0;
5451
5452         released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv,
5453                                            num_bytes, NULL);
5454         if (released)
5455                 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
5456                                               0, released, 0);
5457 }
5458
5459 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
5460 {
5461         struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
5462         struct btrfs_space_info *sinfo = block_rsv->space_info;
5463         u64 num_bytes;
5464
5465         /*
5466          * The global block rsv is based on the size of the extent tree, the
5467          * checksum tree and the root tree.  If the fs is empty we want to set
5468          * it to a minimal amount for safety.
5469          */
5470         num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
5471                 btrfs_root_used(&fs_info->csum_root->root_item) +
5472                 btrfs_root_used(&fs_info->tree_root->root_item);
5473         num_bytes = max_t(u64, num_bytes, SZ_16M);
5474
5475         spin_lock(&sinfo->lock);
5476         spin_lock(&block_rsv->lock);
5477
5478         block_rsv->size = min_t(u64, num_bytes, SZ_512M);
5479
5480         if (block_rsv->reserved < block_rsv->size) {
5481                 num_bytes = btrfs_space_info_used(sinfo, true);
5482                 if (sinfo->total_bytes > num_bytes) {
5483                         num_bytes = sinfo->total_bytes - num_bytes;
5484                         num_bytes = min(num_bytes,
5485                                         block_rsv->size - block_rsv->reserved);
5486                         block_rsv->reserved += num_bytes;
5487                         btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
5488                                                               num_bytes);
5489                         trace_btrfs_space_reservation(fs_info, "space_info",
5490                                                       sinfo->flags, num_bytes,
5491                                                       1);
5492                 }
5493         } else if (block_rsv->reserved > block_rsv->size) {
5494                 num_bytes = block_rsv->reserved - block_rsv->size;
5495                 btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
5496                                                       -num_bytes);
5497                 trace_btrfs_space_reservation(fs_info, "space_info",
5498                                       sinfo->flags, num_bytes, 0);
5499                 block_rsv->reserved = block_rsv->size;
5500         }
5501
5502         if (block_rsv->reserved == block_rsv->size)
5503                 block_rsv->full = 1;
5504         else
5505                 block_rsv->full = 0;
5506
5507         spin_unlock(&block_rsv->lock);
5508         spin_unlock(&sinfo->lock);
5509 }
5510
5511 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
5512 {
5513         struct btrfs_space_info *space_info;
5514
5515         space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
5516         fs_info->chunk_block_rsv.space_info = space_info;
5517
5518         space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5519         fs_info->global_block_rsv.space_info = space_info;
5520         fs_info->trans_block_rsv.space_info = space_info;
5521         fs_info->empty_block_rsv.space_info = space_info;
5522         fs_info->delayed_block_rsv.space_info = space_info;
5523         fs_info->delayed_refs_rsv.space_info = space_info;
5524
5525         fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv;
5526         fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv;
5527         fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
5528         fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
5529         if (fs_info->quota_root)
5530                 fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
5531         fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
5532
5533         update_global_block_rsv(fs_info);
5534 }
5535
5536 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
5537 {
5538         block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
5539                                 (u64)-1, NULL);
5540         WARN_ON(fs_info->trans_block_rsv.size > 0);
5541         WARN_ON(fs_info->trans_block_rsv.reserved > 0);
5542         WARN_ON(fs_info->chunk_block_rsv.size > 0);
5543         WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
5544         WARN_ON(fs_info->delayed_block_rsv.size > 0);
5545         WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
5546         WARN_ON(fs_info->delayed_refs_rsv.reserved > 0);
5547         WARN_ON(fs_info->delayed_refs_rsv.size > 0);
5548 }
5549
5550 /*
5551  * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv
5552  * @trans - the trans that may have generated delayed refs
5553  *
5554  * This is to be called anytime we may have adjusted trans->delayed_ref_updates,
5555  * it'll calculate the additional size and add it to the delayed_refs_rsv.
5556  */
5557 void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
5558 {
5559         struct btrfs_fs_info *fs_info = trans->fs_info;
5560         struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
5561         u64 num_bytes;
5562
5563         if (!trans->delayed_ref_updates)
5564                 return;
5565
5566         num_bytes = btrfs_calc_trans_metadata_size(fs_info,
5567                                                    trans->delayed_ref_updates);
5568         spin_lock(&delayed_rsv->lock);
5569         delayed_rsv->size += num_bytes;
5570         delayed_rsv->full = 0;
5571         spin_unlock(&delayed_rsv->lock);
5572         trans->delayed_ref_updates = 0;
5573 }
5574
5575 /*
5576  * To be called after all the new block groups attached to the transaction
5577  * handle have been created (btrfs_create_pending_block_groups()).
5578  */
5579 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
5580 {
5581         struct btrfs_fs_info *fs_info = trans->fs_info;
5582
5583         if (!trans->chunk_bytes_reserved)
5584                 return;
5585
5586         WARN_ON_ONCE(!list_empty(&trans->new_bgs));
5587
5588         block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
5589                                 trans->chunk_bytes_reserved, NULL);
5590         trans->chunk_bytes_reserved = 0;
5591 }
5592
5593 /*
5594  * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
5595  * root: the root of the parent directory
5596  * rsv: block reservation
5597  * items: the number of items that we need do reservation
5598  * use_global_rsv: allow fallback to the global block reservation
5599  *
5600  * This function is used to reserve the space for snapshot/subvolume
5601  * creation and deletion. Those operations are different with the
5602  * common file/directory operations, they change two fs/file trees
5603  * and root tree, the number of items that the qgroup reserves is
5604  * different with the free space reservation. So we can not use
5605  * the space reservation mechanism in start_transaction().
5606  */
5607 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
5608                                      struct btrfs_block_rsv *rsv, int items,
5609                                      bool use_global_rsv)
5610 {
5611         u64 qgroup_num_bytes = 0;
5612         u64 num_bytes;
5613         int ret;
5614         struct btrfs_fs_info *fs_info = root->fs_info;
5615         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5616
5617         if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
5618                 /* One for parent inode, two for dir entries */
5619                 qgroup_num_bytes = 3 * fs_info->nodesize;
5620                 ret = btrfs_qgroup_reserve_meta_prealloc(root,
5621                                 qgroup_num_bytes, true);
5622                 if (ret)
5623                         return ret;
5624         }
5625
5626         num_bytes = btrfs_calc_trans_metadata_size(fs_info, items);
5627         rsv->space_info = btrfs_find_space_info(fs_info,
5628                                             BTRFS_BLOCK_GROUP_METADATA);
5629         ret = btrfs_block_rsv_add(root, rsv, num_bytes,
5630                                   BTRFS_RESERVE_FLUSH_ALL);
5631
5632         if (ret == -ENOSPC && use_global_rsv)
5633                 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true);
5634
5635         if (ret && qgroup_num_bytes)
5636                 btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
5637
5638         return ret;
5639 }
5640
5641 void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
5642                                       struct btrfs_block_rsv *rsv)
5643 {
5644         btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
5645 }
5646
5647 static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
5648                                                  struct btrfs_inode *inode)
5649 {
5650         struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5651         u64 reserve_size = 0;
5652         u64 qgroup_rsv_size = 0;
5653         u64 csum_leaves;
5654         unsigned outstanding_extents;
5655
5656         lockdep_assert_held(&inode->lock);
5657         outstanding_extents = inode->outstanding_extents;
5658         if (outstanding_extents)
5659                 reserve_size = btrfs_calc_trans_metadata_size(fs_info,
5660                                                 outstanding_extents + 1);
5661         csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
5662                                                  inode->csum_bytes);
5663         reserve_size += btrfs_calc_trans_metadata_size(fs_info,
5664                                                        csum_leaves);
5665         /*
5666          * For qgroup rsv, the calculation is very simple:
5667          * account one nodesize for each outstanding extent
5668          *
5669          * This is overestimating in most cases.
5670          */
5671         qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize;
5672
5673         spin_lock(&block_rsv->lock);
5674         block_rsv->size = reserve_size;
5675         block_rsv->qgroup_rsv_size = qgroup_rsv_size;
5676         spin_unlock(&block_rsv->lock);
5677 }
5678
5679 static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
5680                                     u64 num_bytes, u64 *meta_reserve,
5681                                     u64 *qgroup_reserve)
5682 {
5683         u64 nr_extents = count_max_extents(num_bytes);
5684         u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes);
5685
5686         /* We add one for the inode update at finish ordered time */
5687         *meta_reserve = btrfs_calc_trans_metadata_size(fs_info,
5688                                                 nr_extents + csum_leaves + 1);
5689         *qgroup_reserve = nr_extents * fs_info->nodesize;
5690 }
5691
5692 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
5693 {
5694         struct btrfs_root *root = inode->root;
5695         struct btrfs_fs_info *fs_info = root->fs_info;
5696         struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5697         u64 meta_reserve, qgroup_reserve;
5698         unsigned nr_extents;
5699         enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
5700         int ret = 0;
5701         bool delalloc_lock = true;
5702
5703         /* If we are a free space inode we need to not flush since we will be in
5704          * the middle of a transaction commit.  We also don't need the delalloc
5705          * mutex since we won't race with anybody.  We need this mostly to make
5706          * lockdep shut its filthy mouth.
5707          *
5708          * If we have a transaction open (can happen if we call truncate_block
5709          * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
5710          */
5711         if (btrfs_is_free_space_inode(inode)) {
5712                 flush = BTRFS_RESERVE_NO_FLUSH;
5713                 delalloc_lock = false;
5714         } else {
5715                 if (current->journal_info)
5716                         flush = BTRFS_RESERVE_FLUSH_LIMIT;
5717
5718                 if (btrfs_transaction_in_commit(fs_info))
5719                         schedule_timeout(1);
5720         }
5721
5722         if (delalloc_lock)
5723                 mutex_lock(&inode->delalloc_mutex);
5724
5725         num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
5726
5727         /*
5728          * We always want to do it this way, every other way is wrong and ends
5729          * in tears.  Pre-reserving the amount we are going to add will always
5730          * be the right way, because otherwise if we have enough parallelism we
5731          * could end up with thousands of inodes all holding little bits of
5732          * reservations they were able to make previously and the only way to
5733          * reclaim that space is to ENOSPC out the operations and clear
5734          * everything out and try again, which is bad.  This way we just
5735          * over-reserve slightly, and clean up the mess when we are done.
5736          */
5737         calc_inode_reservations(fs_info, num_bytes, &meta_reserve,
5738                                 &qgroup_reserve);
5739         ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true);
5740         if (ret)
5741                 goto out_fail;
5742         ret = reserve_metadata_bytes(root, block_rsv, meta_reserve, flush);
5743         if (ret)
5744                 goto out_qgroup;
5745
5746         /*
5747          * Now we need to update our outstanding extents and csum bytes _first_
5748          * and then add the reservation to the block_rsv.  This keeps us from
5749          * racing with an ordered completion or some such that would think it
5750          * needs to free the reservation we just made.
5751          */
5752         spin_lock(&inode->lock);
5753         nr_extents = count_max_extents(num_bytes);
5754         btrfs_mod_outstanding_extents(inode, nr_extents);
5755         inode->csum_bytes += num_bytes;
5756         btrfs_calculate_inode_block_rsv_size(fs_info, inode);
5757         spin_unlock(&inode->lock);
5758
5759         /* Now we can safely add our space to our block rsv */
5760         block_rsv_add_bytes(block_rsv, meta_reserve, false);
5761         trace_btrfs_space_reservation(root->fs_info, "delalloc",
5762                                       btrfs_ino(inode), meta_reserve, 1);
5763
5764         spin_lock(&block_rsv->lock);
5765         block_rsv->qgroup_rsv_reserved += qgroup_reserve;
5766         spin_unlock(&block_rsv->lock);
5767
5768         if (delalloc_lock)
5769                 mutex_unlock(&inode->delalloc_mutex);
5770         return 0;
5771 out_qgroup:
5772         btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);
5773 out_fail:
5774         btrfs_inode_rsv_release(inode, true);
5775         if (delalloc_lock)
5776                 mutex_unlock(&inode->delalloc_mutex);
5777         return ret;
5778 }
5779
5780 /**
5781  * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
5782  * @inode: the inode to release the reservation for.
5783  * @num_bytes: the number of bytes we are releasing.
5784  * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
5785  *
5786  * This will release the metadata reservation for an inode.  This can be called
5787  * once we complete IO for a given set of bytes to release their metadata
5788  * reservations, or on error for the same reason.
5789  */
5790 void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
5791                                      bool qgroup_free)
5792 {
5793         struct btrfs_fs_info *fs_info = inode->root->fs_info;
5794
5795         num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
5796         spin_lock(&inode->lock);
5797         inode->csum_bytes -= num_bytes;
5798         btrfs_calculate_inode_block_rsv_size(fs_info, inode);
5799         spin_unlock(&inode->lock);
5800
5801         if (btrfs_is_testing(fs_info))
5802                 return;
5803
5804         btrfs_inode_rsv_release(inode, qgroup_free);
5805 }
5806
5807 /**
5808  * btrfs_delalloc_release_extents - release our outstanding_extents
5809  * @inode: the inode to balance the reservation for.
5810  * @num_bytes: the number of bytes we originally reserved with
5811  * @qgroup_free: do we need to free qgroup meta reservation or convert them.
5812  *
5813  * When we reserve space we increase outstanding_extents for the extents we may
5814  * add.  Once we've set the range as delalloc or created our ordered extents we
5815  * have outstanding_extents to track the real usage, so we use this to free our
5816  * temporarily tracked outstanding_extents.  This _must_ be used in conjunction
5817  * with btrfs_delalloc_reserve_metadata.
5818  */
5819 void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
5820                                     bool qgroup_free)
5821 {
5822         struct btrfs_fs_info *fs_info = inode->root->fs_info;
5823         unsigned num_extents;
5824
5825         spin_lock(&inode->lock);
5826         num_extents = count_max_extents(num_bytes);
5827         btrfs_mod_outstanding_extents(inode, -num_extents);
5828         btrfs_calculate_inode_block_rsv_size(fs_info, inode);
5829         spin_unlock(&inode->lock);
5830
5831         if (btrfs_is_testing(fs_info))
5832                 return;
5833
5834         btrfs_inode_rsv_release(inode, qgroup_free);
5835 }
5836
5837 /**
5838  * btrfs_delalloc_reserve_space - reserve data and metadata space for
5839  * delalloc
5840  * @inode: inode we're writing to
5841  * @start: start range we are writing to
5842  * @len: how long the range we are writing to
5843  * @reserved: mandatory parameter, record actually reserved qgroup ranges of
5844  *            current reservation.
5845  *
5846  * This will do the following things
5847  *
5848  * o reserve space in data space info for num bytes
5849  *   and reserve precious corresponding qgroup space
5850  *   (Done in check_data_free_space)
5851  *
5852  * o reserve space for metadata space, based on the number of outstanding
5853  *   extents and how much csums will be needed
5854  *   also reserve metadata space in a per root over-reserve method.
5855  * o add to the inodes->delalloc_bytes
5856  * o add it to the fs_info's delalloc inodes list.
5857  *   (Above 3 all done in delalloc_reserve_metadata)
5858  *
5859  * Return 0 for success
5860  * Return <0 for error(-ENOSPC or -EQUOT)
5861  */
5862 int btrfs_delalloc_reserve_space(struct inode *inode,
5863                         struct extent_changeset **reserved, u64 start, u64 len)
5864 {
5865         int ret;
5866
5867         ret = btrfs_check_data_free_space(inode, reserved, start, len);
5868         if (ret < 0)
5869                 return ret;
5870         ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
5871         if (ret < 0)
5872                 btrfs_free_reserved_data_space(inode, *reserved, start, len);
5873         return ret;
5874 }
5875
5876 /**
5877  * btrfs_delalloc_release_space - release data and metadata space for delalloc
5878  * @inode: inode we're releasing space for
5879  * @start: start position of the space already reserved
5880  * @len: the len of the space already reserved
5881  * @release_bytes: the len of the space we consumed or didn't use
5882  *
5883  * This function will release the metadata space that was not used and will
5884  * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
5885  * list if there are no delalloc bytes left.
5886  * Also it will handle the qgroup reserved space.
5887  */
5888 void btrfs_delalloc_release_space(struct inode *inode,
5889                                   struct extent_changeset *reserved,
5890                                   u64 start, u64 len, bool qgroup_free)
5891 {
5892         btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
5893         btrfs_free_reserved_data_space(inode, reserved, start, len);
5894 }
5895
5896 static int update_block_group(struct btrfs_trans_handle *trans,
5897                               u64 bytenr, u64 num_bytes, int alloc)
5898 {
5899         struct btrfs_fs_info *info = trans->fs_info;
5900         struct btrfs_block_group_cache *cache = NULL;
5901         u64 total = num_bytes;
5902         u64 old_val;
5903         u64 byte_in_group;
5904         int factor;
5905         int ret = 0;
5906
5907         /* block accounting for super block */
5908         spin_lock(&info->delalloc_root_lock);
5909         old_val = btrfs_super_bytes_used(info->super_copy);
5910         if (alloc)
5911                 old_val += num_bytes;
5912         else
5913                 old_val -= num_bytes;
5914         btrfs_set_super_bytes_used(info->super_copy, old_val);
5915         spin_unlock(&info->delalloc_root_lock);
5916
5917         while (total) {
5918                 cache = btrfs_lookup_block_group(info, bytenr);
5919                 if (!cache) {
5920                         ret = -ENOENT;
5921                         break;
5922                 }
5923                 factor = btrfs_bg_type_to_factor(cache->flags);
5924
5925                 /*
5926                  * If this block group has free space cache written out, we
5927                  * need to make sure to load it if we are removing space.  This
5928                  * is because we need the unpinning stage to actually add the
5929                  * space back to the block group, otherwise we will leak space.
5930                  */
5931                 if (!alloc && cache->cached == BTRFS_CACHE_NO)
5932                         cache_block_group(cache, 1);
5933
5934                 byte_in_group = bytenr - cache->key.objectid;
5935                 WARN_ON(byte_in_group > cache->key.offset);
5936
5937                 spin_lock(&cache->space_info->lock);
5938                 spin_lock(&cache->lock);
5939
5940                 if (btrfs_test_opt(info, SPACE_CACHE) &&
5941                     cache->disk_cache_state < BTRFS_DC_CLEAR)
5942                         cache->disk_cache_state = BTRFS_DC_CLEAR;
5943
5944                 old_val = btrfs_block_group_used(&cache->item);
5945                 num_bytes = min(total, cache->key.offset - byte_in_group);
5946                 if (alloc) {
5947                         old_val += num_bytes;
5948                         btrfs_set_block_group_used(&cache->item, old_val);
5949                         cache->reserved -= num_bytes;
5950                         cache->space_info->bytes_reserved -= num_bytes;
5951                         cache->space_info->bytes_used += num_bytes;
5952                         cache->space_info->disk_used += num_bytes * factor;
5953                         spin_unlock(&cache->lock);
5954                         spin_unlock(&cache->space_info->lock);
5955                 } else {
5956                         old_val -= num_bytes;
5957                         btrfs_set_block_group_used(&cache->item, old_val);
5958                         cache->pinned += num_bytes;
5959                         btrfs_space_info_update_bytes_pinned(info,
5960                                         cache->space_info, num_bytes);
5961                         cache->space_info->bytes_used -= num_bytes;
5962                         cache->space_info->disk_used -= num_bytes * factor;
5963                         spin_unlock(&cache->lock);
5964                         spin_unlock(&cache->space_info->lock);
5965
5966                         trace_btrfs_space_reservation(info, "pinned",
5967                                                       cache->space_info->flags,
5968                                                       num_bytes, 1);
5969                         percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
5970                                            num_bytes,
5971                                            BTRFS_TOTAL_BYTES_PINNED_BATCH);
5972                         set_extent_dirty(info->pinned_extents,
5973                                          bytenr, bytenr + num_bytes - 1,
5974                                          GFP_NOFS | __GFP_NOFAIL);
5975                 }
5976
5977                 spin_lock(&trans->transaction->dirty_bgs_lock);
5978                 if (list_empty(&cache->dirty_list)) {
5979                         list_add_tail(&cache->dirty_list,
5980                                       &trans->transaction->dirty_bgs);
5981                         trans->delayed_ref_updates++;
5982                         btrfs_get_block_group(cache);
5983                 }
5984                 spin_unlock(&trans->transaction->dirty_bgs_lock);
5985
5986                 /*
5987                  * No longer have used bytes in this block group, queue it for
5988                  * deletion. We do this after adding the block group to the
5989                  * dirty list to avoid races between cleaner kthread and space
5990                  * cache writeout.
5991                  */
5992                 if (!alloc && old_val == 0)
5993                         btrfs_mark_bg_unused(cache);
5994
5995                 btrfs_put_block_group(cache);
5996                 total -= num_bytes;
5997                 bytenr += num_bytes;
5998         }
5999
6000         /* Modified block groups are accounted for in the delayed_refs_rsv. */
6001         btrfs_update_delayed_refs_rsv(trans);
6002         return ret;
6003 }
6004
6005 static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
6006 {
6007         struct btrfs_block_group_cache *cache;
6008         u64 bytenr;
6009
6010         spin_lock(&fs_info->block_group_cache_lock);
6011         bytenr = fs_info->first_logical_byte;
6012         spin_unlock(&fs_info->block_group_cache_lock);
6013
6014         if (bytenr < (u64)-1)
6015                 return bytenr;
6016
6017         cache = btrfs_lookup_first_block_group(fs_info, search_start);
6018         if (!cache)
6019                 return 0;
6020
6021         bytenr = cache->key.objectid;
6022         btrfs_put_block_group(cache);
6023
6024         return bytenr;
6025 }
6026
6027 static int pin_down_extent(struct btrfs_block_group_cache *cache,
6028                            u64 bytenr, u64 num_bytes, int reserved)
6029 {
6030         struct btrfs_fs_info *fs_info = cache->fs_info;
6031
6032         spin_lock(&cache->space_info->lock);
6033         spin_lock(&cache->lock);
6034         cache->pinned += num_bytes;
6035         btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
6036                                              num_bytes);
6037         if (reserved) {
6038                 cache->reserved -= num_bytes;
6039                 cache->space_info->bytes_reserved -= num_bytes;
6040         }
6041         spin_unlock(&cache->lock);
6042         spin_unlock(&cache->space_info->lock);
6043
6044         trace_btrfs_space_reservation(fs_info, "pinned",
6045                                       cache->space_info->flags, num_bytes, 1);
6046         percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
6047                     num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
6048         set_extent_dirty(fs_info->pinned_extents, bytenr,
6049                          bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
6050         return 0;
6051 }
6052
6053 /*
6054  * this function must be called within transaction
6055  */
6056 int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
6057                      u64 bytenr, u64 num_bytes, int reserved)
6058 {
6059         struct btrfs_block_group_cache *cache;
6060
6061         cache = btrfs_lookup_block_group(fs_info, bytenr);
6062         BUG_ON(!cache); /* Logic error */
6063
6064         pin_down_extent(cache, bytenr, num_bytes, reserved);
6065
6066         btrfs_put_block_group(cache);
6067         return 0;
6068 }
6069
6070 /*
6071  * this function must be called within transaction
6072  */
6073 int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
6074                                     u64 bytenr, u64 num_bytes)
6075 {
6076         struct btrfs_block_group_cache *cache;
6077         int ret;
6078
6079         cache = btrfs_lookup_block_group(fs_info, bytenr);
6080         if (!cache)
6081                 return -EINVAL;
6082
6083         /*
6084          * pull in the free space cache (if any) so that our pin
6085          * removes the free space from the cache.  We have load_only set
6086          * to one because the slow code to read in the free extents does check
6087          * the pinned extents.
6088          */
6089         cache_block_group(cache, 1);
6090
6091         pin_down_extent(cache, bytenr, num_bytes, 0);
6092
6093         /* remove us from the free space cache (if we're there at all) */
6094         ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
6095         btrfs_put_block_group(cache);
6096         return ret;
6097 }
6098
6099 static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
6100                                    u64 start, u64 num_bytes)
6101 {
6102         int ret;
6103         struct btrfs_block_group_cache *block_group;
6104         struct btrfs_caching_control *caching_ctl;
6105
6106         block_group = btrfs_lookup_block_group(fs_info, start);
6107         if (!block_group)
6108                 return -EINVAL;
6109
6110         cache_block_group(block_group, 0);
6111         caching_ctl = get_caching_control(block_group);
6112
6113         if (!caching_ctl) {
6114                 /* Logic error */
6115                 BUG_ON(!block_group_cache_done(block_group));
6116                 ret = btrfs_remove_free_space(block_group, start, num_bytes);
6117         } else {
6118                 mutex_lock(&caching_ctl->mutex);
6119
6120                 if (start >= caching_ctl->progress) {
6121                         ret = add_excluded_extent(fs_info, start, num_bytes);
6122                 } else if (start + num_bytes <= caching_ctl->progress) {
6123                         ret = btrfs_remove_free_space(block_group,
6124                                                       start, num_bytes);
6125                 } else {
6126                         num_bytes = caching_ctl->progress - start;
6127                         ret = btrfs_remove_free_space(block_group,
6128                                                       start, num_bytes);
6129                         if (ret)
6130                                 goto out_lock;
6131
6132                         num_bytes = (start + num_bytes) -
6133                                 caching_ctl->progress;
6134                         start = caching_ctl->progress;
6135                         ret = add_excluded_extent(fs_info, start, num_bytes);
6136                 }
6137 out_lock:
6138                 mutex_unlock(&caching_ctl->mutex);
6139                 put_caching_control(caching_ctl);
6140         }
6141         btrfs_put_block_group(block_group);
6142         return ret;
6143 }
6144
6145 int btrfs_exclude_logged_extents(struct extent_buffer *eb)
6146 {
6147         struct btrfs_fs_info *fs_info = eb->fs_info;
6148         struct btrfs_file_extent_item *item;
6149         struct btrfs_key key;
6150         int found_type;
6151         int i;
6152         int ret = 0;
6153
6154         if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
6155                 return 0;
6156
6157         for (i = 0; i < btrfs_header_nritems(eb); i++) {
6158                 btrfs_item_key_to_cpu(eb, &key, i);
6159                 if (key.type != BTRFS_EXTENT_DATA_KEY)
6160                         continue;
6161                 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
6162                 found_type = btrfs_file_extent_type(eb, item);
6163                 if (found_type == BTRFS_FILE_EXTENT_INLINE)
6164                         continue;
6165                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
6166                         continue;
6167                 key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
6168                 key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
6169                 ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
6170                 if (ret)
6171                         break;
6172         }
6173
6174         return ret;
6175 }
6176
6177 static void
6178 btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
6179 {
6180         atomic_inc(&bg->reservations);
6181 }
6182
6183 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
6184                                         const u64 start)
6185 {
6186         struct btrfs_block_group_cache *bg;
6187
6188         bg = btrfs_lookup_block_group(fs_info, start);
6189         ASSERT(bg);
6190         if (atomic_dec_and_test(&bg->reservations))
6191                 wake_up_var(&bg->reservations);
6192         btrfs_put_block_group(bg);
6193 }
6194
6195 void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
6196 {
6197         struct btrfs_space_info *space_info = bg->space_info;
6198
6199         ASSERT(bg->ro);
6200
6201         if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
6202                 return;
6203
6204         /*
6205          * Our block group is read only but before we set it to read only,
6206          * some task might have had allocated an extent from it already, but it
6207          * has not yet created a respective ordered extent (and added it to a
6208          * root's list of ordered extents).
6209          * Therefore wait for any task currently allocating extents, since the
6210          * block group's reservations counter is incremented while a read lock
6211          * on the groups' semaphore is held and decremented after releasing
6212          * the read access on that semaphore and creating the ordered extent.
6213          */
6214         down_write(&space_info->groups_sem);
6215         up_write(&space_info->groups_sem);
6216
6217         wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
6218 }
6219
6220 /**
6221  * btrfs_add_reserved_bytes - update the block_group and space info counters
6222  * @cache:      The cache we are manipulating
6223  * @ram_bytes:  The number of bytes of file content, and will be same to
6224  *              @num_bytes except for the compress path.
6225  * @num_bytes:  The number of bytes in question
6226  * @delalloc:   The blocks are allocated for the delalloc write
6227  *
6228  * This is called by the allocator when it reserves space. If this is a
6229  * reservation and the block group has become read only we cannot make the
6230  * reservation and return -EAGAIN, otherwise this function always succeeds.
6231  */
6232 static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
6233                                     u64 ram_bytes, u64 num_bytes, int delalloc)
6234 {
6235         struct btrfs_space_info *space_info = cache->space_info;
6236         int ret = 0;
6237
6238         spin_lock(&space_info->lock);
6239         spin_lock(&cache->lock);
6240         if (cache->ro) {
6241                 ret = -EAGAIN;
6242         } else {
6243                 cache->reserved += num_bytes;
6244                 space_info->bytes_reserved += num_bytes;
6245                 btrfs_space_info_update_bytes_may_use(cache->fs_info,
6246                                                       space_info, -ram_bytes);
6247                 if (delalloc)
6248                         cache->delalloc_bytes += num_bytes;
6249         }
6250         spin_unlock(&cache->lock);
6251         spin_unlock(&space_info->lock);
6252         return ret;
6253 }
6254
6255 /**
6256  * btrfs_free_reserved_bytes - update the block_group and space info counters
6257  * @cache:      The cache we are manipulating
6258  * @num_bytes:  The number of bytes in question
6259  * @delalloc:   The blocks are allocated for the delalloc write
6260  *
6261  * This is called by somebody who is freeing space that was never actually used
6262  * on disk.  For example if you reserve some space for a new leaf in transaction
6263  * A and before transaction A commits you free that leaf, you call this with
6264  * reserve set to 0 in order to clear the reservation.
6265  */
6266
6267 static void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
6268                                       u64 num_bytes, int delalloc)
6269 {
6270         struct btrfs_space_info *space_info = cache->space_info;
6271
6272         spin_lock(&space_info->lock);
6273         spin_lock(&cache->lock);
6274         if (cache->ro)
6275                 space_info->bytes_readonly += num_bytes;
6276         cache->reserved -= num_bytes;
6277         space_info->bytes_reserved -= num_bytes;
6278         space_info->max_extent_size = 0;
6279
6280         if (delalloc)
6281                 cache->delalloc_bytes -= num_bytes;
6282         spin_unlock(&cache->lock);
6283         spin_unlock(&space_info->lock);
6284 }
6285 void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
6286 {
6287         struct btrfs_caching_control *next;
6288         struct btrfs_caching_control *caching_ctl;
6289         struct btrfs_block_group_cache *cache;
6290
6291         down_write(&fs_info->commit_root_sem);
6292
6293         list_for_each_entry_safe(caching_ctl, next,
6294                                  &fs_info->caching_block_groups, list) {
6295                 cache = caching_ctl->block_group;
6296                 if (block_group_cache_done(cache)) {
6297                         cache->last_byte_to_unpin = (u64)-1;
6298                         list_del_init(&caching_ctl->list);
6299                         put_caching_control(caching_ctl);
6300                 } else {
6301                         cache->last_byte_to_unpin = caching_ctl->progress;
6302                 }
6303         }
6304
6305         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6306                 fs_info->pinned_extents = &fs_info->freed_extents[1];
6307         else
6308                 fs_info->pinned_extents = &fs_info->freed_extents[0];
6309
6310         up_write(&fs_info->commit_root_sem);
6311
6312         update_global_block_rsv(fs_info);
6313 }
6314
6315 /*
6316  * Returns the free cluster for the given space info and sets empty_cluster to
6317  * what it should be based on the mount options.
6318  */
6319 static struct btrfs_free_cluster *
6320 fetch_cluster_info(struct btrfs_fs_info *fs_info,
6321                    struct btrfs_space_info *space_info, u64 *empty_cluster)
6322 {
6323         struct btrfs_free_cluster *ret = NULL;
6324
6325         *empty_cluster = 0;
6326         if (btrfs_mixed_space_info(space_info))
6327                 return ret;
6328
6329         if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
6330                 ret = &fs_info->meta_alloc_cluster;
6331                 if (btrfs_test_opt(fs_info, SSD))
6332                         *empty_cluster = SZ_2M;
6333                 else
6334                         *empty_cluster = SZ_64K;
6335         } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
6336                    btrfs_test_opt(fs_info, SSD_SPREAD)) {
6337                 *empty_cluster = SZ_2M;
6338                 ret = &fs_info->data_alloc_cluster;
6339         }
6340
6341         return ret;
6342 }
6343
6344 static int unpin_extent_range(struct btrfs_fs_info *fs_info,
6345                               u64 start, u64 end,
6346                               const bool return_free_space)
6347 {
6348         struct btrfs_block_group_cache *cache = NULL;
6349         struct btrfs_space_info *space_info;
6350         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
6351         struct btrfs_free_cluster *cluster = NULL;
6352         u64 len;
6353         u64 total_unpinned = 0;
6354         u64 empty_cluster = 0;
6355         bool readonly;
6356
6357         while (start <= end) {
6358                 readonly = false;
6359                 if (!cache ||
6360                     start >= cache->key.objectid + cache->key.offset) {
6361                         if (cache)
6362                                 btrfs_put_block_group(cache);
6363                         total_unpinned = 0;
6364                         cache = btrfs_lookup_block_group(fs_info, start);
6365                         BUG_ON(!cache); /* Logic error */
6366
6367                         cluster = fetch_cluster_info(fs_info,
6368                                                      cache->space_info,
6369                                                      &empty_cluster);
6370                         empty_cluster <<= 1;
6371                 }
6372
6373                 len = cache->key.objectid + cache->key.offset - start;
6374                 len = min(len, end + 1 - start);
6375
6376                 if (start < cache->last_byte_to_unpin) {
6377                         len = min(len, cache->last_byte_to_unpin - start);
6378                         if (return_free_space)
6379                                 btrfs_add_free_space(cache, start, len);
6380                 }
6381
6382                 start += len;
6383                 total_unpinned += len;
6384                 space_info = cache->space_info;
6385
6386                 /*
6387                  * If this space cluster has been marked as fragmented and we've
6388                  * unpinned enough in this block group to potentially allow a
6389                  * cluster to be created inside of it go ahead and clear the
6390                  * fragmented check.
6391                  */
6392                 if (cluster && cluster->fragmented &&
6393                     total_unpinned > empty_cluster) {
6394                         spin_lock(&cluster->lock);
6395                         cluster->fragmented = 0;
6396                         spin_unlock(&cluster->lock);
6397                 }
6398
6399                 spin_lock(&space_info->lock);
6400                 spin_lock(&cache->lock);
6401                 cache->pinned -= len;
6402                 btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
6403
6404                 trace_btrfs_space_reservation(fs_info, "pinned",
6405                                               space_info->flags, len, 0);
6406                 space_info->max_extent_size = 0;
6407                 percpu_counter_add_batch(&space_info->total_bytes_pinned,
6408                             -len, BTRFS_TOTAL_BYTES_PINNED_BATCH);
6409                 if (cache->ro) {
6410                         space_info->bytes_readonly += len;
6411                         readonly = true;
6412                 }
6413                 spin_unlock(&cache->lock);
6414                 if (!readonly && return_free_space &&
6415                     global_rsv->space_info == space_info) {
6416                         u64 to_add = len;
6417
6418                         spin_lock(&global_rsv->lock);
6419                         if (!global_rsv->full) {
6420                                 to_add = min(len, global_rsv->size -
6421                                              global_rsv->reserved);
6422                                 global_rsv->reserved += to_add;
6423                                 btrfs_space_info_update_bytes_may_use(fs_info,
6424                                                 space_info, to_add);
6425                                 if (global_rsv->reserved >= global_rsv->size)
6426                                         global_rsv->full = 1;
6427                                 trace_btrfs_space_reservation(fs_info,
6428                                                               "space_info",
6429                                                               space_info->flags,
6430                                                               to_add, 1);
6431                                 len -= to_add;
6432                         }
6433                         spin_unlock(&global_rsv->lock);
6434                         /* Add to any tickets we may have */
6435                         if (len)
6436                                 btrfs_space_info_add_new_bytes(fs_info,
6437                                                 space_info, len);
6438                 }
6439                 spin_unlock(&space_info->lock);
6440         }
6441
6442         if (cache)
6443                 btrfs_put_block_group(cache);
6444         return 0;
6445 }
6446
6447 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
6448 {
6449         struct btrfs_fs_info *fs_info = trans->fs_info;
6450         struct btrfs_block_group_cache *block_group, *tmp;
6451         struct list_head *deleted_bgs;
6452         struct extent_io_tree *unpin;
6453         u64 start;
6454         u64 end;
6455         int ret;
6456
6457         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6458                 unpin = &fs_info->freed_extents[1];
6459         else
6460                 unpin = &fs_info->freed_extents[0];
6461
6462         while (!trans->aborted) {
6463                 struct extent_state *cached_state = NULL;
6464
6465                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
6466                 ret = find_first_extent_bit(unpin, 0, &start, &end,
6467                                             EXTENT_DIRTY, &cached_state);
6468                 if (ret) {
6469                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6470                         break;
6471                 }
6472
6473                 if (btrfs_test_opt(fs_info, DISCARD))
6474                         ret = btrfs_discard_extent(fs_info, start,
6475                                                    end + 1 - start, NULL);
6476
6477                 clear_extent_dirty(unpin, start, end, &cached_state);
6478                 unpin_extent_range(fs_info, start, end, true);
6479                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6480                 free_extent_state(cached_state);
6481                 cond_resched();
6482         }
6483
6484         /*
6485          * Transaction is finished.  We don't need the lock anymore.  We
6486          * do need to clean up the block groups in case of a transaction
6487          * abort.
6488          */
6489         deleted_bgs = &trans->transaction->deleted_bgs;
6490         list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
6491                 u64 trimmed = 0;
6492
6493                 ret = -EROFS;
6494                 if (!trans->aborted)
6495                         ret = btrfs_discard_extent(fs_info,
6496                                                    block_group->key.objectid,
6497                                                    block_group->key.offset,
6498                                                    &trimmed);
6499
6500                 list_del_init(&block_group->bg_list);
6501                 btrfs_put_block_group_trimming(block_group);
6502                 btrfs_put_block_group(block_group);
6503
6504                 if (ret) {
6505                         const char *errstr = btrfs_decode_error(ret);
6506                         btrfs_warn(fs_info,
6507                            "discard failed while removing blockgroup: errno=%d %s",
6508                                    ret, errstr);
6509                 }
6510         }
6511
6512         return 0;
6513 }
6514
6515 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
6516                                struct btrfs_delayed_ref_node *node, u64 parent,
6517                                u64 root_objectid, u64 owner_objectid,
6518                                u64 owner_offset, int refs_to_drop,
6519                                struct btrfs_delayed_extent_op *extent_op)
6520 {
6521         struct btrfs_fs_info *info = trans->fs_info;
6522         struct btrfs_key key;
6523         struct btrfs_path *path;
6524         struct btrfs_root *extent_root = info->extent_root;
6525         struct extent_buffer *leaf;
6526         struct btrfs_extent_item *ei;
6527         struct btrfs_extent_inline_ref *iref;
6528         int ret;
6529         int is_data;
6530         int extent_slot = 0;
6531         int found_extent = 0;
6532         int num_to_del = 1;
6533         u32 item_size;
6534         u64 refs;
6535         u64 bytenr = node->bytenr;
6536         u64 num_bytes = node->num_bytes;
6537         int last_ref = 0;
6538         bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
6539
6540         path = btrfs_alloc_path();
6541         if (!path)
6542                 return -ENOMEM;
6543
6544         path->reada = READA_FORWARD;
6545         path->leave_spinning = 1;
6546
6547         is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
6548         BUG_ON(!is_data && refs_to_drop != 1);
6549
6550         if (is_data)
6551                 skinny_metadata = false;
6552
6553         ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
6554                                     parent, root_objectid, owner_objectid,
6555                                     owner_offset);
6556         if (ret == 0) {
6557                 extent_slot = path->slots[0];
6558                 while (extent_slot >= 0) {
6559                         btrfs_item_key_to_cpu(path->nodes[0], &key,
6560                                               extent_slot);
6561                         if (key.objectid != bytenr)
6562                                 break;
6563                         if (key.type == BTRFS_EXTENT_ITEM_KEY &&
6564                             key.offset == num_bytes) {
6565                                 found_extent = 1;
6566                                 break;
6567                         }
6568                         if (key.type == BTRFS_METADATA_ITEM_KEY &&
6569                             key.offset == owner_objectid) {
6570                                 found_extent = 1;
6571                                 break;
6572                         }
6573                         if (path->slots[0] - extent_slot > 5)
6574                                 break;
6575                         extent_slot--;
6576                 }
6577
6578                 if (!found_extent) {
6579                         BUG_ON(iref);
6580                         ret = remove_extent_backref(trans, path, NULL,
6581                                                     refs_to_drop,
6582                                                     is_data, &last_ref);
6583                         if (ret) {
6584                                 btrfs_abort_transaction(trans, ret);
6585                                 goto out;
6586                         }
6587                         btrfs_release_path(path);
6588                         path->leave_spinning = 1;
6589
6590                         key.objectid = bytenr;
6591                         key.type = BTRFS_EXTENT_ITEM_KEY;
6592                         key.offset = num_bytes;
6593
6594                         if (!is_data && skinny_metadata) {
6595                                 key.type = BTRFS_METADATA_ITEM_KEY;
6596                                 key.offset = owner_objectid;
6597                         }
6598
6599                         ret = btrfs_search_slot(trans, extent_root,
6600                                                 &key, path, -1, 1);
6601                         if (ret > 0 && skinny_metadata && path->slots[0]) {
6602                                 /*
6603                                  * Couldn't find our skinny metadata item,
6604                                  * see if we have ye olde extent item.
6605                                  */
6606                                 path->slots[0]--;
6607                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
6608                                                       path->slots[0]);
6609                                 if (key.objectid == bytenr &&
6610                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
6611                                     key.offset == num_bytes)
6612                                         ret = 0;
6613                         }
6614
6615                         if (ret > 0 && skinny_metadata) {
6616                                 skinny_metadata = false;
6617                                 key.objectid = bytenr;
6618                                 key.type = BTRFS_EXTENT_ITEM_KEY;
6619                                 key.offset = num_bytes;
6620                                 btrfs_release_path(path);
6621                                 ret = btrfs_search_slot(trans, extent_root,
6622                                                         &key, path, -1, 1);
6623                         }
6624
6625                         if (ret) {
6626                                 btrfs_err(info,
6627                                           "umm, got %d back from search, was looking for %llu",
6628                                           ret, bytenr);
6629                                 if (ret > 0)
6630                                         btrfs_print_leaf(path->nodes[0]);
6631                         }
6632                         if (ret < 0) {
6633                                 btrfs_abort_transaction(trans, ret);
6634                                 goto out;
6635                         }
6636                         extent_slot = path->slots[0];
6637                 }
6638         } else if (WARN_ON(ret == -ENOENT)) {
6639                 btrfs_print_leaf(path->nodes[0]);
6640                 btrfs_err(info,
6641                         "unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
6642                         bytenr, parent, root_objectid, owner_objectid,
6643                         owner_offset);
6644                 btrfs_abort_transaction(trans, ret);
6645                 goto out;
6646         } else {
6647                 btrfs_abort_transaction(trans, ret);
6648                 goto out;
6649         }
6650
6651         leaf = path->nodes[0];
6652         item_size = btrfs_item_size_nr(leaf, extent_slot);
6653         if (unlikely(item_size < sizeof(*ei))) {
6654                 ret = -EINVAL;
6655                 btrfs_print_v0_err(info);
6656                 btrfs_abort_transaction(trans, ret);
6657                 goto out;
6658         }
6659         ei = btrfs_item_ptr(leaf, extent_slot,
6660                             struct btrfs_extent_item);
6661         if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
6662             key.type == BTRFS_EXTENT_ITEM_KEY) {
6663                 struct btrfs_tree_block_info *bi;
6664                 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
6665                 bi = (struct btrfs_tree_block_info *)(ei + 1);
6666                 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
6667         }
6668
6669         refs = btrfs_extent_refs(leaf, ei);
6670         if (refs < refs_to_drop) {
6671                 btrfs_err(info,
6672                           "trying to drop %d refs but we only have %Lu for bytenr %Lu",
6673                           refs_to_drop, refs, bytenr);
6674                 ret = -EINVAL;
6675                 btrfs_abort_transaction(trans, ret);
6676                 goto out;
6677         }
6678         refs -= refs_to_drop;
6679
6680         if (refs > 0) {
6681                 if (extent_op)
6682                         __run_delayed_extent_op(extent_op, leaf, ei);
6683                 /*
6684                  * In the case of inline back ref, reference count will
6685                  * be updated by remove_extent_backref
6686                  */
6687                 if (iref) {
6688                         BUG_ON(!found_extent);
6689                 } else {
6690                         btrfs_set_extent_refs(leaf, ei, refs);
6691                         btrfs_mark_buffer_dirty(leaf);
6692                 }
6693                 if (found_extent) {
6694                         ret = remove_extent_backref(trans, path, iref,
6695                                                     refs_to_drop, is_data,
6696                                                     &last_ref);
6697                         if (ret) {
6698                                 btrfs_abort_transaction(trans, ret);
6699                                 goto out;
6700                         }
6701                 }
6702         } else {
6703                 if (found_extent) {
6704                         BUG_ON(is_data && refs_to_drop !=
6705                                extent_data_ref_count(path, iref));
6706                         if (iref) {
6707                                 BUG_ON(path->slots[0] != extent_slot);
6708                         } else {
6709                                 BUG_ON(path->slots[0] != extent_slot + 1);
6710                                 path->slots[0] = extent_slot;
6711                                 num_to_del = 2;
6712                         }
6713                 }
6714
6715                 last_ref = 1;
6716                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
6717                                       num_to_del);
6718                 if (ret) {
6719                         btrfs_abort_transaction(trans, ret);
6720                         goto out;
6721                 }
6722                 btrfs_release_path(path);
6723
6724                 if (is_data) {
6725                         ret = btrfs_del_csums(trans, info, bytenr, num_bytes);
6726                         if (ret) {
6727                                 btrfs_abort_transaction(trans, ret);
6728                                 goto out;
6729                         }
6730                 }
6731
6732                 ret = add_to_free_space_tree(trans, bytenr, num_bytes);
6733                 if (ret) {
6734                         btrfs_abort_transaction(trans, ret);
6735                         goto out;
6736                 }
6737
6738                 ret = update_block_group(trans, bytenr, num_bytes, 0);
6739                 if (ret) {
6740                         btrfs_abort_transaction(trans, ret);
6741                         goto out;
6742                 }
6743         }
6744         btrfs_release_path(path);
6745
6746 out:
6747         btrfs_free_path(path);
6748         return ret;
6749 }
6750
6751 /*
6752  * when we free an block, it is possible (and likely) that we free the last
6753  * delayed ref for that extent as well.  This searches the delayed ref tree for
6754  * a given extent, and if there are no other delayed refs to be processed, it
6755  * removes it from the tree.
6756  */
6757 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
6758                                       u64 bytenr)
6759 {
6760         struct btrfs_delayed_ref_head *head;
6761         struct btrfs_delayed_ref_root *delayed_refs;
6762         int ret = 0;
6763
6764         delayed_refs = &trans->transaction->delayed_refs;
6765         spin_lock(&delayed_refs->lock);
6766         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
6767         if (!head)
6768                 goto out_delayed_unlock;
6769
6770         spin_lock(&head->lock);
6771         if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
6772                 goto out;
6773
6774         if (cleanup_extent_op(head) != NULL)
6775                 goto out;
6776
6777         /*
6778          * waiting for the lock here would deadlock.  If someone else has it
6779          * locked they are already in the process of dropping it anyway
6780          */
6781         if (!mutex_trylock(&head->mutex))
6782                 goto out;
6783
6784         btrfs_delete_ref_head(delayed_refs, head);
6785         head->processing = 0;
6786
6787         spin_unlock(&head->lock);
6788         spin_unlock(&delayed_refs->lock);
6789
6790         BUG_ON(head->extent_op);
6791         if (head->must_insert_reserved)
6792                 ret = 1;
6793
6794         btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
6795         mutex_unlock(&head->mutex);
6796         btrfs_put_delayed_ref_head(head);
6797         return ret;
6798 out:
6799         spin_unlock(&head->lock);
6800
6801 out_delayed_unlock:
6802         spin_unlock(&delayed_refs->lock);
6803         return 0;
6804 }
6805
6806 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
6807                            struct btrfs_root *root,
6808                            struct extent_buffer *buf,
6809                            u64 parent, int last_ref)
6810 {
6811         struct btrfs_fs_info *fs_info = root->fs_info;
6812         struct btrfs_ref generic_ref = { 0 };
6813         int pin = 1;
6814         int ret;
6815
6816         btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
6817                                buf->start, buf->len, parent);
6818         btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
6819                             root->root_key.objectid);
6820
6821         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
6822                 int old_ref_mod, new_ref_mod;
6823
6824                 btrfs_ref_tree_mod(fs_info, &generic_ref);
6825                 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL,
6826                                                  &old_ref_mod, &new_ref_mod);
6827                 BUG_ON(ret); /* -ENOMEM */
6828                 pin = old_ref_mod >= 0 && new_ref_mod < 0;
6829         }
6830
6831         if (last_ref && btrfs_header_generation(buf) == trans->transid) {
6832                 struct btrfs_block_group_cache *cache;
6833
6834                 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
6835                         ret = check_ref_cleanup(trans, buf->start);
6836                         if (!ret)
6837                                 goto out;
6838                 }
6839
6840                 pin = 0;
6841                 cache = btrfs_lookup_block_group(fs_info, buf->start);
6842
6843                 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
6844                         pin_down_extent(cache, buf->start, buf->len, 1);
6845                         btrfs_put_block_group(cache);
6846                         goto out;
6847                 }
6848
6849                 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
6850
6851                 btrfs_add_free_space(cache, buf->start, buf->len);
6852                 btrfs_free_reserved_bytes(cache, buf->len, 0);
6853                 btrfs_put_block_group(cache);
6854                 trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
6855         }
6856 out:
6857         if (pin)
6858                 add_pinned_bytes(fs_info, &generic_ref);
6859
6860         if (last_ref) {
6861                 /*
6862                  * Deleting the buffer, clear the corrupt flag since it doesn't
6863                  * matter anymore.
6864                  */
6865                 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
6866         }
6867 }
6868
6869 /* Can return -ENOMEM */
6870 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
6871 {
6872         struct btrfs_fs_info *fs_info = trans->fs_info;
6873         int old_ref_mod, new_ref_mod;
6874         int ret;
6875
6876         if (btrfs_is_testing(fs_info))
6877                 return 0;
6878
6879         /*
6880          * tree log blocks never actually go into the extent allocation
6881          * tree, just update pinning info and exit early.
6882          */
6883         if ((ref->type == BTRFS_REF_METADATA &&
6884              ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
6885             (ref->type == BTRFS_REF_DATA &&
6886              ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
6887                 /* unlocks the pinned mutex */
6888                 btrfs_pin_extent(fs_info, ref->bytenr, ref->len, 1);
6889                 old_ref_mod = new_ref_mod = 0;
6890                 ret = 0;
6891         } else if (ref->type == BTRFS_REF_METADATA) {
6892                 ret = btrfs_add_delayed_tree_ref(trans, ref, NULL,
6893                                                  &old_ref_mod, &new_ref_mod);
6894         } else {
6895                 ret = btrfs_add_delayed_data_ref(trans, ref, 0,
6896                                                  &old_ref_mod, &new_ref_mod);
6897         }
6898
6899         if (!((ref->type == BTRFS_REF_METADATA &&
6900                ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
6901               (ref->type == BTRFS_REF_DATA &&
6902                ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
6903                 btrfs_ref_tree_mod(fs_info, ref);
6904
6905         if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0)
6906                 add_pinned_bytes(fs_info, ref);
6907
6908         return ret;
6909 }
6910
6911 /*
6912  * when we wait for progress in the block group caching, its because
6913  * our allocation attempt failed at least once.  So, we must sleep
6914  * and let some progress happen before we try again.
6915  *
6916  * This function will sleep at least once waiting for new free space to
6917  * show up, and then it will check the block group free space numbers
6918  * for our min num_bytes.  Another option is to have it go ahead
6919  * and look in the rbtree for a free extent of a given size, but this
6920  * is a good start.
6921  *
6922  * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
6923  * any of the information in this block group.
6924  */
6925 static noinline void
6926 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
6927                                 u64 num_bytes)
6928 {
6929         struct btrfs_caching_control *caching_ctl;
6930
6931         caching_ctl = get_caching_control(cache);
6932         if (!caching_ctl)
6933                 return;
6934
6935         wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
6936                    (cache->free_space_ctl->free_space >= num_bytes));
6937
6938         put_caching_control(caching_ctl);
6939 }
6940
6941 static noinline int
6942 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
6943 {
6944         struct btrfs_caching_control *caching_ctl;
6945         int ret = 0;
6946
6947         caching_ctl = get_caching_control(cache);
6948         if (!caching_ctl)
6949                 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
6950
6951         wait_event(caching_ctl->wait, block_group_cache_done(cache));
6952         if (cache->cached == BTRFS_CACHE_ERROR)
6953                 ret = -EIO;
6954         put_caching_control(caching_ctl);
6955         return ret;
6956 }
6957
6958 enum btrfs_loop_type {
6959         LOOP_CACHING_NOWAIT,
6960         LOOP_CACHING_WAIT,
6961         LOOP_ALLOC_CHUNK,
6962         LOOP_NO_EMPTY_SIZE,
6963 };
6964
6965 static inline void
6966 btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
6967                        int delalloc)
6968 {
6969         if (delalloc)
6970                 down_read(&cache->data_rwsem);
6971 }
6972
6973 static inline void
6974 btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
6975                        int delalloc)
6976 {
6977         btrfs_get_block_group(cache);
6978         if (delalloc)
6979                 down_read(&cache->data_rwsem);
6980 }
6981
6982 static struct btrfs_block_group_cache *
6983 btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
6984                    struct btrfs_free_cluster *cluster,
6985                    int delalloc)
6986 {
6987         struct btrfs_block_group_cache *used_bg = NULL;
6988
6989         spin_lock(&cluster->refill_lock);
6990         while (1) {
6991                 used_bg = cluster->block_group;
6992                 if (!used_bg)
6993                         return NULL;
6994
6995                 if (used_bg == block_group)
6996                         return used_bg;
6997
6998                 btrfs_get_block_group(used_bg);
6999
7000                 if (!delalloc)
7001                         return used_bg;
7002
7003                 if (down_read_trylock(&used_bg->data_rwsem))
7004                         return used_bg;
7005
7006                 spin_unlock(&cluster->refill_lock);
7007
7008                 /* We should only have one-level nested. */
7009                 down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
7010
7011                 spin_lock(&cluster->refill_lock);
7012                 if (used_bg == cluster->block_group)
7013                         return used_bg;
7014
7015                 up_read(&used_bg->data_rwsem);
7016                 btrfs_put_block_group(used_bg);
7017         }
7018 }
7019
7020 static inline void
7021 btrfs_release_block_group(struct btrfs_block_group_cache *cache,
7022                          int delalloc)
7023 {
7024         if (delalloc)
7025                 up_read(&cache->data_rwsem);
7026         btrfs_put_block_group(cache);
7027 }
7028
7029 /*
7030  * Structure used internally for find_free_extent() function.  Wraps needed
7031  * parameters.
7032  */
7033 struct find_free_extent_ctl {
7034         /* Basic allocation info */
7035         u64 ram_bytes;
7036         u64 num_bytes;
7037         u64 empty_size;
7038         u64 flags;
7039         int delalloc;
7040
7041         /* Where to start the search inside the bg */
7042         u64 search_start;
7043
7044         /* For clustered allocation */
7045         u64 empty_cluster;
7046
7047         bool have_caching_bg;
7048         bool orig_have_caching_bg;
7049
7050         /* RAID index, converted from flags */
7051         int index;
7052
7053         /*
7054          * Current loop number, check find_free_extent_update_loop() for details
7055          */
7056         int loop;
7057
7058         /*
7059          * Whether we're refilling a cluster, if true we need to re-search
7060          * current block group but don't try to refill the cluster again.
7061          */
7062         bool retry_clustered;
7063
7064         /*
7065          * Whether we're updating free space cache, if true we need to re-search
7066          * current block group but don't try updating free space cache again.
7067          */
7068         bool retry_unclustered;
7069
7070         /* If current block group is cached */
7071         int cached;
7072
7073         /* Max contiguous hole found */
7074         u64 max_extent_size;
7075
7076         /* Total free space from free space cache, not always contiguous */
7077         u64 total_free_space;
7078
7079         /* Found result */
7080         u64 found_offset;
7081 };
7082
7083
7084 /*
7085  * Helper function for find_free_extent().
7086  *
7087  * Return -ENOENT to inform caller that we need fallback to unclustered mode.
7088  * Return -EAGAIN to inform caller that we need to re-search this block group
7089  * Return >0 to inform caller that we find nothing
7090  * Return 0 means we have found a location and set ffe_ctl->found_offset.
7091  */
7092 static int find_free_extent_clustered(struct btrfs_block_group_cache *bg,
7093                 struct btrfs_free_cluster *last_ptr,
7094                 struct find_free_extent_ctl *ffe_ctl,
7095                 struct btrfs_block_group_cache **cluster_bg_ret)
7096 {
7097         struct btrfs_block_group_cache *cluster_bg;
7098         u64 aligned_cluster;
7099         u64 offset;
7100         int ret;
7101
7102         cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
7103         if (!cluster_bg)
7104                 goto refill_cluster;
7105         if (cluster_bg != bg && (cluster_bg->ro ||
7106             !block_group_bits(cluster_bg, ffe_ctl->flags)))
7107                 goto release_cluster;
7108
7109         offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
7110                         ffe_ctl->num_bytes, cluster_bg->key.objectid,
7111                         &ffe_ctl->max_extent_size);
7112         if (offset) {
7113                 /* We have a block, we're done */
7114                 spin_unlock(&last_ptr->refill_lock);
7115                 trace_btrfs_reserve_extent_cluster(cluster_bg,
7116                                 ffe_ctl->search_start, ffe_ctl->num_bytes);
7117                 *cluster_bg_ret = cluster_bg;
7118                 ffe_ctl->found_offset = offset;
7119                 return 0;
7120         }
7121         WARN_ON(last_ptr->block_group != cluster_bg);
7122
7123 release_cluster:
7124         /*
7125          * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
7126          * lets just skip it and let the allocator find whatever block it can
7127          * find. If we reach this point, we will have tried the cluster
7128          * allocator plenty of times and not have found anything, so we are
7129          * likely way too fragmented for the clustering stuff to find anything.
7130          *
7131          * However, if the cluster is taken from the current block group,
7132          * release the cluster first, so that we stand a better chance of
7133          * succeeding in the unclustered allocation.
7134          */
7135         if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
7136                 spin_unlock(&last_ptr->refill_lock);
7137                 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
7138                 return -ENOENT;
7139         }
7140
7141         /* This cluster didn't work out, free it and start over */
7142         btrfs_return_cluster_to_free_space(NULL, last_ptr);
7143
7144         if (cluster_bg != bg)
7145                 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
7146
7147 refill_cluster:
7148         if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
7149                 spin_unlock(&last_ptr->refill_lock);
7150                 return -ENOENT;
7151         }
7152
7153         aligned_cluster = max_t(u64,
7154                         ffe_ctl->empty_cluster + ffe_ctl->empty_size,
7155                         bg->full_stripe_len);
7156         ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
7157                         ffe_ctl->num_bytes, aligned_cluster);
7158         if (ret == 0) {
7159                 /* Now pull our allocation out of this cluster */
7160                 offset = btrfs_alloc_from_cluster(bg, last_ptr,
7161                                 ffe_ctl->num_bytes, ffe_ctl->search_start,
7162                                 &ffe_ctl->max_extent_size);
7163                 if (offset) {
7164                         /* We found one, proceed */
7165                         spin_unlock(&last_ptr->refill_lock);
7166                         trace_btrfs_reserve_extent_cluster(bg,
7167                                         ffe_ctl->search_start,
7168                                         ffe_ctl->num_bytes);
7169                         ffe_ctl->found_offset = offset;
7170                         return 0;
7171                 }
7172         } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
7173                    !ffe_ctl->retry_clustered) {
7174                 spin_unlock(&last_ptr->refill_lock);
7175
7176                 ffe_ctl->retry_clustered = true;
7177                 wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
7178                                 ffe_ctl->empty_cluster + ffe_ctl->empty_size);
7179                 return -EAGAIN;
7180         }
7181         /*
7182          * At this point we either didn't find a cluster or we weren't able to
7183          * allocate a block from our cluster.  Free the cluster we've been
7184          * trying to use, and go to the next block group.
7185          */
7186         btrfs_return_cluster_to_free_space(NULL, last_ptr);
7187         spin_unlock(&last_ptr->refill_lock);
7188         return 1;
7189 }
7190
7191 /*
7192  * Return >0 to inform caller that we find nothing
7193  * Return 0 when we found an free extent and set ffe_ctrl->found_offset
7194  * Return -EAGAIN to inform caller that we need to re-search this block group
7195  */
7196 static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg,
7197                 struct btrfs_free_cluster *last_ptr,
7198                 struct find_free_extent_ctl *ffe_ctl)
7199 {
7200         u64 offset;
7201
7202         /*
7203          * We are doing an unclustered allocation, set the fragmented flag so
7204          * we don't bother trying to setup a cluster again until we get more
7205          * space.
7206          */
7207         if (unlikely(last_ptr)) {
7208                 spin_lock(&last_ptr->lock);
7209                 last_ptr->fragmented = 1;
7210                 spin_unlock(&last_ptr->lock);
7211         }
7212         if (ffe_ctl->cached) {
7213                 struct btrfs_free_space_ctl *free_space_ctl;
7214
7215                 free_space_ctl = bg->free_space_ctl;
7216                 spin_lock(&free_space_ctl->tree_lock);
7217                 if (free_space_ctl->free_space <
7218                     ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
7219                     ffe_ctl->empty_size) {
7220                         ffe_ctl->total_free_space = max_t(u64,
7221                                         ffe_ctl->total_free_space,
7222                                         free_space_ctl->free_space);
7223                         spin_unlock(&free_space_ctl->tree_lock);
7224                         return 1;
7225                 }
7226                 spin_unlock(&free_space_ctl->tree_lock);
7227         }
7228
7229         offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
7230                         ffe_ctl->num_bytes, ffe_ctl->empty_size,
7231                         &ffe_ctl->max_extent_size);
7232
7233         /*
7234          * If we didn't find a chunk, and we haven't failed on this block group
7235          * before, and this block group is in the middle of caching and we are
7236          * ok with waiting, then go ahead and wait for progress to be made, and
7237          * set @retry_unclustered to true.
7238          *
7239          * If @retry_unclustered is true then we've already waited on this
7240          * block group once and should move on to the next block group.
7241          */
7242         if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
7243             ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
7244                 wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
7245                                                 ffe_ctl->empty_size);
7246                 ffe_ctl->retry_unclustered = true;
7247                 return -EAGAIN;
7248         } else if (!offset) {
7249                 return 1;
7250         }
7251         ffe_ctl->found_offset = offset;
7252         return 0;
7253 }
7254
7255 /*
7256  * Return >0 means caller needs to re-search for free extent
7257  * Return 0 means we have the needed free extent.
7258  * Return <0 means we failed to locate any free extent.
7259  */
7260 static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
7261                                         struct btrfs_free_cluster *last_ptr,
7262                                         struct btrfs_key *ins,
7263                                         struct find_free_extent_ctl *ffe_ctl,
7264                                         int full_search, bool use_cluster)
7265 {
7266         struct btrfs_root *root = fs_info->extent_root;
7267         int ret;
7268
7269         if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
7270             ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
7271                 ffe_ctl->orig_have_caching_bg = true;
7272
7273         if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT &&
7274             ffe_ctl->have_caching_bg)
7275                 return 1;
7276
7277         if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
7278                 return 1;
7279
7280         if (ins->objectid) {
7281                 if (!use_cluster && last_ptr) {
7282                         spin_lock(&last_ptr->lock);
7283                         last_ptr->window_start = ins->objectid;
7284                         spin_unlock(&last_ptr->lock);
7285                 }
7286                 return 0;
7287         }
7288
7289         /*
7290          * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
7291          *                      caching kthreads as we move along
7292          * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
7293          * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
7294          * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
7295          *                     again
7296          */
7297         if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
7298                 ffe_ctl->index = 0;
7299                 if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
7300                         /*
7301                          * We want to skip the LOOP_CACHING_WAIT step if we
7302                          * don't have any uncached bgs and we've already done a
7303                          * full search through.
7304                          */
7305                         if (ffe_ctl->orig_have_caching_bg || !full_search)
7306                                 ffe_ctl->loop = LOOP_CACHING_WAIT;
7307                         else
7308                                 ffe_ctl->loop = LOOP_ALLOC_CHUNK;
7309                 } else {
7310                         ffe_ctl->loop++;
7311                 }
7312
7313                 if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
7314                         struct btrfs_trans_handle *trans;
7315                         int exist = 0;
7316
7317                         trans = current->journal_info;
7318                         if (trans)
7319                                 exist = 1;
7320                         else
7321                                 trans = btrfs_join_transaction(root);
7322
7323                         if (IS_ERR(trans)) {
7324                                 ret = PTR_ERR(trans);
7325                                 return ret;
7326                         }
7327
7328                         ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
7329                                                 CHUNK_ALLOC_FORCE);
7330
7331                         /*
7332                          * If we can't allocate a new chunk we've already looped
7333                          * through at least once, move on to the NO_EMPTY_SIZE
7334                          * case.
7335                          */
7336                         if (ret == -ENOSPC)
7337                                 ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
7338
7339                         /* Do not bail out on ENOSPC since we can do more. */
7340                         if (ret < 0 && ret != -ENOSPC)
7341                                 btrfs_abort_transaction(trans, ret);
7342                         else
7343                                 ret = 0;
7344                         if (!exist)
7345                                 btrfs_end_transaction(trans);
7346                         if (ret)
7347                                 return ret;
7348                 }
7349
7350                 if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
7351                         /*
7352                          * Don't loop again if we already have no empty_size and
7353                          * no empty_cluster.
7354                          */
7355                         if (ffe_ctl->empty_size == 0 &&
7356                             ffe_ctl->empty_cluster == 0)
7357                                 return -ENOSPC;
7358                         ffe_ctl->empty_size = 0;
7359                         ffe_ctl->empty_cluster = 0;
7360                 }
7361                 return 1;
7362         }
7363         return -ENOSPC;
7364 }
7365
7366 /*
7367  * walks the btree of allocated extents and find a hole of a given size.
7368  * The key ins is changed to record the hole:
7369  * ins->objectid == start position
7370  * ins->flags = BTRFS_EXTENT_ITEM_KEY
7371  * ins->offset == the size of the hole.
7372  * Any available blocks before search_start are skipped.
7373  *
7374  * If there is no suitable free space, we will record the max size of
7375  * the free space extent currently.
7376  *
7377  * The overall logic and call chain:
7378  *
7379  * find_free_extent()
7380  * |- Iterate through all block groups
7381  * |  |- Get a valid block group
7382  * |  |- Try to do clustered allocation in that block group
7383  * |  |- Try to do unclustered allocation in that block group
7384  * |  |- Check if the result is valid
7385  * |  |  |- If valid, then exit
7386  * |  |- Jump to next block group
7387  * |
7388  * |- Push harder to find free extents
7389  *    |- If not found, re-iterate all block groups
7390  */
7391 static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
7392                                 u64 ram_bytes, u64 num_bytes, u64 empty_size,
7393                                 u64 hint_byte, struct btrfs_key *ins,
7394                                 u64 flags, int delalloc)
7395 {
7396         int ret = 0;
7397         struct btrfs_free_cluster *last_ptr = NULL;
7398         struct btrfs_block_group_cache *block_group = NULL;
7399         struct find_free_extent_ctl ffe_ctl = {0};
7400         struct btrfs_space_info *space_info;
7401         bool use_cluster = true;
7402         bool full_search = false;
7403
7404         WARN_ON(num_bytes < fs_info->sectorsize);
7405
7406         ffe_ctl.ram_bytes = ram_bytes;
7407         ffe_ctl.num_bytes = num_bytes;
7408         ffe_ctl.empty_size = empty_size;
7409         ffe_ctl.flags = flags;
7410         ffe_ctl.search_start = 0;
7411         ffe_ctl.retry_clustered = false;
7412         ffe_ctl.retry_unclustered = false;
7413         ffe_ctl.delalloc = delalloc;
7414         ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
7415         ffe_ctl.have_caching_bg = false;
7416         ffe_ctl.orig_have_caching_bg = false;
7417         ffe_ctl.found_offset = 0;
7418
7419         ins->type = BTRFS_EXTENT_ITEM_KEY;
7420         ins->objectid = 0;
7421         ins->offset = 0;
7422
7423         trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
7424
7425         space_info = btrfs_find_space_info(fs_info, flags);
7426         if (!space_info) {
7427                 btrfs_err(fs_info, "No space info for %llu", flags);
7428                 return -ENOSPC;
7429         }
7430
7431         /*
7432          * If our free space is heavily fragmented we may not be able to make
7433          * big contiguous allocations, so instead of doing the expensive search
7434          * for free space, simply return ENOSPC with our max_extent_size so we
7435          * can go ahead and search for a more manageable chunk.
7436          *
7437          * If our max_extent_size is large enough for our allocation simply
7438          * disable clustering since we will likely not be able to find enough
7439          * space to create a cluster and induce latency trying.
7440          */
7441         if (unlikely(space_info->max_extent_size)) {
7442                 spin_lock(&space_info->lock);
7443                 if (space_info->max_extent_size &&
7444                     num_bytes > space_info->max_extent_size) {
7445                         ins->offset = space_info->max_extent_size;
7446                         spin_unlock(&space_info->lock);
7447                         return -ENOSPC;
7448                 } else if (space_info->max_extent_size) {
7449                         use_cluster = false;
7450                 }
7451                 spin_unlock(&space_info->lock);
7452         }
7453
7454         last_ptr = fetch_cluster_info(fs_info, space_info,
7455                                       &ffe_ctl.empty_cluster);
7456         if (last_ptr) {
7457                 spin_lock(&last_ptr->lock);
7458                 if (last_ptr->block_group)
7459                         hint_byte = last_ptr->window_start;
7460                 if (last_ptr->fragmented) {
7461                         /*
7462                          * We still set window_start so we can keep track of the
7463                          * last place we found an allocation to try and save
7464                          * some time.
7465                          */
7466                         hint_byte = last_ptr->window_start;
7467                         use_cluster = false;
7468                 }
7469                 spin_unlock(&last_ptr->lock);
7470         }
7471
7472         ffe_ctl.search_start = max(ffe_ctl.search_start,
7473                                    first_logical_byte(fs_info, 0));
7474         ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte);
7475         if (ffe_ctl.search_start == hint_byte) {
7476                 block_group = btrfs_lookup_block_group(fs_info,
7477                                                        ffe_ctl.search_start);
7478                 /*
7479                  * we don't want to use the block group if it doesn't match our
7480                  * allocation bits, or if its not cached.
7481                  *
7482                  * However if we are re-searching with an ideal block group
7483                  * picked out then we don't care that the block group is cached.
7484                  */
7485                 if (block_group && block_group_bits(block_group, flags) &&
7486                     block_group->cached != BTRFS_CACHE_NO) {
7487                         down_read(&space_info->groups_sem);
7488                         if (list_empty(&block_group->list) ||
7489                             block_group->ro) {
7490                                 /*
7491                                  * someone is removing this block group,
7492                                  * we can't jump into the have_block_group
7493                                  * target because our list pointers are not
7494                                  * valid
7495                                  */
7496                                 btrfs_put_block_group(block_group);
7497                                 up_read(&space_info->groups_sem);
7498                         } else {
7499                                 ffe_ctl.index = btrfs_bg_flags_to_raid_index(
7500                                                 block_group->flags);
7501                                 btrfs_lock_block_group(block_group, delalloc);
7502                                 goto have_block_group;
7503                         }
7504                 } else if (block_group) {
7505                         btrfs_put_block_group(block_group);
7506                 }
7507         }
7508 search:
7509         ffe_ctl.have_caching_bg = false;
7510         if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) ||
7511             ffe_ctl.index == 0)
7512                 full_search = true;
7513         down_read(&space_info->groups_sem);
7514         list_for_each_entry(block_group,
7515                             &space_info->block_groups[ffe_ctl.index], list) {
7516                 /* If the block group is read-only, we can skip it entirely. */
7517                 if (unlikely(block_group->ro))
7518                         continue;
7519
7520                 btrfs_grab_block_group(block_group, delalloc);
7521                 ffe_ctl.search_start = block_group->key.objectid;
7522
7523                 /*
7524                  * this can happen if we end up cycling through all the
7525                  * raid types, but we want to make sure we only allocate
7526                  * for the proper type.
7527                  */
7528                 if (!block_group_bits(block_group, flags)) {
7529                         u64 extra = BTRFS_BLOCK_GROUP_DUP |
7530                                 BTRFS_BLOCK_GROUP_RAID1_MASK |
7531                                 BTRFS_BLOCK_GROUP_RAID56_MASK |
7532                                 BTRFS_BLOCK_GROUP_RAID10;
7533
7534                         /*
7535                          * if they asked for extra copies and this block group
7536                          * doesn't provide them, bail.  This does allow us to
7537                          * fill raid0 from raid1.
7538                          */
7539                         if ((flags & extra) && !(block_group->flags & extra))
7540                                 goto loop;
7541                 }
7542
7543 have_block_group:
7544                 ffe_ctl.cached = block_group_cache_done(block_group);
7545                 if (unlikely(!ffe_ctl.cached)) {
7546                         ffe_ctl.have_caching_bg = true;
7547                         ret = cache_block_group(block_group, 0);
7548                         BUG_ON(ret < 0);
7549                         ret = 0;
7550                 }
7551
7552                 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
7553                         goto loop;
7554
7555                 /*
7556                  * Ok we want to try and use the cluster allocator, so
7557                  * lets look there
7558                  */
7559                 if (last_ptr && use_cluster) {
7560                         struct btrfs_block_group_cache *cluster_bg = NULL;
7561
7562                         ret = find_free_extent_clustered(block_group, last_ptr,
7563                                                          &ffe_ctl, &cluster_bg);
7564
7565                         if (ret == 0) {
7566                                 if (cluster_bg && cluster_bg != block_group) {
7567                                         btrfs_release_block_group(block_group,
7568                                                                   delalloc);
7569                                         block_group = cluster_bg;
7570                                 }
7571                                 goto checks;
7572                         } else if (ret == -EAGAIN) {
7573                                 goto have_block_group;
7574                         } else if (ret > 0) {
7575                                 goto loop;
7576                         }
7577                         /* ret == -ENOENT case falls through */
7578                 }
7579
7580                 ret = find_free_extent_unclustered(block_group, last_ptr,
7581                                                    &ffe_ctl);
7582                 if (ret == -EAGAIN)
7583                         goto have_block_group;
7584                 else if (ret > 0)
7585                         goto loop;
7586                 /* ret == 0 case falls through */
7587 checks:
7588                 ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
7589                                              fs_info->stripesize);
7590
7591                 /* move on to the next group */
7592                 if (ffe_ctl.search_start + num_bytes >
7593                     block_group->key.objectid + block_group->key.offset) {
7594                         btrfs_add_free_space(block_group, ffe_ctl.found_offset,
7595                                              num_bytes);
7596                         goto loop;
7597                 }
7598
7599                 if (ffe_ctl.found_offset < ffe_ctl.search_start)
7600                         btrfs_add_free_space(block_group, ffe_ctl.found_offset,
7601                                 ffe_ctl.search_start - ffe_ctl.found_offset);
7602
7603                 ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
7604                                 num_bytes, delalloc);
7605                 if (ret == -EAGAIN) {
7606                         btrfs_add_free_space(block_group, ffe_ctl.found_offset,
7607                                              num_bytes);
7608                         goto loop;
7609                 }
7610                 btrfs_inc_block_group_reservations(block_group);
7611
7612                 /* we are all good, lets return */
7613                 ins->objectid = ffe_ctl.search_start;
7614                 ins->offset = num_bytes;
7615
7616                 trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start,
7617                                            num_bytes);
7618                 btrfs_release_block_group(block_group, delalloc);
7619                 break;
7620 loop:
7621                 ffe_ctl.retry_clustered = false;
7622                 ffe_ctl.retry_unclustered = false;
7623                 BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
7624                        ffe_ctl.index);
7625                 btrfs_release_block_group(block_group, delalloc);
7626                 cond_resched();
7627         }
7628         up_read(&space_info->groups_sem);
7629
7630         ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl,
7631                                            full_search, use_cluster);
7632         if (ret > 0)
7633                 goto search;
7634
7635         if (ret == -ENOSPC) {
7636                 /*
7637                  * Use ffe_ctl->total_free_space as fallback if we can't find
7638                  * any contiguous hole.
7639                  */
7640                 if (!ffe_ctl.max_extent_size)
7641                         ffe_ctl.max_extent_size = ffe_ctl.total_free_space;
7642                 spin_lock(&space_info->lock);
7643                 space_info->max_extent_size = ffe_ctl.max_extent_size;
7644                 spin_unlock(&space_info->lock);
7645                 ins->offset = ffe_ctl.max_extent_size;
7646         }
7647         return ret;
7648 }
7649
7650 #define DUMP_BLOCK_RSV(fs_info, rsv_name)                               \
7651 do {                                                                    \
7652         struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name;           \
7653         spin_lock(&__rsv->lock);                                        \
7654         btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu",      \
7655                    __rsv->size, __rsv->reserved);                       \
7656         spin_unlock(&__rsv->lock);                                      \
7657 } while (0)
7658
7659 static void dump_space_info(struct btrfs_fs_info *fs_info,
7660                             struct btrfs_space_info *info, u64 bytes,
7661                             int dump_block_groups)
7662 {
7663         struct btrfs_block_group_cache *cache;
7664         int index = 0;
7665
7666         spin_lock(&info->lock);
7667         btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
7668                    info->flags,
7669                    info->total_bytes - btrfs_space_info_used(info, true),
7670                    info->full ? "" : "not ");
7671         btrfs_info(fs_info,
7672                 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
7673                 info->total_bytes, info->bytes_used, info->bytes_pinned,
7674                 info->bytes_reserved, info->bytes_may_use,
7675                 info->bytes_readonly);
7676         spin_unlock(&info->lock);
7677
7678         DUMP_BLOCK_RSV(fs_info, global_block_rsv);
7679         DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
7680         DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
7681         DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
7682         DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
7683
7684         if (!dump_block_groups)
7685                 return;
7686
7687         down_read(&info->groups_sem);
7688 again:
7689         list_for_each_entry(cache, &info->block_groups[index], list) {
7690                 spin_lock(&cache->lock);
7691                 btrfs_info(fs_info,
7692                         "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
7693                         cache->key.objectid, cache->key.offset,
7694                         btrfs_block_group_used(&cache->item), cache->pinned,
7695                         cache->reserved, cache->ro ? "[readonly]" : "");
7696                 btrfs_dump_free_space(cache, bytes);
7697                 spin_unlock(&cache->lock);
7698         }
7699         if (++index < BTRFS_NR_RAID_TYPES)
7700                 goto again;
7701         up_read(&info->groups_sem);
7702 }
7703
7704 /*
7705  * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
7706  *                        hole that is at least as big as @num_bytes.
7707  *
7708  * @root           -    The root that will contain this extent
7709  *
7710  * @ram_bytes      -    The amount of space in ram that @num_bytes take. This
7711  *                      is used for accounting purposes. This value differs
7712  *                      from @num_bytes only in the case of compressed extents.
7713  *
7714  * @num_bytes      -    Number of bytes to allocate on-disk.
7715  *
7716  * @min_alloc_size -    Indicates the minimum amount of space that the
7717  *                      allocator should try to satisfy. In some cases
7718  *                      @num_bytes may be larger than what is required and if
7719  *                      the filesystem is fragmented then allocation fails.
7720  *                      However, the presence of @min_alloc_size gives a
7721  *                      chance to try and satisfy the smaller allocation.
7722  *
7723  * @empty_size     -    A hint that you plan on doing more COW. This is the
7724  *                      size in bytes the allocator should try to find free
7725  *                      next to the block it returns.  This is just a hint and
7726  *                      may be ignored by the allocator.
7727  *
7728  * @hint_byte      -    Hint to the allocator to start searching above the byte
7729  *                      address passed. It might be ignored.
7730  *
7731  * @ins            -    This key is modified to record the found hole. It will
7732  *                      have the following values:
7733  *                      ins->objectid == start position
7734  *                      ins->flags = BTRFS_EXTENT_ITEM_KEY
7735  *                      ins->offset == the size of the hole.
7736  *
7737  * @is_data        -    Boolean flag indicating whether an extent is
7738  *                      allocated for data (true) or metadata (false)
7739  *
7740  * @delalloc       -    Boolean flag indicating whether this allocation is for
7741  *                      delalloc or not. If 'true' data_rwsem of block groups
7742  *                      is going to be acquired.
7743  *
7744  *
7745  * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
7746  * case -ENOSPC is returned then @ins->offset will contain the size of the
7747  * largest available hole the allocator managed to find.
7748  */
7749 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
7750                          u64 num_bytes, u64 min_alloc_size,
7751                          u64 empty_size, u64 hint_byte,
7752                          struct btrfs_key *ins, int is_data, int delalloc)
7753 {
7754         struct btrfs_fs_info *fs_info = root->fs_info;
7755         bool final_tried = num_bytes == min_alloc_size;
7756         u64 flags;
7757         int ret;
7758
7759         flags = get_alloc_profile_by_root(root, is_data);
7760 again:
7761         WARN_ON(num_bytes < fs_info->sectorsize);
7762         ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
7763                                hint_byte, ins, flags, delalloc);
7764         if (!ret && !is_data) {
7765                 btrfs_dec_block_group_reservations(fs_info, ins->objectid);
7766         } else if (ret == -ENOSPC) {
7767                 if (!final_tried && ins->offset) {
7768                         num_bytes = min(num_bytes >> 1, ins->offset);
7769                         num_bytes = round_down(num_bytes,
7770                                                fs_info->sectorsize);
7771                         num_bytes = max(num_bytes, min_alloc_size);
7772                         ram_bytes = num_bytes;
7773                         if (num_bytes == min_alloc_size)
7774                                 final_tried = true;
7775                         goto again;
7776                 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
7777                         struct btrfs_space_info *sinfo;
7778
7779                         sinfo = btrfs_find_space_info(fs_info, flags);
7780                         btrfs_err(fs_info,
7781                                   "allocation failed flags %llu, wanted %llu",
7782                                   flags, num_bytes);
7783                         if (sinfo)
7784                                 dump_space_info(fs_info, sinfo, num_bytes, 1);
7785                 }
7786         }
7787
7788         return ret;
7789 }
7790
7791 static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
7792                                         u64 start, u64 len,
7793                                         int pin, int delalloc)
7794 {
7795         struct btrfs_block_group_cache *cache;
7796         int ret = 0;
7797
7798         cache = btrfs_lookup_block_group(fs_info, start);
7799         if (!cache) {
7800                 btrfs_err(fs_info, "Unable to find block group for %llu",
7801                           start);
7802                 return -ENOSPC;
7803         }
7804
7805         if (pin)
7806                 pin_down_extent(cache, start, len, 1);
7807         else {
7808                 if (btrfs_test_opt(fs_info, DISCARD))
7809                         ret = btrfs_discard_extent(fs_info, start, len, NULL);
7810                 btrfs_add_free_space(cache, start, len);
7811                 btrfs_free_reserved_bytes(cache, len, delalloc);
7812                 trace_btrfs_reserved_extent_free(fs_info, start, len);
7813         }
7814
7815         btrfs_put_block_group(cache);
7816         return ret;
7817 }
7818
7819 int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
7820                                u64 start, u64 len, int delalloc)
7821 {
7822         return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc);
7823 }
7824
7825 int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
7826                                        u64 start, u64 len)
7827 {
7828         return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0);
7829 }
7830
7831 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
7832                                       u64 parent, u64 root_objectid,
7833                                       u64 flags, u64 owner, u64 offset,
7834                                       struct btrfs_key *ins, int ref_mod)
7835 {
7836         struct btrfs_fs_info *fs_info = trans->fs_info;
7837         int ret;
7838         struct btrfs_extent_item *extent_item;
7839         struct btrfs_extent_inline_ref *iref;
7840         struct btrfs_path *path;
7841         struct extent_buffer *leaf;
7842         int type;
7843         u32 size;
7844
7845         if (parent > 0)
7846                 type = BTRFS_SHARED_DATA_REF_KEY;
7847         else
7848                 type = BTRFS_EXTENT_DATA_REF_KEY;
7849
7850         size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
7851
7852         path = btrfs_alloc_path();
7853         if (!path)
7854                 return -ENOMEM;
7855
7856         path->leave_spinning = 1;
7857         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
7858                                       ins, size);
7859         if (ret) {
7860                 btrfs_free_path(path);
7861                 return ret;
7862         }
7863
7864         leaf = path->nodes[0];
7865         extent_item = btrfs_item_ptr(leaf, path->slots[0],
7866                                      struct btrfs_extent_item);
7867         btrfs_set_extent_refs(leaf, extent_item, ref_mod);
7868         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
7869         btrfs_set_extent_flags(leaf, extent_item,
7870                                flags | BTRFS_EXTENT_FLAG_DATA);
7871
7872         iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
7873         btrfs_set_extent_inline_ref_type(leaf, iref, type);
7874         if (parent > 0) {
7875                 struct btrfs_shared_data_ref *ref;
7876                 ref = (struct btrfs_shared_data_ref *)(iref + 1);
7877                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
7878                 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
7879         } else {
7880                 struct btrfs_extent_data_ref *ref;
7881                 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
7882                 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
7883                 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
7884                 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
7885                 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
7886         }
7887
7888         btrfs_mark_buffer_dirty(path->nodes[0]);
7889         btrfs_free_path(path);
7890
7891         ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset);
7892         if (ret)
7893                 return ret;
7894
7895         ret = update_block_group(trans, ins->objectid, ins->offset, 1);
7896         if (ret) { /* -ENOENT, logic error */
7897                 btrfs_err(fs_info, "update block group failed for %llu %llu",
7898                         ins->objectid, ins->offset);
7899                 BUG();
7900         }
7901         trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
7902         return ret;
7903 }
7904
7905 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
7906                                      struct btrfs_delayed_ref_node *node,
7907                                      struct btrfs_delayed_extent_op *extent_op)
7908 {
7909         struct btrfs_fs_info *fs_info = trans->fs_info;
7910         int ret;
7911         struct btrfs_extent_item *extent_item;
7912         struct btrfs_key extent_key;
7913         struct btrfs_tree_block_info *block_info;
7914         struct btrfs_extent_inline_ref *iref;
7915         struct btrfs_path *path;
7916         struct extent_buffer *leaf;
7917         struct btrfs_delayed_tree_ref *ref;
7918         u32 size = sizeof(*extent_item) + sizeof(*iref);
7919         u64 num_bytes;
7920         u64 flags = extent_op->flags_to_set;
7921         bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
7922
7923         ref = btrfs_delayed_node_to_tree_ref(node);
7924
7925         extent_key.objectid = node->bytenr;
7926         if (skinny_metadata) {
7927                 extent_key.offset = ref->level;
7928                 extent_key.type = BTRFS_METADATA_ITEM_KEY;
7929                 num_bytes = fs_info->nodesize;
7930         } else {
7931                 extent_key.offset = node->num_bytes;
7932                 extent_key.type = BTRFS_EXTENT_ITEM_KEY;
7933                 size += sizeof(*block_info);
7934                 num_bytes = node->num_bytes;
7935         }
7936
7937         path = btrfs_alloc_path();
7938         if (!path)
7939                 return -ENOMEM;
7940
7941         path->leave_spinning = 1;
7942         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
7943                                       &extent_key, size);
7944         if (ret) {
7945                 btrfs_free_path(path);
7946                 return ret;
7947         }
7948
7949         leaf = path->nodes[0];
7950         extent_item = btrfs_item_ptr(leaf, path->slots[0],
7951                                      struct btrfs_extent_item);
7952         btrfs_set_extent_refs(leaf, extent_item, 1);
7953         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
7954         btrfs_set_extent_flags(leaf, extent_item,
7955                                flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
7956
7957         if (skinny_metadata) {
7958                 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
7959         } else {
7960                 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
7961                 btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
7962                 btrfs_set_tree_block_level(leaf, block_info, ref->level);
7963                 iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
7964         }
7965
7966         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
7967                 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
7968                 btrfs_set_extent_inline_ref_type(leaf, iref,
7969                                                  BTRFS_SHARED_BLOCK_REF_KEY);
7970                 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
7971         } else {
7972                 btrfs_set_extent_inline_ref_type(leaf, iref,
7973                                                  BTRFS_TREE_BLOCK_REF_KEY);
7974                 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
7975         }
7976
7977         btrfs_mark_buffer_dirty(leaf);
7978         btrfs_free_path(path);
7979
7980         ret = remove_from_free_space_tree(trans, extent_key.objectid,
7981                                           num_bytes);
7982         if (ret)
7983                 return ret;
7984
7985         ret = update_block_group(trans, extent_key.objectid,
7986                                  fs_info->nodesize, 1);
7987         if (ret) { /* -ENOENT, logic error */
7988                 btrfs_err(fs_info, "update block group failed for %llu %llu",
7989                         extent_key.objectid, extent_key.offset);
7990                 BUG();
7991         }
7992
7993         trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid,
7994                                           fs_info->nodesize);
7995         return ret;
7996 }
7997
7998 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
7999                                      struct btrfs_root *root, u64 owner,
8000                                      u64 offset, u64 ram_bytes,
8001                                      struct btrfs_key *ins)
8002 {
8003         struct btrfs_ref generic_ref = { 0 };
8004         int ret;
8005
8006         BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
8007
8008         btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
8009                                ins->objectid, ins->offset, 0);
8010         btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset);
8011         btrfs_ref_tree_mod(root->fs_info, &generic_ref);
8012         ret = btrfs_add_delayed_data_ref(trans, &generic_ref,
8013                                          ram_bytes, NULL, NULL);
8014         return ret;
8015 }
8016
8017 /*
8018  * this is used by the tree logging recovery code.  It records that
8019  * an extent has been allocated and makes sure to clear the free
8020  * space cache bits as well
8021  */
8022 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
8023                                    u64 root_objectid, u64 owner, u64 offset,
8024                                    struct btrfs_key *ins)
8025 {
8026         struct btrfs_fs_info *fs_info = trans->fs_info;
8027         int ret;
8028         struct btrfs_block_group_cache *block_group;
8029         struct btrfs_space_info *space_info;
8030
8031         /*
8032          * Mixed block groups will exclude before processing the log so we only
8033          * need to do the exclude dance if this fs isn't mixed.
8034          */
8035         if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
8036                 ret = __exclude_logged_extent(fs_info, ins->objectid,
8037                                               ins->offset);
8038                 if (ret)
8039                         return ret;
8040         }
8041
8042         block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
8043         if (!block_group)
8044                 return -EINVAL;
8045
8046         space_info = block_group->space_info;
8047         spin_lock(&space_info->lock);
8048         spin_lock(&block_group->lock);
8049         space_info->bytes_reserved += ins->offset;
8050         block_group->reserved += ins->offset;
8051         spin_unlock(&block_group->lock);
8052         spin_unlock(&space_info->lock);
8053
8054         ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
8055                                          offset, ins, 1);
8056         btrfs_put_block_group(block_group);
8057         return ret;
8058 }
8059
8060 static struct extent_buffer *
8061 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
8062                       u64 bytenr, int level, u64 owner)
8063 {
8064         struct btrfs_fs_info *fs_info = root->fs_info;
8065         struct extent_buffer *buf;
8066
8067         buf = btrfs_find_create_tree_block(fs_info, bytenr);
8068         if (IS_ERR(buf))
8069                 return buf;
8070
8071         /*
8072          * Extra safety check in case the extent tree is corrupted and extent
8073          * allocator chooses to use a tree block which is already used and
8074          * locked.
8075          */
8076         if (buf->lock_owner == current->pid) {
8077                 btrfs_err_rl(fs_info,
8078 "tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
8079                         buf->start, btrfs_header_owner(buf), current->pid);
8080                 free_extent_buffer(buf);
8081                 return ERR_PTR(-EUCLEAN);
8082         }
8083
8084         btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
8085         btrfs_tree_lock(buf);
8086         btrfs_clean_tree_block(buf);
8087         clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
8088
8089         btrfs_set_lock_blocking_write(buf);
8090         set_extent_buffer_uptodate(buf);
8091
8092         memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
8093         btrfs_set_header_level(buf, level);
8094         btrfs_set_header_bytenr(buf, buf->start);
8095         btrfs_set_header_generation(buf, trans->transid);
8096         btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
8097         btrfs_set_header_owner(buf, owner);
8098         write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
8099         write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
8100         if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
8101                 buf->log_index = root->log_transid % 2;
8102                 /*
8103                  * we allow two log transactions at a time, use different
8104                  * EXTENT bit to differentiate dirty pages.
8105                  */
8106                 if (buf->log_index == 0)
8107                         set_extent_dirty(&root->dirty_log_pages, buf->start,
8108                                         buf->start + buf->len - 1, GFP_NOFS);
8109                 else
8110                         set_extent_new(&root->dirty_log_pages, buf->start,
8111                                         buf->start + buf->len - 1);
8112         } else {
8113                 buf->log_index = -1;
8114                 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
8115                          buf->start + buf->len - 1, GFP_NOFS);
8116         }
8117         trans->dirty = true;
8118         /* this returns a buffer locked for blocking */
8119         return buf;
8120 }
8121
8122 static struct btrfs_block_rsv *
8123 use_block_rsv(struct btrfs_trans_handle *trans,
8124               struct btrfs_root *root, u32 blocksize)
8125 {
8126         struct btrfs_fs_info *fs_info = root->fs_info;
8127         struct btrfs_block_rsv *block_rsv;
8128         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
8129         int ret;
8130         bool global_updated = false;
8131
8132         block_rsv = get_block_rsv(trans, root);
8133
8134         if (unlikely(block_rsv->size == 0))
8135                 goto try_reserve;
8136 again:
8137         ret = block_rsv_use_bytes(block_rsv, blocksize);
8138         if (!ret)
8139                 return block_rsv;
8140
8141         if (block_rsv->failfast)
8142                 return ERR_PTR(ret);
8143
8144         if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
8145                 global_updated = true;
8146                 update_global_block_rsv(fs_info);
8147                 goto again;
8148         }
8149
8150         /*
8151          * The global reserve still exists to save us from ourselves, so don't
8152          * warn_on if we are short on our delayed refs reserve.
8153          */
8154         if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS &&
8155             btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
8156                 static DEFINE_RATELIMIT_STATE(_rs,
8157                                 DEFAULT_RATELIMIT_INTERVAL * 10,
8158                                 /*DEFAULT_RATELIMIT_BURST*/ 1);
8159                 if (__ratelimit(&_rs))
8160                         WARN(1, KERN_DEBUG
8161                                 "BTRFS: block rsv returned %d\n", ret);
8162         }
8163 try_reserve:
8164         ret = reserve_metadata_bytes(root, block_rsv, blocksize,
8165                                      BTRFS_RESERVE_NO_FLUSH);
8166         if (!ret)
8167                 return block_rsv;
8168         /*
8169          * If we couldn't reserve metadata bytes try and use some from
8170          * the global reserve if its space type is the same as the global
8171          * reservation.
8172          */
8173         if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
8174             block_rsv->space_info == global_rsv->space_info) {
8175                 ret = block_rsv_use_bytes(global_rsv, blocksize);
8176                 if (!ret)
8177                         return global_rsv;
8178         }
8179         return ERR_PTR(ret);
8180 }
8181
8182 static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
8183                             struct btrfs_block_rsv *block_rsv, u32 blocksize)
8184 {
8185         block_rsv_add_bytes(block_rsv, blocksize, false);
8186         block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL);
8187 }
8188
8189 /*
8190  * finds a free extent and does all the dirty work required for allocation
8191  * returns the tree buffer or an ERR_PTR on error.
8192  */
8193 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
8194                                              struct btrfs_root *root,
8195                                              u64 parent, u64 root_objectid,
8196                                              const struct btrfs_disk_key *key,
8197                                              int level, u64 hint,
8198                                              u64 empty_size)
8199 {
8200         struct btrfs_fs_info *fs_info = root->fs_info;
8201         struct btrfs_key ins;
8202         struct btrfs_block_rsv *block_rsv;
8203         struct extent_buffer *buf;
8204         struct btrfs_delayed_extent_op *extent_op;
8205         struct btrfs_ref generic_ref = { 0 };
8206         u64 flags = 0;
8207         int ret;
8208         u32 blocksize = fs_info->nodesize;
8209         bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
8210
8211 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
8212         if (btrfs_is_testing(fs_info)) {
8213                 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
8214                                             level, root_objectid);
8215                 if (!IS_ERR(buf))
8216                         root->alloc_bytenr += blocksize;
8217                 return buf;
8218         }
8219 #endif
8220
8221         block_rsv = use_block_rsv(trans, root, blocksize);
8222         if (IS_ERR(block_rsv))
8223                 return ERR_CAST(block_rsv);
8224
8225         ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
8226                                    empty_size, hint, &ins, 0, 0);
8227         if (ret)
8228                 goto out_unuse;
8229
8230         buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
8231                                     root_objectid);
8232         if (IS_ERR(buf)) {
8233                 ret = PTR_ERR(buf);
8234                 goto out_free_reserved;
8235         }
8236
8237         if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
8238                 if (parent == 0)
8239                         parent = ins.objectid;
8240                 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
8241         } else
8242                 BUG_ON(parent > 0);
8243
8244         if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
8245                 extent_op = btrfs_alloc_delayed_extent_op();
8246                 if (!extent_op) {
8247                         ret = -ENOMEM;
8248                         goto out_free_buf;
8249                 }
8250                 if (key)
8251                         memcpy(&extent_op->key, key, sizeof(extent_op->key));
8252                 else
8253                         memset(&extent_op->key, 0, sizeof(extent_op->key));
8254                 extent_op->flags_to_set = flags;
8255                 extent_op->update_key = skinny_metadata ? false : true;
8256                 extent_op->update_flags = true;
8257                 extent_op->is_data = false;
8258                 extent_op->level = level;
8259
8260                 btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
8261                                        ins.objectid, ins.offset, parent);
8262                 generic_ref.real_root = root->root_key.objectid;
8263                 btrfs_init_tree_ref(&generic_ref, level, root_objectid);
8264                 btrfs_ref_tree_mod(fs_info, &generic_ref);
8265                 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref,
8266                                                  extent_op, NULL, NULL);
8267                 if (ret)
8268                         goto out_free_delayed;
8269         }
8270         return buf;
8271
8272 out_free_delayed:
8273         btrfs_free_delayed_extent_op(extent_op);
8274 out_free_buf:
8275         free_extent_buffer(buf);
8276 out_free_reserved:
8277         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
8278 out_unuse:
8279         unuse_block_rsv(fs_info, block_rsv, blocksize);
8280         return ERR_PTR(ret);
8281 }
8282
8283 struct walk_control {
8284         u64 refs[BTRFS_MAX_LEVEL];
8285         u64 flags[BTRFS_MAX_LEVEL];
8286         struct btrfs_key update_progress;
8287         struct btrfs_key drop_progress;
8288         int drop_level;
8289         int stage;
8290         int level;
8291         int shared_level;
8292         int update_ref;
8293         int keep_locks;
8294         int reada_slot;
8295         int reada_count;
8296         int restarted;
8297 };
8298
8299 #define DROP_REFERENCE  1
8300 #define UPDATE_BACKREF  2
8301
8302 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
8303                                      struct btrfs_root *root,
8304                                      struct walk_control *wc,
8305                                      struct btrfs_path *path)
8306 {
8307         struct btrfs_fs_info *fs_info = root->fs_info;
8308         u64 bytenr;
8309         u64 generation;
8310         u64 refs;
8311         u64 flags;
8312         u32 nritems;
8313         struct btrfs_key key;
8314         struct extent_buffer *eb;
8315         int ret;
8316         int slot;
8317         int nread = 0;
8318
8319         if (path->slots[wc->level] < wc->reada_slot) {
8320                 wc->reada_count = wc->reada_count * 2 / 3;
8321                 wc->reada_count = max(wc->reada_count, 2);
8322         } else {
8323                 wc->reada_count = wc->reada_count * 3 / 2;
8324                 wc->reada_count = min_t(int, wc->reada_count,
8325                                         BTRFS_NODEPTRS_PER_BLOCK(fs_info));
8326         }
8327
8328         eb = path->nodes[wc->level];
8329         nritems = btrfs_header_nritems(eb);
8330
8331         for (slot = path->slots[wc->level]; slot < nritems; slot++) {
8332                 if (nread >= wc->reada_count)
8333                         break;
8334
8335                 cond_resched();
8336                 bytenr = btrfs_node_blockptr(eb, slot);
8337                 generation = btrfs_node_ptr_generation(eb, slot);
8338
8339                 if (slot == path->slots[wc->level])
8340                         goto reada;
8341
8342                 if (wc->stage == UPDATE_BACKREF &&
8343                     generation <= root->root_key.offset)
8344                         continue;
8345
8346                 /* We don't lock the tree block, it's OK to be racy here */
8347                 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
8348                                                wc->level - 1, 1, &refs,
8349                                                &flags);
8350                 /* We don't care about errors in readahead. */
8351                 if (ret < 0)
8352                         continue;
8353                 BUG_ON(refs == 0);
8354
8355                 if (wc->stage == DROP_REFERENCE) {
8356                         if (refs == 1)
8357                                 goto reada;
8358
8359                         if (wc->level == 1 &&
8360                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8361                                 continue;
8362                         if (!wc->update_ref ||
8363                             generation <= root->root_key.offset)
8364                                 continue;
8365                         btrfs_node_key_to_cpu(eb, &key, slot);
8366                         ret = btrfs_comp_cpu_keys(&key,
8367                                                   &wc->update_progress);
8368                         if (ret < 0)
8369                                 continue;
8370                 } else {
8371                         if (wc->level == 1 &&
8372                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8373                                 continue;
8374                 }
8375 reada:
8376                 readahead_tree_block(fs_info, bytenr);
8377                 nread++;
8378         }
8379         wc->reada_slot = slot;
8380 }
8381
8382 /*
8383  * helper to process tree block while walking down the tree.
8384  *
8385  * when wc->stage == UPDATE_BACKREF, this function updates
8386  * back refs for pointers in the block.
8387  *
8388  * NOTE: return value 1 means we should stop walking down.
8389  */
8390 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
8391                                    struct btrfs_root *root,
8392                                    struct btrfs_path *path,
8393                                    struct walk_control *wc, int lookup_info)
8394 {
8395         struct btrfs_fs_info *fs_info = root->fs_info;
8396         int level = wc->level;
8397         struct extent_buffer *eb = path->nodes[level];
8398         u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
8399         int ret;
8400
8401         if (wc->stage == UPDATE_BACKREF &&
8402             btrfs_header_owner(eb) != root->root_key.objectid)
8403                 return 1;
8404
8405         /*
8406          * when reference count of tree block is 1, it won't increase
8407          * again. once full backref flag is set, we never clear it.
8408          */
8409         if (lookup_info &&
8410             ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
8411              (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
8412                 BUG_ON(!path->locks[level]);
8413                 ret = btrfs_lookup_extent_info(trans, fs_info,
8414                                                eb->start, level, 1,
8415                                                &wc->refs[level],
8416                                                &wc->flags[level]);
8417                 BUG_ON(ret == -ENOMEM);
8418                 if (ret)
8419                         return ret;
8420                 BUG_ON(wc->refs[level] == 0);
8421         }
8422
8423         if (wc->stage == DROP_REFERENCE) {
8424                 if (wc->refs[level] > 1)
8425                         return 1;
8426
8427                 if (path->locks[level] && !wc->keep_locks) {
8428                         btrfs_tree_unlock_rw(eb, path->locks[level]);
8429                         path->locks[level] = 0;
8430                 }
8431                 return 0;
8432         }
8433
8434         /* wc->stage == UPDATE_BACKREF */
8435         if (!(wc->flags[level] & flag)) {
8436                 BUG_ON(!path->locks[level]);
8437                 ret = btrfs_inc_ref(trans, root, eb, 1);
8438                 BUG_ON(ret); /* -ENOMEM */
8439                 ret = btrfs_dec_ref(trans, root, eb, 0);
8440                 BUG_ON(ret); /* -ENOMEM */
8441                 ret = btrfs_set_disk_extent_flags(trans, eb->start,
8442                                                   eb->len, flag,
8443                                                   btrfs_header_level(eb), 0);
8444                 BUG_ON(ret); /* -ENOMEM */
8445                 wc->flags[level] |= flag;
8446         }
8447
8448         /*
8449          * the block is shared by multiple trees, so it's not good to
8450          * keep the tree lock
8451          */
8452         if (path->locks[level] && level > 0) {
8453                 btrfs_tree_unlock_rw(eb, path->locks[level]);
8454                 path->locks[level] = 0;
8455         }
8456         return 0;
8457 }
8458
8459 /*
8460  * This is used to verify a ref exists for this root to deal with a bug where we
8461  * would have a drop_progress key that hadn't been updated properly.
8462  */
8463 static int check_ref_exists(struct btrfs_trans_handle *trans,
8464                             struct btrfs_root *root, u64 bytenr, u64 parent,
8465                             int level)
8466 {
8467         struct btrfs_path *path;
8468         struct btrfs_extent_inline_ref *iref;
8469         int ret;
8470
8471         path = btrfs_alloc_path();
8472         if (!path)
8473                 return -ENOMEM;
8474
8475         ret = lookup_extent_backref(trans, path, &iref, bytenr,
8476                                     root->fs_info->nodesize, parent,
8477                                     root->root_key.objectid, level, 0);
8478         btrfs_free_path(path);
8479         if (ret == -ENOENT)
8480                 return 0;
8481         if (ret < 0)
8482                 return ret;
8483         return 1;
8484 }
8485
8486 /*
8487  * helper to process tree block pointer.
8488  *
8489  * when wc->stage == DROP_REFERENCE, this function checks
8490  * reference count of the block pointed to. if the block
8491  * is shared and we need update back refs for the subtree
8492  * rooted at the block, this function changes wc->stage to
8493  * UPDATE_BACKREF. if the block is shared and there is no
8494  * need to update back, this function drops the reference
8495  * to the block.
8496  *
8497  * NOTE: return value 1 means we should stop walking down.
8498  */
8499 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
8500                                  struct btrfs_root *root,
8501                                  struct btrfs_path *path,
8502                                  struct walk_control *wc, int *lookup_info)
8503 {
8504         struct btrfs_fs_info *fs_info = root->fs_info;
8505         u64 bytenr;
8506         u64 generation;
8507         u64 parent;
8508         struct btrfs_key key;
8509         struct btrfs_key first_key;
8510         struct btrfs_ref ref = { 0 };
8511         struct extent_buffer *next;
8512         int level = wc->level;
8513         int reada = 0;
8514         int ret = 0;
8515         bool need_account = false;
8516
8517         generation = btrfs_node_ptr_generation(path->nodes[level],
8518                                                path->slots[level]);
8519         /*
8520          * if the lower level block was created before the snapshot
8521          * was created, we know there is no need to update back refs
8522          * for the subtree
8523          */
8524         if (wc->stage == UPDATE_BACKREF &&
8525             generation <= root->root_key.offset) {
8526                 *lookup_info = 1;
8527                 return 1;
8528         }
8529
8530         bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
8531         btrfs_node_key_to_cpu(path->nodes[level], &first_key,
8532                               path->slots[level]);
8533
8534         next = find_extent_buffer(fs_info, bytenr);
8535         if (!next) {
8536                 next = btrfs_find_create_tree_block(fs_info, bytenr);
8537                 if (IS_ERR(next))
8538                         return PTR_ERR(next);
8539
8540                 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
8541                                                level - 1);
8542                 reada = 1;
8543         }
8544         btrfs_tree_lock(next);
8545         btrfs_set_lock_blocking_write(next);
8546
8547         ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
8548                                        &wc->refs[level - 1],
8549                                        &wc->flags[level - 1]);
8550         if (ret < 0)
8551                 goto out_unlock;
8552
8553         if (unlikely(wc->refs[level - 1] == 0)) {
8554                 btrfs_err(fs_info, "Missing references.");
8555                 ret = -EIO;
8556                 goto out_unlock;
8557         }
8558         *lookup_info = 0;
8559
8560         if (wc->stage == DROP_REFERENCE) {
8561                 if (wc->refs[level - 1] > 1) {
8562                         need_account = true;
8563                         if (level == 1 &&
8564                             (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8565                                 goto skip;
8566
8567                         if (!wc->update_ref ||
8568                             generation <= root->root_key.offset)
8569                                 goto skip;
8570
8571                         btrfs_node_key_to_cpu(path->nodes[level], &key,
8572                                               path->slots[level]);
8573                         ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
8574                         if (ret < 0)
8575                                 goto skip;
8576
8577                         wc->stage = UPDATE_BACKREF;
8578                         wc->shared_level = level - 1;
8579                 }
8580         } else {
8581                 if (level == 1 &&
8582                     (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8583                         goto skip;
8584         }
8585
8586         if (!btrfs_buffer_uptodate(next, generation, 0)) {
8587                 btrfs_tree_unlock(next);
8588                 free_extent_buffer(next);
8589                 next = NULL;
8590                 *lookup_info = 1;
8591         }
8592
8593         if (!next) {
8594                 if (reada && level == 1)
8595                         reada_walk_down(trans, root, wc, path);
8596                 next = read_tree_block(fs_info, bytenr, generation, level - 1,
8597                                        &first_key);
8598                 if (IS_ERR(next)) {
8599                         return PTR_ERR(next);
8600                 } else if (!extent_buffer_uptodate(next)) {
8601                         free_extent_buffer(next);
8602                         return -EIO;
8603                 }
8604                 btrfs_tree_lock(next);
8605                 btrfs_set_lock_blocking_write(next);
8606         }
8607
8608         level--;
8609         ASSERT(level == btrfs_header_level(next));
8610         if (level != btrfs_header_level(next)) {
8611                 btrfs_err(root->fs_info, "mismatched level");
8612                 ret = -EIO;
8613                 goto out_unlock;
8614         }
8615         path->nodes[level] = next;
8616         path->slots[level] = 0;
8617         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8618         wc->level = level;
8619         if (wc->level == 1)
8620                 wc->reada_slot = 0;
8621         return 0;
8622 skip:
8623         wc->refs[level - 1] = 0;
8624         wc->flags[level - 1] = 0;
8625         if (wc->stage == DROP_REFERENCE) {
8626                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
8627                         parent = path->nodes[level]->start;
8628                 } else {
8629                         ASSERT(root->root_key.objectid ==
8630                                btrfs_header_owner(path->nodes[level]));
8631                         if (root->root_key.objectid !=
8632                             btrfs_header_owner(path->nodes[level])) {
8633                                 btrfs_err(root->fs_info,
8634                                                 "mismatched block owner");
8635                                 ret = -EIO;
8636                                 goto out_unlock;
8637                         }
8638                         parent = 0;
8639                 }
8640
8641                 /*
8642                  * If we had a drop_progress we need to verify the refs are set
8643                  * as expected.  If we find our ref then we know that from here
8644                  * on out everything should be correct, and we can clear the
8645                  * ->restarted flag.
8646                  */
8647                 if (wc->restarted) {
8648                         ret = check_ref_exists(trans, root, bytenr, parent,
8649                                                level - 1);
8650                         if (ret < 0)
8651                                 goto out_unlock;
8652                         if (ret == 0)
8653                                 goto no_delete;
8654                         ret = 0;
8655                         wc->restarted = 0;
8656                 }
8657
8658                 /*
8659                  * Reloc tree doesn't contribute to qgroup numbers, and we have
8660                  * already accounted them at merge time (replace_path),
8661                  * thus we could skip expensive subtree trace here.
8662                  */
8663                 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
8664                     need_account) {
8665                         ret = btrfs_qgroup_trace_subtree(trans, next,
8666                                                          generation, level - 1);
8667                         if (ret) {
8668                                 btrfs_err_rl(fs_info,
8669                                              "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
8670                                              ret);
8671                         }
8672                 }
8673
8674                 /*
8675                  * We need to update the next key in our walk control so we can
8676                  * update the drop_progress key accordingly.  We don't care if
8677                  * find_next_key doesn't find a key because that means we're at
8678                  * the end and are going to clean up now.
8679                  */
8680                 wc->drop_level = level;
8681                 find_next_key(path, level, &wc->drop_progress);
8682
8683                 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
8684                                        fs_info->nodesize, parent);
8685                 btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid);
8686                 ret = btrfs_free_extent(trans, &ref);
8687                 if (ret)
8688                         goto out_unlock;
8689         }
8690 no_delete:
8691         *lookup_info = 1;
8692         ret = 1;
8693
8694 out_unlock:
8695         btrfs_tree_unlock(next);
8696         free_extent_buffer(next);
8697
8698         return ret;
8699 }
8700
8701 /*
8702  * helper to process tree block while walking up the tree.
8703  *
8704  * when wc->stage == DROP_REFERENCE, this function drops
8705  * reference count on the block.
8706  *
8707  * when wc->stage == UPDATE_BACKREF, this function changes
8708  * wc->stage back to DROP_REFERENCE if we changed wc->stage
8709  * to UPDATE_BACKREF previously while processing the block.
8710  *
8711  * NOTE: return value 1 means we should stop walking up.
8712  */
8713 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
8714                                  struct btrfs_root *root,
8715                                  struct btrfs_path *path,
8716                                  struct walk_control *wc)
8717 {
8718         struct btrfs_fs_info *fs_info = root->fs_info;
8719         int ret;
8720         int level = wc->level;
8721         struct extent_buffer *eb = path->nodes[level];
8722         u64 parent = 0;
8723
8724         if (wc->stage == UPDATE_BACKREF) {
8725                 BUG_ON(wc->shared_level < level);
8726                 if (level < wc->shared_level)
8727                         goto out;
8728
8729                 ret = find_next_key(path, level + 1, &wc->update_progress);
8730                 if (ret > 0)
8731                         wc->update_ref = 0;
8732
8733                 wc->stage = DROP_REFERENCE;
8734                 wc->shared_level = -1;
8735                 path->slots[level] = 0;
8736
8737                 /*
8738                  * check reference count again if the block isn't locked.
8739                  * we should start walking down the tree again if reference
8740                  * count is one.
8741                  */
8742                 if (!path->locks[level]) {
8743                         BUG_ON(level == 0);
8744                         btrfs_tree_lock(eb);
8745                         btrfs_set_lock_blocking_write(eb);
8746                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8747
8748                         ret = btrfs_lookup_extent_info(trans, fs_info,
8749                                                        eb->start, level, 1,
8750                                                        &wc->refs[level],
8751                                                        &wc->flags[level]);
8752                         if (ret < 0) {
8753                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
8754                                 path->locks[level] = 0;
8755                                 return ret;
8756                         }
8757                         BUG_ON(wc->refs[level] == 0);
8758                         if (wc->refs[level] == 1) {
8759                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
8760                                 path->locks[level] = 0;
8761                                 return 1;
8762                         }
8763                 }
8764         }
8765
8766         /* wc->stage == DROP_REFERENCE */
8767         BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
8768
8769         if (wc->refs[level] == 1) {
8770                 if (level == 0) {
8771                         if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8772                                 ret = btrfs_dec_ref(trans, root, eb, 1);
8773                         else
8774                                 ret = btrfs_dec_ref(trans, root, eb, 0);
8775                         BUG_ON(ret); /* -ENOMEM */
8776                         if (is_fstree(root->root_key.objectid)) {
8777                                 ret = btrfs_qgroup_trace_leaf_items(trans, eb);
8778                                 if (ret) {
8779                                         btrfs_err_rl(fs_info,
8780         "error %d accounting leaf items, quota is out of sync, rescan required",
8781                                              ret);
8782                                 }
8783                         }
8784                 }
8785                 /* make block locked assertion in btrfs_clean_tree_block happy */
8786                 if (!path->locks[level] &&
8787                     btrfs_header_generation(eb) == trans->transid) {
8788                         btrfs_tree_lock(eb);
8789                         btrfs_set_lock_blocking_write(eb);
8790                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8791                 }
8792                 btrfs_clean_tree_block(eb);
8793         }
8794
8795         if (eb == root->node) {
8796                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8797                         parent = eb->start;
8798                 else if (root->root_key.objectid != btrfs_header_owner(eb))
8799                         goto owner_mismatch;
8800         } else {
8801                 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8802                         parent = path->nodes[level + 1]->start;
8803                 else if (root->root_key.objectid !=
8804                          btrfs_header_owner(path->nodes[level + 1]))
8805                         goto owner_mismatch;
8806         }
8807
8808         btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
8809 out:
8810         wc->refs[level] = 0;
8811         wc->flags[level] = 0;
8812         return 0;
8813
8814 owner_mismatch:
8815         btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
8816                      btrfs_header_owner(eb), root->root_key.objectid);
8817         return -EUCLEAN;
8818 }
8819
8820 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
8821                                    struct btrfs_root *root,
8822                                    struct btrfs_path *path,
8823                                    struct walk_control *wc)
8824 {
8825         int level = wc->level;
8826         int lookup_info = 1;
8827         int ret;
8828
8829         while (level >= 0) {
8830                 ret = walk_down_proc(trans, root, path, wc, lookup_info);
8831                 if (ret > 0)
8832                         break;
8833
8834                 if (level == 0)
8835                         break;
8836
8837                 if (path->slots[level] >=
8838                     btrfs_header_nritems(path->nodes[level]))
8839                         break;
8840
8841                 ret = do_walk_down(trans, root, path, wc, &lookup_info);
8842                 if (ret > 0) {
8843                         path->slots[level]++;
8844                         continue;
8845                 } else if (ret < 0)
8846                         return ret;
8847                 level = wc->level;
8848         }
8849         return 0;
8850 }
8851
8852 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
8853                                  struct btrfs_root *root,
8854                                  struct btrfs_path *path,
8855                                  struct walk_control *wc, int max_level)
8856 {
8857         int level = wc->level;
8858         int ret;
8859
8860         path->slots[level] = btrfs_header_nritems(path->nodes[level]);
8861         while (level < max_level && path->nodes[level]) {
8862                 wc->level = level;
8863                 if (path->slots[level] + 1 <
8864                     btrfs_header_nritems(path->nodes[level])) {
8865                         path->slots[level]++;
8866                         return 0;
8867                 } else {
8868                         ret = walk_up_proc(trans, root, path, wc);
8869                         if (ret > 0)
8870                                 return 0;
8871                         if (ret < 0)
8872                                 return ret;
8873
8874                         if (path->locks[level]) {
8875                                 btrfs_tree_unlock_rw(path->nodes[level],
8876                                                      path->locks[level]);
8877                                 path->locks[level] = 0;
8878                         }
8879                         free_extent_buffer(path->nodes[level]);
8880                         path->nodes[level] = NULL;
8881                         level++;
8882                 }
8883         }
8884         return 1;
8885 }
8886
8887 /*
8888  * drop a subvolume tree.
8889  *
8890  * this function traverses the tree freeing any blocks that only
8891  * referenced by the tree.
8892  *
8893  * when a shared tree block is found. this function decreases its
8894  * reference count by one. if update_ref is true, this function
8895  * also make sure backrefs for the shared block and all lower level
8896  * blocks are properly updated.
8897  *
8898  * If called with for_reloc == 0, may exit early with -EAGAIN
8899  */
8900 int btrfs_drop_snapshot(struct btrfs_root *root,
8901                          struct btrfs_block_rsv *block_rsv, int update_ref,
8902                          int for_reloc)
8903 {
8904         struct btrfs_fs_info *fs_info = root->fs_info;
8905         struct btrfs_path *path;
8906         struct btrfs_trans_handle *trans;
8907         struct btrfs_root *tree_root = fs_info->tree_root;
8908         struct btrfs_root_item *root_item = &root->root_item;
8909         struct walk_control *wc;
8910         struct btrfs_key key;
8911         int err = 0;
8912         int ret;
8913         int level;
8914         bool root_dropped = false;
8915
8916         btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
8917
8918         path = btrfs_alloc_path();
8919         if (!path) {
8920                 err = -ENOMEM;
8921                 goto out;
8922         }
8923
8924         wc = kzalloc(sizeof(*wc), GFP_NOFS);
8925         if (!wc) {
8926                 btrfs_free_path(path);
8927                 err = -ENOMEM;
8928                 goto out;
8929         }
8930
8931         trans = btrfs_start_transaction(tree_root, 0);
8932         if (IS_ERR(trans)) {
8933                 err = PTR_ERR(trans);
8934                 goto out_free;
8935         }
8936
8937         err = btrfs_run_delayed_items(trans);
8938         if (err)
8939                 goto out_end_trans;
8940
8941         if (block_rsv)
8942                 trans->block_rsv = block_rsv;
8943
8944         /*
8945          * This will help us catch people modifying the fs tree while we're
8946          * dropping it.  It is unsafe to mess with the fs tree while it's being
8947          * dropped as we unlock the root node and parent nodes as we walk down
8948          * the tree, assuming nothing will change.  If something does change
8949          * then we'll have stale information and drop references to blocks we've
8950          * already dropped.
8951          */
8952         set_bit(BTRFS_ROOT_DELETING, &root->state);
8953         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
8954                 level = btrfs_header_level(root->node);
8955                 path->nodes[level] = btrfs_lock_root_node(root);
8956                 btrfs_set_lock_blocking_write(path->nodes[level]);
8957                 path->slots[level] = 0;
8958                 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8959                 memset(&wc->update_progress, 0,
8960                        sizeof(wc->update_progress));
8961         } else {
8962                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
8963                 memcpy(&wc->update_progress, &key,
8964                        sizeof(wc->update_progress));
8965
8966                 level = root_item->drop_level;
8967                 BUG_ON(level == 0);
8968                 path->lowest_level = level;
8969                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
8970                 path->lowest_level = 0;
8971                 if (ret < 0) {
8972                         err = ret;
8973                         goto out_end_trans;
8974                 }
8975                 WARN_ON(ret > 0);
8976
8977                 /*
8978                  * unlock our path, this is safe because only this
8979                  * function is allowed to delete this snapshot
8980                  */
8981                 btrfs_unlock_up_safe(path, 0);
8982
8983                 level = btrfs_header_level(root->node);
8984                 while (1) {
8985                         btrfs_tree_lock(path->nodes[level]);
8986                         btrfs_set_lock_blocking_write(path->nodes[level]);
8987                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8988
8989                         ret = btrfs_lookup_extent_info(trans, fs_info,
8990                                                 path->nodes[level]->start,
8991                                                 level, 1, &wc->refs[level],
8992                                                 &wc->flags[level]);
8993                         if (ret < 0) {
8994                                 err = ret;
8995                                 goto out_end_trans;
8996                         }
8997                         BUG_ON(wc->refs[level] == 0);
8998
8999                         if (level == root_item->drop_level)
9000                                 break;
9001
9002                         btrfs_tree_unlock(path->nodes[level]);
9003                         path->locks[level] = 0;
9004                         WARN_ON(wc->refs[level] != 1);
9005                         level--;
9006                 }
9007         }
9008
9009         wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
9010         wc->level = level;
9011         wc->shared_level = -1;
9012         wc->stage = DROP_REFERENCE;
9013         wc->update_ref = update_ref;
9014         wc->keep_locks = 0;
9015         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
9016
9017         while (1) {
9018
9019                 ret = walk_down_tree(trans, root, path, wc);
9020                 if (ret < 0) {
9021                         err = ret;
9022                         break;
9023                 }
9024
9025                 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
9026                 if (ret < 0) {
9027                         err = ret;
9028                         break;
9029                 }
9030
9031                 if (ret > 0) {
9032                         BUG_ON(wc->stage != DROP_REFERENCE);
9033                         break;
9034                 }
9035
9036                 if (wc->stage == DROP_REFERENCE) {
9037                         wc->drop_level = wc->level;
9038                         btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
9039                                               &wc->drop_progress,
9040                                               path->slots[wc->drop_level]);
9041                 }
9042                 btrfs_cpu_key_to_disk(&root_item->drop_progress,
9043                                       &wc->drop_progress);
9044                 root_item->drop_level = wc->drop_level;
9045
9046                 BUG_ON(wc->level == 0);
9047                 if (btrfs_should_end_transaction(trans) ||
9048                     (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
9049                         ret = btrfs_update_root(trans, tree_root,
9050                                                 &root->root_key,
9051                                                 root_item);
9052                         if (ret) {
9053                                 btrfs_abort_transaction(trans, ret);
9054                                 err = ret;
9055                                 goto out_end_trans;
9056                         }
9057
9058                         btrfs_end_transaction_throttle(trans);
9059                         if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
9060                                 btrfs_debug(fs_info,
9061                                             "drop snapshot early exit");
9062                                 err = -EAGAIN;
9063                                 goto out_free;
9064                         }
9065
9066                         trans = btrfs_start_transaction(tree_root, 0);
9067                         if (IS_ERR(trans)) {
9068                                 err = PTR_ERR(trans);
9069                                 goto out_free;
9070                         }
9071                         if (block_rsv)
9072                                 trans->block_rsv = block_rsv;
9073                 }
9074         }
9075         btrfs_release_path(path);
9076         if (err)
9077                 goto out_end_trans;
9078
9079         ret = btrfs_del_root(trans, &root->root_key);
9080         if (ret) {
9081                 btrfs_abort_transaction(trans, ret);
9082                 err = ret;
9083                 goto out_end_trans;
9084         }
9085
9086         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
9087                 ret = btrfs_find_root(tree_root, &root->root_key, path,
9088                                       NULL, NULL);
9089                 if (ret < 0) {
9090                         btrfs_abort_transaction(trans, ret);
9091                         err = ret;
9092                         goto out_end_trans;
9093                 } else if (ret > 0) {
9094                         /* if we fail to delete the orphan item this time
9095                          * around, it'll get picked up the next time.
9096                          *
9097                          * The most common failure here is just -ENOENT.
9098                          */
9099                         btrfs_del_orphan_item(trans, tree_root,
9100                                               root->root_key.objectid);
9101                 }
9102         }
9103
9104         if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
9105                 btrfs_add_dropped_root(trans, root);
9106         } else {
9107                 free_extent_buffer(root->node);
9108                 free_extent_buffer(root->commit_root);
9109                 btrfs_put_fs_root(root);
9110         }
9111         root_dropped = true;
9112 out_end_trans:
9113         btrfs_end_transaction_throttle(trans);
9114 out_free:
9115         kfree(wc);
9116         btrfs_free_path(path);
9117 out:
9118         /*
9119          * So if we need to stop dropping the snapshot for whatever reason we
9120          * need to make sure to add it back to the dead root list so that we
9121          * keep trying to do the work later.  This also cleans up roots if we
9122          * don't have it in the radix (like when we recover after a power fail
9123          * or unmount) so we don't leak memory.
9124          */
9125         if (!for_reloc && !root_dropped)
9126                 btrfs_add_dead_root(root);
9127         if (err && err != -EAGAIN)
9128                 btrfs_handle_fs_error(fs_info, err, NULL);
9129         return err;
9130 }
9131
9132 /*
9133  * drop subtree rooted at tree block 'node'.
9134  *
9135  * NOTE: this function will unlock and release tree block 'node'
9136  * only used by relocation code
9137  */
9138 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
9139                         struct btrfs_root *root,
9140                         struct extent_buffer *node,
9141                         struct extent_buffer *parent)
9142 {
9143         struct btrfs_fs_info *fs_info = root->fs_info;
9144         struct btrfs_path *path;
9145         struct walk_control *wc;
9146         int level;
9147         int parent_level;
9148         int ret = 0;
9149         int wret;
9150
9151         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
9152
9153         path = btrfs_alloc_path();
9154         if (!path)
9155                 return -ENOMEM;
9156
9157         wc = kzalloc(sizeof(*wc), GFP_NOFS);
9158         if (!wc) {
9159                 btrfs_free_path(path);
9160                 return -ENOMEM;
9161         }
9162
9163         btrfs_assert_tree_locked(parent);
9164         parent_level = btrfs_header_level(parent);
9165         extent_buffer_get(parent);
9166         path->nodes[parent_level] = parent;
9167         path->slots[parent_level] = btrfs_header_nritems(parent);
9168
9169         btrfs_assert_tree_locked(node);
9170         level = btrfs_header_level(node);
9171         path->nodes[level] = node;
9172         path->slots[level] = 0;
9173         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9174
9175         wc->refs[parent_level] = 1;
9176         wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
9177         wc->level = level;
9178         wc->shared_level = -1;
9179         wc->stage = DROP_REFERENCE;
9180         wc->update_ref = 0;
9181         wc->keep_locks = 1;
9182         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
9183
9184         while (1) {
9185                 wret = walk_down_tree(trans, root, path, wc);
9186                 if (wret < 0) {
9187                         ret = wret;
9188                         break;
9189                 }
9190
9191                 wret = walk_up_tree(trans, root, path, wc, parent_level);
9192                 if (wret < 0)
9193                         ret = wret;
9194                 if (wret != 0)
9195                         break;
9196         }
9197
9198         kfree(wc);
9199         btrfs_free_path(path);
9200         return ret;
9201 }
9202
9203 static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
9204 {
9205         u64 num_devices;
9206         u64 stripped;
9207
9208         /*
9209          * if restripe for this chunk_type is on pick target profile and
9210          * return, otherwise do the usual balance
9211          */
9212         stripped = get_restripe_target(fs_info, flags);
9213         if (stripped)
9214                 return extended_to_chunk(stripped);
9215
9216         num_devices = fs_info->fs_devices->rw_devices;
9217
9218         stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK |
9219                 BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10;
9220
9221         if (num_devices == 1) {
9222                 stripped |= BTRFS_BLOCK_GROUP_DUP;
9223                 stripped = flags & ~stripped;
9224
9225                 /* turn raid0 into single device chunks */
9226                 if (flags & BTRFS_BLOCK_GROUP_RAID0)
9227                         return stripped;
9228
9229                 /* turn mirroring into duplication */
9230                 if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK |
9231                              BTRFS_BLOCK_GROUP_RAID10))
9232                         return stripped | BTRFS_BLOCK_GROUP_DUP;
9233         } else {
9234                 /* they already had raid on here, just return */
9235                 if (flags & stripped)
9236                         return flags;
9237
9238                 stripped |= BTRFS_BLOCK_GROUP_DUP;
9239                 stripped = flags & ~stripped;
9240
9241                 /* switch duplicated blocks with raid1 */
9242                 if (flags & BTRFS_BLOCK_GROUP_DUP)
9243                         return stripped | BTRFS_BLOCK_GROUP_RAID1;
9244
9245                 /* this is drive concat, leave it alone */
9246         }
9247
9248         return flags;
9249 }
9250
9251 static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
9252 {
9253         struct btrfs_space_info *sinfo = cache->space_info;
9254         u64 num_bytes;
9255         u64 sinfo_used;
9256         u64 min_allocable_bytes;
9257         int ret = -ENOSPC;
9258
9259         /*
9260          * We need some metadata space and system metadata space for
9261          * allocating chunks in some corner cases until we force to set
9262          * it to be readonly.
9263          */
9264         if ((sinfo->flags &
9265              (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
9266             !force)
9267                 min_allocable_bytes = SZ_1M;
9268         else
9269                 min_allocable_bytes = 0;
9270
9271         spin_lock(&sinfo->lock);
9272         spin_lock(&cache->lock);
9273
9274         if (cache->ro) {
9275                 cache->ro++;
9276                 ret = 0;
9277                 goto out;
9278         }
9279
9280         num_bytes = cache->key.offset - cache->reserved - cache->pinned -
9281                     cache->bytes_super - btrfs_block_group_used(&cache->item);
9282         sinfo_used = btrfs_space_info_used(sinfo, true);
9283
9284         if (sinfo_used + num_bytes + min_allocable_bytes <=
9285             sinfo->total_bytes) {
9286                 sinfo->bytes_readonly += num_bytes;
9287                 cache->ro++;
9288                 list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
9289                 ret = 0;
9290         }
9291 out:
9292         spin_unlock(&cache->lock);
9293         spin_unlock(&sinfo->lock);
9294         if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
9295                 btrfs_info(cache->fs_info,
9296                         "unable to make block group %llu ro",
9297                         cache->key.objectid);
9298                 btrfs_info(cache->fs_info,
9299                         "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu",
9300                         sinfo_used, num_bytes, min_allocable_bytes);
9301                 dump_space_info(cache->fs_info, cache->space_info, 0, 0);
9302         }
9303         return ret;
9304 }
9305
9306 int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache)
9307
9308 {
9309         struct btrfs_fs_info *fs_info = cache->fs_info;
9310         struct btrfs_trans_handle *trans;
9311         u64 alloc_flags;
9312         int ret;
9313
9314 again:
9315         trans = btrfs_join_transaction(fs_info->extent_root);
9316         if (IS_ERR(trans))
9317                 return PTR_ERR(trans);
9318
9319         /*
9320          * we're not allowed to set block groups readonly after the dirty
9321          * block groups cache has started writing.  If it already started,
9322          * back off and let this transaction commit
9323          */
9324         mutex_lock(&fs_info->ro_block_group_mutex);
9325         if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
9326                 u64 transid = trans->transid;
9327
9328                 mutex_unlock(&fs_info->ro_block_group_mutex);
9329                 btrfs_end_transaction(trans);
9330
9331                 ret = btrfs_wait_for_commit(fs_info, transid);
9332                 if (ret)
9333                         return ret;
9334                 goto again;
9335         }
9336
9337         /*
9338          * if we are changing raid levels, try to allocate a corresponding
9339          * block group with the new raid level.
9340          */
9341         alloc_flags = update_block_group_flags(fs_info, cache->flags);
9342         if (alloc_flags != cache->flags) {
9343                 ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
9344                 /*
9345                  * ENOSPC is allowed here, we may have enough space
9346                  * already allocated at the new raid level to
9347                  * carry on
9348                  */
9349                 if (ret == -ENOSPC)
9350                         ret = 0;
9351                 if (ret < 0)
9352                         goto out;
9353         }
9354
9355         ret = inc_block_group_ro(cache, 0);
9356         if (!ret)
9357                 goto out;
9358         alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags);
9359         ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
9360         if (ret < 0)
9361                 goto out;
9362         ret = inc_block_group_ro(cache, 0);
9363 out:
9364         if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
9365                 alloc_flags = update_block_group_flags(fs_info, cache->flags);
9366                 mutex_lock(&fs_info->chunk_mutex);
9367                 check_system_chunk(trans, alloc_flags);
9368                 mutex_unlock(&fs_info->chunk_mutex);
9369         }
9370         mutex_unlock(&fs_info->ro_block_group_mutex);
9371
9372         btrfs_end_transaction(trans);
9373         return ret;
9374 }
9375
9376 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
9377 {
9378         u64 alloc_flags = get_alloc_profile(trans->fs_info, type);
9379
9380         return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
9381 }
9382
9383 /*
9384  * helper to account the unused space of all the readonly block group in the
9385  * space_info. takes mirrors into account.
9386  */
9387 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
9388 {
9389         struct btrfs_block_group_cache *block_group;
9390         u64 free_bytes = 0;
9391         int factor;
9392
9393         /* It's df, we don't care if it's racy */
9394         if (list_empty(&sinfo->ro_bgs))
9395                 return 0;
9396
9397         spin_lock(&sinfo->lock);
9398         list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
9399                 spin_lock(&block_group->lock);
9400
9401                 if (!block_group->ro) {
9402                         spin_unlock(&block_group->lock);
9403                         continue;
9404                 }
9405
9406                 factor = btrfs_bg_type_to_factor(block_group->flags);
9407                 free_bytes += (block_group->key.offset -
9408                                btrfs_block_group_used(&block_group->item)) *
9409                                factor;
9410
9411                 spin_unlock(&block_group->lock);
9412         }
9413         spin_unlock(&sinfo->lock);
9414
9415         return free_bytes;
9416 }
9417
9418 void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
9419 {
9420         struct btrfs_space_info *sinfo = cache->space_info;
9421         u64 num_bytes;
9422
9423         BUG_ON(!cache->ro);
9424
9425         spin_lock(&sinfo->lock);
9426         spin_lock(&cache->lock);
9427         if (!--cache->ro) {
9428                 num_bytes = cache->key.offset - cache->reserved -
9429                             cache->pinned - cache->bytes_super -
9430                             btrfs_block_group_used(&cache->item);
9431                 sinfo->bytes_readonly -= num_bytes;
9432                 list_del_init(&cache->ro_list);
9433         }
9434         spin_unlock(&cache->lock);
9435         spin_unlock(&sinfo->lock);
9436 }
9437
9438 /*
9439  * Checks to see if it's even possible to relocate this block group.
9440  *
9441  * @return - -1 if it's not a good idea to relocate this block group, 0 if its
9442  * ok to go ahead and try.
9443  */
9444 int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
9445 {
9446         struct btrfs_block_group_cache *block_group;
9447         struct btrfs_space_info *space_info;
9448         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
9449         struct btrfs_device *device;
9450         u64 min_free;
9451         u64 dev_min = 1;
9452         u64 dev_nr = 0;
9453         u64 target;
9454         int debug;
9455         int index;
9456         int full = 0;
9457         int ret = 0;
9458
9459         debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG);
9460
9461         block_group = btrfs_lookup_block_group(fs_info, bytenr);
9462
9463         /* odd, couldn't find the block group, leave it alone */
9464         if (!block_group) {
9465                 if (debug)
9466                         btrfs_warn(fs_info,
9467                                    "can't find block group for bytenr %llu",
9468                                    bytenr);
9469                 return -1;
9470         }
9471
9472         min_free = btrfs_block_group_used(&block_group->item);
9473
9474         /* no bytes used, we're good */
9475         if (!min_free)
9476                 goto out;
9477
9478         space_info = block_group->space_info;
9479         spin_lock(&space_info->lock);
9480
9481         full = space_info->full;
9482
9483         /*
9484          * if this is the last block group we have in this space, we can't
9485          * relocate it unless we're able to allocate a new chunk below.
9486          *
9487          * Otherwise, we need to make sure we have room in the space to handle
9488          * all of the extents from this block group.  If we can, we're good
9489          */
9490         if ((space_info->total_bytes != block_group->key.offset) &&
9491             (btrfs_space_info_used(space_info, false) + min_free <
9492              space_info->total_bytes)) {
9493                 spin_unlock(&space_info->lock);
9494                 goto out;
9495         }
9496         spin_unlock(&space_info->lock);
9497
9498         /*
9499          * ok we don't have enough space, but maybe we have free space on our
9500          * devices to allocate new chunks for relocation, so loop through our
9501          * alloc devices and guess if we have enough space.  if this block
9502          * group is going to be restriped, run checks against the target
9503          * profile instead of the current one.
9504          */
9505         ret = -1;
9506
9507         /*
9508          * index:
9509          *      0: raid10
9510          *      1: raid1
9511          *      2: dup
9512          *      3: raid0
9513          *      4: single
9514          */
9515         target = get_restripe_target(fs_info, block_group->flags);
9516         if (target) {
9517                 index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target));
9518         } else {
9519                 /*
9520                  * this is just a balance, so if we were marked as full
9521                  * we know there is no space for a new chunk
9522                  */
9523                 if (full) {
9524                         if (debug)
9525                                 btrfs_warn(fs_info,
9526                                            "no space to alloc new chunk for block group %llu",
9527                                            block_group->key.objectid);
9528                         goto out;
9529                 }
9530
9531                 index = btrfs_bg_flags_to_raid_index(block_group->flags);
9532         }
9533
9534         if (index == BTRFS_RAID_RAID10) {
9535                 dev_min = 4;
9536                 /* Divide by 2 */
9537                 min_free >>= 1;
9538         } else if (index == BTRFS_RAID_RAID1) {
9539                 dev_min = 2;
9540         } else if (index == BTRFS_RAID_DUP) {
9541                 /* Multiply by 2 */
9542                 min_free <<= 1;
9543         } else if (index == BTRFS_RAID_RAID0) {
9544                 dev_min = fs_devices->rw_devices;
9545                 min_free = div64_u64(min_free, dev_min);
9546         }
9547
9548         mutex_lock(&fs_info->chunk_mutex);
9549         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
9550                 u64 dev_offset;
9551
9552                 /*
9553                  * check to make sure we can actually find a chunk with enough
9554                  * space to fit our block group in.
9555                  */
9556                 if (device->total_bytes > device->bytes_used + min_free &&
9557                     !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
9558                         ret = find_free_dev_extent(device, min_free,
9559                                                    &dev_offset, NULL);
9560                         if (!ret)
9561                                 dev_nr++;
9562
9563                         if (dev_nr >= dev_min)
9564                                 break;
9565
9566                         ret = -1;
9567                 }
9568         }
9569         if (debug && ret == -1)
9570                 btrfs_warn(fs_info,
9571                            "no space to allocate a new chunk for block group %llu",
9572                            block_group->key.objectid);
9573         mutex_unlock(&fs_info->chunk_mutex);
9574 out:
9575         btrfs_put_block_group(block_group);
9576         return ret;
9577 }
9578
9579 static int find_first_block_group(struct btrfs_fs_info *fs_info,
9580                                   struct btrfs_path *path,
9581                                   struct btrfs_key *key)
9582 {
9583         struct btrfs_root *root = fs_info->extent_root;
9584         int ret = 0;
9585         struct btrfs_key found_key;
9586         struct extent_buffer *leaf;
9587         struct btrfs_block_group_item bg;
9588         u64 flags;
9589         int slot;
9590
9591         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
9592         if (ret < 0)
9593                 goto out;
9594
9595         while (1) {
9596                 slot = path->slots[0];
9597                 leaf = path->nodes[0];
9598                 if (slot >= btrfs_header_nritems(leaf)) {
9599                         ret = btrfs_next_leaf(root, path);
9600                         if (ret == 0)
9601                                 continue;
9602                         if (ret < 0)
9603                                 goto out;
9604                         break;
9605                 }
9606                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
9607
9608                 if (found_key.objectid >= key->objectid &&
9609                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
9610                         struct extent_map_tree *em_tree;
9611                         struct extent_map *em;
9612
9613                         em_tree = &root->fs_info->mapping_tree;
9614                         read_lock(&em_tree->lock);
9615                         em = lookup_extent_mapping(em_tree, found_key.objectid,
9616                                                    found_key.offset);
9617                         read_unlock(&em_tree->lock);
9618                         if (!em) {
9619                                 btrfs_err(fs_info,
9620                         "logical %llu len %llu found bg but no related chunk",
9621                                           found_key.objectid, found_key.offset);
9622                                 ret = -ENOENT;
9623                         } else if (em->start != found_key.objectid ||
9624                                    em->len != found_key.offset) {
9625                                 btrfs_err(fs_info,
9626                 "block group %llu len %llu mismatch with chunk %llu len %llu",
9627                                           found_key.objectid, found_key.offset,
9628                                           em->start, em->len);
9629                                 ret = -EUCLEAN;
9630                         } else {
9631                                 read_extent_buffer(leaf, &bg,
9632                                         btrfs_item_ptr_offset(leaf, slot),
9633                                         sizeof(bg));
9634                                 flags = btrfs_block_group_flags(&bg) &
9635                                         BTRFS_BLOCK_GROUP_TYPE_MASK;
9636
9637                                 if (flags != (em->map_lookup->type &
9638                                               BTRFS_BLOCK_GROUP_TYPE_MASK)) {
9639                                         btrfs_err(fs_info,
9640 "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
9641                                                 found_key.objectid,
9642                                                 found_key.offset, flags,
9643                                                 (BTRFS_BLOCK_GROUP_TYPE_MASK &
9644                                                  em->map_lookup->type));
9645                                         ret = -EUCLEAN;
9646                                 } else {
9647                                         ret = 0;
9648                                 }
9649                         }
9650                         free_extent_map(em);
9651                         goto out;
9652                 }
9653                 path->slots[0]++;
9654         }
9655 out:
9656         return ret;
9657 }
9658
9659 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
9660 {
9661         struct btrfs_block_group_cache *block_group;
9662         u64 last = 0;
9663
9664         while (1) {
9665                 struct inode *inode;
9666
9667                 block_group = btrfs_lookup_first_block_group(info, last);
9668                 while (block_group) {
9669                         wait_block_group_cache_done(block_group);
9670                         spin_lock(&block_group->lock);
9671                         if (block_group->iref)
9672                                 break;
9673                         spin_unlock(&block_group->lock);
9674                         block_group = next_block_group(block_group);
9675                 }
9676                 if (!block_group) {
9677                         if (last == 0)
9678                                 break;
9679                         last = 0;
9680                         continue;
9681                 }
9682
9683                 inode = block_group->inode;
9684                 block_group->iref = 0;
9685                 block_group->inode = NULL;
9686                 spin_unlock(&block_group->lock);
9687                 ASSERT(block_group->io_ctl.inode == NULL);
9688                 iput(inode);
9689                 last = block_group->key.objectid + block_group->key.offset;
9690                 btrfs_put_block_group(block_group);
9691         }
9692 }
9693
9694 /*
9695  * Must be called only after stopping all workers, since we could have block
9696  * group caching kthreads running, and therefore they could race with us if we
9697  * freed the block groups before stopping them.
9698  */
9699 int btrfs_free_block_groups(struct btrfs_fs_info *info)
9700 {
9701         struct btrfs_block_group_cache *block_group;
9702         struct btrfs_space_info *space_info;
9703         struct btrfs_caching_control *caching_ctl;
9704         struct rb_node *n;
9705
9706         down_write(&info->commit_root_sem);
9707         while (!list_empty(&info->caching_block_groups)) {
9708                 caching_ctl = list_entry(info->caching_block_groups.next,
9709                                          struct btrfs_caching_control, list);
9710                 list_del(&caching_ctl->list);
9711                 put_caching_control(caching_ctl);
9712         }
9713         up_write(&info->commit_root_sem);
9714
9715         spin_lock(&info->unused_bgs_lock);
9716         while (!list_empty(&info->unused_bgs)) {
9717                 block_group = list_first_entry(&info->unused_bgs,
9718                                                struct btrfs_block_group_cache,
9719                                                bg_list);
9720                 list_del_init(&block_group->bg_list);
9721                 btrfs_put_block_group(block_group);
9722         }
9723         spin_unlock(&info->unused_bgs_lock);
9724
9725         spin_lock(&info->block_group_cache_lock);
9726         while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
9727                 block_group = rb_entry(n, struct btrfs_block_group_cache,
9728                                        cache_node);
9729                 rb_erase(&block_group->cache_node,
9730                          &info->block_group_cache_tree);
9731                 RB_CLEAR_NODE(&block_group->cache_node);
9732                 spin_unlock(&info->block_group_cache_lock);
9733
9734                 down_write(&block_group->space_info->groups_sem);
9735                 list_del(&block_group->list);
9736                 up_write(&block_group->space_info->groups_sem);
9737
9738                 /*
9739                  * We haven't cached this block group, which means we could
9740                  * possibly have excluded extents on this block group.
9741                  */
9742                 if (block_group->cached == BTRFS_CACHE_NO ||
9743                     block_group->cached == BTRFS_CACHE_ERROR)
9744                         free_excluded_extents(block_group);
9745
9746                 btrfs_remove_free_space_cache(block_group);
9747                 ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
9748                 ASSERT(list_empty(&block_group->dirty_list));
9749                 ASSERT(list_empty(&block_group->io_list));
9750                 ASSERT(list_empty(&block_group->bg_list));
9751                 ASSERT(atomic_read(&block_group->count) == 1);
9752                 btrfs_put_block_group(block_group);
9753
9754                 spin_lock(&info->block_group_cache_lock);
9755         }
9756         spin_unlock(&info->block_group_cache_lock);
9757
9758         /* now that all the block groups are freed, go through and
9759          * free all the space_info structs.  This is only called during
9760          * the final stages of unmount, and so we know nobody is
9761          * using them.  We call synchronize_rcu() once before we start,
9762          * just to be on the safe side.
9763          */
9764         synchronize_rcu();
9765
9766         release_global_block_rsv(info);
9767
9768         while (!list_empty(&info->space_info)) {
9769                 int i;
9770
9771                 space_info = list_entry(info->space_info.next,
9772                                         struct btrfs_space_info,
9773                                         list);
9774
9775                 /*
9776                  * Do not hide this behind enospc_debug, this is actually
9777                  * important and indicates a real bug if this happens.
9778                  */
9779                 if (WARN_ON(space_info->bytes_pinned > 0 ||
9780                             space_info->bytes_reserved > 0 ||
9781                             space_info->bytes_may_use > 0))
9782                         dump_space_info(info, space_info, 0, 0);
9783                 list_del(&space_info->list);
9784                 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
9785                         struct kobject *kobj;
9786                         kobj = space_info->block_group_kobjs[i];
9787                         space_info->block_group_kobjs[i] = NULL;
9788                         if (kobj) {
9789                                 kobject_del(kobj);
9790                                 kobject_put(kobj);
9791                         }
9792                 }
9793                 kobject_del(&space_info->kobj);
9794                 kobject_put(&space_info->kobj);
9795         }
9796         return 0;
9797 }
9798
9799 /* link_block_group will queue up kobjects to add when we're reclaim-safe */
9800 void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info)
9801 {
9802         struct btrfs_space_info *space_info;
9803         struct raid_kobject *rkobj;
9804         LIST_HEAD(list);
9805         int ret = 0;
9806
9807         spin_lock(&fs_info->pending_raid_kobjs_lock);
9808         list_splice_init(&fs_info->pending_raid_kobjs, &list);
9809         spin_unlock(&fs_info->pending_raid_kobjs_lock);
9810
9811         list_for_each_entry(rkobj, &list, list) {
9812                 space_info = btrfs_find_space_info(fs_info, rkobj->flags);
9813
9814                 ret = kobject_add(&rkobj->kobj, &space_info->kobj,
9815                                 "%s", btrfs_bg_type_to_raid_name(rkobj->flags));
9816                 if (ret) {
9817                         kobject_put(&rkobj->kobj);
9818                         break;
9819                 }
9820         }
9821         if (ret)
9822                 btrfs_warn(fs_info,
9823                            "failed to add kobject for block cache, ignoring");
9824 }
9825
9826 static void link_block_group(struct btrfs_block_group_cache *cache)
9827 {
9828         struct btrfs_space_info *space_info = cache->space_info;
9829         struct btrfs_fs_info *fs_info = cache->fs_info;
9830         int index = btrfs_bg_flags_to_raid_index(cache->flags);
9831         bool first = false;
9832
9833         down_write(&space_info->groups_sem);
9834         if (list_empty(&space_info->block_groups[index]))
9835                 first = true;
9836         list_add_tail(&cache->list, &space_info->block_groups[index]);
9837         up_write(&space_info->groups_sem);
9838
9839         if (first) {
9840                 struct raid_kobject *rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
9841                 if (!rkobj) {
9842                         btrfs_warn(cache->fs_info,
9843                                 "couldn't alloc memory for raid level kobject");
9844                         return;
9845                 }
9846                 rkobj->flags = cache->flags;
9847                 kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
9848
9849                 spin_lock(&fs_info->pending_raid_kobjs_lock);
9850                 list_add_tail(&rkobj->list, &fs_info->pending_raid_kobjs);
9851                 spin_unlock(&fs_info->pending_raid_kobjs_lock);
9852                 space_info->block_group_kobjs[index] = &rkobj->kobj;
9853         }
9854 }
9855
9856 static struct btrfs_block_group_cache *
9857 btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
9858                                u64 start, u64 size)
9859 {
9860         struct btrfs_block_group_cache *cache;
9861
9862         cache = kzalloc(sizeof(*cache), GFP_NOFS);
9863         if (!cache)
9864                 return NULL;
9865
9866         cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
9867                                         GFP_NOFS);
9868         if (!cache->free_space_ctl) {
9869                 kfree(cache);
9870                 return NULL;
9871         }
9872
9873         cache->key.objectid = start;
9874         cache->key.offset = size;
9875         cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
9876
9877         cache->fs_info = fs_info;
9878         cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
9879         set_free_space_tree_thresholds(cache);
9880
9881         atomic_set(&cache->count, 1);
9882         spin_lock_init(&cache->lock);
9883         init_rwsem(&cache->data_rwsem);
9884         INIT_LIST_HEAD(&cache->list);
9885         INIT_LIST_HEAD(&cache->cluster_list);
9886         INIT_LIST_HEAD(&cache->bg_list);
9887         INIT_LIST_HEAD(&cache->ro_list);
9888         INIT_LIST_HEAD(&cache->dirty_list);
9889         INIT_LIST_HEAD(&cache->io_list);
9890         btrfs_init_free_space_ctl(cache);
9891         atomic_set(&cache->trimming, 0);
9892         mutex_init(&cache->free_space_lock);
9893         btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
9894
9895         return cache;
9896 }
9897
9898
9899 /*
9900  * Iterate all chunks and verify that each of them has the corresponding block
9901  * group
9902  */
9903 static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
9904 {
9905         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
9906         struct extent_map *em;
9907         struct btrfs_block_group_cache *bg;
9908         u64 start = 0;
9909         int ret = 0;
9910
9911         while (1) {
9912                 read_lock(&map_tree->lock);
9913                 /*
9914                  * lookup_extent_mapping will return the first extent map
9915                  * intersecting the range, so setting @len to 1 is enough to
9916                  * get the first chunk.
9917                  */
9918                 em = lookup_extent_mapping(map_tree, start, 1);
9919                 read_unlock(&map_tree->lock);
9920                 if (!em)
9921                         break;
9922
9923                 bg = btrfs_lookup_block_group(fs_info, em->start);
9924                 if (!bg) {
9925                         btrfs_err(fs_info,
9926         "chunk start=%llu len=%llu doesn't have corresponding block group",
9927                                      em->start, em->len);
9928                         ret = -EUCLEAN;
9929                         free_extent_map(em);
9930                         break;
9931                 }
9932                 if (bg->key.objectid != em->start ||
9933                     bg->key.offset != em->len ||
9934                     (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
9935                     (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
9936                         btrfs_err(fs_info,
9937 "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
9938                                 em->start, em->len,
9939                                 em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
9940                                 bg->key.objectid, bg->key.offset,
9941                                 bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
9942                         ret = -EUCLEAN;
9943                         free_extent_map(em);
9944                         btrfs_put_block_group(bg);
9945                         break;
9946                 }
9947                 start = em->start + em->len;
9948                 free_extent_map(em);
9949                 btrfs_put_block_group(bg);
9950         }
9951         return ret;
9952 }
9953
9954 int btrfs_read_block_groups(struct btrfs_fs_info *info)
9955 {
9956         struct btrfs_path *path;
9957         int ret;
9958         struct btrfs_block_group_cache *cache;
9959         struct btrfs_space_info *space_info;
9960         struct btrfs_key key;
9961         struct btrfs_key found_key;
9962         struct extent_buffer *leaf;
9963         int need_clear = 0;
9964         u64 cache_gen;
9965         u64 feature;
9966         int mixed;
9967
9968         feature = btrfs_super_incompat_flags(info->super_copy);
9969         mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
9970
9971         key.objectid = 0;
9972         key.offset = 0;
9973         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
9974         path = btrfs_alloc_path();
9975         if (!path)
9976                 return -ENOMEM;
9977         path->reada = READA_FORWARD;
9978
9979         cache_gen = btrfs_super_cache_generation(info->super_copy);
9980         if (btrfs_test_opt(info, SPACE_CACHE) &&
9981             btrfs_super_generation(info->super_copy) != cache_gen)
9982                 need_clear = 1;
9983         if (btrfs_test_opt(info, CLEAR_CACHE))
9984                 need_clear = 1;
9985
9986         while (1) {
9987                 ret = find_first_block_group(info, path, &key);
9988                 if (ret > 0)
9989                         break;
9990                 if (ret != 0)
9991                         goto error;
9992
9993                 leaf = path->nodes[0];
9994                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
9995
9996                 cache = btrfs_create_block_group_cache(info, found_key.objectid,
9997                                                        found_key.offset);
9998                 if (!cache) {
9999                         ret = -ENOMEM;
10000                         goto error;
10001                 }
10002
10003                 if (need_clear) {
10004                         /*
10005                          * When we mount with old space cache, we need to
10006                          * set BTRFS_DC_CLEAR and set dirty flag.
10007                          *
10008                          * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
10009                          *    truncate the old free space cache inode and
10010                          *    setup a new one.
10011                          * b) Setting 'dirty flag' makes sure that we flush
10012                          *    the new space cache info onto disk.
10013                          */
10014                         if (btrfs_test_opt(info, SPACE_CACHE))
10015                                 cache->disk_cache_state = BTRFS_DC_CLEAR;
10016                 }
10017
10018                 read_extent_buffer(leaf, &cache->item,
10019                                    btrfs_item_ptr_offset(leaf, path->slots[0]),
10020                                    sizeof(cache->item));
10021                 cache->flags = btrfs_block_group_flags(&cache->item);
10022                 if (!mixed &&
10023                     ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
10024                     (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
10025                         btrfs_err(info,
10026 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
10027                                   cache->key.objectid);
10028                         ret = -EINVAL;
10029                         goto error;
10030                 }
10031
10032                 key.objectid = found_key.objectid + found_key.offset;
10033                 btrfs_release_path(path);
10034
10035                 /*
10036                  * We need to exclude the super stripes now so that the space
10037                  * info has super bytes accounted for, otherwise we'll think
10038                  * we have more space than we actually do.
10039                  */
10040                 ret = exclude_super_stripes(cache);
10041                 if (ret) {
10042                         /*
10043                          * We may have excluded something, so call this just in
10044                          * case.
10045                          */
10046                         free_excluded_extents(cache);
10047                         btrfs_put_block_group(cache);
10048                         goto error;
10049                 }
10050
10051                 /*
10052                  * check for two cases, either we are full, and therefore
10053                  * don't need to bother with the caching work since we won't
10054                  * find any space, or we are empty, and we can just add all
10055                  * the space in and be done with it.  This saves us _a_lot_ of
10056                  * time, particularly in the full case.
10057                  */
10058                 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
10059                         cache->last_byte_to_unpin = (u64)-1;
10060                         cache->cached = BTRFS_CACHE_FINISHED;
10061                         free_excluded_extents(cache);
10062                 } else if (btrfs_block_group_used(&cache->item) == 0) {
10063                         cache->last_byte_to_unpin = (u64)-1;
10064                         cache->cached = BTRFS_CACHE_FINISHED;
10065                         add_new_free_space(cache, found_key.objectid,
10066                                            found_key.objectid +
10067                                            found_key.offset);
10068                         free_excluded_extents(cache);
10069                 }
10070
10071                 ret = btrfs_add_block_group_cache(info, cache);
10072                 if (ret) {
10073                         btrfs_remove_free_space_cache(cache);
10074                         btrfs_put_block_group(cache);
10075                         goto error;
10076                 }
10077
10078                 trace_btrfs_add_block_group(info, cache, 0);
10079                 btrfs_update_space_info(info, cache->flags, found_key.offset,
10080                                         btrfs_block_group_used(&cache->item),
10081                                         cache->bytes_super, &space_info);
10082
10083                 cache->space_info = space_info;
10084
10085                 link_block_group(cache);
10086
10087                 set_avail_alloc_bits(info, cache->flags);
10088                 if (btrfs_chunk_readonly(info, cache->key.objectid)) {
10089                         inc_block_group_ro(cache, 1);
10090                 } else if (btrfs_block_group_used(&cache->item) == 0) {
10091                         ASSERT(list_empty(&cache->bg_list));
10092                         btrfs_mark_bg_unused(cache);
10093                 }
10094         }
10095
10096         list_for_each_entry_rcu(space_info, &info->space_info, list) {
10097                 if (!(get_alloc_profile(info, space_info->flags) &
10098                       (BTRFS_BLOCK_GROUP_RAID10 |
10099                        BTRFS_BLOCK_GROUP_RAID1_MASK |
10100                        BTRFS_BLOCK_GROUP_RAID56_MASK |
10101                        BTRFS_BLOCK_GROUP_DUP)))
10102                         continue;
10103                 /*
10104                  * avoid allocating from un-mirrored block group if there are
10105                  * mirrored block groups.
10106                  */
10107                 list_for_each_entry(cache,
10108                                 &space_info->block_groups[BTRFS_RAID_RAID0],
10109                                 list)
10110                         inc_block_group_ro(cache, 1);
10111                 list_for_each_entry(cache,
10112                                 &space_info->block_groups[BTRFS_RAID_SINGLE],
10113                                 list)
10114                         inc_block_group_ro(cache, 1);
10115         }
10116
10117         btrfs_add_raid_kobjects(info);
10118         init_global_block_rsv(info);
10119         ret = check_chunk_block_group_mappings(info);
10120 error:
10121         btrfs_free_path(path);
10122         return ret;
10123 }
10124
10125 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
10126 {
10127         struct btrfs_fs_info *fs_info = trans->fs_info;
10128         struct btrfs_block_group_cache *block_group;
10129         struct btrfs_root *extent_root = fs_info->extent_root;
10130         struct btrfs_block_group_item item;
10131         struct btrfs_key key;
10132         int ret = 0;
10133
10134         if (!trans->can_flush_pending_bgs)
10135                 return;
10136
10137         while (!list_empty(&trans->new_bgs)) {
10138                 block_group = list_first_entry(&trans->new_bgs,
10139                                                struct btrfs_block_group_cache,
10140                                                bg_list);
10141                 if (ret)
10142                         goto next;
10143
10144                 spin_lock(&block_group->lock);
10145                 memcpy(&item, &block_group->item, sizeof(item));
10146                 memcpy(&key, &block_group->key, sizeof(key));
10147                 spin_unlock(&block_group->lock);
10148
10149                 ret = btrfs_insert_item(trans, extent_root, &key, &item,
10150                                         sizeof(item));
10151                 if (ret)
10152                         btrfs_abort_transaction(trans, ret);
10153                 ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset);
10154                 if (ret)
10155                         btrfs_abort_transaction(trans, ret);
10156                 add_block_group_free_space(trans, block_group);
10157                 /* already aborted the transaction if it failed. */
10158 next:
10159                 btrfs_delayed_refs_rsv_release(fs_info, 1);
10160                 list_del_init(&block_group->bg_list);
10161         }
10162         btrfs_trans_release_chunk_metadata(trans);
10163 }
10164
10165 int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
10166                            u64 type, u64 chunk_offset, u64 size)
10167 {
10168         struct btrfs_fs_info *fs_info = trans->fs_info;
10169         struct btrfs_block_group_cache *cache;
10170         int ret;
10171
10172         btrfs_set_log_full_commit(trans);
10173
10174         cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size);
10175         if (!cache)
10176                 return -ENOMEM;
10177
10178         btrfs_set_block_group_used(&cache->item, bytes_used);
10179         btrfs_set_block_group_chunk_objectid(&cache->item,
10180                                              BTRFS_FIRST_CHUNK_TREE_OBJECTID);
10181         btrfs_set_block_group_flags(&cache->item, type);
10182
10183         cache->flags = type;
10184         cache->last_byte_to_unpin = (u64)-1;
10185         cache->cached = BTRFS_CACHE_FINISHED;
10186         cache->needs_free_space = 1;
10187         ret = exclude_super_stripes(cache);
10188         if (ret) {
10189                 /*
10190                  * We may have excluded something, so call this just in
10191                  * case.
10192                  */
10193                 free_excluded_extents(cache);
10194                 btrfs_put_block_group(cache);
10195                 return ret;
10196         }
10197
10198         add_new_free_space(cache, chunk_offset, chunk_offset + size);
10199
10200         free_excluded_extents(cache);
10201
10202 #ifdef CONFIG_BTRFS_DEBUG
10203         if (btrfs_should_fragment_free_space(cache)) {
10204                 u64 new_bytes_used = size - bytes_used;
10205
10206                 bytes_used += new_bytes_used >> 1;
10207                 fragment_free_space(cache);
10208         }
10209 #endif
10210         /*
10211          * Ensure the corresponding space_info object is created and
10212          * assigned to our block group. We want our bg to be added to the rbtree
10213          * with its ->space_info set.
10214          */
10215         cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
10216         ASSERT(cache->space_info);
10217
10218         ret = btrfs_add_block_group_cache(fs_info, cache);
10219         if (ret) {
10220                 btrfs_remove_free_space_cache(cache);
10221                 btrfs_put_block_group(cache);
10222                 return ret;
10223         }
10224
10225         /*
10226          * Now that our block group has its ->space_info set and is inserted in
10227          * the rbtree, update the space info's counters.
10228          */
10229         trace_btrfs_add_block_group(fs_info, cache, 1);
10230         btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
10231                                 cache->bytes_super, &cache->space_info);
10232         update_global_block_rsv(fs_info);
10233
10234         link_block_group(cache);
10235
10236         list_add_tail(&cache->bg_list, &trans->new_bgs);
10237         trans->delayed_ref_updates++;
10238         btrfs_update_delayed_refs_rsv(trans);
10239
10240         set_avail_alloc_bits(fs_info, type);
10241         return 0;
10242 }
10243
10244 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
10245 {
10246         u64 extra_flags = chunk_to_extended(flags) &
10247                                 BTRFS_EXTENDED_PROFILE_MASK;
10248
10249         write_seqlock(&fs_info->profiles_lock);
10250         if (flags & BTRFS_BLOCK_GROUP_DATA)
10251                 fs_info->avail_data_alloc_bits &= ~extra_flags;
10252         if (flags & BTRFS_BLOCK_GROUP_METADATA)
10253                 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
10254         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
10255                 fs_info->avail_system_alloc_bits &= ~extra_flags;
10256         write_sequnlock(&fs_info->profiles_lock);
10257 }
10258
10259 /*
10260  * Clear incompat bits for the following feature(s):
10261  *
10262  * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
10263  *            in the whole filesystem
10264  */
10265 static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
10266 {
10267         if (flags & BTRFS_BLOCK_GROUP_RAID56_MASK) {
10268                 struct list_head *head = &fs_info->space_info;
10269                 struct btrfs_space_info *sinfo;
10270
10271                 list_for_each_entry_rcu(sinfo, head, list) {
10272                         bool found = false;
10273
10274                         down_read(&sinfo->groups_sem);
10275                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
10276                                 found = true;
10277                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
10278                                 found = true;
10279                         up_read(&sinfo->groups_sem);
10280
10281                         if (found)
10282                                 return;
10283                 }
10284                 btrfs_clear_fs_incompat(fs_info, RAID56);
10285         }
10286 }
10287
10288 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
10289                              u64 group_start, struct extent_map *em)
10290 {
10291         struct btrfs_fs_info *fs_info = trans->fs_info;
10292         struct btrfs_root *root = fs_info->extent_root;
10293         struct btrfs_path *path;
10294         struct btrfs_block_group_cache *block_group;
10295         struct btrfs_free_cluster *cluster;
10296         struct btrfs_root *tree_root = fs_info->tree_root;
10297         struct btrfs_key key;
10298         struct inode *inode;
10299         struct kobject *kobj = NULL;
10300         int ret;
10301         int index;
10302         int factor;
10303         struct btrfs_caching_control *caching_ctl = NULL;
10304         bool remove_em;
10305         bool remove_rsv = false;
10306
10307         block_group = btrfs_lookup_block_group(fs_info, group_start);
10308         BUG_ON(!block_group);
10309         BUG_ON(!block_group->ro);
10310
10311         trace_btrfs_remove_block_group(block_group);
10312         /*
10313          * Free the reserved super bytes from this block group before
10314          * remove it.
10315          */
10316         free_excluded_extents(block_group);
10317         btrfs_free_ref_tree_range(fs_info, block_group->key.objectid,
10318                                   block_group->key.offset);
10319
10320         memcpy(&key, &block_group->key, sizeof(key));
10321         index = btrfs_bg_flags_to_raid_index(block_group->flags);
10322         factor = btrfs_bg_type_to_factor(block_group->flags);
10323
10324         /* make sure this block group isn't part of an allocation cluster */
10325         cluster = &fs_info->data_alloc_cluster;
10326         spin_lock(&cluster->refill_lock);
10327         btrfs_return_cluster_to_free_space(block_group, cluster);
10328         spin_unlock(&cluster->refill_lock);
10329
10330         /*
10331          * make sure this block group isn't part of a metadata
10332          * allocation cluster
10333          */
10334         cluster = &fs_info->meta_alloc_cluster;
10335         spin_lock(&cluster->refill_lock);
10336         btrfs_return_cluster_to_free_space(block_group, cluster);
10337         spin_unlock(&cluster->refill_lock);
10338
10339         path = btrfs_alloc_path();
10340         if (!path) {
10341                 ret = -ENOMEM;
10342                 goto out;
10343         }
10344
10345         /*
10346          * get the inode first so any iput calls done for the io_list
10347          * aren't the final iput (no unlinks allowed now)
10348          */
10349         inode = lookup_free_space_inode(block_group, path);
10350
10351         mutex_lock(&trans->transaction->cache_write_mutex);
10352         /*
10353          * Make sure our free space cache IO is done before removing the
10354          * free space inode
10355          */
10356         spin_lock(&trans->transaction->dirty_bgs_lock);
10357         if (!list_empty(&block_group->io_list)) {
10358                 list_del_init(&block_group->io_list);
10359
10360                 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
10361
10362                 spin_unlock(&trans->transaction->dirty_bgs_lock);
10363                 btrfs_wait_cache_io(trans, block_group, path);
10364                 btrfs_put_block_group(block_group);
10365                 spin_lock(&trans->transaction->dirty_bgs_lock);
10366         }
10367
10368         if (!list_empty(&block_group->dirty_list)) {
10369                 list_del_init(&block_group->dirty_list);
10370                 remove_rsv = true;
10371                 btrfs_put_block_group(block_group);
10372         }
10373         spin_unlock(&trans->transaction->dirty_bgs_lock);
10374         mutex_unlock(&trans->transaction->cache_write_mutex);
10375
10376         if (!IS_ERR(inode)) {
10377                 ret = btrfs_orphan_add(trans, BTRFS_I(inode));
10378                 if (ret) {
10379                         btrfs_add_delayed_iput(inode);
10380                         goto out;
10381                 }
10382                 clear_nlink(inode);
10383                 /* One for the block groups ref */
10384                 spin_lock(&block_group->lock);
10385                 if (block_group->iref) {
10386                         block_group->iref = 0;
10387                         block_group->inode = NULL;
10388                         spin_unlock(&block_group->lock);
10389                         iput(inode);
10390                 } else {
10391                         spin_unlock(&block_group->lock);
10392                 }
10393                 /* One for our lookup ref */
10394                 btrfs_add_delayed_iput(inode);
10395         }
10396
10397         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
10398         key.offset = block_group->key.objectid;
10399         key.type = 0;
10400
10401         ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
10402         if (ret < 0)
10403                 goto out;
10404         if (ret > 0)
10405                 btrfs_release_path(path);
10406         if (ret == 0) {
10407                 ret = btrfs_del_item(trans, tree_root, path);
10408                 if (ret)
10409                         goto out;
10410                 btrfs_release_path(path);
10411         }
10412
10413         spin_lock(&fs_info->block_group_cache_lock);
10414         rb_erase(&block_group->cache_node,
10415                  &fs_info->block_group_cache_tree);
10416         RB_CLEAR_NODE(&block_group->cache_node);
10417
10418         if (fs_info->first_logical_byte == block_group->key.objectid)
10419                 fs_info->first_logical_byte = (u64)-1;
10420         spin_unlock(&fs_info->block_group_cache_lock);
10421
10422         down_write(&block_group->space_info->groups_sem);
10423         /*
10424          * we must use list_del_init so people can check to see if they
10425          * are still on the list after taking the semaphore
10426          */
10427         list_del_init(&block_group->list);
10428         if (list_empty(&block_group->space_info->block_groups[index])) {
10429                 kobj = block_group->space_info->block_group_kobjs[index];
10430                 block_group->space_info->block_group_kobjs[index] = NULL;
10431                 clear_avail_alloc_bits(fs_info, block_group->flags);
10432         }
10433         up_write(&block_group->space_info->groups_sem);
10434         clear_incompat_bg_bits(fs_info, block_group->flags);
10435         if (kobj) {
10436                 kobject_del(kobj);
10437                 kobject_put(kobj);
10438         }
10439
10440         if (block_group->has_caching_ctl)
10441                 caching_ctl = get_caching_control(block_group);
10442         if (block_group->cached == BTRFS_CACHE_STARTED)
10443                 wait_block_group_cache_done(block_group);
10444         if (block_group->has_caching_ctl) {
10445                 down_write(&fs_info->commit_root_sem);
10446                 if (!caching_ctl) {
10447                         struct btrfs_caching_control *ctl;
10448
10449                         list_for_each_entry(ctl,
10450                                     &fs_info->caching_block_groups, list)
10451                                 if (ctl->block_group == block_group) {
10452                                         caching_ctl = ctl;
10453                                         refcount_inc(&caching_ctl->count);
10454                                         break;
10455                                 }
10456                 }
10457                 if (caching_ctl)
10458                         list_del_init(&caching_ctl->list);
10459                 up_write(&fs_info->commit_root_sem);
10460                 if (caching_ctl) {
10461                         /* Once for the caching bgs list and once for us. */
10462                         put_caching_control(caching_ctl);
10463                         put_caching_control(caching_ctl);
10464                 }
10465         }
10466
10467         spin_lock(&trans->transaction->dirty_bgs_lock);
10468         WARN_ON(!list_empty(&block_group->dirty_list));
10469         WARN_ON(!list_empty(&block_group->io_list));
10470         spin_unlock(&trans->transaction->dirty_bgs_lock);
10471
10472         btrfs_remove_free_space_cache(block_group);
10473
10474         spin_lock(&block_group->space_info->lock);
10475         list_del_init(&block_group->ro_list);
10476
10477         if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
10478                 WARN_ON(block_group->space_info->total_bytes
10479                         < block_group->key.offset);
10480                 WARN_ON(block_group->space_info->bytes_readonly
10481                         < block_group->key.offset);
10482                 WARN_ON(block_group->space_info->disk_total
10483                         < block_group->key.offset * factor);
10484         }
10485         block_group->space_info->total_bytes -= block_group->key.offset;
10486         block_group->space_info->bytes_readonly -= block_group->key.offset;
10487         block_group->space_info->disk_total -= block_group->key.offset * factor;
10488
10489         spin_unlock(&block_group->space_info->lock);
10490
10491         memcpy(&key, &block_group->key, sizeof(key));
10492
10493         mutex_lock(&fs_info->chunk_mutex);
10494         spin_lock(&block_group->lock);
10495         block_group->removed = 1;
10496         /*
10497          * At this point trimming can't start on this block group, because we
10498          * removed the block group from the tree fs_info->block_group_cache_tree
10499          * so no one can't find it anymore and even if someone already got this
10500          * block group before we removed it from the rbtree, they have already
10501          * incremented block_group->trimming - if they didn't, they won't find
10502          * any free space entries because we already removed them all when we
10503          * called btrfs_remove_free_space_cache().
10504          *
10505          * And we must not remove the extent map from the fs_info->mapping_tree
10506          * to prevent the same logical address range and physical device space
10507          * ranges from being reused for a new block group. This is because our
10508          * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
10509          * completely transactionless, so while it is trimming a range the
10510          * currently running transaction might finish and a new one start,
10511          * allowing for new block groups to be created that can reuse the same
10512          * physical device locations unless we take this special care.
10513          *
10514          * There may also be an implicit trim operation if the file system
10515          * is mounted with -odiscard. The same protections must remain
10516          * in place until the extents have been discarded completely when
10517          * the transaction commit has completed.
10518          */
10519         remove_em = (atomic_read(&block_group->trimming) == 0);
10520         spin_unlock(&block_group->lock);
10521
10522         mutex_unlock(&fs_info->chunk_mutex);
10523
10524         ret = remove_block_group_free_space(trans, block_group);
10525         if (ret)
10526                 goto out;
10527
10528         btrfs_put_block_group(block_group);
10529         btrfs_put_block_group(block_group);
10530
10531         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
10532         if (ret > 0)
10533                 ret = -EIO;
10534         if (ret < 0)
10535                 goto out;
10536
10537         ret = btrfs_del_item(trans, root, path);
10538         if (ret)
10539                 goto out;
10540
10541         if (remove_em) {
10542                 struct extent_map_tree *em_tree;
10543
10544                 em_tree = &fs_info->mapping_tree;
10545                 write_lock(&em_tree->lock);
10546                 remove_extent_mapping(em_tree, em);
10547                 write_unlock(&em_tree->lock);
10548                 /* once for the tree */
10549                 free_extent_map(em);
10550         }
10551 out:
10552         if (remove_rsv)
10553                 btrfs_delayed_refs_rsv_release(fs_info, 1);
10554         btrfs_free_path(path);
10555         return ret;
10556 }
10557
10558 struct btrfs_trans_handle *
10559 btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
10560                                      const u64 chunk_offset)
10561 {
10562         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
10563         struct extent_map *em;
10564         struct map_lookup *map;
10565         unsigned int num_items;
10566
10567         read_lock(&em_tree->lock);
10568         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
10569         read_unlock(&em_tree->lock);
10570         ASSERT(em && em->start == chunk_offset);
10571
10572         /*
10573          * We need to reserve 3 + N units from the metadata space info in order
10574          * to remove a block group (done at btrfs_remove_chunk() and at
10575          * btrfs_remove_block_group()), which are used for:
10576          *
10577          * 1 unit for adding the free space inode's orphan (located in the tree
10578          * of tree roots).
10579          * 1 unit for deleting the block group item (located in the extent
10580          * tree).
10581          * 1 unit for deleting the free space item (located in tree of tree
10582          * roots).
10583          * N units for deleting N device extent items corresponding to each
10584          * stripe (located in the device tree).
10585          *
10586          * In order to remove a block group we also need to reserve units in the
10587          * system space info in order to update the chunk tree (update one or
10588          * more device items and remove one chunk item), but this is done at
10589          * btrfs_remove_chunk() through a call to check_system_chunk().
10590          */
10591         map = em->map_lookup;
10592         num_items = 3 + map->num_stripes;
10593         free_extent_map(em);
10594
10595         return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
10596                                                            num_items, 1);
10597 }
10598
10599 /*
10600  * Process the unused_bgs list and remove any that don't have any allocated
10601  * space inside of them.
10602  */
10603 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
10604 {
10605         struct btrfs_block_group_cache *block_group;
10606         struct btrfs_space_info *space_info;
10607         struct btrfs_trans_handle *trans;
10608         int ret = 0;
10609
10610         if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
10611                 return;
10612
10613         spin_lock(&fs_info->unused_bgs_lock);
10614         while (!list_empty(&fs_info->unused_bgs)) {
10615                 u64 start, end;
10616                 int trimming;
10617
10618                 block_group = list_first_entry(&fs_info->unused_bgs,
10619                                                struct btrfs_block_group_cache,
10620                                                bg_list);
10621                 list_del_init(&block_group->bg_list);
10622
10623                 space_info = block_group->space_info;
10624
10625                 if (ret || btrfs_mixed_space_info(space_info)) {
10626                         btrfs_put_block_group(block_group);
10627                         continue;
10628                 }
10629                 spin_unlock(&fs_info->unused_bgs_lock);
10630
10631                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
10632
10633                 /* Don't want to race with allocators so take the groups_sem */
10634                 down_write(&space_info->groups_sem);
10635                 spin_lock(&block_group->lock);
10636                 if (block_group->reserved || block_group->pinned ||
10637                     btrfs_block_group_used(&block_group->item) ||
10638                     block_group->ro ||
10639                     list_is_singular(&block_group->list)) {
10640                         /*
10641                          * We want to bail if we made new allocations or have
10642                          * outstanding allocations in this block group.  We do
10643                          * the ro check in case balance is currently acting on
10644                          * this block group.
10645                          */
10646                         trace_btrfs_skip_unused_block_group(block_group);
10647                         spin_unlock(&block_group->lock);
10648                         up_write(&space_info->groups_sem);
10649                         goto next;
10650                 }
10651                 spin_unlock(&block_group->lock);
10652
10653                 /* We don't want to force the issue, only flip if it's ok. */
10654                 ret = inc_block_group_ro(block_group, 0);
10655                 up_write(&space_info->groups_sem);
10656                 if (ret < 0) {
10657                         ret = 0;
10658                         goto next;
10659                 }
10660
10661                 /*
10662                  * Want to do this before we do anything else so we can recover
10663                  * properly if we fail to join the transaction.
10664                  */
10665                 trans = btrfs_start_trans_remove_block_group(fs_info,
10666                                                      block_group->key.objectid);
10667                 if (IS_ERR(trans)) {
10668                         btrfs_dec_block_group_ro(block_group);
10669                         ret = PTR_ERR(trans);
10670                         goto next;
10671                 }
10672
10673                 /*
10674                  * We could have pending pinned extents for this block group,
10675                  * just delete them, we don't care about them anymore.
10676                  */
10677                 start = block_group->key.objectid;
10678                 end = start + block_group->key.offset - 1;
10679                 /*
10680                  * Hold the unused_bg_unpin_mutex lock to avoid racing with
10681                  * btrfs_finish_extent_commit(). If we are at transaction N,
10682                  * another task might be running finish_extent_commit() for the
10683                  * previous transaction N - 1, and have seen a range belonging
10684                  * to the block group in freed_extents[] before we were able to
10685                  * clear the whole block group range from freed_extents[]. This
10686                  * means that task can lookup for the block group after we
10687                  * unpinned it from freed_extents[] and removed it, leading to
10688                  * a BUG_ON() at btrfs_unpin_extent_range().
10689                  */
10690                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
10691                 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
10692                                   EXTENT_DIRTY);
10693                 if (ret) {
10694                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10695                         btrfs_dec_block_group_ro(block_group);
10696                         goto end_trans;
10697                 }
10698                 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
10699                                   EXTENT_DIRTY);
10700                 if (ret) {
10701                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10702                         btrfs_dec_block_group_ro(block_group);
10703                         goto end_trans;
10704                 }
10705                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10706
10707                 /* Reset pinned so btrfs_put_block_group doesn't complain */
10708                 spin_lock(&space_info->lock);
10709                 spin_lock(&block_group->lock);
10710
10711                 btrfs_space_info_update_bytes_pinned(fs_info, space_info,
10712                                                      -block_group->pinned);
10713                 space_info->bytes_readonly += block_group->pinned;
10714                 percpu_counter_add_batch(&space_info->total_bytes_pinned,
10715                                    -block_group->pinned,
10716                                    BTRFS_TOTAL_BYTES_PINNED_BATCH);
10717                 block_group->pinned = 0;
10718
10719                 spin_unlock(&block_group->lock);
10720                 spin_unlock(&space_info->lock);
10721
10722                 /* DISCARD can flip during remount */
10723                 trimming = btrfs_test_opt(fs_info, DISCARD);
10724
10725                 /* Implicit trim during transaction commit. */
10726                 if (trimming)
10727                         btrfs_get_block_group_trimming(block_group);
10728
10729                 /*
10730                  * Btrfs_remove_chunk will abort the transaction if things go
10731                  * horribly wrong.
10732                  */
10733                 ret = btrfs_remove_chunk(trans, block_group->key.objectid);
10734
10735                 if (ret) {
10736                         if (trimming)
10737                                 btrfs_put_block_group_trimming(block_group);
10738                         goto end_trans;
10739                 }
10740
10741                 /*
10742                  * If we're not mounted with -odiscard, we can just forget
10743                  * about this block group. Otherwise we'll need to wait
10744                  * until transaction commit to do the actual discard.
10745                  */
10746                 if (trimming) {
10747                         spin_lock(&fs_info->unused_bgs_lock);
10748                         /*
10749                          * A concurrent scrub might have added us to the list
10750                          * fs_info->unused_bgs, so use a list_move operation
10751                          * to add the block group to the deleted_bgs list.
10752                          */
10753                         list_move(&block_group->bg_list,
10754                                   &trans->transaction->deleted_bgs);
10755                         spin_unlock(&fs_info->unused_bgs_lock);
10756                         btrfs_get_block_group(block_group);
10757                 }
10758 end_trans:
10759                 btrfs_end_transaction(trans);
10760 next:
10761                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
10762                 btrfs_put_block_group(block_group);
10763                 spin_lock(&fs_info->unused_bgs_lock);
10764         }
10765         spin_unlock(&fs_info->unused_bgs_lock);
10766 }
10767
10768 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
10769                                    u64 start, u64 end)
10770 {
10771         return unpin_extent_range(fs_info, start, end, false);
10772 }
10773
10774 /*
10775  * It used to be that old block groups would be left around forever.
10776  * Iterating over them would be enough to trim unused space.  Since we
10777  * now automatically remove them, we also need to iterate over unallocated
10778  * space.
10779  *
10780  * We don't want a transaction for this since the discard may take a
10781  * substantial amount of time.  We don't require that a transaction be
10782  * running, but we do need to take a running transaction into account
10783  * to ensure that we're not discarding chunks that were released or
10784  * allocated in the current transaction.
10785  *
10786  * Holding the chunks lock will prevent other threads from allocating
10787  * or releasing chunks, but it won't prevent a running transaction
10788  * from committing and releasing the memory that the pending chunks
10789  * list head uses.  For that, we need to take a reference to the
10790  * transaction and hold the commit root sem.  We only need to hold
10791  * it while performing the free space search since we have already
10792  * held back allocations.
10793  */
10794 static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
10795 {
10796         u64 start = SZ_1M, len = 0, end = 0;
10797         int ret;
10798
10799         *trimmed = 0;
10800
10801         /* Discard not supported = nothing to do. */
10802         if (!blk_queue_discard(bdev_get_queue(device->bdev)))
10803                 return 0;
10804
10805         /* Not writable = nothing to do. */
10806         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
10807                 return 0;
10808
10809         /* No free space = nothing to do. */
10810         if (device->total_bytes <= device->bytes_used)
10811                 return 0;
10812
10813         ret = 0;
10814
10815         while (1) {
10816                 struct btrfs_fs_info *fs_info = device->fs_info;
10817                 u64 bytes;
10818
10819                 ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
10820                 if (ret)
10821                         break;
10822
10823                 find_first_clear_extent_bit(&device->alloc_state, start,
10824                                             &start, &end,
10825                                             CHUNK_TRIMMED | CHUNK_ALLOCATED);
10826
10827                 /* Ensure we skip the reserved area in the first 1M */
10828                 start = max_t(u64, start, SZ_1M);
10829
10830                 /*
10831                  * If find_first_clear_extent_bit find a range that spans the
10832                  * end of the device it will set end to -1, in this case it's up
10833                  * to the caller to trim the value to the size of the device.
10834                  */
10835                 end = min(end, device->total_bytes - 1);
10836
10837                 len = end - start + 1;
10838
10839                 /* We didn't find any extents */
10840                 if (!len) {
10841                         mutex_unlock(&fs_info->chunk_mutex);
10842                         ret = 0;
10843                         break;
10844                 }
10845
10846                 ret = btrfs_issue_discard(device->bdev, start, len,
10847                                           &bytes);
10848                 if (!ret)
10849                         set_extent_bits(&device->alloc_state, start,
10850                                         start + bytes - 1,
10851                                         CHUNK_TRIMMED);
10852                 mutex_unlock(&fs_info->chunk_mutex);
10853
10854                 if (ret)
10855                         break;
10856
10857                 start += len;
10858                 *trimmed += bytes;
10859
10860                 if (fatal_signal_pending(current)) {
10861                         ret = -ERESTARTSYS;
10862                         break;
10863                 }
10864
10865                 cond_resched();
10866         }
10867
10868         return ret;
10869 }
10870
10871 /*
10872  * Trim the whole filesystem by:
10873  * 1) trimming the free space in each block group
10874  * 2) trimming the unallocated space on each device
10875  *
10876  * This will also continue trimming even if a block group or device encounters
10877  * an error.  The return value will be the last error, or 0 if nothing bad
10878  * happens.
10879  */
10880 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
10881 {
10882         struct btrfs_block_group_cache *cache = NULL;
10883         struct btrfs_device *device;
10884         struct list_head *devices;
10885         u64 group_trimmed;
10886         u64 start;
10887         u64 end;
10888         u64 trimmed = 0;
10889         u64 bg_failed = 0;
10890         u64 dev_failed = 0;
10891         int bg_ret = 0;
10892         int dev_ret = 0;
10893         int ret = 0;
10894
10895         cache = btrfs_lookup_first_block_group(fs_info, range->start);
10896         for (; cache; cache = next_block_group(cache)) {
10897                 if (cache->key.objectid >= (range->start + range->len)) {
10898                         btrfs_put_block_group(cache);
10899                         break;
10900                 }
10901
10902                 start = max(range->start, cache->key.objectid);
10903                 end = min(range->start + range->len,
10904                                 cache->key.objectid + cache->key.offset);
10905
10906                 if (end - start >= range->minlen) {
10907                         if (!block_group_cache_done(cache)) {
10908                                 ret = cache_block_group(cache, 0);
10909                                 if (ret) {
10910                                         bg_failed++;
10911                                         bg_ret = ret;
10912                                         continue;
10913                                 }
10914                                 ret = wait_block_group_cache_done(cache);
10915                                 if (ret) {
10916                                         bg_failed++;
10917                                         bg_ret = ret;
10918                                         continue;
10919                                 }
10920                         }
10921                         ret = btrfs_trim_block_group(cache,
10922                                                      &group_trimmed,
10923                                                      start,
10924                                                      end,
10925                                                      range->minlen);
10926
10927                         trimmed += group_trimmed;
10928                         if (ret) {
10929                                 bg_failed++;
10930                                 bg_ret = ret;
10931                                 continue;
10932                         }
10933                 }
10934         }
10935
10936         if (bg_failed)
10937                 btrfs_warn(fs_info,
10938                         "failed to trim %llu block group(s), last error %d",
10939                         bg_failed, bg_ret);
10940         mutex_lock(&fs_info->fs_devices->device_list_mutex);
10941         devices = &fs_info->fs_devices->devices;
10942         list_for_each_entry(device, devices, dev_list) {
10943                 ret = btrfs_trim_free_extents(device, &group_trimmed);
10944                 if (ret) {
10945                         dev_failed++;
10946                         dev_ret = ret;
10947                         break;
10948                 }
10949
10950                 trimmed += group_trimmed;
10951         }
10952         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
10953
10954         if (dev_failed)
10955                 btrfs_warn(fs_info,
10956                         "failed to trim %llu device(s), last error %d",
10957                         dev_failed, dev_ret);
10958         range->len = trimmed;
10959         if (bg_ret)
10960                 return bg_ret;
10961         return dev_ret;
10962 }
10963
10964 /*
10965  * btrfs_{start,end}_write_no_snapshotting() are similar to
10966  * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
10967  * data into the page cache through nocow before the subvolume is snapshoted,
10968  * but flush the data into disk after the snapshot creation, or to prevent
10969  * operations while snapshotting is ongoing and that cause the snapshot to be
10970  * inconsistent (writes followed by expanding truncates for example).
10971  */
10972 void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
10973 {
10974         percpu_counter_dec(&root->subv_writers->counter);
10975         cond_wake_up(&root->subv_writers->wait);
10976 }
10977
10978 int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
10979 {
10980         if (atomic_read(&root->will_be_snapshotted))
10981                 return 0;
10982
10983         percpu_counter_inc(&root->subv_writers->counter);
10984         /*
10985          * Make sure counter is updated before we check for snapshot creation.
10986          */
10987         smp_mb();
10988         if (atomic_read(&root->will_be_snapshotted)) {
10989                 btrfs_end_write_no_snapshotting(root);
10990                 return 0;
10991         }
10992         return 1;
10993 }
10994
10995 void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
10996 {
10997         while (true) {
10998                 int ret;
10999
11000                 ret = btrfs_start_write_no_snapshotting(root);
11001                 if (ret)
11002                         break;
11003                 wait_var_event(&root->will_be_snapshotted,
11004                                !atomic_read(&root->will_be_snapshotted));
11005         }
11006 }
11007
11008 void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg)
11009 {
11010         struct btrfs_fs_info *fs_info = bg->fs_info;
11011
11012         spin_lock(&fs_info->unused_bgs_lock);
11013         if (list_empty(&bg->bg_list)) {
11014                 btrfs_get_block_group(bg);
11015                 trace_btrfs_add_unused_block_group(bg);
11016                 list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
11017         }
11018         spin_unlock(&fs_info->unused_bgs_lock);
11019 }