fs/btrfs/extent-tree.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2007 Oracle.  All rights reserved.
   4  */
   5
   6 #include <linux/sched.h>
   7 #include <linux/sched/signal.h>
   8 #include <linux/pagemap.h>
   9 #include <linux/writeback.h>
  10 #include <linux/blkdev.h>
  11 #include <linux/sort.h>
  12 #include <linux/rcupdate.h>
  13 #include <linux/kthread.h>
  14 #include <linux/slab.h>
  15 #include <linux/ratelimit.h>
  16 #include <linux/percpu_counter.h>
  17 #include <linux/lockdep.h>
  18 #include <linux/crc32c.h>
  19 #include "tree-log.h"
  20 #include "disk-io.h"
  21 #include "print-tree.h"
  22 #include "volumes.h"
  23 #include "raid56.h"
  24 #include "locking.h"
  25 #include "free-space-cache.h"
  26 #include "free-space-tree.h"
  27 #include "math.h"
  28 #include "sysfs.h"
  29 #include "qgroup.h"
  30 #include "ref-verify.h"
  31
  32 #undef SCRAMBLE_DELAYED_REFS
  33
  34 /*
  35  * control flags for do_chunk_alloc's force field
  36  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
  37  * if we really need one.
  38  *
  39  * CHUNK_ALLOC_LIMITED means to only try and allocate one
  40  * if we have very few chunks already allocated.  This is
  41  * used as part of the clustering code to help make sure
  42  * we have a good pool of storage to cluster in, without
  43  * filling the FS with empty chunks
  44  *
  45  * CHUNK_ALLOC_FORCE means it must try to allocate one
  46  *
  47  */
  48 enum {
  49         CHUNK_ALLOC_NO_FORCE = 0,
  50         CHUNK_ALLOC_LIMITED = 1,
  51         CHUNK_ALLOC_FORCE = 2,
  52 };
  53
  54 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
  55                                struct btrfs_delayed_ref_node *node, u64 parent,
  56                                u64 root_objectid, u64 owner_objectid,
  57                                u64 owner_offset, int refs_to_drop,
  58                                struct btrfs_delayed_extent_op *extra_op);
  59 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
  60                                     struct extent_buffer *leaf,
  61                                     struct btrfs_extent_item *ei);
  62 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
  63                                       u64 parent, u64 root_objectid,
  64                                       u64 flags, u64 owner, u64 offset,
  65                                       struct btrfs_key *ins, int ref_mod);
  66 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
  67                                      struct btrfs_delayed_ref_node *node,
  68                                      struct btrfs_delayed_extent_op *extent_op);
  69 static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
  70                           int force);
  71 static int find_next_key(struct btrfs_path *path, int level,
  72                          struct btrfs_key *key);
  73 static void dump_space_info(struct btrfs_fs_info *fs_info,
  74                             struct btrfs_space_info *info, u64 bytes,
  75                             int dump_block_groups);
  76 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
  77                                u64 num_bytes);
  78 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
  79                                      struct btrfs_space_info *space_info,
  80                                      u64 num_bytes);
  81 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
  82                                      struct btrfs_space_info *space_info,
  83                                      u64 num_bytes);
  84
  85 static noinline int
  86 block_group_cache_done(struct btrfs_block_group_cache *cache)
  87 {
  88         smp_mb();
  89         return cache->cached == BTRFS_CACHE_FINISHED ||
  90                 cache->cached == BTRFS_CACHE_ERROR;
  91 }
  92
  93 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
  94 {
  95         return (cache->flags & bits) == bits;
  96 }
  97
  98 void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
  99 {
 100         atomic_inc(&cache->count);
 101 }
 102
 103 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
 104 {
 105         if (atomic_dec_and_test(&cache->count)) {
 106                 WARN_ON(cache->pinned > 0);
 107                 WARN_ON(cache->reserved > 0);
 108
 109                 /*
 110                  * If not empty, someone is still holding mutex of
 111                  * full_stripe_lock, which can only be released by caller.
 112                  * And it will definitely cause use-after-free when caller
 113                  * tries to release full stripe lock.
 114                  *
 115                  * No better way to resolve, but only to warn.
 116                  */
 117                 WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
 118                 kfree(cache->free_space_ctl);
 119                 kfree(cache);
 120         }
 121 }
 122
 123 /*
 124  * this adds the block group to the fs_info rb tree for the block group
 125  * cache
 126  */
 127 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
 128                                 struct btrfs_block_group_cache *block_group)
 129 {
 130         struct rb_node **p;
 131         struct rb_node *parent = NULL;
 132         struct btrfs_block_group_cache *cache;
 133
 134         spin_lock(&info->block_group_cache_lock);
 135         p = &info->block_group_cache_tree.rb_node;
 136
 137         while (*p) {
 138                 parent = *p;
 139                 cache = rb_entry(parent, struct btrfs_block_group_cache,
 140                                  cache_node);
 141                 if (block_group->key.objectid < cache->key.objectid) {
 142                         p = &(*p)->rb_left;
 143                 } else if (block_group->key.objectid > cache->key.objectid) {
 144                         p = &(*p)->rb_right;
 145                 } else {
 146                         spin_unlock(&info->block_group_cache_lock);
 147                         return -EEXIST;
 148                 }
 149         }
 150
 151         rb_link_node(&block_group->cache_node, parent, p);
 152         rb_insert_color(&block_group->cache_node,
 153                         &info->block_group_cache_tree);
 154
 155         if (info->first_logical_byte > block_group->key.objectid)
 156                 info->first_logical_byte = block_group->key.objectid;
 157
 158         spin_unlock(&info->block_group_cache_lock);
 159
 160         return 0;
 161 }
 162
 163 /*
 164  * This will return the block group at or after bytenr if contains is 0, else
 165  * it will return the block group that contains the bytenr
 166  */
 167 static struct btrfs_block_group_cache *
 168 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
 169                               int contains)
 170 {
 171         struct btrfs_block_group_cache *cache, *ret = NULL;
 172         struct rb_node *n;
 173         u64 end, start;
 174
 175         spin_lock(&info->block_group_cache_lock);
 176         n = info->block_group_cache_tree.rb_node;
 177
 178         while (n) {
 179                 cache = rb_entry(n, struct btrfs_block_group_cache,
 180                                  cache_node);
 181                 end = cache->key.objectid + cache->key.offset - 1;
 182                 start = cache->key.objectid;
 183
 184                 if (bytenr < start) {
 185                         if (!contains && (!ret || start < ret->key.objectid))
 186                                 ret = cache;
 187                         n = n->rb_left;
 188                 } else if (bytenr > start) {
 189                         if (contains && bytenr <= end) {
 190                                 ret = cache;
 191                                 break;
 192                         }
 193                         n = n->rb_right;
 194                 } else {
 195                         ret = cache;
 196                         break;
 197                 }
 198         }
 199         if (ret) {
 200                 btrfs_get_block_group(ret);
 201                 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
 202                         info->first_logical_byte = ret->key.objectid;
 203         }
 204         spin_unlock(&info->block_group_cache_lock);
 205
 206         return ret;
 207 }
 208
 209 static int add_excluded_extent(struct btrfs_fs_info *fs_info,
 210                                u64 start, u64 num_bytes)
 211 {
 212         u64 end = start + num_bytes - 1;
 213         set_extent_bits(&fs_info->freed_extents[0],
 214                         start, end, EXTENT_UPTODATE);
 215         set_extent_bits(&fs_info->freed_extents[1],
 216                         start, end, EXTENT_UPTODATE);
 217         return 0;
 218 }
 219
 220 static void free_excluded_extents(struct btrfs_block_group_cache *cache)
 221 {
 222         struct btrfs_fs_info *fs_info = cache->fs_info;
 223         u64 start, end;
 224
 225         start = cache->key.objectid;
 226         end = start + cache->key.offset - 1;
 227
 228         clear_extent_bits(&fs_info->freed_extents[0],
 229                           start, end, EXTENT_UPTODATE);
 230         clear_extent_bits(&fs_info->freed_extents[1],
 231                           start, end, EXTENT_UPTODATE);
 232 }
 233
 234 static int exclude_super_stripes(struct btrfs_block_group_cache *cache)
 235 {
 236         struct btrfs_fs_info *fs_info = cache->fs_info;
 237         u64 bytenr;
 238         u64 *logical;
 239         int stripe_len;
 240         int i, nr, ret;
 241
 242         if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
 243                 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
 244                 cache->bytes_super += stripe_len;
 245                 ret = add_excluded_extent(fs_info, cache->key.objectid,
 246                                           stripe_len);
 247                 if (ret)
 248                         return ret;
 249         }
 250
 251         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
 252                 bytenr = btrfs_sb_offset(i);
 253                 ret = btrfs_rmap_block(fs_info, cache->key.objectid,
 254                                        bytenr, &logical, &nr, &stripe_len);
 255                 if (ret)
 256                         return ret;
 257
 258                 while (nr--) {
 259                         u64 start, len;
 260
 261                         if (logical[nr] > cache->key.objectid +
 262                             cache->key.offset)
 263                                 continue;
 264
 265                         if (logical[nr] + stripe_len <= cache->key.objectid)
 266                                 continue;
 267
 268                         start = logical[nr];
 269                         if (start < cache->key.objectid) {
 270                                 start = cache->key.objectid;
 271                                 len = (logical[nr] + stripe_len) - start;
 272                         } else {
 273                                 len = min_t(u64, stripe_len,
 274                                             cache->key.objectid +
 275                                             cache->key.offset - start);
 276                         }
 277
 278                         cache->bytes_super += len;
 279                         ret = add_excluded_extent(fs_info, start, len);
 280                         if (ret) {
 281                                 kfree(logical);
 282                                 return ret;
 283                         }
 284                 }
 285
 286                 kfree(logical);
 287         }
 288         return 0;
 289 }
 290
 291 static struct btrfs_caching_control *
 292 get_caching_control(struct btrfs_block_group_cache *cache)
 293 {
 294         struct btrfs_caching_control *ctl;
 295
 296         spin_lock(&cache->lock);
 297         if (!cache->caching_ctl) {
 298                 spin_unlock(&cache->lock);
 299                 return NULL;
 300         }
 301
 302         ctl = cache->caching_ctl;
 303         refcount_inc(&ctl->count);
 304         spin_unlock(&cache->lock);
 305         return ctl;
 306 }
 307
 308 static void put_caching_control(struct btrfs_caching_control *ctl)
 309 {
 310         if (refcount_dec_and_test(&ctl->count))
 311                 kfree(ctl);
 312 }
 313
 314 #ifdef CONFIG_BTRFS_DEBUG
 315 static void fragment_free_space(struct btrfs_block_group_cache *block_group)
 316 {
 317         struct btrfs_fs_info *fs_info = block_group->fs_info;
 318         u64 start = block_group->key.objectid;
 319         u64 len = block_group->key.offset;
 320         u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
 321                 fs_info->nodesize : fs_info->sectorsize;
 322         u64 step = chunk << 1;
 323
 324         while (len > chunk) {
 325                 btrfs_remove_free_space(block_group, start, chunk);
 326                 start += step;
 327                 if (len < step)
 328                         len = 0;
 329                 else
 330                         len -= step;
 331         }
 332 }
 333 #endif
 334
 335 /*
 336  * this is only called by cache_block_group, since we could have freed extents
 337  * we need to check the pinned_extents for any extents that can't be used yet
 338  * since their free space will be released as soon as the transaction commits.
 339  */
 340 u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
 341                        u64 start, u64 end)
 342 {
 343         struct btrfs_fs_info *info = block_group->fs_info;
 344         u64 extent_start, extent_end, size, total_added = 0;
 345         int ret;
 346
 347         while (start < end) {
 348                 ret = find_first_extent_bit(info->pinned_extents, start,
 349                                             &extent_start, &extent_end,
 350                                             EXTENT_DIRTY | EXTENT_UPTODATE,
 351                                             NULL);
 352                 if (ret)
 353                         break;
 354
 355                 if (extent_start <= start) {
 356                         start = extent_end + 1;
 357                 } else if (extent_start > start && extent_start < end) {
 358                         size = extent_start - start;
 359                         total_added += size;
 360                         ret = btrfs_add_free_space(block_group, start,
 361                                                    size);
 362                         BUG_ON(ret); /* -ENOMEM or logic error */
 363                         start = extent_end + 1;
 364                 } else {
 365                         break;
 366                 }
 367         }
 368
 369         if (start < end) {
 370                 size = end - start;
 371                 total_added += size;
 372                 ret = btrfs_add_free_space(block_group, start, size);
 373                 BUG_ON(ret); /* -ENOMEM or logic error */
 374         }
 375
 376         return total_added;
 377 }
 378
 379 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
 380 {
 381         struct btrfs_block_group_cache *block_group = caching_ctl->block_group;
 382         struct btrfs_fs_info *fs_info = block_group->fs_info;
 383         struct btrfs_root *extent_root = fs_info->extent_root;
 384         struct btrfs_path *path;
 385         struct extent_buffer *leaf;
 386         struct btrfs_key key;
 387         u64 total_found = 0;
 388         u64 last = 0;
 389         u32 nritems;
 390         int ret;
 391         bool wakeup = true;
 392
 393         path = btrfs_alloc_path();
 394         if (!path)
 395                 return -ENOMEM;
 396
 397         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
 398
 399 #ifdef CONFIG_BTRFS_DEBUG
 400         /*
 401          * If we're fragmenting we don't want to make anybody think we can
 402          * allocate from this block group until we've had a chance to fragment
 403          * the free space.
 404          */
 405         if (btrfs_should_fragment_free_space(block_group))
 406                 wakeup = false;
 407 #endif
 408         /*
 409          * We don't want to deadlock with somebody trying to allocate a new
 410          * extent for the extent root while also trying to search the extent
 411          * root to add free space.  So we skip locking and search the commit
 412          * root, since its read-only
 413          */
 414         path->skip_locking = 1;
 415         path->search_commit_root = 1;
 416         path->reada = READA_FORWARD;
 417
 418         key.objectid = last;
 419         key.offset = 0;
 420         key.type = BTRFS_EXTENT_ITEM_KEY;
 421
 422 next:
 423         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 424         if (ret < 0)
 425                 goto out;
 426
 427         leaf = path->nodes[0];
 428         nritems = btrfs_header_nritems(leaf);
 429
 430         while (1) {
 431                 if (btrfs_fs_closing(fs_info) > 1) {
 432                         last = (u64)-1;
 433                         break;
 434                 }
 435
 436                 if (path->slots[0] < nritems) {
 437                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 438                 } else {
 439                         ret = find_next_key(path, 0, &key);
 440                         if (ret)
 441                                 break;
 442
 443                         if (need_resched() ||
 444                             rwsem_is_contended(&fs_info->commit_root_sem)) {
 445                                 if (wakeup)
 446                                         caching_ctl->progress = last;
 447                                 btrfs_release_path(path);
 448                                 up_read(&fs_info->commit_root_sem);
 449                                 mutex_unlock(&caching_ctl->mutex);
 450                                 cond_resched();
 451                                 mutex_lock(&caching_ctl->mutex);
 452                                 down_read(&fs_info->commit_root_sem);
 453                                 goto next;
 454                         }
 455
 456                         ret = btrfs_next_leaf(extent_root, path);
 457                         if (ret < 0)
 458                                 goto out;
 459                         if (ret)
 460                                 break;
 461                         leaf = path->nodes[0];
 462                         nritems = btrfs_header_nritems(leaf);
 463                         continue;
 464                 }
 465
 466                 if (key.objectid < last) {
 467                         key.objectid = last;
 468                         key.offset = 0;
 469                         key.type = BTRFS_EXTENT_ITEM_KEY;
 470
 471                         if (wakeup)
 472                                 caching_ctl->progress = last;
 473                         btrfs_release_path(path);
 474                         goto next;
 475                 }
 476
 477                 if (key.objectid < block_group->key.objectid) {
 478                         path->slots[0]++;
 479                         continue;
 480                 }
 481
 482                 if (key.objectid >= block_group->key.objectid +
 483                     block_group->key.offset)
 484                         break;
 485
 486                 if (key.type == BTRFS_EXTENT_ITEM_KEY ||
 487                     key.type == BTRFS_METADATA_ITEM_KEY) {
 488                         total_found += add_new_free_space(block_group, last,
 489                                                           key.objectid);
 490                         if (key.type == BTRFS_METADATA_ITEM_KEY)
 491                                 last = key.objectid +
 492                                         fs_info->nodesize;
 493                         else
 494                                 last = key.objectid + key.offset;
 495
 496                         if (total_found > CACHING_CTL_WAKE_UP) {
 497                                 total_found = 0;
 498                                 if (wakeup)
 499                                         wake_up(&caching_ctl->wait);
 500                         }
 501                 }
 502                 path->slots[0]++;
 503         }
 504         ret = 0;
 505
 506         total_found += add_new_free_space(block_group, last,
 507                                           block_group->key.objectid +
 508                                           block_group->key.offset);
 509         caching_ctl->progress = (u64)-1;
 510
 511 out:
 512         btrfs_free_path(path);
 513         return ret;
 514 }
 515
 516 static noinline void caching_thread(struct btrfs_work *work)
 517 {
 518         struct btrfs_block_group_cache *block_group;
 519         struct btrfs_fs_info *fs_info;
 520         struct btrfs_caching_control *caching_ctl;
 521         int ret;
 522
 523         caching_ctl = container_of(work, struct btrfs_caching_control, work);
 524         block_group = caching_ctl->block_group;
 525         fs_info = block_group->fs_info;
 526
 527         mutex_lock(&caching_ctl->mutex);
 528         down_read(&fs_info->commit_root_sem);
 529
 530         if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
 531                 ret = load_free_space_tree(caching_ctl);
 532         else
 533                 ret = load_extent_tree_free(caching_ctl);
 534
 535         spin_lock(&block_group->lock);
 536         block_group->caching_ctl = NULL;
 537         block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
 538         spin_unlock(&block_group->lock);
 539
 540 #ifdef CONFIG_BTRFS_DEBUG
 541         if (btrfs_should_fragment_free_space(block_group)) {
 542                 u64 bytes_used;
 543
 544                 spin_lock(&block_group->space_info->lock);
 545                 spin_lock(&block_group->lock);
 546                 bytes_used = block_group->key.offset -
 547                         btrfs_block_group_used(&block_group->item);
 548                 block_group->space_info->bytes_used += bytes_used >> 1;
 549                 spin_unlock(&block_group->lock);
 550                 spin_unlock(&block_group->space_info->lock);
 551                 fragment_free_space(block_group);
 552         }
 553 #endif
 554
 555         caching_ctl->progress = (u64)-1;
 556
 557         up_read(&fs_info->commit_root_sem);
 558         free_excluded_extents(block_group);
 559         mutex_unlock(&caching_ctl->mutex);
 560
 561         wake_up(&caching_ctl->wait);
 562
 563         put_caching_control(caching_ctl);
 564         btrfs_put_block_group(block_group);
 565 }
 566
 567 static int cache_block_group(struct btrfs_block_group_cache *cache,
 568                              int load_cache_only)
 569 {
 570         DEFINE_WAIT(wait);
 571         struct btrfs_fs_info *fs_info = cache->fs_info;
 572         struct btrfs_caching_control *caching_ctl;
 573         int ret = 0;
 574
 575         caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
 576         if (!caching_ctl)
 577                 return -ENOMEM;
 578
 579         INIT_LIST_HEAD(&caching_ctl->list);
 580         mutex_init(&caching_ctl->mutex);
 581         init_waitqueue_head(&caching_ctl->wait);
 582         caching_ctl->block_group = cache;
 583         caching_ctl->progress = cache->key.objectid;
 584         refcount_set(&caching_ctl->count, 1);
 585         btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
 586                         caching_thread, NULL, NULL);
 587
 588         spin_lock(&cache->lock);
 589         /*
 590          * This should be a rare occasion, but this could happen I think in the
 591          * case where one thread starts to load the space cache info, and then
 592          * some other thread starts a transaction commit which tries to do an
 593          * allocation while the other thread is still loading the space cache
 594          * info.  The previous loop should have kept us from choosing this block
 595          * group, but if we've moved to the state where we will wait on caching
 596          * block groups we need to first check if we're doing a fast load here,
 597          * so we can wait for it to finish, otherwise we could end up allocating
 598          * from a block group who's cache gets evicted for one reason or
 599          * another.
 600          */
 601         while (cache->cached == BTRFS_CACHE_FAST) {
 602                 struct btrfs_caching_control *ctl;
 603
 604                 ctl = cache->caching_ctl;
 605                 refcount_inc(&ctl->count);
 606                 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
 607                 spin_unlock(&cache->lock);
 608
 609                 schedule();
 610
 611                 finish_wait(&ctl->wait, &wait);
 612                 put_caching_control(ctl);
 613                 spin_lock(&cache->lock);
 614         }
 615
 616         if (cache->cached != BTRFS_CACHE_NO) {
 617                 spin_unlock(&cache->lock);
 618                 kfree(caching_ctl);
 619                 return 0;
 620         }
 621         WARN_ON(cache->caching_ctl);
 622         cache->caching_ctl = caching_ctl;
 623         cache->cached = BTRFS_CACHE_FAST;
 624         spin_unlock(&cache->lock);
 625
 626         if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
 627                 mutex_lock(&caching_ctl->mutex);
 628                 ret = load_free_space_cache(fs_info, cache);
 629
 630                 spin_lock(&cache->lock);
 631                 if (ret == 1) {
 632                         cache->caching_ctl = NULL;
 633                         cache->cached = BTRFS_CACHE_FINISHED;
 634                         cache->last_byte_to_unpin = (u64)-1;
 635                         caching_ctl->progress = (u64)-1;
 636                 } else {
 637                         if (load_cache_only) {
 638                                 cache->caching_ctl = NULL;
 639                                 cache->cached = BTRFS_CACHE_NO;
 640                         } else {
 641                                 cache->cached = BTRFS_CACHE_STARTED;
 642                                 cache->has_caching_ctl = 1;
 643                         }
 644                 }
 645                 spin_unlock(&cache->lock);
 646 #ifdef CONFIG_BTRFS_DEBUG
 647                 if (ret == 1 &&
 648                     btrfs_should_fragment_free_space(cache)) {
 649                         u64 bytes_used;
 650
 651                         spin_lock(&cache->space_info->lock);
 652                         spin_lock(&cache->lock);
 653                         bytes_used = cache->key.offset -
 654                                 btrfs_block_group_used(&cache->item);
 655                         cache->space_info->bytes_used += bytes_used >> 1;
 656                         spin_unlock(&cache->lock);
 657                         spin_unlock(&cache->space_info->lock);
 658                         fragment_free_space(cache);
 659                 }
 660 #endif
 661                 mutex_unlock(&caching_ctl->mutex);
 662
 663                 wake_up(&caching_ctl->wait);
 664                 if (ret == 1) {
 665                         put_caching_control(caching_ctl);
 666                         free_excluded_extents(cache);
 667                         return 0;
 668                 }
 669         } else {
 670                 /*
 671                  * We're either using the free space tree or no caching at all.
 672                  * Set cached to the appropriate value and wakeup any waiters.
 673                  */
 674                 spin_lock(&cache->lock);
 675                 if (load_cache_only) {
 676                         cache->caching_ctl = NULL;
 677                         cache->cached = BTRFS_CACHE_NO;
 678                 } else {
 679                         cache->cached = BTRFS_CACHE_STARTED;
 680                         cache->has_caching_ctl = 1;
 681                 }
 682                 spin_unlock(&cache->lock);
 683                 wake_up(&caching_ctl->wait);
 684         }
 685
 686         if (load_cache_only) {
 687                 put_caching_control(caching_ctl);
 688                 return 0;
 689         }
 690
 691         down_write(&fs_info->commit_root_sem);
 692         refcount_inc(&caching_ctl->count);
 693         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
 694         up_write(&fs_info->commit_root_sem);
 695
 696         btrfs_get_block_group(cache);
 697
 698         btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
 699
 700         return ret;
 701 }
 702
 703 /*
 704  * return the block group that starts at or after bytenr
 705  */
 706 static struct btrfs_block_group_cache *
 707 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
 708 {
 709         return block_group_cache_tree_search(info, bytenr, 0);
 710 }
 711
 712 /*
 713  * return the block group that contains the given bytenr
 714  */
 715 struct btrfs_block_group_cache *btrfs_lookup_block_group(
 716                                                  struct btrfs_fs_info *info,
 717                                                  u64 bytenr)
 718 {
 719         return block_group_cache_tree_search(info, bytenr, 1);
 720 }
 721
 722 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 723                                                   u64 flags)
 724 {
 725         struct list_head *head = &info->space_info;
 726         struct btrfs_space_info *found;
 727
 728         flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
 729
 730         rcu_read_lock();
 731         list_for_each_entry_rcu(found, head, list) {
 732                 if (found->flags & flags) {
 733                         rcu_read_unlock();
 734                         return found;
 735                 }
 736         }
 737         rcu_read_unlock();
 738         return NULL;
 739 }
 740
 741 static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes,
 742                              bool metadata, u64 root_objectid)
 743 {
 744         struct btrfs_space_info *space_info;
 745         u64 flags;
 746
 747         if (metadata) {
 748                 if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
 749                         flags = BTRFS_BLOCK_GROUP_SYSTEM;
 750                 else
 751                         flags = BTRFS_BLOCK_GROUP_METADATA;
 752         } else {
 753                 flags = BTRFS_BLOCK_GROUP_DATA;
 754         }
 755
 756         space_info = __find_space_info(fs_info, flags);
 757         ASSERT(space_info);
 758         percpu_counter_add_batch(&space_info->total_bytes_pinned, num_bytes,
 759                     BTRFS_TOTAL_BYTES_PINNED_BATCH);
 760 }
 761
 762 /*
 763  * after adding space to the filesystem, we need to clear the full flags
 764  * on all the space infos.
 765  */
 766 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
 767 {
 768         struct list_head *head = &info->space_info;
 769         struct btrfs_space_info *found;
 770
 771         rcu_read_lock();
 772         list_for_each_entry_rcu(found, head, list)
 773                 found->full = 0;
 774         rcu_read_unlock();
 775 }
 776
 777 /* simple helper to search for an existing data extent at a given offset */
 778 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
 779 {
 780         int ret;
 781         struct btrfs_key key;
 782         struct btrfs_path *path;
 783
 784         path = btrfs_alloc_path();
 785         if (!path)
 786                 return -ENOMEM;
 787
 788         key.objectid = start;
 789         key.offset = len;
 790         key.type = BTRFS_EXTENT_ITEM_KEY;
 791         ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
 792         btrfs_free_path(path);
 793         return ret;
 794 }
 795
 796 /*
 797  * helper function to lookup reference count and flags of a tree block.
 798  *
 799  * the head node for delayed ref is used to store the sum of all the
 800  * reference count modifications queued up in the rbtree. the head
 801  * node may also store the extent flags to set. This way you can check
 802  * to see what the reference count and extent flags would be if all of
 803  * the delayed refs are not processed.
 804  */
 805 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 806                              struct btrfs_fs_info *fs_info, u64 bytenr,
 807                              u64 offset, int metadata, u64 *refs, u64 *flags)
 808 {
 809         struct btrfs_delayed_ref_head *head;
 810         struct btrfs_delayed_ref_root *delayed_refs;
 811         struct btrfs_path *path;
 812         struct btrfs_extent_item *ei;
 813         struct extent_buffer *leaf;
 814         struct btrfs_key key;
 815         u32 item_size;
 816         u64 num_refs;
 817         u64 extent_flags;
 818         int ret;
 819
 820         /*
 821          * If we don't have skinny metadata, don't bother doing anything
 822          * different
 823          */
 824         if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
 825                 offset = fs_info->nodesize;
 826                 metadata = 0;
 827         }
 828
 829         path = btrfs_alloc_path();
 830         if (!path)
 831                 return -ENOMEM;
 832
 833         if (!trans) {
 834                 path->skip_locking = 1;
 835                 path->search_commit_root = 1;
 836         }
 837
 838 search_again:
 839         key.objectid = bytenr;
 840         key.offset = offset;
 841         if (metadata)
 842                 key.type = BTRFS_METADATA_ITEM_KEY;
 843         else
 844                 key.type = BTRFS_EXTENT_ITEM_KEY;
 845
 846         ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
 847         if (ret < 0)
 848                 goto out_free;
 849
 850         if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
 851                 if (path->slots[0]) {
 852                         path->slots[0]--;
 853                         btrfs_item_key_to_cpu(path->nodes[0], &key,
 854                                               path->slots[0]);
 855                         if (key.objectid == bytenr &&
 856                             key.type == BTRFS_EXTENT_ITEM_KEY &&
 857                             key.offset == fs_info->nodesize)
 858                                 ret = 0;
 859                 }
 860         }
 861
 862         if (ret == 0) {
 863                 leaf = path->nodes[0];
 864                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 865                 if (item_size >= sizeof(*ei)) {
 866                         ei = btrfs_item_ptr(leaf, path->slots[0],
 867                                             struct btrfs_extent_item);
 868                         num_refs = btrfs_extent_refs(leaf, ei);
 869                         extent_flags = btrfs_extent_flags(leaf, ei);
 870                 } else {
 871                         ret = -EINVAL;
 872                         btrfs_print_v0_err(fs_info);
 873                         if (trans)
 874                                 btrfs_abort_transaction(trans, ret);
 875                         else
 876                                 btrfs_handle_fs_error(fs_info, ret, NULL);
 877
 878                         goto out_free;
 879                 }
 880
 881                 BUG_ON(num_refs == 0);
 882         } else {
 883                 num_refs = 0;
 884                 extent_flags = 0;
 885                 ret = 0;
 886         }
 887
 888         if (!trans)
 889                 goto out;
 890
 891         delayed_refs = &trans->transaction->delayed_refs;
 892         spin_lock(&delayed_refs->lock);
 893         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
 894         if (head) {
 895                 if (!mutex_trylock(&head->mutex)) {
 896                         refcount_inc(&head->refs);
 897                         spin_unlock(&delayed_refs->lock);
 898
 899                         btrfs_release_path(path);
 900
 901                         /*
 902                          * Mutex was contended, block until it's released and try
 903                          * again
 904                          */
 905                         mutex_lock(&head->mutex);
 906                         mutex_unlock(&head->mutex);
 907                         btrfs_put_delayed_ref_head(head);
 908                         goto search_again;
 909                 }
 910                 spin_lock(&head->lock);
 911                 if (head->extent_op && head->extent_op->update_flags)
 912                         extent_flags |= head->extent_op->flags_to_set;
 913                 else
 914                         BUG_ON(num_refs == 0);
 915
 916                 num_refs += head->ref_mod;
 917                 spin_unlock(&head->lock);
 918                 mutex_unlock(&head->mutex);
 919         }
 920         spin_unlock(&delayed_refs->lock);
 921 out:
 922         WARN_ON(num_refs == 0);
 923         if (refs)
 924                 *refs = num_refs;
 925         if (flags)
 926                 *flags = extent_flags;
 927 out_free:
 928         btrfs_free_path(path);
 929         return ret;
 930 }
 931
 932 /*
 933  * Back reference rules.  Back refs have three main goals:
 934  *
 935  * 1) differentiate between all holders of references to an extent so that
 936  *    when a reference is dropped we can make sure it was a valid reference
 937  *    before freeing the extent.
 938  *
 939  * 2) Provide enough information to quickly find the holders of an extent
 940  *    if we notice a given block is corrupted or bad.
 941  *
 942  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
 943  *    maintenance.  This is actually the same as #2, but with a slightly
 944  *    different use case.
 945  *
 946  * There are two kinds of back refs. The implicit back refs is optimized
 947  * for pointers in non-shared tree blocks. For a given pointer in a block,
 948  * back refs of this kind provide information about the block's owner tree
 949  * and the pointer's key. These information allow us to find the block by
 950  * b-tree searching. The full back refs is for pointers in tree blocks not
 951  * referenced by their owner trees. The location of tree block is recorded
 952  * in the back refs. Actually the full back refs is generic, and can be
 953  * used in all cases the implicit back refs is used. The major shortcoming
 954  * of the full back refs is its overhead. Every time a tree block gets
 955  * COWed, we have to update back refs entry for all pointers in it.
 956  *
 957  * For a newly allocated tree block, we use implicit back refs for
 958  * pointers in it. This means most tree related operations only involve
 959  * implicit back refs. For a tree block created in old transaction, the
 960  * only way to drop a reference to it is COW it. So we can detect the
 961  * event that tree block loses its owner tree's reference and do the
 962  * back refs conversion.
 963  *
 964  * When a tree block is COWed through a tree, there are four cases:
 965  *
 966  * The reference count of the block is one and the tree is the block's
 967  * owner tree. Nothing to do in this case.
 968  *
 969  * The reference count of the block is one and the tree is not the
 970  * block's owner tree. In this case, full back refs is used for pointers
 971  * in the block. Remove these full back refs, add implicit back refs for
 972  * every pointers in the new block.
 973  *
 974  * The reference count of the block is greater than one and the tree is
 975  * the block's owner tree. In this case, implicit back refs is used for
 976  * pointers in the block. Add full back refs for every pointers in the
 977  * block, increase lower level extents' reference counts. The original
 978  * implicit back refs are entailed to the new block.
 979  *
 980  * The reference count of the block is greater than one and the tree is
 981  * not the block's owner tree. Add implicit back refs for every pointer in
 982  * the new block, increase lower level extents' reference count.
 983  *
 984  * Back Reference Key composing:
 985  *
 986  * The key objectid corresponds to the first byte in the extent,
 987  * The key type is used to differentiate between types of back refs.
 988  * There are different meanings of the key offset for different types
 989  * of back refs.
 990  *
 991  * File extents can be referenced by:
 992  *
 993  * - multiple snapshots, subvolumes, or different generations in one subvol
 994  * - different files inside a single subvolume
 995  * - different offsets inside a file (bookend extents in file.c)
 996  *
 997  * The extent ref structure for the implicit back refs has fields for:
 998  *
 999  * - Objectid of the subvolume root
1000  * - objectid of the file holding the reference
1001  * - original offset in the file
1002  * - how many bookend extents
1003  *
1004  * The key offset for the implicit back refs is hash of the first
1005  * three fields.
1006  *
1007  * The extent ref structure for the full back refs has field for:
1008  *
1009  * - number of pointers in the tree leaf
1010  *
1011  * The key offset for the implicit back refs is the first byte of
1012  * the tree leaf
1013  *
1014  * When a file extent is allocated, The implicit back refs is used.
1015  * the fields are filled in:
1016  *
1017  *     (root_key.objectid, inode objectid, offset in file, 1)
1018  *
1019  * When a file extent is removed file truncation, we find the
1020  * corresponding implicit back refs and check the following fields:
1021  *
1022  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
1023  *
1024  * Btree extents can be referenced by:
1025  *
1026  * - Different subvolumes
1027  *
1028  * Both the implicit back refs and the full back refs for tree blocks
1029  * only consist of key. The key offset for the implicit back refs is
1030  * objectid of block's owner tree. The key offset for the full back refs
1031  * is the first byte of parent block.
1032  *
1033  * When implicit back refs is used, information about the lowest key and
1034  * level of the tree block are required. These information are stored in
1035  * tree block info structure.
1036  */
1037
1038 /*
1039  * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
1040  * is_data == BTRFS_REF_TYPE_DATA, data type is requried,
1041  * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
1042  */
1043 int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
1044                                      struct btrfs_extent_inline_ref *iref,
1045                                      enum btrfs_inline_ref_type is_data)
1046 {
1047         int type = btrfs_extent_inline_ref_type(eb, iref);
1048         u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
1049
1050         if (type == BTRFS_TREE_BLOCK_REF_KEY ||
1051             type == BTRFS_SHARED_BLOCK_REF_KEY ||
1052             type == BTRFS_SHARED_DATA_REF_KEY ||
1053             type == BTRFS_EXTENT_DATA_REF_KEY) {
1054                 if (is_data == BTRFS_REF_TYPE_BLOCK) {
1055                         if (type == BTRFS_TREE_BLOCK_REF_KEY)
1056                                 return type;
1057                         if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1058                                 ASSERT(eb->fs_info);
1059                                 /*
1060                                  * Every shared one has parent tree
1061                                  * block, which must be aligned to
1062                                  * nodesize.
1063                                  */
1064                                 if (offset &&
1065                                     IS_ALIGNED(offset, eb->fs_info->nodesize))
1066                                         return type;
1067                         }
1068                 } else if (is_data == BTRFS_REF_TYPE_DATA) {
1069                         if (type == BTRFS_EXTENT_DATA_REF_KEY)
1070                                 return type;
1071                         if (type == BTRFS_SHARED_DATA_REF_KEY) {
1072                                 ASSERT(eb->fs_info);
1073                                 /*
1074                                  * Every shared one has parent tree
1075                                  * block, which must be aligned to
1076                                  * nodesize.
1077                                  */
1078                                 if (offset &&
1079                                     IS_ALIGNED(offset, eb->fs_info->nodesize))
1080                                         return type;
1081                         }
1082                 } else {
1083                         ASSERT(is_data == BTRFS_REF_TYPE_ANY);
1084                         return type;
1085                 }
1086         }
1087
1088         btrfs_print_leaf((struct extent_buffer *)eb);
1089         btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d",
1090                   eb->start, type);
1091         WARN_ON(1);
1092
1093         return BTRFS_REF_TYPE_INVALID;
1094 }
1095
1096 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1097 {
1098         u32 high_crc = ~(u32)0;
1099         u32 low_crc = ~(u32)0;
1100         __le64 lenum;
1101
1102         lenum = cpu_to_le64(root_objectid);
1103         high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
1104         lenum = cpu_to_le64(owner);
1105         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1106         lenum = cpu_to_le64(offset);
1107         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1108
1109         return ((u64)high_crc << 31) ^ (u64)low_crc;
1110 }
1111
1112 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1113                                      struct btrfs_extent_data_ref *ref)
1114 {
1115         return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1116                                     btrfs_extent_data_ref_objectid(leaf, ref),
1117                                     btrfs_extent_data_ref_offset(leaf, ref));
1118 }
1119
1120 static int match_extent_data_ref(struct extent_buffer *leaf,
1121                                  struct btrfs_extent_data_ref *ref,
1122                                  u64 root_objectid, u64 owner, u64 offset)
1123 {
1124         if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1125             btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1126             btrfs_extent_data_ref_offset(leaf, ref) != offset)
1127                 return 0;
1128         return 1;
1129 }
1130
1131 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1132                                            struct btrfs_path *path,
1133                                            u64 bytenr, u64 parent,
1134                                            u64 root_objectid,
1135                                            u64 owner, u64 offset)
1136 {
1137         struct btrfs_root *root = trans->fs_info->extent_root;
1138         struct btrfs_key key;
1139         struct btrfs_extent_data_ref *ref;
1140         struct extent_buffer *leaf;
1141         u32 nritems;
1142         int ret;
1143         int recow;
1144         int err = -ENOENT;
1145
1146         key.objectid = bytenr;
1147         if (parent) {
1148                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1149                 key.offset = parent;
1150         } else {
1151                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1152                 key.offset = hash_extent_data_ref(root_objectid,
1153                                                   owner, offset);
1154         }
1155 again:
1156         recow = 0;
1157         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1158         if (ret < 0) {
1159                 err = ret;
1160                 goto fail;
1161         }
1162
1163         if (parent) {
1164                 if (!ret)
1165                         return 0;
1166                 goto fail;
1167         }
1168
1169         leaf = path->nodes[0];
1170         nritems = btrfs_header_nritems(leaf);
1171         while (1) {
1172                 if (path->slots[0] >= nritems) {
1173                         ret = btrfs_next_leaf(root, path);
1174                         if (ret < 0)
1175                                 err = ret;
1176                         if (ret)
1177                                 goto fail;
1178
1179                         leaf = path->nodes[0];
1180                         nritems = btrfs_header_nritems(leaf);
1181                         recow = 1;
1182                 }
1183
1184                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1185                 if (key.objectid != bytenr ||
1186                     key.type != BTRFS_EXTENT_DATA_REF_KEY)
1187                         goto fail;
1188
1189                 ref = btrfs_item_ptr(leaf, path->slots[0],
1190                                      struct btrfs_extent_data_ref);
1191
1192                 if (match_extent_data_ref(leaf, ref, root_objectid,
1193                                           owner, offset)) {
1194                         if (recow) {
1195                                 btrfs_release_path(path);
1196                                 goto again;
1197                         }
1198                         err = 0;
1199                         break;
1200                 }
1201                 path->slots[0]++;
1202         }
1203 fail:
1204         return err;
1205 }
1206
1207 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1208                                            struct btrfs_path *path,
1209                                            u64 bytenr, u64 parent,
1210                                            u64 root_objectid, u64 owner,
1211                                            u64 offset, int refs_to_add)
1212 {
1213         struct btrfs_root *root = trans->fs_info->extent_root;
1214         struct btrfs_key key;
1215         struct extent_buffer *leaf;
1216         u32 size;
1217         u32 num_refs;
1218         int ret;
1219
1220         key.objectid = bytenr;
1221         if (parent) {
1222                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1223                 key.offset = parent;
1224                 size = sizeof(struct btrfs_shared_data_ref);
1225         } else {
1226                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1227                 key.offset = hash_extent_data_ref(root_objectid,
1228                                                   owner, offset);
1229                 size = sizeof(struct btrfs_extent_data_ref);
1230         }
1231
1232         ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1233         if (ret && ret != -EEXIST)
1234                 goto fail;
1235
1236         leaf = path->nodes[0];
1237         if (parent) {
1238                 struct btrfs_shared_data_ref *ref;
1239                 ref = btrfs_item_ptr(leaf, path->slots[0],
1240                                      struct btrfs_shared_data_ref);
1241                 if (ret == 0) {
1242                         btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1243                 } else {
1244                         num_refs = btrfs_shared_data_ref_count(leaf, ref);
1245                         num_refs += refs_to_add;
1246                         btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1247                 }
1248         } else {
1249                 struct btrfs_extent_data_ref *ref;
1250                 while (ret == -EEXIST) {
1251                         ref = btrfs_item_ptr(leaf, path->slots[0],
1252                                              struct btrfs_extent_data_ref);
1253                         if (match_extent_data_ref(leaf, ref, root_objectid,
1254                                                   owner, offset))
1255                                 break;
1256                         btrfs_release_path(path);
1257                         key.offset++;
1258                         ret = btrfs_insert_empty_item(trans, root, path, &key,
1259                                                       size);
1260                         if (ret && ret != -EEXIST)
1261                                 goto fail;
1262
1263                         leaf = path->nodes[0];
1264                 }
1265                 ref = btrfs_item_ptr(leaf, path->slots[0],
1266                                      struct btrfs_extent_data_ref);
1267                 if (ret == 0) {
1268                         btrfs_set_extent_data_ref_root(leaf, ref,
1269                                                        root_objectid);
1270                         btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1271                         btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1272                         btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1273                 } else {
1274                         num_refs = btrfs_extent_data_ref_count(leaf, ref);
1275                         num_refs += refs_to_add;
1276                         btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1277                 }
1278         }
1279         btrfs_mark_buffer_dirty(leaf);
1280         ret = 0;
1281 fail:
1282         btrfs_release_path(path);
1283         return ret;
1284 }
1285
1286 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1287                                            struct btrfs_path *path,
1288                                            int refs_to_drop, int *last_ref)
1289 {
1290         struct btrfs_key key;
1291         struct btrfs_extent_data_ref *ref1 = NULL;
1292         struct btrfs_shared_data_ref *ref2 = NULL;
1293         struct extent_buffer *leaf;
1294         u32 num_refs = 0;
1295         int ret = 0;
1296
1297         leaf = path->nodes[0];
1298         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1299
1300         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1301                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1302                                       struct btrfs_extent_data_ref);
1303                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1304         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1305                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1306                                       struct btrfs_shared_data_ref);
1307                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1308         } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
1309                 btrfs_print_v0_err(trans->fs_info);
1310                 btrfs_abort_transaction(trans, -EINVAL);
1311                 return -EINVAL;
1312         } else {
1313                 BUG();
1314         }
1315
1316         BUG_ON(num_refs < refs_to_drop);
1317         num_refs -= refs_to_drop;
1318
1319         if (num_refs == 0) {
1320                 ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
1321                 *last_ref = 1;
1322         } else {
1323                 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1324                         btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1325                 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1326                         btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1327                 btrfs_mark_buffer_dirty(leaf);
1328         }
1329         return ret;
1330 }
1331
1332 static noinline u32 extent_data_ref_count(struct btrfs_path *path,
1333                                           struct btrfs_extent_inline_ref *iref)
1334 {
1335         struct btrfs_key key;
1336         struct extent_buffer *leaf;
1337         struct btrfs_extent_data_ref *ref1;
1338         struct btrfs_shared_data_ref *ref2;
1339         u32 num_refs = 0;
1340         int type;
1341
1342         leaf = path->nodes[0];
1343         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1344
1345         BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
1346         if (iref) {
1347                 /*
1348                  * If type is invalid, we should have bailed out earlier than
1349                  * this call.
1350                  */
1351                 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
1352                 ASSERT(type != BTRFS_REF_TYPE_INVALID);
1353                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1354                         ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1355                         num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1356                 } else {
1357                         ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1358                         num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1359                 }
1360         } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1361                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1362                                       struct btrfs_extent_data_ref);
1363                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1364         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1365                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1366                                       struct btrfs_shared_data_ref);
1367                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1368         } else {
1369                 WARN_ON(1);
1370         }
1371         return num_refs;
1372 }
1373
1374 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1375                                           struct btrfs_path *path,
1376                                           u64 bytenr, u64 parent,
1377                                           u64 root_objectid)
1378 {
1379         struct btrfs_root *root = trans->fs_info->extent_root;
1380         struct btrfs_key key;
1381         int ret;
1382
1383         key.objectid = bytenr;
1384         if (parent) {
1385                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1386                 key.offset = parent;
1387         } else {
1388                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1389                 key.offset = root_objectid;
1390         }
1391
1392         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1393         if (ret > 0)
1394                 ret = -ENOENT;
1395         return ret;
1396 }
1397
1398 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1399                                           struct btrfs_path *path,
1400                                           u64 bytenr, u64 parent,
1401                                           u64 root_objectid)
1402 {
1403         struct btrfs_key key;
1404         int ret;
1405
1406         key.objectid = bytenr;
1407         if (parent) {
1408                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1409                 key.offset = parent;
1410         } else {
1411                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1412                 key.offset = root_objectid;
1413         }
1414
1415         ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root,
1416                                       path, &key, 0);
1417         btrfs_release_path(path);
1418         return ret;
1419 }
1420
1421 static inline int extent_ref_type(u64 parent, u64 owner)
1422 {
1423         int type;
1424         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1425                 if (parent > 0)
1426                         type = BTRFS_SHARED_BLOCK_REF_KEY;
1427                 else
1428                         type = BTRFS_TREE_BLOCK_REF_KEY;
1429         } else {
1430                 if (parent > 0)
1431                         type = BTRFS_SHARED_DATA_REF_KEY;
1432                 else
1433                         type = BTRFS_EXTENT_DATA_REF_KEY;
1434         }
1435         return type;
1436 }
1437
1438 static int find_next_key(struct btrfs_path *path, int level,
1439                          struct btrfs_key *key)
1440
1441 {
1442         for (; level < BTRFS_MAX_LEVEL; level++) {
1443                 if (!path->nodes[level])
1444                         break;
1445                 if (path->slots[level] + 1 >=
1446                     btrfs_header_nritems(path->nodes[level]))
1447                         continue;
1448                 if (level == 0)
1449                         btrfs_item_key_to_cpu(path->nodes[level], key,
1450                                               path->slots[level] + 1);
1451                 else
1452                         btrfs_node_key_to_cpu(path->nodes[level], key,
1453                                               path->slots[level] + 1);
1454                 return 0;
1455         }
1456         return 1;
1457 }
1458
1459 /*
1460  * look for inline back ref. if back ref is found, *ref_ret is set
1461  * to the address of inline back ref, and 0 is returned.
1462  *
1463  * if back ref isn't found, *ref_ret is set to the address where it
1464  * should be inserted, and -ENOENT is returned.
1465  *
1466  * if insert is true and there are too many inline back refs, the path
1467  * points to the extent item, and -EAGAIN is returned.
1468  *
1469  * NOTE: inline back refs are ordered in the same way that back ref
1470  *       items in the tree are ordered.
1471  */
1472 static noinline_for_stack
1473 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1474                                  struct btrfs_path *path,
1475                                  struct btrfs_extent_inline_ref **ref_ret,
1476                                  u64 bytenr, u64 num_bytes,
1477                                  u64 parent, u64 root_objectid,
1478                                  u64 owner, u64 offset, int insert)
1479 {
1480         struct btrfs_fs_info *fs_info = trans->fs_info;
1481         struct btrfs_root *root = fs_info->extent_root;
1482         struct btrfs_key key;
1483         struct extent_buffer *leaf;
1484         struct btrfs_extent_item *ei;
1485         struct btrfs_extent_inline_ref *iref;
1486         u64 flags;
1487         u64 item_size;
1488         unsigned long ptr;
1489         unsigned long end;
1490         int extra_size;
1491         int type;
1492         int want;
1493         int ret;
1494         int err = 0;
1495         bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
1496         int needed;
1497
1498         key.objectid = bytenr;
1499         key.type = BTRFS_EXTENT_ITEM_KEY;
1500         key.offset = num_bytes;
1501
1502         want = extent_ref_type(parent, owner);
1503         if (insert) {
1504                 extra_size = btrfs_extent_inline_ref_size(want);
1505                 path->keep_locks = 1;
1506         } else
1507                 extra_size = -1;
1508
1509         /*
1510          * Owner is our level, so we can just add one to get the level for the
1511          * block we are interested in.
1512          */
1513         if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
1514                 key.type = BTRFS_METADATA_ITEM_KEY;
1515                 key.offset = owner;
1516         }
1517
1518 again:
1519         ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1520         if (ret < 0) {
1521                 err = ret;
1522                 goto out;
1523         }
1524
1525         /*
1526          * We may be a newly converted file system which still has the old fat
1527          * extent entries for metadata, so try and see if we have one of those.
1528          */
1529         if (ret > 0 && skinny_metadata) {
1530                 skinny_metadata = false;
1531                 if (path->slots[0]) {
1532                         path->slots[0]--;
1533                         btrfs_item_key_to_cpu(path->nodes[0], &key,
1534                                               path->slots[0]);
1535                         if (key.objectid == bytenr &&
1536                             key.type == BTRFS_EXTENT_ITEM_KEY &&
1537                             key.offset == num_bytes)
1538                                 ret = 0;
1539                 }
1540                 if (ret) {
1541                         key.objectid = bytenr;
1542                         key.type = BTRFS_EXTENT_ITEM_KEY;
1543                         key.offset = num_bytes;
1544                         btrfs_release_path(path);
1545                         goto again;
1546                 }
1547         }
1548
1549         if (ret && !insert) {
1550                 err = -ENOENT;
1551                 goto out;
1552         } else if (WARN_ON(ret)) {
1553                 err = -EIO;
1554                 goto out;
1555         }
1556
1557         leaf = path->nodes[0];
1558         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1559         if (unlikely(item_size < sizeof(*ei))) {
1560                 err = -EINVAL;
1561                 btrfs_print_v0_err(fs_info);
1562                 btrfs_abort_transaction(trans, err);
1563                 goto out;
1564         }
1565
1566         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1567         flags = btrfs_extent_flags(leaf, ei);
1568
1569         ptr = (unsigned long)(ei + 1);
1570         end = (unsigned long)ei + item_size;
1571
1572         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
1573                 ptr += sizeof(struct btrfs_tree_block_info);
1574                 BUG_ON(ptr > end);
1575         }
1576
1577         if (owner >= BTRFS_FIRST_FREE_OBJECTID)
1578                 needed = BTRFS_REF_TYPE_DATA;
1579         else
1580                 needed = BTRFS_REF_TYPE_BLOCK;
1581
1582         err = -ENOENT;
1583         while (1) {
1584                 if (ptr >= end) {
1585                         WARN_ON(ptr > end);
1586                         break;
1587                 }
1588                 iref = (struct btrfs_extent_inline_ref *)ptr;
1589                 type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
1590                 if (type == BTRFS_REF_TYPE_INVALID) {
1591                         err = -EUCLEAN;
1592                         goto out;
1593                 }
1594
1595                 if (want < type)
1596                         break;
1597                 if (want > type) {
1598                         ptr += btrfs_extent_inline_ref_size(type);
1599                         continue;
1600                 }
1601
1602                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1603                         struct btrfs_extent_data_ref *dref;
1604                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1605                         if (match_extent_data_ref(leaf, dref, root_objectid,
1606                                                   owner, offset)) {
1607                                 err = 0;
1608                                 break;
1609                         }
1610                         if (hash_extent_data_ref_item(leaf, dref) <
1611                             hash_extent_data_ref(root_objectid, owner, offset))
1612                                 break;
1613                 } else {
1614                         u64 ref_offset;
1615                         ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1616                         if (parent > 0) {
1617                                 if (parent == ref_offset) {
1618                                         err = 0;
1619                                         break;
1620                                 }
1621                                 if (ref_offset < parent)
1622                                         break;
1623                         } else {
1624                                 if (root_objectid == ref_offset) {
1625                                         err = 0;
1626                                         break;
1627                                 }
1628                                 if (ref_offset < root_objectid)
1629                                         break;
1630                         }
1631                 }
1632                 ptr += btrfs_extent_inline_ref_size(type);
1633         }
1634         if (err == -ENOENT && insert) {
1635                 if (item_size + extra_size >=
1636                     BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1637                         err = -EAGAIN;
1638                         goto out;
1639                 }
1640                 /*
1641                  * To add new inline back ref, we have to make sure
1642                  * there is no corresponding back ref item.
1643                  * For simplicity, we just do not add new inline back
1644                  * ref if there is any kind of item for this block
1645                  */
1646                 if (find_next_key(path, 0, &key) == 0 &&
1647                     key.objectid == bytenr &&
1648                     key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1649                         err = -EAGAIN;
1650                         goto out;
1651                 }
1652         }
1653         *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1654 out:
1655         if (insert) {
1656                 path->keep_locks = 0;
1657                 btrfs_unlock_up_safe(path, 1);
1658         }
1659         return err;
1660 }
1661
1662 /*
1663  * helper to add new inline back ref
1664  */
1665 static noinline_for_stack
1666 void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
1667                                  struct btrfs_path *path,
1668                                  struct btrfs_extent_inline_ref *iref,
1669                                  u64 parent, u64 root_objectid,
1670                                  u64 owner, u64 offset, int refs_to_add,
1671                                  struct btrfs_delayed_extent_op *extent_op)
1672 {
1673         struct extent_buffer *leaf;
1674         struct btrfs_extent_item *ei;
1675         unsigned long ptr;
1676         unsigned long end;
1677         unsigned long item_offset;
1678         u64 refs;
1679         int size;
1680         int type;
1681
1682         leaf = path->nodes[0];
1683         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1684         item_offset = (unsigned long)iref - (unsigned long)ei;
1685
1686         type = extent_ref_type(parent, owner);
1687         size = btrfs_extent_inline_ref_size(type);
1688
1689         btrfs_extend_item(fs_info, path, size);
1690
1691         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1692         refs = btrfs_extent_refs(leaf, ei);
1693         refs += refs_to_add;
1694         btrfs_set_extent_refs(leaf, ei, refs);
1695         if (extent_op)
1696                 __run_delayed_extent_op(extent_op, leaf, ei);
1697
1698         ptr = (unsigned long)ei + item_offset;
1699         end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1700         if (ptr < end - size)
1701                 memmove_extent_buffer(leaf, ptr + size, ptr,
1702                                       end - size - ptr);
1703
1704         iref = (struct btrfs_extent_inline_ref *)ptr;
1705         btrfs_set_extent_inline_ref_type(leaf, iref, type);
1706         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1707                 struct btrfs_extent_data_ref *dref;
1708                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1709                 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1710                 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1711                 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1712                 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1713         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1714                 struct btrfs_shared_data_ref *sref;
1715                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1716                 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1717                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1718         } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1719                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1720         } else {
1721                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1722         }
1723         btrfs_mark_buffer_dirty(leaf);
1724 }
1725
1726 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1727                                  struct btrfs_path *path,
1728                                  struct btrfs_extent_inline_ref **ref_ret,
1729                                  u64 bytenr, u64 num_bytes, u64 parent,
1730                                  u64 root_objectid, u64 owner, u64 offset)
1731 {
1732         int ret;
1733
1734         ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
1735                                            num_bytes, parent, root_objectid,
1736                                            owner, offset, 0);
1737         if (ret != -ENOENT)
1738                 return ret;
1739
1740         btrfs_release_path(path);
1741         *ref_ret = NULL;
1742
1743         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1744                 ret = lookup_tree_block_ref(trans, path, bytenr, parent,
1745                                             root_objectid);
1746         } else {
1747                 ret = lookup_extent_data_ref(trans, path, bytenr, parent,
1748                                              root_objectid, owner, offset);
1749         }
1750         return ret;
1751 }
1752
1753 /*
1754  * helper to update/remove inline back ref
1755  */
1756 static noinline_for_stack
1757 void update_inline_extent_backref(struct btrfs_path *path,
1758                                   struct btrfs_extent_inline_ref *iref,
1759                                   int refs_to_mod,
1760                                   struct btrfs_delayed_extent_op *extent_op,
1761                                   int *last_ref)
1762 {
1763         struct extent_buffer *leaf = path->nodes[0];
1764         struct btrfs_fs_info *fs_info = leaf->fs_info;
1765         struct btrfs_extent_item *ei;
1766         struct btrfs_extent_data_ref *dref = NULL;
1767         struct btrfs_shared_data_ref *sref = NULL;
1768         unsigned long ptr;
1769         unsigned long end;
1770         u32 item_size;
1771         int size;
1772         int type;
1773         u64 refs;
1774
1775         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1776         refs = btrfs_extent_refs(leaf, ei);
1777         WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1778         refs += refs_to_mod;
1779         btrfs_set_extent_refs(leaf, ei, refs);
1780         if (extent_op)
1781                 __run_delayed_extent_op(extent_op, leaf, ei);
1782
1783         /*
1784          * If type is invalid, we should have bailed out after
1785          * lookup_inline_extent_backref().
1786          */
1787         type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
1788         ASSERT(type != BTRFS_REF_TYPE_INVALID);
1789
1790         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1791                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1792                 refs = btrfs_extent_data_ref_count(leaf, dref);
1793         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1794                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1795                 refs = btrfs_shared_data_ref_count(leaf, sref);
1796         } else {
1797                 refs = 1;
1798                 BUG_ON(refs_to_mod != -1);
1799         }
1800
1801         BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1802         refs += refs_to_mod;
1803
1804         if (refs > 0) {
1805                 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1806                         btrfs_set_extent_data_ref_count(leaf, dref, refs);
1807                 else
1808                         btrfs_set_shared_data_ref_count(leaf, sref, refs);
1809         } else {
1810                 *last_ref = 1;
1811                 size =  btrfs_extent_inline_ref_size(type);
1812                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1813                 ptr = (unsigned long)iref;
1814                 end = (unsigned long)ei + item_size;
1815                 if (ptr + size < end)
1816                         memmove_extent_buffer(leaf, ptr, ptr + size,
1817                                               end - ptr - size);
1818                 item_size -= size;
1819                 btrfs_truncate_item(fs_info, path, item_size, 1);
1820         }
1821         btrfs_mark_buffer_dirty(leaf);
1822 }
1823
1824 static noinline_for_stack
1825 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1826                                  struct btrfs_path *path,
1827                                  u64 bytenr, u64 num_bytes, u64 parent,
1828                                  u64 root_objectid, u64 owner,
1829                                  u64 offset, int refs_to_add,
1830                                  struct btrfs_delayed_extent_op *extent_op)
1831 {
1832         struct btrfs_extent_inline_ref *iref;
1833         int ret;
1834
1835         ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
1836                                            num_bytes, parent, root_objectid,
1837                                            owner, offset, 1);
1838         if (ret == 0) {
1839                 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1840                 update_inline_extent_backref(path, iref, refs_to_add,
1841                                              extent_op, NULL);
1842         } else if (ret == -ENOENT) {
1843                 setup_inline_extent_backref(trans->fs_info, path, iref, parent,
1844                                             root_objectid, owner, offset,
1845                                             refs_to_add, extent_op);
1846                 ret = 0;
1847         }
1848         return ret;
1849 }
1850
1851 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1852                                  struct btrfs_path *path,
1853                                  u64 bytenr, u64 parent, u64 root_objectid,
1854                                  u64 owner, u64 offset, int refs_to_add)
1855 {
1856         int ret;
1857         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1858                 BUG_ON(refs_to_add != 1);
1859                 ret = insert_tree_block_ref(trans, path, bytenr, parent,
1860                                             root_objectid);
1861         } else {
1862                 ret = insert_extent_data_ref(trans, path, bytenr, parent,
1863                                              root_objectid, owner, offset,
1864                                              refs_to_add);
1865         }
1866         return ret;
1867 }
1868
1869 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1870                                  struct btrfs_path *path,
1871                                  struct btrfs_extent_inline_ref *iref,
1872                                  int refs_to_drop, int is_data, int *last_ref)
1873 {
1874         int ret = 0;
1875
1876         BUG_ON(!is_data && refs_to_drop != 1);
1877         if (iref) {
1878                 update_inline_extent_backref(path, iref, -refs_to_drop, NULL,
1879                                              last_ref);
1880         } else if (is_data) {
1881                 ret = remove_extent_data_ref(trans, path, refs_to_drop,
1882                                              last_ref);
1883         } else {
1884                 *last_ref = 1;
1885                 ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
1886         }
1887         return ret;
1888 }
1889
1890 #define in_range(b, first, len)        ((b) >= (first) && (b) < (first) + (len))
1891 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1892                                u64 *discarded_bytes)
1893 {
1894         int j, ret = 0;
1895         u64 bytes_left, end;
1896         u64 aligned_start = ALIGN(start, 1 << 9);
1897
1898         if (WARN_ON(start != aligned_start)) {
1899                 len -= aligned_start - start;
1900                 len = round_down(len, 1 << 9);
1901                 start = aligned_start;
1902         }
1903
1904         *discarded_bytes = 0;
1905
1906         if (!len)
1907                 return 0;
1908
1909         end = start + len;
1910         bytes_left = len;
1911
1912         /* Skip any superblocks on this device. */
1913         for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1914                 u64 sb_start = btrfs_sb_offset(j);
1915                 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1916                 u64 size = sb_start - start;
1917
1918                 if (!in_range(sb_start, start, bytes_left) &&
1919                     !in_range(sb_end, start, bytes_left) &&
1920                     !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1921                         continue;
1922
1923                 /*
1924                  * Superblock spans beginning of range.  Adjust start and
1925                  * try again.
1926                  */
1927                 if (sb_start <= start) {
1928                         start += sb_end - start;
1929                         if (start > end) {
1930                                 bytes_left = 0;
1931                                 break;
1932                         }
1933                         bytes_left = end - start;
1934                         continue;
1935                 }
1936
1937                 if (size) {
1938                         ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
1939                                                    GFP_NOFS, 0);
1940                         if (!ret)
1941                                 *discarded_bytes += size;
1942                         else if (ret != -EOPNOTSUPP)
1943                                 return ret;
1944                 }
1945
1946                 start = sb_end;
1947                 if (start > end) {
1948                         bytes_left = 0;
1949                         break;
1950                 }
1951                 bytes_left = end - start;
1952         }
1953
1954         if (bytes_left) {
1955                 ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
1956                                            GFP_NOFS, 0);
1957                 if (!ret)
1958                         *discarded_bytes += bytes_left;
1959         }
1960         return ret;
1961 }
1962
1963 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1964                          u64 num_bytes, u64 *actual_bytes)
1965 {
1966         int ret;
1967         u64 discarded_bytes = 0;
1968         struct btrfs_bio *bbio = NULL;
1969
1970
1971         /*
1972          * Avoid races with device replace and make sure our bbio has devices
1973          * associated to its stripes that don't go away while we are discarding.
1974          */
1975         btrfs_bio_counter_inc_blocked(fs_info);
1976         /* Tell the block device(s) that the sectors can be discarded */
1977         ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes,
1978                               &bbio, 0);
1979         /* Error condition is -ENOMEM */
1980         if (!ret) {
1981                 struct btrfs_bio_stripe *stripe = bbio->stripes;
1982                 int i;
1983
1984
1985                 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1986                         u64 bytes;
1987                         struct request_queue *req_q;
1988
1989                         if (!stripe->dev->bdev) {
1990                                 ASSERT(btrfs_test_opt(fs_info, DEGRADED));
1991                                 continue;
1992                         }
1993                         req_q = bdev_get_queue(stripe->dev->bdev);
1994                         if (!blk_queue_discard(req_q))
1995                                 continue;
1996
1997                         ret = btrfs_issue_discard(stripe->dev->bdev,
1998                                                   stripe->physical,
1999                                                   stripe->length,
2000                                                   &bytes);
2001                         if (!ret)
2002                                 discarded_bytes += bytes;
2003                         else if (ret != -EOPNOTSUPP)
2004                                 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
2005
2006                         /*
2007                          * Just in case we get back EOPNOTSUPP for some reason,
2008                          * just ignore the return value so we don't screw up
2009                          * people calling discard_extent.
2010                          */
2011                         ret = 0;
2012                 }
2013                 btrfs_put_bbio(bbio);
2014         }
2015         btrfs_bio_counter_dec(fs_info);
2016
2017         if (actual_bytes)
2018                 *actual_bytes = discarded_bytes;
2019
2020
2021         if (ret == -EOPNOTSUPP)
2022                 ret = 0;
2023         return ret;
2024 }
2025
2026 /* Can return -ENOMEM */
2027 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2028                          struct btrfs_root *root,
2029                          u64 bytenr, u64 num_bytes, u64 parent,
2030                          u64 root_objectid, u64 owner, u64 offset)
2031 {
2032         struct btrfs_fs_info *fs_info = root->fs_info;
2033         int old_ref_mod, new_ref_mod;
2034         int ret;
2035
2036         BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
2037                root_objectid == BTRFS_TREE_LOG_OBJECTID);
2038
2039         btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, root_objectid,
2040                            owner, offset, BTRFS_ADD_DELAYED_REF);
2041
2042         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
2043                 ret = btrfs_add_delayed_tree_ref(trans, bytenr,
2044                                                  num_bytes, parent,
2045                                                  root_objectid, (int)owner,
2046                                                  BTRFS_ADD_DELAYED_REF, NULL,
2047                                                  &old_ref_mod, &new_ref_mod);
2048         } else {
2049                 ret = btrfs_add_delayed_data_ref(trans, bytenr,
2050                                                  num_bytes, parent,
2051                                                  root_objectid, owner, offset,
2052                                                  0, BTRFS_ADD_DELAYED_REF,
2053                                                  &old_ref_mod, &new_ref_mod);
2054         }
2055
2056         if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) {
2057                 bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
2058
2059                 add_pinned_bytes(fs_info, -num_bytes, metadata, root_objectid);
2060         }
2061
2062         return ret;
2063 }
2064
2065 /*
2066  * __btrfs_inc_extent_ref - insert backreference for a given extent
2067  *
2068  * @trans:          Handle of transaction
2069  *
2070  * @node:           The delayed ref node used to get the bytenr/length for
2071  *                  extent whose references are incremented.
2072  *
2073  * @parent:         If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
2074  *                  BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
2075  *                  bytenr of the parent block. Since new extents are always
2076  *                  created with indirect references, this will only be the case
2077  *                  when relocating a shared extent. In that case, root_objectid
2078  *                  will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must
2079  *                  be 0
2080  *
2081  * @root_objectid:  The id of the root where this modification has originated,
2082  *                  this can be either one of the well-known metadata trees or
2083  *                  the subvolume id which references this extent.
2084  *
2085  * @owner:          For data extents it is the inode number of the owning file.
2086  *                  For metadata extents this parameter holds the level in the
2087  *                  tree of the extent.
2088  *
2089  * @offset:         For metadata extents the offset is ignored and is currently
2090  *                  always passed as 0. For data extents it is the fileoffset
2091  *                  this extent belongs to.
2092  *
2093  * @refs_to_add     Number of references to add
2094  *
2095  * @extent_op       Pointer to a structure, holding information necessary when
2096  *                  updating a tree block's flags
2097  *
2098  */
2099 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2100                                   struct btrfs_delayed_ref_node *node,
2101                                   u64 parent, u64 root_objectid,
2102                                   u64 owner, u64 offset, int refs_to_add,
2103                                   struct btrfs_delayed_extent_op *extent_op)
2104 {
2105         struct btrfs_path *path;
2106         struct extent_buffer *leaf;
2107         struct btrfs_extent_item *item;
2108         struct btrfs_key key;
2109         u64 bytenr = node->bytenr;
2110         u64 num_bytes = node->num_bytes;
2111         u64 refs;
2112         int ret;
2113
2114         path = btrfs_alloc_path();
2115         if (!path)
2116                 return -ENOMEM;
2117
2118         path->reada = READA_FORWARD;
2119         path->leave_spinning = 1;
2120         /* this will setup the path even if it fails to insert the back ref */
2121         ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
2122                                            parent, root_objectid, owner,
2123                                            offset, refs_to_add, extent_op);
2124         if ((ret < 0 && ret != -EAGAIN) || !ret)
2125                 goto out;
2126
2127         /*
2128          * Ok we had -EAGAIN which means we didn't have space to insert and
2129          * inline extent ref, so just update the reference count and add a
2130          * normal backref.
2131          */
2132         leaf = path->nodes[0];
2133         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2134         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2135         refs = btrfs_extent_refs(leaf, item);
2136         btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
2137         if (extent_op)
2138                 __run_delayed_extent_op(extent_op, leaf, item);
2139
2140         btrfs_mark_buffer_dirty(leaf);
2141         btrfs_release_path(path);
2142
2143         path->reada = READA_FORWARD;
2144         path->leave_spinning = 1;
2145         /* now insert the actual backref */
2146         ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid,
2147                                     owner, offset, refs_to_add);
2148         if (ret)
2149                 btrfs_abort_transaction(trans, ret);
2150 out:
2151         btrfs_free_path(path);
2152         return ret;
2153 }
2154
2155 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
2156                                 struct btrfs_delayed_ref_node *node,
2157                                 struct btrfs_delayed_extent_op *extent_op,
2158                                 int insert_reserved)
2159 {
2160         int ret = 0;
2161         struct btrfs_delayed_data_ref *ref;
2162         struct btrfs_key ins;
2163         u64 parent = 0;
2164         u64 ref_root = 0;
2165         u64 flags = 0;
2166
2167         ins.objectid = node->bytenr;
2168         ins.offset = node->num_bytes;
2169         ins.type = BTRFS_EXTENT_ITEM_KEY;
2170
2171         ref = btrfs_delayed_node_to_data_ref(node);
2172         trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
2173
2174         if (node->type == BTRFS_SHARED_DATA_REF_KEY)
2175                 parent = ref->parent;
2176         ref_root = ref->root;
2177
2178         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2179                 if (extent_op)
2180                         flags |= extent_op->flags_to_set;
2181                 ret = alloc_reserved_file_extent(trans, parent, ref_root,
2182                                                  flags, ref->objectid,
2183                                                  ref->offset, &ins,
2184                                                  node->ref_mod);
2185         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2186                 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
2187                                              ref->objectid, ref->offset,
2188                                              node->ref_mod, extent_op);
2189         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2190                 ret = __btrfs_free_extent(trans, node, parent,
2191                                           ref_root, ref->objectid,
2192                                           ref->offset, node->ref_mod,
2193                                           extent_op);
2194         } else {
2195                 BUG();
2196         }
2197         return ret;
2198 }
2199
2200 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2201                                     struct extent_buffer *leaf,
2202                                     struct btrfs_extent_item *ei)
2203 {
2204         u64 flags = btrfs_extent_flags(leaf, ei);
2205         if (extent_op->update_flags) {
2206                 flags |= extent_op->flags_to_set;
2207                 btrfs_set_extent_flags(leaf, ei, flags);
2208         }
2209
2210         if (extent_op->update_key) {
2211                 struct btrfs_tree_block_info *bi;
2212                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2213                 bi = (struct btrfs_tree_block_info *)(ei + 1);
2214                 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2215         }
2216 }
2217
2218 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2219                                  struct btrfs_delayed_ref_head *head,
2220                                  struct btrfs_delayed_extent_op *extent_op)
2221 {
2222         struct btrfs_fs_info *fs_info = trans->fs_info;
2223         struct btrfs_key key;
2224         struct btrfs_path *path;
2225         struct btrfs_extent_item *ei;
2226         struct extent_buffer *leaf;
2227         u32 item_size;
2228         int ret;
2229         int err = 0;
2230         int metadata = !extent_op->is_data;
2231
2232         if (trans->aborted)
2233                 return 0;
2234
2235         if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2236                 metadata = 0;
2237
2238         path = btrfs_alloc_path();
2239         if (!path)
2240                 return -ENOMEM;
2241
2242         key.objectid = head->bytenr;
2243
2244         if (metadata) {
2245                 key.type = BTRFS_METADATA_ITEM_KEY;
2246                 key.offset = extent_op->level;
2247         } else {
2248                 key.type = BTRFS_EXTENT_ITEM_KEY;
2249                 key.offset = head->num_bytes;
2250         }
2251
2252 again:
2253         path->reada = READA_FORWARD;
2254         path->leave_spinning = 1;
2255         ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
2256         if (ret < 0) {
2257                 err = ret;
2258                 goto out;
2259         }
2260         if (ret > 0) {
2261                 if (metadata) {
2262                         if (path->slots[0] > 0) {
2263                                 path->slots[0]--;
2264                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
2265                                                       path->slots[0]);
2266                                 if (key.objectid == head->bytenr &&
2267                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
2268                                     key.offset == head->num_bytes)
2269                                         ret = 0;
2270                         }
2271                         if (ret > 0) {
2272                                 btrfs_release_path(path);
2273                                 metadata = 0;
2274
2275                                 key.objectid = head->bytenr;
2276                                 key.offset = head->num_bytes;
2277                                 key.type = BTRFS_EXTENT_ITEM_KEY;
2278                                 goto again;
2279                         }
2280                 } else {
2281                         err = -EIO;
2282                         goto out;
2283                 }
2284         }
2285
2286         leaf = path->nodes[0];
2287         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2288
2289         if (unlikely(item_size < sizeof(*ei))) {
2290                 err = -EINVAL;
2291                 btrfs_print_v0_err(fs_info);
2292                 btrfs_abort_transaction(trans, err);
2293                 goto out;
2294         }
2295
2296         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2297         __run_delayed_extent_op(extent_op, leaf, ei);
2298
2299         btrfs_mark_buffer_dirty(leaf);
2300 out:
2301         btrfs_free_path(path);
2302         return err;
2303 }
2304
2305 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2306                                 struct btrfs_delayed_ref_node *node,
2307                                 struct btrfs_delayed_extent_op *extent_op,
2308                                 int insert_reserved)
2309 {
2310         int ret = 0;
2311         struct btrfs_delayed_tree_ref *ref;
2312         u64 parent = 0;
2313         u64 ref_root = 0;
2314
2315         ref = btrfs_delayed_node_to_tree_ref(node);
2316         trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
2317
2318         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2319                 parent = ref->parent;
2320         ref_root = ref->root;
2321
2322         if (node->ref_mod != 1) {
2323                 btrfs_err(trans->fs_info,
2324         "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
2325                           node->bytenr, node->ref_mod, node->action, ref_root,
2326                           parent);
2327                 return -EIO;
2328         }
2329         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2330                 BUG_ON(!extent_op || !extent_op->update_flags);
2331                 ret = alloc_reserved_tree_block(trans, node, extent_op);
2332         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2333                 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
2334                                              ref->level, 0, 1, extent_op);
2335         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2336                 ret = __btrfs_free_extent(trans, node, parent, ref_root,
2337                                           ref->level, 0, 1, extent_op);
2338         } else {
2339                 BUG();
2340         }
2341         return ret;
2342 }
2343
2344 /* helper function to actually process a single delayed ref entry */
2345 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2346                                struct btrfs_delayed_ref_node *node,
2347                                struct btrfs_delayed_extent_op *extent_op,
2348                                int insert_reserved)
2349 {
2350         int ret = 0;
2351
2352         if (trans->aborted) {
2353                 if (insert_reserved)
2354                         btrfs_pin_extent(trans->fs_info, node->bytenr,
2355                                          node->num_bytes, 1);
2356                 return 0;
2357         }
2358
2359         if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2360             node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2361                 ret = run_delayed_tree_ref(trans, node, extent_op,
2362                                            insert_reserved);
2363         else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2364                  node->type == BTRFS_SHARED_DATA_REF_KEY)
2365                 ret = run_delayed_data_ref(trans, node, extent_op,
2366                                            insert_reserved);
2367         else
2368                 BUG();
2369         return ret;
2370 }
2371
2372 static inline struct btrfs_delayed_ref_node *
2373 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2374 {
2375         struct btrfs_delayed_ref_node *ref;
2376
2377         if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
2378                 return NULL;
2379
2380         /*
2381          * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
2382          * This is to prevent a ref count from going down to zero, which deletes
2383          * the extent item from the extent tree, when there still are references
2384          * to add, which would fail because they would not find the extent item.
2385          */
2386         if (!list_empty(&head->ref_add_list))
2387                 return list_first_entry(&head->ref_add_list,
2388                                 struct btrfs_delayed_ref_node, add_list);
2389
2390         ref = rb_entry(rb_first_cached(&head->ref_tree),
2391                        struct btrfs_delayed_ref_node, ref_node);
2392         ASSERT(list_empty(&ref->add_list));
2393         return ref;
2394 }
2395
2396 static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
2397                                       struct btrfs_delayed_ref_head *head)
2398 {
2399         spin_lock(&delayed_refs->lock);
2400         head->processing = 0;
2401         delayed_refs->num_heads_ready++;
2402         spin_unlock(&delayed_refs->lock);
2403         btrfs_delayed_ref_unlock(head);
2404 }
2405
2406 static int cleanup_extent_op(struct btrfs_trans_handle *trans,
2407                              struct btrfs_delayed_ref_head *head)
2408 {
2409         struct btrfs_delayed_extent_op *extent_op = head->extent_op;
2410         int ret;
2411
2412         if (!extent_op)
2413                 return 0;
2414         head->extent_op = NULL;
2415         if (head->must_insert_reserved) {
2416                 btrfs_free_delayed_extent_op(extent_op);
2417                 return 0;
2418         }
2419         spin_unlock(&head->lock);
2420         ret = run_delayed_extent_op(trans, head, extent_op);
2421         btrfs_free_delayed_extent_op(extent_op);
2422         return ret ? ret : 1;
2423 }
2424
2425 static int cleanup_ref_head(struct btrfs_trans_handle *trans,
2426                             struct btrfs_delayed_ref_head *head)
2427 {
2428
2429         struct btrfs_fs_info *fs_info = trans->fs_info;
2430         struct btrfs_delayed_ref_root *delayed_refs;
2431         int ret;
2432
2433         delayed_refs = &trans->transaction->delayed_refs;
2434
2435         ret = cleanup_extent_op(trans, head);
2436         if (ret < 0) {
2437                 unselect_delayed_ref_head(delayed_refs, head);
2438                 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
2439                 return ret;
2440         } else if (ret) {
2441                 return ret;
2442         }
2443
2444         /*
2445          * Need to drop our head ref lock and re-acquire the delayed ref lock
2446          * and then re-check to make sure nobody got added.
2447          */
2448         spin_unlock(&head->lock);
2449         spin_lock(&delayed_refs->lock);
2450         spin_lock(&head->lock);
2451         if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
2452                 spin_unlock(&head->lock);
2453                 spin_unlock(&delayed_refs->lock);
2454                 return 1;
2455         }
2456         delayed_refs->num_heads--;
2457         rb_erase_cached(&head->href_node, &delayed_refs->href_root);
2458         RB_CLEAR_NODE(&head->href_node);
2459         spin_unlock(&head->lock);
2460         spin_unlock(&delayed_refs->lock);
2461         atomic_dec(&delayed_refs->num_entries);
2462
2463         trace_run_delayed_ref_head(fs_info, head, 0);
2464
2465         if (head->total_ref_mod < 0) {
2466                 struct btrfs_space_info *space_info;
2467                 u64 flags;
2468
2469                 if (head->is_data)
2470                         flags = BTRFS_BLOCK_GROUP_DATA;
2471                 else if (head->is_system)
2472                         flags = BTRFS_BLOCK_GROUP_SYSTEM;
2473                 else
2474                         flags = BTRFS_BLOCK_GROUP_METADATA;
2475                 space_info = __find_space_info(fs_info, flags);
2476                 ASSERT(space_info);
2477                 percpu_counter_add_batch(&space_info->total_bytes_pinned,
2478                                    -head->num_bytes,
2479                                    BTRFS_TOTAL_BYTES_PINNED_BATCH);
2480
2481                 if (head->is_data) {
2482                         spin_lock(&delayed_refs->lock);
2483                         delayed_refs->pending_csums -= head->num_bytes;
2484                         spin_unlock(&delayed_refs->lock);
2485                 }
2486         }
2487
2488         if (head->must_insert_reserved) {
2489                 btrfs_pin_extent(fs_info, head->bytenr,
2490                                  head->num_bytes, 1);
2491                 if (head->is_data) {
2492                         ret = btrfs_del_csums(trans, fs_info, head->bytenr,
2493                                               head->num_bytes);
2494                 }
2495         }
2496
2497         /* Also free its reserved qgroup space */
2498         btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
2499                                       head->qgroup_reserved);
2500         btrfs_delayed_ref_unlock(head);
2501         btrfs_put_delayed_ref_head(head);
2502         return 0;
2503 }
2504
2505 static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
2506                                         struct btrfs_trans_handle *trans)
2507 {
2508         struct btrfs_delayed_ref_root *delayed_refs =
2509                 &trans->transaction->delayed_refs;
2510         struct btrfs_delayed_ref_head *head = NULL;
2511         int ret;
2512
2513         spin_lock(&delayed_refs->lock);
2514         head = btrfs_select_ref_head(trans);
2515         if (!head) {
2516                 spin_unlock(&delayed_refs->lock);
2517                 return head;
2518         }
2519
2520         /*
2521          * Grab the lock that says we are going to process all the refs for
2522          * this head
2523          */
2524         ret = btrfs_delayed_ref_lock(trans, head);
2525         spin_unlock(&delayed_refs->lock);
2526
2527         /*
2528          * We may have dropped the spin lock to get the head mutex lock, and
2529          * that might have given someone else time to free the head.  If that's
2530          * true, it has been removed from our list and we can move on.
2531          */
2532         if (ret == -EAGAIN)
2533                 head = ERR_PTR(-EAGAIN);
2534
2535         return head;
2536 }
2537
2538 static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
2539                                     struct btrfs_delayed_ref_head *locked_ref,
2540                                     unsigned long *run_refs)
2541 {
2542         struct btrfs_fs_info *fs_info = trans->fs_info;
2543         struct btrfs_delayed_ref_root *delayed_refs;
2544         struct btrfs_delayed_extent_op *extent_op;
2545         struct btrfs_delayed_ref_node *ref;
2546         int must_insert_reserved = 0;
2547         int ret;
2548
2549         delayed_refs = &trans->transaction->delayed_refs;
2550
2551         while ((ref = select_delayed_ref(locked_ref))) {
2552                 if (ref->seq &&
2553                     btrfs_check_delayed_seq(fs_info, ref->seq)) {
2554                         spin_unlock(&locked_ref->lock);
2555                         unselect_delayed_ref_head(delayed_refs, locked_ref);
2556                         return -EAGAIN;
2557                 }
2558
2559                 (*run_refs)++;
2560                 ref->in_tree = 0;
2561                 rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
2562                 RB_CLEAR_NODE(&ref->ref_node);
2563                 if (!list_empty(&ref->add_list))
2564                         list_del(&ref->add_list);
2565                 /*
2566                  * When we play the delayed ref, also correct the ref_mod on
2567                  * head
2568                  */
2569                 switch (ref->action) {
2570                 case BTRFS_ADD_DELAYED_REF:
2571                 case BTRFS_ADD_DELAYED_EXTENT:
2572                         locked_ref->ref_mod -= ref->ref_mod;
2573                         break;
2574                 case BTRFS_DROP_DELAYED_REF:
2575                         locked_ref->ref_mod += ref->ref_mod;
2576                         break;
2577                 default:
2578                         WARN_ON(1);
2579                 }
2580                 atomic_dec(&delayed_refs->num_entries);
2581
2582                 /*
2583                  * Record the must_insert_reserved flag before we drop the
2584                  * spin lock.
2585                  */
2586                 must_insert_reserved = locked_ref->must_insert_reserved;
2587                 locked_ref->must_insert_reserved = 0;
2588
2589                 extent_op = locked_ref->extent_op;
2590                 locked_ref->extent_op = NULL;
2591                 spin_unlock(&locked_ref->lock);
2592
2593                 ret = run_one_delayed_ref(trans, ref, extent_op,
2594                                           must_insert_reserved);
2595
2596                 btrfs_free_delayed_extent_op(extent_op);
2597                 if (ret) {
2598                         unselect_delayed_ref_head(delayed_refs, locked_ref);
2599                         btrfs_put_delayed_ref(ref);
2600                         btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
2601                                     ret);
2602                         return ret;
2603                 }
2604
2605                 btrfs_put_delayed_ref(ref);
2606                 cond_resched();
2607
2608                 spin_lock(&locked_ref->lock);
2609                 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2610         }
2611
2612         return 0;
2613 }
2614
2615 /*
2616  * Returns 0 on success or if called with an already aborted transaction.
2617  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2618  */
2619 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2620                                              unsigned long nr)
2621 {
2622         struct btrfs_fs_info *fs_info = trans->fs_info;
2623         struct btrfs_delayed_ref_root *delayed_refs;
2624         struct btrfs_delayed_ref_node *ref;
2625         struct btrfs_delayed_ref_head *locked_ref = NULL;
2626         struct btrfs_delayed_extent_op *extent_op;
2627         ktime_t start = ktime_get();
2628         int ret;
2629         unsigned long count = 0;
2630         unsigned long actual_count = 0;
2631         int must_insert_reserved = 0;
2632
2633         delayed_refs = &trans->transaction->delayed_refs;
2634         while (1) {
2635                 if (!locked_ref) {
2636                         if (count >= nr)
2637                                 break;
2638
2639                         locked_ref = btrfs_obtain_ref_head(trans);
2640                         if (!locked_ref)
2641                                 break;
2642                         else if (PTR_ERR(locked_ref) == -EAGAIN) {
2643                                 locked_ref = NULL;
2644                                 count++;
2645                                 continue;
2646                         }
2647                 }
2648
2649                 /*
2650                  * We need to try and merge add/drops of the same ref since we
2651                  * can run into issues with relocate dropping the implicit ref
2652                  * and then it being added back again before the drop can
2653                  * finish.  If we merged anything we need to re-loop so we can
2654                  * get a good ref.
2655                  * Or we can get node references of the same type that weren't
2656                  * merged when created due to bumps in the tree mod seq, and
2657                  * we need to merge them to prevent adding an inline extent
2658                  * backref before dropping it (triggering a BUG_ON at
2659                  * insert_inline_extent_backref()).
2660                  */
2661                 spin_lock(&locked_ref->lock);
2662                 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2663
2664                 ref = select_delayed_ref(locked_ref);
2665
2666                 if (ref && ref->seq &&
2667                     btrfs_check_delayed_seq(fs_info, ref->seq)) {
2668                         spin_unlock(&locked_ref->lock);
2669                         unselect_delayed_ref_head(delayed_refs, locked_ref);
2670                         locked_ref = NULL;
2671                         cond_resched();
2672                         count++;
2673                         continue;
2674                 }
2675
2676                 /*
2677                  * We're done processing refs in this ref_head, clean everything
2678                  * up and move on to the next ref_head.
2679                  */
2680                 if (!ref) {
2681                         ret = cleanup_ref_head(trans, locked_ref);
2682                         if (ret > 0 ) {
2683                                 /* We dropped our lock, we need to loop. */
2684                                 ret = 0;
2685                                 continue;
2686                         } else if (ret) {
2687                                 return ret;
2688                         }
2689                         locked_ref = NULL;
2690                         count++;
2691                         continue;
2692                 }
2693
2694                 actual_count++;
2695                 ref->in_tree = 0;
2696                 rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
2697                 RB_CLEAR_NODE(&ref->ref_node);
2698                 if (!list_empty(&ref->add_list))
2699                         list_del(&ref->add_list);
2700                 /*
2701                  * When we play the delayed ref, also correct the ref_mod on
2702                  * head
2703                  */
2704                 switch (ref->action) {
2705                 case BTRFS_ADD_DELAYED_REF:
2706                 case BTRFS_ADD_DELAYED_EXTENT:
2707                         locked_ref->ref_mod -= ref->ref_mod;
2708                         break;
2709                 case BTRFS_DROP_DELAYED_REF:
2710                         locked_ref->ref_mod += ref->ref_mod;
2711                         break;
2712                 default:
2713                         WARN_ON(1);
2714                 }
2715                 atomic_dec(&delayed_refs->num_entries);
2716
2717                 /*
2718                  * Record the must-insert_reserved flag before we drop the spin
2719                  * lock.
2720                  */
2721                 must_insert_reserved = locked_ref->must_insert_reserved;
2722                 locked_ref->must_insert_reserved = 0;
2723
2724                 extent_op = locked_ref->extent_op;
2725                 locked_ref->extent_op = NULL;
2726                 spin_unlock(&locked_ref->lock);
2727
2728                 ret = run_one_delayed_ref(trans, ref, extent_op,
2729                                           must_insert_reserved);
2730
2731                 btrfs_free_delayed_extent_op(extent_op);
2732                 if (ret) {
2733                         unselect_delayed_ref_head(delayed_refs, locked_ref);
2734                         btrfs_put_delayed_ref(ref);
2735                         btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
2736                                     ret);
2737                         return ret;
2738                 }
2739
2740                 btrfs_put_delayed_ref(ref);
2741                 count++;
2742                 cond_resched();
2743         }
2744
2745         /*
2746          * We don't want to include ref heads since we can have empty ref heads
2747          * and those will drastically skew our runtime down since we just do
2748          * accounting, no actual extent tree updates.
2749          */
2750         if (actual_count > 0) {
2751                 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2752                 u64 avg;
2753
2754                 /*
2755                  * We weigh the current average higher than our current runtime
2756                  * to avoid large swings in the average.
2757                  */
2758                 spin_lock(&delayed_refs->lock);
2759                 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2760                 fs_info->avg_delayed_ref_runtime = avg >> 2;    /* div by 4 */
2761                 spin_unlock(&delayed_refs->lock);
2762         }
2763         return 0;
2764 }
2765
2766 #ifdef SCRAMBLE_DELAYED_REFS
2767 /*
2768  * Normally delayed refs get processed in ascending bytenr order. This
2769  * correlates in most cases to the order added. To expose dependencies on this
2770  * order, we start to process the tree in the middle instead of the beginning
2771  */
2772 static u64 find_middle(struct rb_root *root)
2773 {
2774         struct rb_node *n = root->rb_node;
2775         struct btrfs_delayed_ref_node *entry;
2776         int alt = 1;
2777         u64 middle;
2778         u64 first = 0, last = 0;
2779
2780         n = rb_first(root);
2781         if (n) {
2782                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2783                 first = entry->bytenr;
2784         }
2785         n = rb_last(root);
2786         if (n) {
2787                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2788                 last = entry->bytenr;
2789         }
2790         n = root->rb_node;
2791
2792         while (n) {
2793                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2794                 WARN_ON(!entry->in_tree);
2795
2796                 middle = entry->bytenr;
2797
2798                 if (alt)
2799                         n = n->rb_left;
2800                 else
2801                         n = n->rb_right;
2802
2803                 alt = 1 - alt;
2804         }
2805         return middle;
2806 }
2807 #endif
2808
2809 static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads)
2810 {
2811         u64 num_bytes;
2812
2813         num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2814                              sizeof(struct btrfs_extent_inline_ref));
2815         if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2816                 num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2817
2818         /*
2819          * We don't ever fill up leaves all the way so multiply by 2 just to be
2820          * closer to what we're really going to want to use.
2821          */
2822         return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info));
2823 }
2824
2825 /*
2826  * Takes the number of bytes to be csumm'ed and figures out how many leaves it
2827  * would require to store the csums for that many bytes.
2828  */
2829 u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
2830 {
2831         u64 csum_size;
2832         u64 num_csums_per_leaf;
2833         u64 num_csums;
2834
2835         csum_size = BTRFS_MAX_ITEM_SIZE(fs_info);
2836         num_csums_per_leaf = div64_u64(csum_size,
2837                         (u64)btrfs_super_csum_size(fs_info->super_copy));
2838         num_csums = div64_u64(csum_bytes, fs_info->sectorsize);
2839         num_csums += num_csums_per_leaf - 1;
2840         num_csums = div64_u64(num_csums, num_csums_per_leaf);
2841         return num_csums;
2842 }
2843
2844 int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
2845                                        struct btrfs_fs_info *fs_info)
2846 {
2847         struct btrfs_block_rsv *global_rsv;
2848         u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
2849         u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
2850         unsigned int num_dirty_bgs = trans->transaction->num_dirty_bgs;
2851         u64 num_bytes, num_dirty_bgs_bytes;
2852         int ret = 0;
2853
2854         num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
2855         num_heads = heads_to_leaves(fs_info, num_heads);
2856         if (num_heads > 1)
2857                 num_bytes += (num_heads - 1) * fs_info->nodesize;
2858         num_bytes <<= 1;
2859         num_bytes += btrfs_csum_bytes_to_leaves(fs_info, csum_bytes) *
2860                                                         fs_info->nodesize;
2861         num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(fs_info,
2862                                                              num_dirty_bgs);
2863         global_rsv = &fs_info->global_block_rsv;
2864
2865         /*
2866          * If we can't allocate any more chunks lets make sure we have _lots_ of
2867          * wiggle room since running delayed refs can create more delayed refs.
2868          */
2869         if (global_rsv->space_info->full) {
2870                 num_dirty_bgs_bytes <<= 1;
2871                 num_bytes <<= 1;
2872         }
2873
2874         spin_lock(&global_rsv->lock);
2875         if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
2876                 ret = 1;
2877         spin_unlock(&global_rsv->lock);
2878         return ret;
2879 }
2880
2881 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
2882                                        struct btrfs_fs_info *fs_info)
2883 {
2884         u64 num_entries =
2885                 atomic_read(&trans->transaction->delayed_refs.num_entries);
2886         u64 avg_runtime;
2887         u64 val;
2888
2889         smp_mb();
2890         avg_runtime = fs_info->avg_delayed_ref_runtime;
2891         val = num_entries * avg_runtime;
2892         if (val >= NSEC_PER_SEC)
2893                 return 1;
2894         if (val >= NSEC_PER_SEC / 2)
2895                 return 2;
2896
2897         return btrfs_check_space_for_delayed_refs(trans, fs_info);
2898 }
2899
2900 struct async_delayed_refs {
2901         struct btrfs_root *root;
2902         u64 transid;
2903         int count;
2904         int error;
2905         int sync;
2906         struct completion wait;
2907         struct btrfs_work work;
2908 };
2909
2910 static inline struct async_delayed_refs *
2911 to_async_delayed_refs(struct btrfs_work *work)
2912 {
2913         return container_of(work, struct async_delayed_refs, work);
2914 }
2915
2916 static void delayed_ref_async_start(struct btrfs_work *work)
2917 {
2918         struct async_delayed_refs *async = to_async_delayed_refs(work);
2919         struct btrfs_trans_handle *trans;
2920         struct btrfs_fs_info *fs_info = async->root->fs_info;
2921         int ret;
2922
2923         /* if the commit is already started, we don't need to wait here */
2924         if (btrfs_transaction_blocked(fs_info))
2925                 goto done;
2926
2927         trans = btrfs_join_transaction(async->root);
2928         if (IS_ERR(trans)) {
2929                 async->error = PTR_ERR(trans);
2930                 goto done;
2931         }
2932
2933         /*
2934          * trans->sync means that when we call end_transaction, we won't
2935          * wait on delayed refs
2936          */
2937         trans->sync = true;
2938
2939         /* Don't bother flushing if we got into a different transaction */
2940         if (trans->transid > async->transid)
2941                 goto end;
2942
2943         ret = btrfs_run_delayed_refs(trans, async->count);
2944         if (ret)
2945                 async->error = ret;
2946 end:
2947         ret = btrfs_end_transaction(trans);
2948         if (ret && !async->error)
2949                 async->error = ret;
2950 done:
2951         if (async->sync)
2952                 complete(&async->wait);
2953         else
2954                 kfree(async);
2955 }
2956
2957 int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
2958                                  unsigned long count, u64 transid, int wait)
2959 {
2960         struct async_delayed_refs *async;
2961         int ret;
2962
2963         async = kmalloc(sizeof(*async), GFP_NOFS);
2964         if (!async)
2965                 return -ENOMEM;
2966
2967         async->root = fs_info->tree_root;
2968         async->count = count;
2969         async->error = 0;
2970         async->transid = transid;
2971         if (wait)
2972                 async->sync = 1;
2973         else
2974                 async->sync = 0;
2975         init_completion(&async->wait);
2976
2977         btrfs_init_work(&async->work, btrfs_extent_refs_helper,
2978                         delayed_ref_async_start, NULL, NULL);
2979
2980         btrfs_queue_work(fs_info->extent_workers, &async->work);
2981
2982         if (wait) {
2983                 wait_for_completion(&async->wait);
2984                 ret = async->error;
2985                 kfree(async);
2986                 return ret;
2987         }
2988         return 0;
2989 }
2990
2991 /*
2992  * this starts processing the delayed reference count updates and
2993  * extent insertions we have queued up so far.  count can be
2994  * 0, which means to process everything in the tree at the start
2995  * of the run (but not newly added entries), or it can be some target
2996  * number you'd like to process.
2997  *
2998  * Returns 0 on success or if called with an aborted transaction
2999  * Returns <0 on error and aborts the transaction
3000  */
3001 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
3002                            unsigned long count)
3003 {
3004         struct btrfs_fs_info *fs_info = trans->fs_info;
3005         struct rb_node *node;
3006         struct btrfs_delayed_ref_root *delayed_refs;
3007         struct btrfs_delayed_ref_head *head;
3008         int ret;
3009         int run_all = count == (unsigned long)-1;
3010         bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
3011
3012         /* We'll clean this up in btrfs_cleanup_transaction */
3013         if (trans->aborted)
3014                 return 0;
3015
3016         if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
3017                 return 0;
3018
3019         delayed_refs = &trans->transaction->delayed_refs;
3020         if (count == 0)
3021                 count = atomic_read(&delayed_refs->num_entries) * 2;
3022
3023 again:
3024 #ifdef SCRAMBLE_DELAYED_REFS
3025         delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
3026 #endif
3027         trans->can_flush_pending_bgs = false;
3028         ret = __btrfs_run_delayed_refs(trans, count);
3029         if (ret < 0) {
3030                 btrfs_abort_transaction(trans, ret);
3031                 return ret;
3032         }
3033
3034         if (run_all) {
3035                 if (!list_empty(&trans->new_bgs))
3036                         btrfs_create_pending_block_groups(trans);
3037
3038                 spin_lock(&delayed_refs->lock);
3039                 node = rb_first_cached(&delayed_refs->href_root);
3040                 if (!node) {
3041                         spin_unlock(&delayed_refs->lock);
3042                         goto out;
3043                 }
3044                 head = rb_entry(node, struct btrfs_delayed_ref_head,
3045                                 href_node);
3046                 refcount_inc(&head->refs);
3047                 spin_unlock(&delayed_refs->lock);
3048
3049                 /* Mutex was contended, block until it's released and retry. */
3050                 mutex_lock(&head->mutex);
3051                 mutex_unlock(&head->mutex);
3052
3053                 btrfs_put_delayed_ref_head(head);
3054                 cond_resched();
3055                 goto again;
3056         }
3057 out:
3058         trans->can_flush_pending_bgs = can_flush_pending_bgs;
3059         return 0;
3060 }
3061
3062 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
3063                                 struct btrfs_fs_info *fs_info,
3064                                 u64 bytenr, u64 num_bytes, u64 flags,
3065                                 int level, int is_data)
3066 {
3067         struct btrfs_delayed_extent_op *extent_op;
3068         int ret;
3069
3070         extent_op = btrfs_alloc_delayed_extent_op();
3071         if (!extent_op)
3072                 return -ENOMEM;
3073
3074         extent_op->flags_to_set = flags;
3075         extent_op->update_flags = true;
3076         extent_op->update_key = false;
3077         extent_op->is_data = is_data ? true : false;
3078         extent_op->level = level;
3079
3080         ret = btrfs_add_delayed_extent_op(fs_info, trans, bytenr,
3081                                           num_bytes, extent_op);
3082         if (ret)
3083                 btrfs_free_delayed_extent_op(extent_op);
3084         return ret;
3085 }
3086
3087 static noinline int check_delayed_ref(struct btrfs_root *root,
3088                                       struct btrfs_path *path,
3089                                       u64 objectid, u64 offset, u64 bytenr)
3090 {
3091         struct btrfs_delayed_ref_head *head;
3092         struct btrfs_delayed_ref_node *ref;
3093         struct btrfs_delayed_data_ref *data_ref;
3094         struct btrfs_delayed_ref_root *delayed_refs;
3095         struct btrfs_transaction *cur_trans;
3096         struct rb_node *node;
3097         int ret = 0;
3098
3099         spin_lock(&root->fs_info->trans_lock);
3100         cur_trans = root->fs_info->running_transaction;
3101         if (cur_trans)
3102                 refcount_inc(&cur_trans->use_count);
3103         spin_unlock(&root->fs_info->trans_lock);
3104         if (!cur_trans)
3105                 return 0;
3106
3107         delayed_refs = &cur_trans->delayed_refs;
3108         spin_lock(&delayed_refs->lock);
3109         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
3110         if (!head) {
3111                 spin_unlock(&delayed_refs->lock);
3112                 btrfs_put_transaction(cur_trans);
3113                 return 0;
3114         }
3115
3116         if (!mutex_trylock(&head->mutex)) {
3117                 refcount_inc(&head->refs);
3118                 spin_unlock(&delayed_refs->lock);
3119
3120                 btrfs_release_path(path);
3121
3122                 /*
3123                  * Mutex was contended, block until it's released and let
3124                  * caller try again
3125                  */
3126                 mutex_lock(&head->mutex);
3127                 mutex_unlock(&head->mutex);
3128                 btrfs_put_delayed_ref_head(head);
3129                 btrfs_put_transaction(cur_trans);
3130                 return -EAGAIN;
3131         }
3132         spin_unlock(&delayed_refs->lock);
3133
3134         spin_lock(&head->lock);
3135         /*
3136          * XXX: We should replace this with a proper search function in the
3137          * future.
3138          */
3139         for (node = rb_first_cached(&head->ref_tree); node;
3140              node = rb_next(node)) {
3141                 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
3142                 /* If it's a shared ref we know a cross reference exists */
3143                 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
3144                         ret = 1;
3145                         break;
3146                 }
3147
3148                 data_ref = btrfs_delayed_node_to_data_ref(ref);
3149
3150                 /*
3151                  * If our ref doesn't match the one we're currently looking at
3152                  * then we have a cross reference.
3153                  */
3154                 if (data_ref->root != root->root_key.objectid ||
3155                     data_ref->objectid != objectid ||
3156                     data_ref->offset != offset) {
3157                         ret = 1;
3158                         break;
3159                 }
3160         }
3161         spin_unlock(&head->lock);
3162         mutex_unlock(&head->mutex);
3163         btrfs_put_transaction(cur_trans);
3164         return ret;
3165 }
3166
3167 static noinline int check_committed_ref(struct btrfs_root *root,
3168                                         struct btrfs_path *path,
3169                                         u64 objectid, u64 offset, u64 bytenr)
3170 {
3171         struct btrfs_fs_info *fs_info = root->fs_info;
3172         struct btrfs_root *extent_root = fs_info->extent_root;
3173         struct extent_buffer *leaf;
3174         struct btrfs_extent_data_ref *ref;
3175         struct btrfs_extent_inline_ref *iref;
3176         struct btrfs_extent_item *ei;
3177         struct btrfs_key key;
3178         u32 item_size;
3179         int type;
3180         int ret;
3181
3182         key.objectid = bytenr;
3183         key.offset = (u64)-1;
3184         key.type = BTRFS_EXTENT_ITEM_KEY;
3185
3186         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
3187         if (ret < 0)
3188                 goto out;
3189         BUG_ON(ret == 0); /* Corruption */
3190
3191         ret = -ENOENT;
3192         if (path->slots[0] == 0)
3193                 goto out;
3194
3195         path->slots[0]--;
3196         leaf = path->nodes[0];
3197         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3198
3199         if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
3200                 goto out;
3201
3202         ret = 1;
3203         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3204         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
3205
3206         if (item_size != sizeof(*ei) +
3207             btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
3208                 goto out;
3209
3210         if (btrfs_extent_generation(leaf, ei) <=
3211             btrfs_root_last_snapshot(&root->root_item))
3212                 goto out;
3213
3214         iref = (struct btrfs_extent_inline_ref *)(ei + 1);
3215
3216         type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
3217         if (type != BTRFS_EXTENT_DATA_REF_KEY)
3218                 goto out;
3219
3220         ref = (struct btrfs_extent_data_ref *)(&iref->offset);
3221         if (btrfs_extent_refs(leaf, ei) !=
3222             btrfs_extent_data_ref_count(leaf, ref) ||
3223             btrfs_extent_data_ref_root(leaf, ref) !=
3224             root->root_key.objectid ||
3225             btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
3226             btrfs_extent_data_ref_offset(leaf, ref) != offset)
3227                 goto out;
3228
3229         ret = 0;
3230 out:
3231         return ret;
3232 }
3233
3234 int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
3235                           u64 bytenr)
3236 {
3237         struct btrfs_path *path;
3238         int ret;
3239
3240         path = btrfs_alloc_path();
3241         if (!path)
3242                 return -ENOMEM;
3243
3244         do {
3245                 ret = check_committed_ref(root, path, objectid,
3246                                           offset, bytenr);
3247                 if (ret && ret != -ENOENT)
3248                         goto out;
3249
3250                 ret = check_delayed_ref(root, path, objectid, offset, bytenr);
3251         } while (ret == -EAGAIN);
3252
3253 out:
3254         btrfs_free_path(path);
3255         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
3256                 WARN_ON(ret > 0);
3257         return ret;
3258 }
3259
3260 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3261                            struct btrfs_root *root,
3262                            struct extent_buffer *buf,
3263                            int full_backref, int inc)
3264 {
3265         struct btrfs_fs_info *fs_info = root->fs_info;
3266         u64 bytenr;
3267         u64 num_bytes;
3268         u64 parent;
3269         u64 ref_root;
3270         u32 nritems;
3271         struct btrfs_key key;
3272         struct btrfs_file_extent_item *fi;
3273         int i;
3274         int level;
3275         int ret = 0;
3276         int (*process_func)(struct btrfs_trans_handle *,
3277                             struct btrfs_root *,
3278                             u64, u64, u64, u64, u64, u64);
3279
3280
3281         if (btrfs_is_testing(fs_info))
3282                 return 0;
3283
3284         ref_root = btrfs_header_owner(buf);
3285         nritems = btrfs_header_nritems(buf);
3286         level = btrfs_header_level(buf);
3287
3288         if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
3289                 return 0;
3290
3291         if (inc)
3292                 process_func = btrfs_inc_extent_ref;
3293         else
3294                 process_func = btrfs_free_extent;
3295
3296         if (full_backref)
3297                 parent = buf->start;
3298         else
3299                 parent = 0;
3300
3301         for (i = 0; i < nritems; i++) {
3302                 if (level == 0) {
3303                         btrfs_item_key_to_cpu(buf, &key, i);
3304                         if (key.type != BTRFS_EXTENT_DATA_KEY)
3305                                 continue;
3306                         fi = btrfs_item_ptr(buf, i,
3307                                             struct btrfs_file_extent_item);
3308                         if (btrfs_file_extent_type(buf, fi) ==
3309                             BTRFS_FILE_EXTENT_INLINE)
3310                                 continue;
3311                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
3312                         if (bytenr == 0)
3313                                 continue;
3314
3315                         num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
3316                         key.offset -= btrfs_file_extent_offset(buf, fi);
3317                         ret = process_func(trans, root, bytenr, num_bytes,
3318                                            parent, ref_root, key.objectid,
3319                                            key.offset);
3320                         if (ret)
3321                                 goto fail;
3322                 } else {
3323                         bytenr = btrfs_node_blockptr(buf, i);
3324                         num_bytes = fs_info->nodesize;
3325                         ret = process_func(trans, root, bytenr, num_bytes,
3326                                            parent, ref_root, level - 1, 0);
3327                         if (ret)
3328                                 goto fail;
3329                 }
3330         }
3331         return 0;
3332 fail:
3333         return ret;
3334 }
3335
3336 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3337                   struct extent_buffer *buf, int full_backref)
3338 {
3339         return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
3340 }
3341
3342 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3343                   struct extent_buffer *buf, int full_backref)
3344 {
3345         return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
3346 }
3347
3348 static int write_one_cache_group(struct btrfs_trans_handle *trans,
3349                                  struct btrfs_fs_info *fs_info,
3350                                  struct btrfs_path *path,
3351                                  struct btrfs_block_group_cache *cache)
3352 {
3353         int ret;
3354         struct btrfs_root *extent_root = fs_info->extent_root;
3355         unsigned long bi;
3356         struct extent_buffer *leaf;
3357
3358         ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
3359         if (ret) {
3360                 if (ret > 0)
3361                         ret = -ENOENT;
3362                 goto fail;
3363         }
3364
3365         leaf = path->nodes[0];
3366         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3367         write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
3368         btrfs_mark_buffer_dirty(leaf);
3369 fail:
3370         btrfs_release_path(path);
3371         return ret;
3372
3373 }
3374
3375 static struct btrfs_block_group_cache *
3376 next_block_group(struct btrfs_fs_info *fs_info,
3377                  struct btrfs_block_group_cache *cache)
3378 {
3379         struct rb_node *node;
3380
3381         spin_lock(&fs_info->block_group_cache_lock);
3382
3383         /* If our block group was removed, we need a full search. */
3384         if (RB_EMPTY_NODE(&cache->cache_node)) {
3385                 const u64 next_bytenr = cache->key.objectid + cache->key.offset;
3386
3387                 spin_unlock(&fs_info->block_group_cache_lock);
3388                 btrfs_put_block_group(cache);
3389                 cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
3390         }
3391         node = rb_next(&cache->cache_node);
3392         btrfs_put_block_group(cache);
3393         if (node) {
3394                 cache = rb_entry(node, struct btrfs_block_group_cache,
3395                                  cache_node);
3396                 btrfs_get_block_group(cache);
3397         } else
3398                 cache = NULL;
3399         spin_unlock(&fs_info->block_group_cache_lock);
3400         return cache;
3401 }
3402
3403 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
3404                             struct btrfs_trans_handle *trans,
3405                             struct btrfs_path *path)
3406 {
3407         struct btrfs_fs_info *fs_info = block_group->fs_info;
3408         struct btrfs_root *root = fs_info->tree_root;
3409         struct inode *inode = NULL;
3410         struct extent_changeset *data_reserved = NULL;
3411         u64 alloc_hint = 0;
3412         int dcs = BTRFS_DC_ERROR;
3413         u64 num_pages = 0;
3414         int retries = 0;
3415         int ret = 0;
3416
3417         /*
3418          * If this block group is smaller than 100 megs don't bother caching the
3419          * block group.
3420          */
3421         if (block_group->key.offset < (100 * SZ_1M)) {
3422                 spin_lock(&block_group->lock);
3423                 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3424                 spin_unlock(&block_group->lock);
3425                 return 0;
3426         }
3427
3428         if (trans->aborted)
3429                 return 0;
3430 again:
3431         inode = lookup_free_space_inode(fs_info, block_group, path);
3432         if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3433                 ret = PTR_ERR(inode);
3434                 btrfs_release_path(path);
3435                 goto out;
3436         }
3437
3438         if (IS_ERR(inode)) {
3439                 BUG_ON(retries);
3440                 retries++;
3441
3442                 if (block_group->ro)
3443                         goto out_free;
3444
3445                 ret = create_free_space_inode(fs_info, trans, block_group,
3446                                               path);
3447                 if (ret)
3448                         goto out_free;
3449                 goto again;
3450         }
3451
3452         /*
3453          * We want to set the generation to 0, that way if anything goes wrong
3454          * from here on out we know not to trust this cache when we load up next
3455          * time.
3456          */
3457         BTRFS_I(inode)->generation = 0;
3458         ret = btrfs_update_inode(trans, root, inode);
3459         if (ret) {
3460                 /*
3461                  * So theoretically we could recover from this, simply set the
3462                  * super cache generation to 0 so we know to invalidate the
3463                  * cache, but then we'd have to keep track of the block groups
3464                  * that fail this way so we know we _have_ to reset this cache
3465                  * before the next commit or risk reading stale cache.  So to
3466                  * limit our exposure to horrible edge cases lets just abort the
3467                  * transaction, this only happens in really bad situations
3468                  * anyway.
3469                  */
3470                 btrfs_abort_transaction(trans, ret);
3471                 goto out_put;
3472         }
3473         WARN_ON(ret);
3474
3475         /* We've already setup this transaction, go ahead and exit */
3476         if (block_group->cache_generation == trans->transid &&
3477             i_size_read(inode)) {
3478                 dcs = BTRFS_DC_SETUP;
3479                 goto out_put;
3480         }
3481
3482         if (i_size_read(inode) > 0) {
3483                 ret = btrfs_check_trunc_cache_free_space(fs_info,
3484                                         &fs_info->global_block_rsv);
3485                 if (ret)
3486                         goto out_put;
3487
3488                 ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
3489                 if (ret)
3490                         goto out_put;
3491         }
3492
3493         spin_lock(&block_group->lock);
3494         if (block_group->cached != BTRFS_CACHE_FINISHED ||
3495             !btrfs_test_opt(fs_info, SPACE_CACHE)) {
3496                 /*
3497                  * don't bother trying to write stuff out _if_
3498                  * a) we're not cached,
3499                  * b) we're with nospace_cache mount option,
3500                  * c) we're with v2 space_cache (FREE_SPACE_TREE).
3501                  */
3502                 dcs = BTRFS_DC_WRITTEN;
3503                 spin_unlock(&block_group->lock);
3504                 goto out_put;
3505         }
3506         spin_unlock(&block_group->lock);
3507
3508         /*
3509          * We hit an ENOSPC when setting up the cache in this transaction, just
3510          * skip doing the setup, we've already cleared the cache so we're safe.
3511          */
3512         if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
3513                 ret = -ENOSPC;
3514                 goto out_put;
3515         }
3516
3517         /*
3518          * Try to preallocate enough space based on how big the block group is.
3519          * Keep in mind this has to include any pinned space which could end up
3520          * taking up quite a bit since it's not folded into the other space
3521          * cache.
3522          */
3523         num_pages = div_u64(block_group->key.offset, SZ_256M);
3524         if (!num_pages)
3525                 num_pages = 1;
3526
3527         num_pages *= 16;
3528         num_pages *= PAGE_SIZE;
3529
3530         ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
3531         if (ret)
3532                 goto out_put;
3533
3534         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3535                                               num_pages, num_pages,
3536                                               &alloc_hint);
3537         /*
3538          * Our cache requires contiguous chunks so that we don't modify a bunch
3539          * of metadata or split extents when writing the cache out, which means
3540          * we can enospc if we are heavily fragmented in addition to just normal
3541          * out of space conditions.  So if we hit this just skip setting up any
3542          * other block groups for this transaction, maybe we'll unpin enough
3543          * space the next time around.
3544          */
3545         if (!ret)
3546                 dcs = BTRFS_DC_SETUP;
3547         else if (ret == -ENOSPC)
3548                 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
3549
3550 out_put:
3551         iput(inode);
3552 out_free:
3553         btrfs_release_path(path);
3554 out:
3555         spin_lock(&block_group->lock);
3556         if (!ret && dcs == BTRFS_DC_SETUP)
3557                 block_group->cache_generation = trans->transid;
3558         block_group->disk_cache_state = dcs;
3559         spin_unlock(&block_group->lock);
3560
3561         extent_changeset_free(data_reserved);
3562         return ret;
3563 }
3564
3565 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
3566                             struct btrfs_fs_info *fs_info)
3567 {
3568         struct btrfs_block_group_cache *cache, *tmp;
3569         struct btrfs_transaction *cur_trans = trans->transaction;
3570         struct btrfs_path *path;
3571
3572         if (list_empty(&cur_trans->dirty_bgs) ||
3573             !btrfs_test_opt(fs_info, SPACE_CACHE))
3574                 return 0;
3575
3576         path = btrfs_alloc_path();
3577         if (!path)
3578                 return -ENOMEM;
3579
3580         /* Could add new block groups, use _safe just in case */
3581         list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
3582                                  dirty_list) {
3583                 if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3584                         cache_save_setup(cache, trans, path);
3585         }
3586
3587         btrfs_free_path(path);
3588         return 0;
3589 }
3590
3591 /*
3592  * transaction commit does final block group cache writeback during a
3593  * critical section where nothing is allowed to change the FS.  This is
3594  * required in order for the cache to actually match the block group,
3595  * but can introduce a lot of latency into the commit.
3596  *
3597  * So, btrfs_start_dirty_block_groups is here to kick off block group
3598  * cache IO.  There's a chance we'll have to redo some of it if the
3599  * block group changes again during the commit, but it greatly reduces
3600  * the commit latency by getting rid of the easy block groups while
3601  * we're still allowing others to join the commit.
3602  */
3603 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
3604 {
3605         struct btrfs_fs_info *fs_info = trans->fs_info;
3606         struct btrfs_block_group_cache *cache;
3607         struct btrfs_transaction *cur_trans = trans->transaction;
3608         int ret = 0;
3609         int should_put;
3610         struct btrfs_path *path = NULL;
3611         LIST_HEAD(dirty);
3612         struct list_head *io = &cur_trans->io_bgs;
3613         int num_started = 0;
3614         int loops = 0;
3615
3616         spin_lock(&cur_trans->dirty_bgs_lock);
3617         if (list_empty(&cur_trans->dirty_bgs)) {
3618                 spin_unlock(&cur_trans->dirty_bgs_lock);
3619                 return 0;
3620         }
3621         list_splice_init(&cur_trans->dirty_bgs, &dirty);
3622         spin_unlock(&cur_trans->dirty_bgs_lock);
3623
3624 again:
3625         /*
3626          * make sure all the block groups on our dirty list actually
3627          * exist
3628          */
3629         btrfs_create_pending_block_groups(trans);
3630
3631         if (!path) {
3632                 path = btrfs_alloc_path();
3633                 if (!path)
3634                         return -ENOMEM;
3635         }
3636
3637         /*
3638          * cache_write_mutex is here only to save us from balance or automatic
3639          * removal of empty block groups deleting this block group while we are
3640          * writing out the cache
3641          */
3642         mutex_lock(&trans->transaction->cache_write_mutex);
3643         while (!list_empty(&dirty)) {
3644                 cache = list_first_entry(&dirty,
3645                                          struct btrfs_block_group_cache,
3646                                          dirty_list);
3647                 /*
3648                  * this can happen if something re-dirties a block
3649                  * group that is already under IO.  Just wait for it to
3650                  * finish and then do it all again
3651                  */
3652                 if (!list_empty(&cache->io_list)) {
3653                         list_del_init(&cache->io_list);
3654                         btrfs_wait_cache_io(trans, cache, path);
3655                         btrfs_put_block_group(cache);
3656                 }
3657
3658
3659                 /*
3660                  * btrfs_wait_cache_io uses the cache->dirty_list to decide
3661                  * if it should update the cache_state.  Don't delete
3662                  * until after we wait.
3663                  *
3664                  * Since we're not running in the commit critical section
3665                  * we need the dirty_bgs_lock to protect from update_block_group
3666                  */
3667                 spin_lock(&cur_trans->dirty_bgs_lock);
3668                 list_del_init(&cache->dirty_list);
3669                 spin_unlock(&cur_trans->dirty_bgs_lock);
3670
3671                 should_put = 1;
3672
3673                 cache_save_setup(cache, trans, path);
3674
3675                 if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3676                         cache->io_ctl.inode = NULL;
3677                         ret = btrfs_write_out_cache(fs_info, trans,
3678                                                     cache, path);
3679                         if (ret == 0 && cache->io_ctl.inode) {
3680                                 num_started++;
3681                                 should_put = 0;
3682
3683                                 /*
3684                                  * The cache_write_mutex is protecting the
3685                                  * io_list, also refer to the definition of
3686                                  * btrfs_transaction::io_bgs for more details
3687                                  */
3688                                 list_add_tail(&cache->io_list, io);
3689                         } else {
3690                                 /*
3691                                  * if we failed to write the cache, the
3692                                  * generation will be bad and life goes on
3693                                  */
3694                                 ret = 0;
3695                         }
3696                 }
3697                 if (!ret) {
3698                         ret = write_one_cache_group(trans, fs_info,
3699                                                     path, cache);
3700                         /*
3701                          * Our block group might still be attached to the list
3702                          * of new block groups in the transaction handle of some
3703                          * other task (struct btrfs_trans_handle->new_bgs). This
3704                          * means its block group item isn't yet in the extent
3705                          * tree. If this happens ignore the error, as we will
3706                          * try again later in the critical section of the
3707                          * transaction commit.
3708                          */
3709                         if (ret == -ENOENT) {
3710                                 ret = 0;
3711                                 spin_lock(&cur_trans->dirty_bgs_lock);
3712                                 if (list_empty(&cache->dirty_list)) {
3713                                         list_add_tail(&cache->dirty_list,
3714                                                       &cur_trans->dirty_bgs);
3715                                         btrfs_get_block_group(cache);
3716                                 }
3717                                 spin_unlock(&cur_trans->dirty_bgs_lock);
3718                         } else if (ret) {
3719                                 btrfs_abort_transaction(trans, ret);
3720                         }
3721                 }
3722
3723                 /* if its not on the io list, we need to put the block group */
3724                 if (should_put)
3725                         btrfs_put_block_group(cache);
3726
3727                 if (ret)
3728                         break;
3729
3730                 /*
3731                  * Avoid blocking other tasks for too long. It might even save
3732                  * us from writing caches for block groups that are going to be
3733                  * removed.
3734                  */
3735                 mutex_unlock(&trans->transaction->cache_write_mutex);
3736                 mutex_lock(&trans->transaction->cache_write_mutex);
3737         }
3738         mutex_unlock(&trans->transaction->cache_write_mutex);
3739
3740         /*
3741          * go through delayed refs for all the stuff we've just kicked off
3742          * and then loop back (just once)
3743          */
3744         ret = btrfs_run_delayed_refs(trans, 0);
3745         if (!ret && loops == 0) {
3746                 loops++;
3747                 spin_lock(&cur_trans->dirty_bgs_lock);
3748                 list_splice_init(&cur_trans->dirty_bgs, &dirty);
3749                 /*
3750                  * dirty_bgs_lock protects us from concurrent block group
3751                  * deletes too (not just cache_write_mutex).
3752                  */
3753                 if (!list_empty(&dirty)) {
3754                         spin_unlock(&cur_trans->dirty_bgs_lock);
3755                         goto again;
3756                 }
3757                 spin_unlock(&cur_trans->dirty_bgs_lock);
3758         } else if (ret < 0) {
3759                 btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
3760         }
3761
3762         btrfs_free_path(path);
3763         return ret;
3764 }
3765
3766 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3767                                    struct btrfs_fs_info *fs_info)
3768 {
3769         struct btrfs_block_group_cache *cache;
3770         struct btrfs_transaction *cur_trans = trans->transaction;
3771         int ret = 0;
3772         int should_put;
3773         struct btrfs_path *path;
3774         struct list_head *io = &cur_trans->io_bgs;
3775         int num_started = 0;
3776
3777         path = btrfs_alloc_path();
3778         if (!path)
3779                 return -ENOMEM;
3780
3781         /*
3782          * Even though we are in the critical section of the transaction commit,
3783          * we can still have concurrent tasks adding elements to this
3784          * transaction's list of dirty block groups. These tasks correspond to
3785          * endio free space workers started when writeback finishes for a
3786          * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
3787          * allocate new block groups as a result of COWing nodes of the root
3788          * tree when updating the free space inode. The writeback for the space
3789          * caches is triggered by an earlier call to
3790          * btrfs_start_dirty_block_groups() and iterations of the following
3791          * loop.
3792          * Also we want to do the cache_save_setup first and then run the
3793          * delayed refs to make sure we have the best chance at doing this all
3794          * in one shot.
3795          */
3796         spin_lock(&cur_trans->dirty_bgs_lock);
3797         while (!list_empty(&cur_trans->dirty_bgs)) {
3798                 cache = list_first_entry(&cur_trans->dirty_bgs,
3799                                          struct btrfs_block_group_cache,
3800                                          dirty_list);
3801
3802                 /*
3803                  * this can happen if cache_save_setup re-dirties a block
3804                  * group that is already under IO.  Just wait for it to
3805                  * finish and then do it all again
3806                  */
3807                 if (!list_empty(&cache->io_list)) {
3808                         spin_unlock(&cur_trans->dirty_bgs_lock);
3809                         list_del_init(&cache->io_list);
3810                         btrfs_wait_cache_io(trans, cache, path);
3811                         btrfs_put_block_group(cache);
3812                         spin_lock(&cur_trans->dirty_bgs_lock);
3813                 }
3814
3815                 /*
3816                  * don't remove from the dirty list until after we've waited
3817                  * on any pending IO
3818                  */
3819                 list_del_init(&cache->dirty_list);
3820                 spin_unlock(&cur_trans->dirty_bgs_lock);
3821                 should_put = 1;
3822
3823                 cache_save_setup(cache, trans, path);
3824
3825                 if (!ret)
3826                         ret = btrfs_run_delayed_refs(trans,
3827                                                      (unsigned long) -1);
3828
3829                 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3830                         cache->io_ctl.inode = NULL;
3831                         ret = btrfs_write_out_cache(fs_info, trans,
3832                                                     cache, path);
3833                         if (ret == 0 && cache->io_ctl.inode) {
3834                                 num_started++;
3835                                 should_put = 0;
3836                                 list_add_tail(&cache->io_list, io);
3837                         } else {
3838                                 /*
3839                                  * if we failed to write the cache, the
3840                                  * generation will be bad and life goes on
3841                                  */
3842                                 ret = 0;
3843                         }
3844                 }
3845                 if (!ret) {
3846                         ret = write_one_cache_group(trans, fs_info,
3847                                                     path, cache);
3848                         /*
3849                          * One of the free space endio workers might have
3850                          * created a new block group while updating a free space
3851                          * cache's inode (at inode.c:btrfs_finish_ordered_io())
3852                          * and hasn't released its transaction handle yet, in
3853                          * which case the new block group is still attached to
3854                          * its transaction handle and its creation has not
3855                          * finished yet (no block group item in the extent tree
3856                          * yet, etc). If this is the case, wait for all free
3857                          * space endio workers to finish and retry. This is a
3858                          * a very rare case so no need for a more efficient and
3859                          * complex approach.
3860                          */
3861                         if (ret == -ENOENT) {
3862                                 wait_event(cur_trans->writer_wait,
3863                                    atomic_read(&cur_trans->num_writers) == 1);
3864                                 ret = write_one_cache_group(trans, fs_info,
3865                                                             path, cache);
3866                         }
3867                         if (ret)
3868                                 btrfs_abort_transaction(trans, ret);
3869                 }
3870
3871                 /* if its not on the io list, we need to put the block group */
3872                 if (should_put)
3873                         btrfs_put_block_group(cache);
3874                 spin_lock(&cur_trans->dirty_bgs_lock);
3875         }
3876         spin_unlock(&cur_trans->dirty_bgs_lock);
3877
3878         /*
3879          * Refer to the definition of io_bgs member for details why it's safe
3880          * to use it without any locking
3881          */
3882         while (!list_empty(io)) {
3883                 cache = list_first_entry(io, struct btrfs_block_group_cache,
3884                                          io_list);
3885                 list_del_init(&cache->io_list);
3886                 btrfs_wait_cache_io(trans, cache, path);
3887                 btrfs_put_block_group(cache);
3888         }
3889
3890         btrfs_free_path(path);
3891         return ret;
3892 }
3893
3894 int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
3895 {
3896         struct btrfs_block_group_cache *block_group;
3897         int readonly = 0;
3898
3899         block_group = btrfs_lookup_block_group(fs_info, bytenr);
3900         if (!block_group || block_group->ro)
3901                 readonly = 1;
3902         if (block_group)
3903                 btrfs_put_block_group(block_group);
3904         return readonly;
3905 }
3906
3907 bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3908 {
3909         struct btrfs_block_group_cache *bg;
3910         bool ret = true;
3911
3912         bg = btrfs_lookup_block_group(fs_info, bytenr);
3913         if (!bg)
3914                 return false;
3915
3916         spin_lock(&bg->lock);
3917         if (bg->ro)
3918                 ret = false;
3919         else
3920                 atomic_inc(&bg->nocow_writers);
3921         spin_unlock(&bg->lock);
3922
3923         /* no put on block group, done by btrfs_dec_nocow_writers */
3924         if (!ret)
3925                 btrfs_put_block_group(bg);
3926
3927         return ret;
3928
3929 }
3930
3931 void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3932 {
3933         struct btrfs_block_group_cache *bg;
3934
3935         bg = btrfs_lookup_block_group(fs_info, bytenr);
3936         ASSERT(bg);
3937         if (atomic_dec_and_test(&bg->nocow_writers))
3938                 wake_up_var(&bg->nocow_writers);
3939         /*
3940          * Once for our lookup and once for the lookup done by a previous call
3941          * to btrfs_inc_nocow_writers()
3942          */
3943         btrfs_put_block_group(bg);
3944         btrfs_put_block_group(bg);
3945 }
3946
3947 void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
3948 {
3949         wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
3950 }
3951
3952 static const char *alloc_name(u64 flags)
3953 {
3954         switch (flags) {
3955         case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
3956                 return "mixed";
3957         case BTRFS_BLOCK_GROUP_METADATA:
3958                 return "metadata";
3959         case BTRFS_BLOCK_GROUP_DATA:
3960                 return "data";
3961         case BTRFS_BLOCK_GROUP_SYSTEM:
3962                 return "system";
3963         default:
3964                 WARN_ON(1);
3965                 return "invalid-combination";
3966         };
3967 }
3968
3969 static int create_space_info(struct btrfs_fs_info *info, u64 flags)
3970 {
3971
3972         struct btrfs_space_info *space_info;
3973         int i;
3974         int ret;
3975
3976         space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
3977         if (!space_info)
3978                 return -ENOMEM;
3979
3980         ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
3981                                  GFP_KERNEL);
3982         if (ret) {
3983                 kfree(space_info);
3984                 return ret;
3985         }
3986
3987         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3988                 INIT_LIST_HEAD(&space_info->block_groups[i]);
3989         init_rwsem(&space_info->groups_sem);
3990         spin_lock_init(&space_info->lock);
3991         space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
3992         space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3993         init_waitqueue_head(&space_info->wait);
3994         INIT_LIST_HEAD(&space_info->ro_bgs);
3995         INIT_LIST_HEAD(&space_info->tickets);
3996         INIT_LIST_HEAD(&space_info->priority_tickets);
3997
3998         ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype,
3999                                     info->space_info_kobj, "%s",
4000                                     alloc_name(space_info->flags));
4001         if (ret) {
4002                 percpu_counter_destroy(&space_info->total_bytes_pinned);
4003                 kfree(space_info);
4004                 return ret;
4005         }
4006
4007         list_add_rcu(&space_info->list, &info->space_info);
4008         if (flags & BTRFS_BLOCK_GROUP_DATA)
4009                 info->data_sinfo = space_info;
4010
4011         return ret;
4012 }
4013
4014 static void update_space_info(struct btrfs_fs_info *info, u64 flags,
4015                              u64 total_bytes, u64 bytes_used,
4016                              u64 bytes_readonly,
4017                              struct btrfs_space_info **space_info)
4018 {
4019         struct btrfs_space_info *found;
4020         int factor;
4021
4022         factor = btrfs_bg_type_to_factor(flags);
4023
4024         found = __find_space_info(info, flags);
4025         ASSERT(found);
4026         spin_lock(&found->lock);
4027         found->total_bytes += total_bytes;
4028         found->disk_total += total_bytes * factor;
4029         found->bytes_used += bytes_used;
4030         found->disk_used += bytes_used * factor;
4031         found->bytes_readonly += bytes_readonly;
4032         if (total_bytes > 0)
4033                 found->full = 0;
4034         space_info_add_new_bytes(info, found, total_bytes -
4035                                  bytes_used - bytes_readonly);
4036         spin_unlock(&found->lock);
4037         *space_info = found;
4038 }
4039
4040 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
4041 {
4042         u64 extra_flags = chunk_to_extended(flags) &
4043                                 BTRFS_EXTENDED_PROFILE_MASK;
4044
4045         write_seqlock(&fs_info->profiles_lock);
4046         if (flags & BTRFS_BLOCK_GROUP_DATA)
4047                 fs_info->avail_data_alloc_bits |= extra_flags;
4048         if (flags & BTRFS_BLOCK_GROUP_METADATA)
4049                 fs_info->avail_metadata_alloc_bits |= extra_flags;
4050         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
4051                 fs_info->avail_system_alloc_bits |= extra_flags;
4052         write_sequnlock(&fs_info->profiles_lock);
4053 }
4054
4055 /*
4056  * returns target flags in extended format or 0 if restripe for this
4057  * chunk_type is not in progress
4058  *
4059  * should be called with balance_lock held
4060  */
4061 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
4062 {
4063         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
4064         u64 target = 0;
4065
4066         if (!bctl)
4067                 return 0;
4068
4069         if (flags & BTRFS_BLOCK_GROUP_DATA &&
4070             bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4071                 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
4072         } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
4073                    bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4074                 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
4075         } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
4076                    bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4077                 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
4078         }
4079
4080         return target;
4081 }
4082
4083 /*
4084  * @flags: available profiles in extended format (see ctree.h)
4085  *
4086  * Returns reduced profile in chunk format.  If profile changing is in
4087  * progress (either running or paused) picks the target profile (if it's
4088  * already available), otherwise falls back to plain reducing.
4089  */
4090 static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
4091 {
4092         u64 num_devices = fs_info->fs_devices->rw_devices;
4093         u64 target;
4094         u64 raid_type;
4095         u64 allowed = 0;
4096
4097         /*
4098          * see if restripe for this chunk_type is in progress, if so
4099          * try to reduce to the target profile
4100          */
4101         spin_lock(&fs_info->balance_lock);
4102         target = get_restripe_target(fs_info, flags);
4103         if (target) {
4104                 /* pick target profile only if it's already available */
4105                 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
4106                         spin_unlock(&fs_info->balance_lock);
4107                         return extended_to_chunk(target);
4108                 }
4109         }
4110         spin_unlock(&fs_info->balance_lock);
4111
4112         /* First, mask out the RAID levels which aren't possible */
4113         for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
4114                 if (num_devices >= btrfs_raid_array[raid_type].devs_min)
4115                         allowed |= btrfs_raid_array[raid_type].bg_flag;
4116         }
4117         allowed &= flags;
4118
4119         if (allowed & BTRFS_BLOCK_GROUP_RAID6)
4120                 allowed = BTRFS_BLOCK_GROUP_RAID6;
4121         else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
4122                 allowed = BTRFS_BLOCK_GROUP_RAID5;
4123         else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
4124                 allowed = BTRFS_BLOCK_GROUP_RAID10;
4125         else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
4126                 allowed = BTRFS_BLOCK_GROUP_RAID1;
4127         else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
4128                 allowed = BTRFS_BLOCK_GROUP_RAID0;
4129
4130         flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
4131
4132         return extended_to_chunk(flags | allowed);
4133 }
4134
4135 static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
4136 {
4137         unsigned seq;
4138         u64 flags;
4139
4140         do {
4141                 flags = orig_flags;
4142                 seq = read_seqbegin(&fs_info->profiles_lock);
4143
4144                 if (flags & BTRFS_BLOCK_GROUP_DATA)
4145                         flags |= fs_info->avail_data_alloc_bits;
4146                 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
4147                         flags |= fs_info->avail_system_alloc_bits;
4148                 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
4149                         flags |= fs_info->avail_metadata_alloc_bits;
4150         } while (read_seqretry(&fs_info->profiles_lock, seq));
4151
4152         return btrfs_reduce_alloc_profile(fs_info, flags);
4153 }
4154
4155 static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
4156 {
4157         struct btrfs_fs_info *fs_info = root->fs_info;
4158         u64 flags;
4159         u64 ret;
4160
4161         if (data)
4162                 flags = BTRFS_BLOCK_GROUP_DATA;
4163         else if (root == fs_info->chunk_root)
4164                 flags = BTRFS_BLOCK_GROUP_SYSTEM;
4165         else
4166                 flags = BTRFS_BLOCK_GROUP_METADATA;
4167
4168         ret = get_alloc_profile(fs_info, flags);
4169         return ret;
4170 }
4171
4172 u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
4173 {
4174         return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA);
4175 }
4176
4177 u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info)
4178 {
4179         return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4180 }
4181
4182 u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
4183 {
4184         return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4185 }
4186
4187 static u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
4188                                  bool may_use_included)
4189 {
4190         ASSERT(s_info);
4191         return s_info->bytes_used + s_info->bytes_reserved +
4192                 s_info->bytes_pinned + s_info->bytes_readonly +
4193                 (may_use_included ? s_info->bytes_may_use : 0);
4194 }
4195
4196 int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
4197 {
4198         struct btrfs_root *root = inode->root;
4199         struct btrfs_fs_info *fs_info = root->fs_info;
4200         struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
4201         u64 used;
4202         int ret = 0;
4203         int need_commit = 2;
4204         int have_pinned_space;
4205
4206         /* make sure bytes are sectorsize aligned */
4207         bytes = ALIGN(bytes, fs_info->sectorsize);
4208
4209         if (btrfs_is_free_space_inode(inode)) {
4210                 need_commit = 0;
4211                 ASSERT(current->journal_info);
4212         }
4213
4214 again:
4215         /* make sure we have enough space to handle the data first */
4216         spin_lock(&data_sinfo->lock);
4217         used = btrfs_space_info_used(data_sinfo, true);
4218
4219         if (used + bytes > data_sinfo->total_bytes) {
4220                 struct btrfs_trans_handle *trans;
4221
4222                 /*
4223                  * if we don't have enough free bytes in this space then we need
4224                  * to alloc a new chunk.
4225                  */
4226                 if (!data_sinfo->full) {
4227                         u64 alloc_target;
4228
4229                         data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
4230                         spin_unlock(&data_sinfo->lock);
4231
4232                         alloc_target = btrfs_data_alloc_profile(fs_info);
4233                         /*
4234                          * It is ugly that we don't call nolock join
4235                          * transaction for the free space inode case here.
4236                          * But it is safe because we only do the data space
4237                          * reservation for the free space cache in the
4238                          * transaction context, the common join transaction
4239                          * just increase the counter of the current transaction
4240                          * handler, doesn't try to acquire the trans_lock of
4241                          * the fs.
4242                          */
4243                         trans = btrfs_join_transaction(root);
4244                         if (IS_ERR(trans))
4245                                 return PTR_ERR(trans);
4246
4247                         ret = do_chunk_alloc(trans, alloc_target,
4248                                              CHUNK_ALLOC_NO_FORCE);
4249                         btrfs_end_transaction(trans);
4250                         if (ret < 0) {
4251                                 if (ret != -ENOSPC)
4252                                         return ret;
4253                                 else {
4254                                         have_pinned_space = 1;
4255                                         goto commit_trans;
4256                                 }
4257                         }
4258
4259                         goto again;
4260                 }
4261
4262                 /*
4263                  * If we don't have enough pinned space to deal with this
4264                  * allocation, and no removed chunk in current transaction,
4265                  * don't bother committing the transaction.
4266                  */
4267                 have_pinned_space = __percpu_counter_compare(
4268                         &data_sinfo->total_bytes_pinned,
4269                         used + bytes - data_sinfo->total_bytes,
4270                         BTRFS_TOTAL_BYTES_PINNED_BATCH);
4271                 spin_unlock(&data_sinfo->lock);
4272
4273                 /* commit the current transaction and try again */
4274 commit_trans:
4275                 if (need_commit) {
4276                         need_commit--;
4277
4278                         if (need_commit > 0) {
4279                                 btrfs_start_delalloc_roots(fs_info, -1);
4280                                 btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
4281                                                          (u64)-1);
4282                         }
4283
4284                         trans = btrfs_join_transaction(root);
4285                         if (IS_ERR(trans))
4286                                 return PTR_ERR(trans);
4287                         if (have_pinned_space >= 0 ||
4288                             test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
4289                                      &trans->transaction->flags) ||
4290                             need_commit > 0) {
4291                                 ret = btrfs_commit_transaction(trans);
4292                                 if (ret)
4293                                         return ret;
4294                                 /*
4295                                  * The cleaner kthread might still be doing iput
4296                                  * operations. Wait for it to finish so that
4297                                  * more space is released.
4298                                  */
4299                                 mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
4300                                 mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
4301                                 goto again;
4302                         } else {
4303                                 btrfs_end_transaction(trans);
4304                         }
4305                 }
4306
4307                 trace_btrfs_space_reservation(fs_info,
4308                                               "space_info:enospc",
4309                                               data_sinfo->flags, bytes, 1);
4310                 return -ENOSPC;
4311         }
4312         data_sinfo->bytes_may_use += bytes;
4313         trace_btrfs_space_reservation(fs_info, "space_info",
4314                                       data_sinfo->flags, bytes, 1);
4315         spin_unlock(&data_sinfo->lock);
4316
4317         return 0;
4318 }
4319
4320 int btrfs_check_data_free_space(struct inode *inode,
4321                         struct extent_changeset **reserved, u64 start, u64 len)
4322 {
4323         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4324         int ret;
4325
4326         /* align the range */
4327         len = round_up(start + len, fs_info->sectorsize) -
4328               round_down(start, fs_info->sectorsize);
4329         start = round_down(start, fs_info->sectorsize);
4330
4331         ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len);
4332         if (ret < 0)
4333                 return ret;
4334
4335         /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
4336         ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
4337         if (ret < 0)
4338                 btrfs_free_reserved_data_space_noquota(inode, start, len);
4339         else
4340                 ret = 0;
4341         return ret;
4342 }
4343
4344 /*
4345  * Called if we need to clear a data reservation for this inode
4346  * Normally in a error case.
4347  *
4348  * This one will *NOT* use accurate qgroup reserved space API, just for case
4349  * which we can't sleep and is sure it won't affect qgroup reserved space.
4350  * Like clear_bit_hook().
4351  */
4352 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
4353                                             u64 len)
4354 {
4355         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4356         struct btrfs_space_info *data_sinfo;
4357
4358         /* Make sure the range is aligned to sectorsize */
4359         len = round_up(start + len, fs_info->sectorsize) -
4360               round_down(start, fs_info->sectorsize);
4361         start = round_down(start, fs_info->sectorsize);
4362
4363         data_sinfo = fs_info->data_sinfo;
4364         spin_lock(&data_sinfo->lock);
4365         if (WARN_ON(data_sinfo->bytes_may_use < len))
4366                 data_sinfo->bytes_may_use = 0;
4367         else
4368                 data_sinfo->bytes_may_use -= len;
4369         trace_btrfs_space_reservation(fs_info, "space_info",
4370                                       data_sinfo->flags, len, 0);
4371         spin_unlock(&data_sinfo->lock);
4372 }
4373
4374 /*
4375  * Called if we need to clear a data reservation for this inode
4376  * Normally in a error case.
4377  *
4378  * This one will handle the per-inode data rsv map for accurate reserved
4379  * space framework.
4380  */
4381 void btrfs_free_reserved_data_space(struct inode *inode,
4382                         struct extent_changeset *reserved, u64 start, u64 len)
4383 {
4384         struct btrfs_root *root = BTRFS_I(inode)->root;
4385
4386         /* Make sure the range is aligned to sectorsize */
4387         len = round_up(start + len, root->fs_info->sectorsize) -
4388               round_down(start, root->fs_info->sectorsize);
4389         start = round_down(start, root->fs_info->sectorsize);
4390
4391         btrfs_free_reserved_data_space_noquota(inode, start, len);
4392         btrfs_qgroup_free_data(inode, reserved, start, len);
4393 }
4394
4395 static void force_metadata_allocation(struct btrfs_fs_info *info)
4396 {
4397         struct list_head *head = &info->space_info;
4398         struct btrfs_space_info *found;
4399
4400         rcu_read_lock();
4401         list_for_each_entry_rcu(found, head, list) {
4402                 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
4403                         found->force_alloc = CHUNK_ALLOC_FORCE;
4404         }
4405         rcu_read_unlock();
4406 }
4407
4408 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
4409 {
4410         return (global->size << 1);
4411 }
4412
4413 static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
4414                               struct btrfs_space_info *sinfo, int force)
4415 {
4416         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4417         u64 bytes_used = btrfs_space_info_used(sinfo, false);
4418         u64 thresh;
4419
4420         if (force == CHUNK_ALLOC_FORCE)
4421                 return 1;
4422
4423         /*
4424          * We need to take into account the global rsv because for all intents
4425          * and purposes it's used space.  Don't worry about locking the
4426          * global_rsv, it doesn't change except when the transaction commits.
4427          */
4428         if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
4429                 bytes_used += calc_global_rsv_need_space(global_rsv);
4430
4431         /*
4432          * in limited mode, we want to have some free space up to
4433          * about 1% of the FS size.
4434          */
4435         if (force == CHUNK_ALLOC_LIMITED) {
4436                 thresh = btrfs_super_total_bytes(fs_info->super_copy);
4437                 thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
4438
4439                 if (sinfo->total_bytes - bytes_used < thresh)
4440                         return 1;
4441         }
4442
4443         if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
4444                 return 0;
4445         return 1;
4446 }
4447
4448 static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
4449 {
4450         u64 num_dev;
4451
4452         if (type & (BTRFS_BLOCK_GROUP_RAID10 |
4453                     BTRFS_BLOCK_GROUP_RAID0 |
4454                     BTRFS_BLOCK_GROUP_RAID5 |
4455                     BTRFS_BLOCK_GROUP_RAID6))
4456                 num_dev = fs_info->fs_devices->rw_devices;
4457         else if (type & BTRFS_BLOCK_GROUP_RAID1)
4458                 num_dev = 2;
4459         else
4460                 num_dev = 1;    /* DUP or single */
4461
4462         return num_dev;
4463 }
4464
4465 /*
4466  * If @is_allocation is true, reserve space in the system space info necessary
4467  * for allocating a chunk, otherwise if it's false, reserve space necessary for
4468  * removing a chunk.
4469  */
4470 void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
4471 {
4472         struct btrfs_fs_info *fs_info = trans->fs_info;
4473         struct btrfs_space_info *info;
4474         u64 left;
4475         u64 thresh;
4476         int ret = 0;
4477         u64 num_devs;
4478
4479         /*
4480          * Needed because we can end up allocating a system chunk and for an
4481          * atomic and race free space reservation in the chunk block reserve.
4482          */
4483         lockdep_assert_held(&fs_info->chunk_mutex);
4484
4485         info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4486         spin_lock(&info->lock);
4487         left = info->total_bytes - btrfs_space_info_used(info, true);
4488         spin_unlock(&info->lock);
4489
4490         num_devs = get_profile_num_devs(fs_info, type);
4491
4492         /* num_devs device items to update and 1 chunk item to add or remove */
4493         thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) +
4494                 btrfs_calc_trans_metadata_size(fs_info, 1);
4495
4496         if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4497                 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
4498                            left, thresh, type);
4499                 dump_space_info(fs_info, info, 0, 0);
4500         }
4501
4502         if (left < thresh) {
4503                 u64 flags = btrfs_system_alloc_profile(fs_info);
4504
4505                 /*
4506                  * Ignore failure to create system chunk. We might end up not
4507                  * needing it, as we might not need to COW all nodes/leafs from
4508                  * the paths we visit in the chunk tree (they were already COWed
4509                  * or created in the current transaction for example).
4510                  */
4511                 ret = btrfs_alloc_chunk(trans, flags);
4512         }
4513
4514         if (!ret) {
4515                 ret = btrfs_block_rsv_add(fs_info->chunk_root,
4516                                           &fs_info->chunk_block_rsv,
4517                                           thresh, BTRFS_RESERVE_NO_FLUSH);
4518                 if (!ret)
4519                         trans->chunk_bytes_reserved += thresh;
4520         }
4521 }
4522
4523 /*
4524  * If force is CHUNK_ALLOC_FORCE:
4525  *    - return 1 if it successfully allocates a chunk,
4526  *    - return errors including -ENOSPC otherwise.
4527  * If force is NOT CHUNK_ALLOC_FORCE:
4528  *    - return 0 if it doesn't need to allocate a new chunk,
4529  *    - return 1 if it successfully allocates a chunk,
4530  *    - return errors including -ENOSPC otherwise.
4531  */
4532 static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
4533                           int force)
4534 {
4535         struct btrfs_fs_info *fs_info = trans->fs_info;
4536         struct btrfs_space_info *space_info;
4537         bool wait_for_alloc = false;
4538         bool should_alloc = false;
4539         int ret = 0;
4540
4541         /* Don't re-enter if we're already allocating a chunk */
4542         if (trans->allocating_chunk)
4543                 return -ENOSPC;
4544
4545         space_info = __find_space_info(fs_info, flags);
4546         ASSERT(space_info);
4547
4548         do {
4549                 spin_lock(&space_info->lock);
4550                 if (force < space_info->force_alloc)
4551                         force = space_info->force_alloc;
4552                 should_alloc = should_alloc_chunk(fs_info, space_info, force);
4553                 if (space_info->full) {
4554                         /* No more free physical space */
4555                         if (should_alloc)
4556                                 ret = -ENOSPC;
4557                         else
4558                                 ret = 0;
4559                         spin_unlock(&space_info->lock);
4560                         return ret;
4561                 } else if (!should_alloc) {
4562                         spin_unlock(&space_info->lock);
4563                         return 0;
4564                 } else if (space_info->chunk_alloc) {
4565                         /*
4566                          * Someone is already allocating, so we need to block
4567                          * until this someone is finished and then loop to
4568                          * recheck if we should continue with our allocation
4569                          * attempt.
4570                          */
4571                         wait_for_alloc = true;
4572                         spin_unlock(&space_info->lock);
4573                         mutex_lock(&fs_info->chunk_mutex);
4574                         mutex_unlock(&fs_info->chunk_mutex);
4575                 } else {
4576                         /* Proceed with allocation */
4577                         space_info->chunk_alloc = 1;
4578                         wait_for_alloc = false;
4579                         spin_unlock(&space_info->lock);
4580                 }
4581
4582                 cond_resched();
4583         } while (wait_for_alloc);
4584
4585         mutex_lock(&fs_info->chunk_mutex);
4586         trans->allocating_chunk = true;
4587
4588         /*
4589          * If we have mixed data/metadata chunks we want to make sure we keep
4590          * allocating mixed chunks instead of individual chunks.
4591          */
4592         if (btrfs_mixed_space_info(space_info))
4593                 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
4594
4595         /*
4596          * if we're doing a data chunk, go ahead and make sure that
4597          * we keep a reasonable number of metadata chunks allocated in the
4598          * FS as well.
4599          */
4600         if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
4601                 fs_info->data_chunk_allocations++;
4602                 if (!(fs_info->data_chunk_allocations %
4603                       fs_info->metadata_ratio))
4604                         force_metadata_allocation(fs_info);
4605         }
4606
4607         /*
4608          * Check if we have enough space in SYSTEM chunk because we may need
4609          * to update devices.
4610          */
4611         check_system_chunk(trans, flags);
4612
4613         ret = btrfs_alloc_chunk(trans, flags);
4614         trans->allocating_chunk = false;
4615
4616         spin_lock(&space_info->lock);
4617         if (ret < 0) {
4618                 if (ret == -ENOSPC)
4619                         space_info->full = 1;
4620                 else
4621                         goto out;
4622         } else {
4623                 ret = 1;
4624         }
4625
4626         space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4627 out:
4628         space_info->chunk_alloc = 0;
4629         spin_unlock(&space_info->lock);
4630         mutex_unlock(&fs_info->chunk_mutex);
4631         /*
4632          * When we allocate a new chunk we reserve space in the chunk block
4633          * reserve to make sure we can COW nodes/leafs in the chunk tree or
4634          * add new nodes/leafs to it if we end up needing to do it when
4635          * inserting the chunk item and updating device items as part of the
4636          * second phase of chunk allocation, performed by
4637          * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
4638          * large number of new block groups to create in our transaction
4639          * handle's new_bgs list to avoid exhausting the chunk block reserve
4640          * in extreme cases - like having a single transaction create many new
4641          * block groups when starting to write out the free space caches of all
4642          * the block groups that were made dirty during the lifetime of the
4643          * transaction.
4644          */
4645         if (trans->can_flush_pending_bgs &&
4646             trans->chunk_bytes_reserved >= (u64)SZ_2M) {
4647                 btrfs_create_pending_block_groups(trans);
4648                 btrfs_trans_release_chunk_metadata(trans);
4649         }
4650         return ret;
4651 }
4652
4653 static int can_overcommit(struct btrfs_fs_info *fs_info,
4654                           struct btrfs_space_info *space_info, u64 bytes,
4655                           enum btrfs_reserve_flush_enum flush,
4656                           bool system_chunk)
4657 {
4658         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4659         u64 profile;
4660         u64 space_size;
4661         u64 avail;
4662         u64 used;
4663         int factor;
4664
4665         /* Don't overcommit when in mixed mode. */
4666         if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
4667                 return 0;
4668
4669         if (system_chunk)
4670                 profile = btrfs_system_alloc_profile(fs_info);
4671         else
4672                 profile = btrfs_metadata_alloc_profile(fs_info);
4673
4674         used = btrfs_space_info_used(space_info, false);
4675
4676         /*
4677          * We only want to allow over committing if we have lots of actual space
4678          * free, but if we don't have enough space to handle the global reserve
4679          * space then we could end up having a real enospc problem when trying
4680          * to allocate a chunk or some other such important allocation.
4681          */
4682         spin_lock(&global_rsv->lock);
4683         space_size = calc_global_rsv_need_space(global_rsv);
4684         spin_unlock(&global_rsv->lock);
4685         if (used + space_size >= space_info->total_bytes)
4686                 return 0;
4687
4688         used += space_info->bytes_may_use;
4689
4690         avail = atomic64_read(&fs_info->free_chunk_space);
4691
4692         /*
4693          * If we have dup, raid1 or raid10 then only half of the free
4694          * space is actually useable.  For raid56, the space info used
4695          * doesn't include the parity drive, so we don't have to
4696          * change the math
4697          */
4698         factor = btrfs_bg_type_to_factor(profile);
4699         avail = div_u64(avail, factor);
4700
4701         /*
4702          * If we aren't flushing all things, let us overcommit up to
4703          * 1/2th of the space. If we can flush, don't let us overcommit
4704          * too much, let it overcommit up to 1/8 of the space.
4705          */
4706         if (flush == BTRFS_RESERVE_FLUSH_ALL)
4707                 avail >>= 3;
4708         else
4709                 avail >>= 1;
4710
4711         if (used + bytes < space_info->total_bytes + avail)
4712                 return 1;
4713         return 0;
4714 }
4715
4716 static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
4717                                          unsigned long nr_pages, int nr_items)
4718 {
4719         struct super_block *sb = fs_info->sb;
4720
4721         if (down_read_trylock(&sb->s_umount)) {
4722                 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
4723                 up_read(&sb->s_umount);
4724         } else {
4725                 /*
4726                  * We needn't worry the filesystem going from r/w to r/o though
4727                  * we don't acquire ->s_umount mutex, because the filesystem
4728                  * should guarantee the delalloc inodes list be empty after
4729                  * the filesystem is readonly(all dirty pages are written to
4730                  * the disk).
4731                  */
4732                 btrfs_start_delalloc_roots(fs_info, nr_items);
4733                 if (!current->journal_info)
4734                         btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
4735         }
4736 }
4737
4738 static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
4739                                         u64 to_reclaim)
4740 {
4741         u64 bytes;
4742         u64 nr;
4743
4744         bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
4745         nr = div64_u64(to_reclaim, bytes);
4746         if (!nr)
4747                 nr = 1;
4748         return nr;
4749 }
4750
4751 #define EXTENT_SIZE_PER_ITEM    SZ_256K
4752
4753 /*
4754  * shrink metadata reservation for delalloc
4755  */
4756 static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
4757                             u64 orig, bool wait_ordered)
4758 {
4759         struct btrfs_space_info *space_info;
4760         struct btrfs_trans_handle *trans;
4761         u64 delalloc_bytes;
4762         u64 max_reclaim;
4763         u64 items;
4764         long time_left;
4765         unsigned long nr_pages;
4766         int loops;
4767
4768         /* Calc the number of the pages we need flush for space reservation */
4769         items = calc_reclaim_items_nr(fs_info, to_reclaim);
4770         to_reclaim = items * EXTENT_SIZE_PER_ITEM;
4771
4772         trans = (struct btrfs_trans_handle *)current->journal_info;
4773         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4774
4775         delalloc_bytes = percpu_counter_sum_positive(
4776                                                 &fs_info->delalloc_bytes);
4777         if (delalloc_bytes == 0) {
4778                 if (trans)
4779                         return;
4780                 if (wait_ordered)
4781                         btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
4782                 return;
4783         }
4784
4785         loops = 0;
4786         while (delalloc_bytes && loops < 3) {
4787                 max_reclaim = min(delalloc_bytes, to_reclaim);
4788                 nr_pages = max_reclaim >> PAGE_SHIFT;
4789                 btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
4790                 /*
4791                  * We need to wait for the async pages to actually start before
4792                  * we do anything.
4793                  */
4794                 max_reclaim = atomic_read(&fs_info->async_delalloc_pages);
4795                 if (!max_reclaim)
4796                         goto skip_async;
4797
4798                 if (max_reclaim <= nr_pages)
4799                         max_reclaim = 0;
4800                 else
4801                         max_reclaim -= nr_pages;
4802
4803                 wait_event(fs_info->async_submit_wait,
4804                            atomic_read(&fs_info->async_delalloc_pages) <=
4805                            (int)max_reclaim);
4806 skip_async:
4807                 spin_lock(&space_info->lock);
4808                 if (list_empty(&space_info->tickets) &&
4809                     list_empty(&space_info->priority_tickets)) {
4810                         spin_unlock(&space_info->lock);
4811                         break;
4812                 }
4813                 spin_unlock(&space_info->lock);
4814
4815                 loops++;
4816                 if (wait_ordered && !trans) {
4817                         btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
4818                 } else {
4819                         time_left = schedule_timeout_killable(1);
4820                         if (time_left)
4821                                 break;
4822                 }
4823                 delalloc_bytes = percpu_counter_sum_positive(
4824                                                 &fs_info->delalloc_bytes);
4825         }
4826 }
4827
4828 struct reserve_ticket {
4829         u64 bytes;
4830         int error;
4831         struct list_head list;
4832         wait_queue_head_t wait;
4833 };
4834
4835 /**
4836  * maybe_commit_transaction - possibly commit the transaction if its ok to
4837  * @root - the root we're allocating for
4838  * @bytes - the number of bytes we want to reserve
4839  * @force - force the commit
4840  *
4841  * This will check to make sure that committing the transaction will actually
4842  * get us somewhere and then commit the transaction if it does.  Otherwise it
4843  * will return -ENOSPC.
4844  */
4845 static int may_commit_transaction(struct btrfs_fs_info *fs_info,
4846                                   struct btrfs_space_info *space_info)
4847 {
4848         struct reserve_ticket *ticket = NULL;
4849         struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
4850         struct btrfs_trans_handle *trans;
4851         u64 bytes;
4852
4853         trans = (struct btrfs_trans_handle *)current->journal_info;
4854         if (trans)
4855                 return -EAGAIN;
4856
4857         spin_lock(&space_info->lock);
4858         if (!list_empty(&space_info->priority_tickets))
4859                 ticket = list_first_entry(&space_info->priority_tickets,
4860                                           struct reserve_ticket, list);
4861         else if (!list_empty(&space_info->tickets))
4862                 ticket = list_first_entry(&space_info->tickets,
4863                                           struct reserve_ticket, list);
4864         bytes = (ticket) ? ticket->bytes : 0;
4865         spin_unlock(&space_info->lock);
4866
4867         if (!bytes)
4868                 return 0;
4869
4870         /* See if there is enough pinned space to make this reservation */
4871         if (__percpu_counter_compare(&space_info->total_bytes_pinned,
4872                                    bytes,
4873                                    BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
4874                 goto commit;
4875
4876         /*
4877          * See if there is some space in the delayed insertion reservation for
4878          * this reservation.
4879          */
4880         if (space_info != delayed_rsv->space_info)
4881                 return -ENOSPC;
4882
4883         spin_lock(&delayed_rsv->lock);
4884         if (delayed_rsv->size > bytes)
4885                 bytes = 0;
4886         else
4887                 bytes -= delayed_rsv->size;
4888         spin_unlock(&delayed_rsv->lock);
4889
4890         if (__percpu_counter_compare(&space_info->total_bytes_pinned,
4891                                    bytes,
4892                                    BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) {
4893                 return -ENOSPC;
4894         }
4895
4896 commit:
4897         trans = btrfs_join_transaction(fs_info->extent_root);
4898         if (IS_ERR(trans))
4899                 return -ENOSPC;
4900
4901         return btrfs_commit_transaction(trans);
4902 }
4903
4904 /*
4905  * Try to flush some data based on policy set by @state. This is only advisory
4906  * and may fail for various reasons. The caller is supposed to examine the
4907  * state of @space_info to detect the outcome.
4908  */
4909 static void flush_space(struct btrfs_fs_info *fs_info,
4910                        struct btrfs_space_info *space_info, u64 num_bytes,
4911                        int state)
4912 {
4913         struct btrfs_root *root = fs_info->extent_root;
4914         struct btrfs_trans_handle *trans;
4915         int nr;
4916         int ret = 0;
4917
4918         switch (state) {
4919         case FLUSH_DELAYED_ITEMS_NR:
4920         case FLUSH_DELAYED_ITEMS:
4921                 if (state == FLUSH_DELAYED_ITEMS_NR)
4922                         nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
4923                 else
4924                         nr = -1;
4925
4926                 trans = btrfs_join_transaction(root);
4927                 if (IS_ERR(trans)) {
4928                         ret = PTR_ERR(trans);
4929                         break;
4930                 }
4931                 ret = btrfs_run_delayed_items_nr(trans, nr);
4932                 btrfs_end_transaction(trans);
4933                 break;
4934         case FLUSH_DELALLOC:
4935         case FLUSH_DELALLOC_WAIT:
4936                 shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
4937                                 state == FLUSH_DELALLOC_WAIT);
4938                 break;
4939         case ALLOC_CHUNK:
4940                 trans = btrfs_join_transaction(root);
4941                 if (IS_ERR(trans)) {
4942                         ret = PTR_ERR(trans);
4943                         break;
4944                 }
4945                 ret = do_chunk_alloc(trans,
4946                                      btrfs_metadata_alloc_profile(fs_info),
4947                                      CHUNK_ALLOC_NO_FORCE);
4948                 btrfs_end_transaction(trans);
4949                 if (ret > 0 || ret == -ENOSPC)
4950                         ret = 0;
4951                 break;
4952         case COMMIT_TRANS:
4953                 ret = may_commit_transaction(fs_info, space_info);
4954                 break;
4955         default:
4956                 ret = -ENOSPC;
4957                 break;
4958         }
4959
4960         trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
4961                                 ret);
4962         return;
4963 }
4964
4965 static inline u64
4966 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
4967                                  struct btrfs_space_info *space_info,
4968                                  bool system_chunk)
4969 {
4970         struct reserve_ticket *ticket;
4971         u64 used;
4972         u64 expected;
4973         u64 to_reclaim = 0;
4974
4975         list_for_each_entry(ticket, &space_info->tickets, list)
4976                 to_reclaim += ticket->bytes;
4977         list_for_each_entry(ticket, &space_info->priority_tickets, list)
4978                 to_reclaim += ticket->bytes;
4979         if (to_reclaim)
4980                 return to_reclaim;
4981
4982         to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
4983         if (can_overcommit(fs_info, space_info, to_reclaim,
4984                            BTRFS_RESERVE_FLUSH_ALL, system_chunk))
4985                 return 0;
4986
4987         used = btrfs_space_info_used(space_info, true);
4988
4989         if (can_overcommit(fs_info, space_info, SZ_1M,
4990                            BTRFS_RESERVE_FLUSH_ALL, system_chunk))
4991                 expected = div_factor_fine(space_info->total_bytes, 95);
4992         else
4993                 expected = div_factor_fine(space_info->total_bytes, 90);
4994
4995         if (used > expected)
4996                 to_reclaim = used - expected;
4997         else
4998                 to_reclaim = 0;
4999         to_reclaim = min(to_reclaim, space_info->bytes_may_use +
5000                                      space_info->bytes_reserved);
5001         return to_reclaim;
5002 }
5003
5004 static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
5005                                         struct btrfs_space_info *space_info,
5006                                         u64 used, bool system_chunk)
5007 {
5008         u64 thresh = div_factor_fine(space_info->total_bytes, 98);
5009
5010         /* If we're just plain full then async reclaim just slows us down. */
5011         if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
5012                 return 0;
5013
5014         if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
5015                                               system_chunk))
5016                 return 0;
5017
5018         return (used >= thresh && !btrfs_fs_closing(fs_info) &&
5019                 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
5020 }
5021
5022 static void wake_all_tickets(struct list_head *head)
5023 {
5024         struct reserve_ticket *ticket;
5025
5026         while (!list_empty(head)) {
5027                 ticket = list_first_entry(head, struct reserve_ticket, list);
5028                 list_del_init(&ticket->list);
5029                 ticket->error = -ENOSPC;
5030                 wake_up(&ticket->wait);
5031         }
5032 }
5033
5034 /*
5035  * This is for normal flushers, we can wait all goddamned day if we want to.  We
5036  * will loop and continuously try to flush as long as we are making progress.
5037  * We count progress as clearing off tickets each time we have to loop.
5038  */
5039 static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
5040 {
5041         struct btrfs_fs_info *fs_info;
5042         struct btrfs_space_info *space_info;
5043         u64 to_reclaim;
5044         int flush_state;
5045         int commit_cycles = 0;
5046         u64 last_tickets_id;
5047
5048         fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
5049         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5050
5051         spin_lock(&space_info->lock);
5052         to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
5053                                                       false);
5054         if (!to_reclaim) {
5055                 space_info->flush = 0;
5056                 spin_unlock(&space_info->lock);
5057                 return;
5058         }
5059         last_tickets_id = space_info->tickets_id;
5060         spin_unlock(&space_info->lock);
5061
5062         flush_state = FLUSH_DELAYED_ITEMS_NR;
5063         do {
5064                 flush_space(fs_info, space_info, to_reclaim, flush_state);
5065                 spin_lock(&space_info->lock);
5066                 if (list_empty(&space_info->tickets)) {
5067                         space_info->flush = 0;
5068                         spin_unlock(&space_info->lock);
5069                         return;
5070                 }
5071                 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
5072                                                               space_info,
5073                                                               false);
5074                 if (last_tickets_id == space_info->tickets_id) {
5075                         flush_state++;
5076                 } else {
5077                         last_tickets_id = space_info->tickets_id;
5078                         flush_state = FLUSH_DELAYED_ITEMS_NR;
5079                         if (commit_cycles)
5080                                 commit_cycles--;
5081                 }
5082
5083                 if (flush_state > COMMIT_TRANS) {
5084                         commit_cycles++;
5085                         if (commit_cycles > 2) {
5086                                 wake_all_tickets(&space_info->tickets);
5087                                 space_info->flush = 0;
5088                         } else {
5089                                 flush_state = FLUSH_DELAYED_ITEMS_NR;
5090                         }
5091                 }
5092                 spin_unlock(&space_info->lock);
5093         } while (flush_state <= COMMIT_TRANS);
5094 }
5095
5096 void btrfs_init_async_reclaim_work(struct work_struct *work)
5097 {
5098         INIT_WORK(work, btrfs_async_reclaim_metadata_space);
5099 }
5100
5101 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
5102                                             struct btrfs_space_info *space_info,
5103                                             struct reserve_ticket *ticket)
5104 {
5105         u64 to_reclaim;
5106         int flush_state = FLUSH_DELAYED_ITEMS_NR;
5107
5108         spin_lock(&space_info->lock);
5109         to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
5110                                                       false);
5111         if (!to_reclaim) {
5112                 spin_unlock(&space_info->lock);
5113                 return;
5114         }
5115         spin_unlock(&space_info->lock);
5116
5117         do {
5118                 flush_space(fs_info, space_info, to_reclaim, flush_state);
5119                 flush_state++;
5120                 spin_lock(&space_info->lock);
5121                 if (ticket->bytes == 0) {
5122                         spin_unlock(&space_info->lock);
5123                         return;
5124                 }
5125                 spin_unlock(&space_info->lock);
5126
5127                 /*
5128                  * Priority flushers can't wait on delalloc without
5129                  * deadlocking.
5130                  */
5131                 if (flush_state == FLUSH_DELALLOC ||
5132                     flush_state == FLUSH_DELALLOC_WAIT)
5133                         flush_state = ALLOC_CHUNK;
5134         } while (flush_state < COMMIT_TRANS);
5135 }
5136
5137 static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
5138                                struct btrfs_space_info *space_info,
5139                                struct reserve_ticket *ticket, u64 orig_bytes)
5140
5141 {
5142         DEFINE_WAIT(wait);
5143         int ret = 0;
5144
5145         spin_lock(&space_info->lock);
5146         while (ticket->bytes > 0 && ticket->error == 0) {
5147                 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
5148                 if (ret) {
5149                         ret = -EINTR;
5150                         break;
5151                 }
5152                 spin_unlock(&space_info->lock);
5153
5154                 schedule();
5155
5156                 finish_wait(&ticket->wait, &wait);
5157                 spin_lock(&space_info->lock);
5158         }
5159         if (!ret)
5160                 ret = ticket->error;
5161         if (!list_empty(&ticket->list))
5162                 list_del_init(&ticket->list);
5163         if (ticket->bytes && ticket->bytes < orig_bytes) {
5164                 u64 num_bytes = orig_bytes - ticket->bytes;
5165                 space_info->bytes_may_use -= num_bytes;
5166                 trace_btrfs_space_reservation(fs_info, "space_info",
5167                                               space_info->flags, num_bytes, 0);
5168         }
5169         spin_unlock(&space_info->lock);
5170
5171         return ret;
5172 }
5173
5174 /**
5175  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5176  * @root - the root we're allocating for
5177  * @space_info - the space info we want to allocate from
5178  * @orig_bytes - the number of bytes we want
5179  * @flush - whether or not we can flush to make our reservation
5180  *
5181  * This will reserve orig_bytes number of bytes from the space info associated
5182  * with the block_rsv.  If there is not enough space it will make an attempt to
5183  * flush out space to make room.  It will do this by flushing delalloc if
5184  * possible or committing the transaction.  If flush is 0 then no attempts to
5185  * regain reservations will be made and this will fail if there is not enough
5186  * space already.
5187  */
5188 static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
5189                                     struct btrfs_space_info *space_info,
5190                                     u64 orig_bytes,
5191                                     enum btrfs_reserve_flush_enum flush,
5192                                     bool system_chunk)
5193 {
5194         struct reserve_ticket ticket;
5195         u64 used;
5196         int ret = 0;
5197
5198         ASSERT(orig_bytes);
5199         ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
5200
5201         spin_lock(&space_info->lock);
5202         ret = -ENOSPC;
5203         used = btrfs_space_info_used(space_info, true);
5204
5205         /*
5206          * If we have enough space then hooray, make our reservation and carry
5207          * on.  If not see if we can overcommit, and if we can, hooray carry on.
5208          * If not things get more complicated.
5209          */
5210         if (used + orig_bytes <= space_info->total_bytes) {
5211                 space_info->bytes_may_use += orig_bytes;
5212                 trace_btrfs_space_reservation(fs_info, "space_info",
5213                                               space_info->flags, orig_bytes, 1);
5214                 ret = 0;
5215         } else if (can_overcommit(fs_info, space_info, orig_bytes, flush,
5216                                   system_chunk)) {
5217                 space_info->bytes_may_use += orig_bytes;
5218                 trace_btrfs_space_reservation(fs_info, "space_info",
5219                                               space_info->flags, orig_bytes, 1);
5220                 ret = 0;
5221         }
5222
5223         /*
5224          * If we couldn't make a reservation then setup our reservation ticket
5225          * and kick the async worker if it's not already running.
5226          *
5227          * If we are a priority flusher then we just need to add our ticket to
5228          * the list and we will do our own flushing further down.
5229          */
5230         if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
5231                 ticket.bytes = orig_bytes;
5232                 ticket.error = 0;
5233                 init_waitqueue_head(&ticket.wait);
5234                 if (flush == BTRFS_RESERVE_FLUSH_ALL) {
5235                         list_add_tail(&ticket.list, &space_info->tickets);
5236                         if (!space_info->flush) {
5237                                 space_info->flush = 1;
5238                                 trace_btrfs_trigger_flush(fs_info,
5239                                                           space_info->flags,
5240                                                           orig_bytes, flush,
5241                                                           "enospc");
5242                                 queue_work(system_unbound_wq,
5243                                            &fs_info->async_reclaim_work);
5244                         }
5245                 } else {
5246                         list_add_tail(&ticket.list,
5247                                       &space_info->priority_tickets);
5248                 }
5249         } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
5250                 used += orig_bytes;
5251                 /*
5252                  * We will do the space reservation dance during log replay,
5253                  * which means we won't have fs_info->fs_root set, so don't do
5254                  * the async reclaim as we will panic.
5255                  */
5256                 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
5257                     need_do_async_reclaim(fs_info, space_info,
5258                                           used, system_chunk) &&
5259                     !work_busy(&fs_info->async_reclaim_work)) {
5260                         trace_btrfs_trigger_flush(fs_info, space_info->flags,
5261                                                   orig_bytes, flush, "preempt");
5262                         queue_work(system_unbound_wq,
5263                                    &fs_info->async_reclaim_work);
5264                 }
5265         }
5266         spin_unlock(&space_info->lock);
5267         if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
5268                 return ret;
5269
5270         if (flush == BTRFS_RESERVE_FLUSH_ALL)
5271                 return wait_reserve_ticket(fs_info, space_info, &ticket,
5272                                            orig_bytes);
5273
5274         ret = 0;
5275         priority_reclaim_metadata_space(fs_info, space_info, &ticket);
5276         spin_lock(&space_info->lock);
5277         if (ticket.bytes) {
5278                 if (ticket.bytes < orig_bytes) {
5279                         u64 num_bytes = orig_bytes - ticket.bytes;
5280                         space_info->bytes_may_use -= num_bytes;
5281                         trace_btrfs_space_reservation(fs_info, "space_info",
5282                                                       space_info->flags,
5283                                                       num_bytes, 0);
5284
5285                 }
5286                 list_del_init(&ticket.list);
5287                 ret = -ENOSPC;
5288         }
5289         spin_unlock(&space_info->lock);
5290         ASSERT(list_empty(&ticket.list));
5291         return ret;
5292 }
5293
5294 /**
5295  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5296  * @root - the root we're allocating for
5297  * @block_rsv - the block_rsv we're allocating for
5298  * @orig_bytes - the number of bytes we want
5299  * @flush - whether or not we can flush to make our reservation
5300  *
5301  * This will reserve orgi_bytes number of bytes from the space info associated
5302  * with the block_rsv.  If there is not enough space it will make an attempt to
5303  * flush out space to make room.  It will do this by flushing delalloc if
5304  * possible or committing the transaction.  If flush is 0 then no attempts to
5305  * regain reservations will be made and this will fail if there is not enough
5306  * space already.
5307  */
5308 static int reserve_metadata_bytes(struct btrfs_root *root,
5309                                   struct btrfs_block_rsv *block_rsv,
5310                                   u64 orig_bytes,
5311                                   enum btrfs_reserve_flush_enum flush)
5312 {
5313         struct btrfs_fs_info *fs_info = root->fs_info;
5314         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5315         int ret;
5316         bool system_chunk = (root == fs_info->chunk_root);
5317
5318         ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
5319                                        orig_bytes, flush, system_chunk);
5320         if (ret == -ENOSPC &&
5321             unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
5322                 if (block_rsv != global_rsv &&
5323                     !block_rsv_use_bytes(global_rsv, orig_bytes))
5324                         ret = 0;
5325         }
5326         if (ret == -ENOSPC) {
5327                 trace_btrfs_space_reservation(fs_info, "space_info:enospc",
5328                                               block_rsv->space_info->flags,
5329                                               orig_bytes, 1);
5330
5331                 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
5332                         dump_space_info(fs_info, block_rsv->space_info,
5333                                         orig_bytes, 0);
5334         }
5335         return ret;
5336 }
5337
5338 static struct btrfs_block_rsv *get_block_rsv(
5339                                         const struct btrfs_trans_handle *trans,
5340                                         const struct btrfs_root *root)
5341 {
5342         struct btrfs_fs_info *fs_info = root->fs_info;
5343         struct btrfs_block_rsv *block_rsv = NULL;
5344
5345         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
5346             (root == fs_info->csum_root && trans->adding_csums) ||
5347             (root == fs_info->uuid_root))
5348                 block_rsv = trans->block_rsv;
5349
5350         if (!block_rsv)
5351                 block_rsv = root->block_rsv;
5352
5353         if (!block_rsv)
5354                 block_rsv = &fs_info->empty_block_rsv;
5355
5356         return block_rsv;
5357 }
5358
5359 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
5360                                u64 num_bytes)
5361 {
5362         int ret = -ENOSPC;
5363         spin_lock(&block_rsv->lock);
5364         if (block_rsv->reserved >= num_bytes) {
5365                 block_rsv->reserved -= num_bytes;
5366                 if (block_rsv->reserved < block_rsv->size)
5367                         block_rsv->full = 0;
5368                 ret = 0;
5369         }
5370         spin_unlock(&block_rsv->lock);
5371         return ret;
5372 }
5373
5374 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
5375                                 u64 num_bytes, bool update_size)
5376 {
5377         spin_lock(&block_rsv->lock);
5378         block_rsv->reserved += num_bytes;
5379         if (update_size)
5380                 block_rsv->size += num_bytes;
5381         else if (block_rsv->reserved >= block_rsv->size)
5382                 block_rsv->full = 1;
5383         spin_unlock(&block_rsv->lock);
5384 }
5385
5386 int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
5387                              struct btrfs_block_rsv *dest, u64 num_bytes,
5388                              int min_factor)
5389 {
5390         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5391         u64 min_bytes;
5392
5393         if (global_rsv->space_info != dest->space_info)
5394                 return -ENOSPC;
5395
5396         spin_lock(&global_rsv->lock);
5397         min_bytes = div_factor(global_rsv->size, min_factor);
5398         if (global_rsv->reserved < min_bytes + num_bytes) {
5399                 spin_unlock(&global_rsv->lock);
5400                 return -ENOSPC;
5401         }
5402         global_rsv->reserved -= num_bytes;
5403         if (global_rsv->reserved < global_rsv->size)
5404                 global_rsv->full = 0;
5405         spin_unlock(&global_rsv->lock);
5406
5407         block_rsv_add_bytes(dest, num_bytes, true);
5408         return 0;
5409 }
5410
5411 /*
5412  * This is for space we already have accounted in space_info->bytes_may_use, so
5413  * basically when we're returning space from block_rsv's.
5414  */
5415 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
5416                                      struct btrfs_space_info *space_info,
5417                                      u64 num_bytes)
5418 {
5419         struct reserve_ticket *ticket;
5420         struct list_head *head;
5421         u64 used;
5422         enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
5423         bool check_overcommit = false;
5424
5425         spin_lock(&space_info->lock);
5426         head = &space_info->priority_tickets;
5427
5428         /*
5429          * If we are over our limit then we need to check and see if we can
5430          * overcommit, and if we can't then we just need to free up our space
5431          * and not satisfy any requests.
5432          */
5433         used = btrfs_space_info_used(space_info, true);
5434         if (used - num_bytes >= space_info->total_bytes)
5435                 check_overcommit = true;
5436 again:
5437         while (!list_empty(head) && num_bytes) {
5438                 ticket = list_first_entry(head, struct reserve_ticket,
5439                                           list);
5440                 /*
5441                  * We use 0 bytes because this space is already reserved, so
5442                  * adding the ticket space would be a double count.
5443                  */
5444                 if (check_overcommit &&
5445                     !can_overcommit(fs_info, space_info, 0, flush, false))
5446                         break;
5447                 if (num_bytes >= ticket->bytes) {
5448                         list_del_init(&ticket->list);
5449                         num_bytes -= ticket->bytes;
5450                         ticket->bytes = 0;
5451                         space_info->tickets_id++;
5452                         wake_up(&ticket->wait);
5453                 } else {
5454                         ticket->bytes -= num_bytes;
5455                         num_bytes = 0;
5456                 }
5457         }
5458
5459         if (num_bytes && head == &space_info->priority_tickets) {
5460                 head = &space_info->tickets;
5461                 flush = BTRFS_RESERVE_FLUSH_ALL;
5462                 goto again;
5463         }
5464         space_info->bytes_may_use -= num_bytes;
5465         trace_btrfs_space_reservation(fs_info, "space_info",
5466                                       space_info->flags, num_bytes, 0);
5467         spin_unlock(&space_info->lock);
5468 }
5469
5470 /*
5471  * This is for newly allocated space that isn't accounted in
5472  * space_info->bytes_may_use yet.  So if we allocate a chunk or unpin an extent
5473  * we use this helper.
5474  */
5475 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
5476                                      struct btrfs_space_info *space_info,
5477                                      u64 num_bytes)
5478 {
5479         struct reserve_ticket *ticket;
5480         struct list_head *head = &space_info->priority_tickets;
5481
5482 again:
5483         while (!list_empty(head) && num_bytes) {
5484                 ticket = list_first_entry(head, struct reserve_ticket,
5485                                           list);
5486                 if (num_bytes >= ticket->bytes) {
5487                         trace_btrfs_space_reservation(fs_info, "space_info",
5488                                                       space_info->flags,
5489                                                       ticket->bytes, 1);
5490                         list_del_init(&ticket->list);
5491                         num_bytes -= ticket->bytes;
5492                         space_info->bytes_may_use += ticket->bytes;
5493                         ticket->bytes = 0;
5494                         space_info->tickets_id++;
5495                         wake_up(&ticket->wait);
5496                 } else {
5497                         trace_btrfs_space_reservation(fs_info, "space_info",
5498                                                       space_info->flags,
5499                                                       num_bytes, 1);
5500                         space_info->bytes_may_use += num_bytes;
5501                         ticket->bytes -= num_bytes;
5502                         num_bytes = 0;
5503                 }
5504         }
5505
5506         if (num_bytes && head == &space_info->priority_tickets) {
5507                 head = &space_info->tickets;
5508                 goto again;
5509         }
5510 }
5511
5512 static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
5513                                     struct btrfs_block_rsv *block_rsv,
5514                                     struct btrfs_block_rsv *dest, u64 num_bytes,
5515                                     u64 *qgroup_to_release_ret)
5516 {
5517         struct btrfs_space_info *space_info = block_rsv->space_info;
5518         u64 qgroup_to_release = 0;
5519         u64 ret;
5520
5521         spin_lock(&block_rsv->lock);
5522         if (num_bytes == (u64)-1) {
5523                 num_bytes = block_rsv->size;
5524                 qgroup_to_release = block_rsv->qgroup_rsv_size;
5525         }
5526         block_rsv->size -= num_bytes;
5527         if (block_rsv->reserved >= block_rsv->size) {
5528                 num_bytes = block_rsv->reserved - block_rsv->size;
5529                 block_rsv->reserved = block_rsv->size;
5530                 block_rsv->full = 1;
5531         } else {
5532                 num_bytes = 0;
5533         }
5534         if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) {
5535                 qgroup_to_release = block_rsv->qgroup_rsv_reserved -
5536                                     block_rsv->qgroup_rsv_size;
5537                 block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size;
5538         } else {
5539                 qgroup_to_release = 0;
5540         }
5541         spin_unlock(&block_rsv->lock);
5542
5543         ret = num_bytes;
5544         if (num_bytes > 0) {
5545                 if (dest) {
5546                         spin_lock(&dest->lock);
5547                         if (!dest->full) {
5548                                 u64 bytes_to_add;
5549
5550                                 bytes_to_add = dest->size - dest->reserved;
5551                                 bytes_to_add = min(num_bytes, bytes_to_add);
5552                                 dest->reserved += bytes_to_add;
5553                                 if (dest->reserved >= dest->size)
5554                                         dest->full = 1;
5555                                 num_bytes -= bytes_to_add;
5556                         }
5557                         spin_unlock(&dest->lock);
5558                 }
5559                 if (num_bytes)
5560                         space_info_add_old_bytes(fs_info, space_info,
5561                                                  num_bytes);
5562         }
5563         if (qgroup_to_release_ret)
5564                 *qgroup_to_release_ret = qgroup_to_release;
5565         return ret;
5566 }
5567
5568 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
5569                             struct btrfs_block_rsv *dst, u64 num_bytes,
5570                             bool update_size)
5571 {
5572         int ret;
5573
5574         ret = block_rsv_use_bytes(src, num_bytes);
5575         if (ret)
5576                 return ret;
5577
5578         block_rsv_add_bytes(dst, num_bytes, update_size);
5579         return 0;
5580 }
5581
5582 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
5583 {
5584         memset(rsv, 0, sizeof(*rsv));
5585         spin_lock_init(&rsv->lock);
5586         rsv->type = type;
5587 }
5588
5589 void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
5590                                    struct btrfs_block_rsv *rsv,
5591                                    unsigned short type)
5592 {
5593         btrfs_init_block_rsv(rsv, type);
5594         rsv->space_info = __find_space_info(fs_info,
5595                                             BTRFS_BLOCK_GROUP_METADATA);
5596 }
5597
5598 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
5599                                               unsigned short type)
5600 {
5601         struct btrfs_block_rsv *block_rsv;
5602
5603         block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
5604         if (!block_rsv)
5605                 return NULL;
5606
5607         btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
5608         return block_rsv;
5609 }
5610
5611 void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
5612                           struct btrfs_block_rsv *rsv)
5613 {
5614         if (!rsv)
5615                 return;
5616         btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
5617         kfree(rsv);
5618 }
5619
5620 int btrfs_block_rsv_add(struct btrfs_root *root,
5621                         struct btrfs_block_rsv *block_rsv, u64 num_bytes,
5622                         enum btrfs_reserve_flush_enum flush)
5623 {
5624         int ret;
5625
5626         if (num_bytes == 0)
5627                 return 0;
5628
5629         ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5630         if (!ret)
5631                 block_rsv_add_bytes(block_rsv, num_bytes, true);
5632
5633         return ret;
5634 }
5635
5636 int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
5637 {
5638         u64 num_bytes = 0;
5639         int ret = -ENOSPC;
5640
5641         if (!block_rsv)
5642                 return 0;
5643
5644         spin_lock(&block_rsv->lock);
5645         num_bytes = div_factor(block_rsv->size, min_factor);
5646         if (block_rsv->reserved >= num_bytes)
5647                 ret = 0;
5648         spin_unlock(&block_rsv->lock);
5649
5650         return ret;
5651 }
5652
5653 int btrfs_block_rsv_refill(struct btrfs_root *root,
5654                            struct btrfs_block_rsv *block_rsv, u64 min_reserved,
5655                            enum btrfs_reserve_flush_enum flush)
5656 {
5657         u64 num_bytes = 0;
5658         int ret = -ENOSPC;
5659
5660         if (!block_rsv)
5661                 return 0;
5662
5663         spin_lock(&block_rsv->lock);
5664         num_bytes = min_reserved;
5665         if (block_rsv->reserved >= num_bytes)
5666                 ret = 0;
5667         else
5668                 num_bytes -= block_rsv->reserved;
5669         spin_unlock(&block_rsv->lock);
5670
5671         if (!ret)
5672                 return 0;
5673
5674         ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5675         if (!ret) {
5676                 block_rsv_add_bytes(block_rsv, num_bytes, false);
5677                 return 0;
5678         }
5679
5680         return ret;
5681 }
5682
5683 /**
5684  * btrfs_inode_rsv_refill - refill the inode block rsv.
5685  * @inode - the inode we are refilling.
5686  * @flush - the flusing restriction.
5687  *
5688  * Essentially the same as btrfs_block_rsv_refill, except it uses the
5689  * block_rsv->size as the minimum size.  We'll either refill the missing amount
5690  * or return if we already have enough space.  This will also handle the resreve
5691  * tracepoint for the reserved amount.
5692  */
5693 static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
5694                                   enum btrfs_reserve_flush_enum flush)
5695 {
5696         struct btrfs_root *root = inode->root;
5697         struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5698         u64 num_bytes = 0;
5699         u64 qgroup_num_bytes = 0;
5700         int ret = -ENOSPC;
5701
5702         spin_lock(&block_rsv->lock);
5703         if (block_rsv->reserved < block_rsv->size)
5704                 num_bytes = block_rsv->size - block_rsv->reserved;
5705         if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
5706                 qgroup_num_bytes = block_rsv->qgroup_rsv_size -
5707                                    block_rsv->qgroup_rsv_reserved;
5708         spin_unlock(&block_rsv->lock);
5709
5710         if (num_bytes == 0)
5711                 return 0;
5712
5713         ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, true);
5714         if (ret)
5715                 return ret;
5716         ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5717         if (!ret) {
5718                 block_rsv_add_bytes(block_rsv, num_bytes, false);
5719                 trace_btrfs_space_reservation(root->fs_info, "delalloc",
5720                                               btrfs_ino(inode), num_bytes, 1);
5721
5722                 /* Don't forget to increase qgroup_rsv_reserved */
5723                 spin_lock(&block_rsv->lock);
5724                 block_rsv->qgroup_rsv_reserved += qgroup_num_bytes;
5725                 spin_unlock(&block_rsv->lock);
5726         } else
5727                 btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
5728         return ret;
5729 }
5730
5731 /**
5732  * btrfs_inode_rsv_release - release any excessive reservation.
5733  * @inode - the inode we need to release from.
5734  * @qgroup_free - free or convert qgroup meta.
5735  *   Unlike normal operation, qgroup meta reservation needs to know if we are
5736  *   freeing qgroup reservation or just converting it into per-trans.  Normally
5737  *   @qgroup_free is true for error handling, and false for normal release.
5738  *
5739  * This is the same as btrfs_block_rsv_release, except that it handles the
5740  * tracepoint for the reservation.
5741  */
5742 static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
5743 {
5744         struct btrfs_fs_info *fs_info = inode->root->fs_info;
5745         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5746         struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5747         u64 released = 0;
5748         u64 qgroup_to_release = 0;
5749
5750         /*
5751          * Since we statically set the block_rsv->size we just want to say we
5752          * are releasing 0 bytes, and then we'll just get the reservation over
5753          * the size free'd.
5754          */
5755         released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0,
5756                                            &qgroup_to_release);
5757         if (released > 0)
5758                 trace_btrfs_space_reservation(fs_info, "delalloc",
5759                                               btrfs_ino(inode), released, 0);
5760         if (qgroup_free)
5761                 btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release);
5762         else
5763                 btrfs_qgroup_convert_reserved_meta(inode->root,
5764                                                    qgroup_to_release);
5765 }
5766
5767 void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
5768                              struct btrfs_block_rsv *block_rsv,
5769                              u64 num_bytes)
5770 {
5771         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5772
5773         if (global_rsv == block_rsv ||
5774             block_rsv->space_info != global_rsv->space_info)
5775                 global_rsv = NULL;
5776         block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes, NULL);
5777 }
5778
5779 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
5780 {
5781         struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
5782         struct btrfs_space_info *sinfo = block_rsv->space_info;
5783         u64 num_bytes;
5784
5785         /*
5786          * The global block rsv is based on the size of the extent tree, the
5787          * checksum tree and the root tree.  If the fs is empty we want to set
5788          * it to a minimal amount for safety.
5789          */
5790         num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
5791                 btrfs_root_used(&fs_info->csum_root->root_item) +
5792                 btrfs_root_used(&fs_info->tree_root->root_item);
5793         num_bytes = max_t(u64, num_bytes, SZ_16M);
5794
5795         spin_lock(&sinfo->lock);
5796         spin_lock(&block_rsv->lock);
5797
5798         block_rsv->size = min_t(u64, num_bytes, SZ_512M);
5799
5800         if (block_rsv->reserved < block_rsv->size) {
5801                 num_bytes = btrfs_space_info_used(sinfo, true);
5802                 if (sinfo->total_bytes > num_bytes) {
5803                         num_bytes = sinfo->total_bytes - num_bytes;
5804                         num_bytes = min(num_bytes,
5805                                         block_rsv->size - block_rsv->reserved);
5806                         block_rsv->reserved += num_bytes;
5807                         sinfo->bytes_may_use += num_bytes;
5808                         trace_btrfs_space_reservation(fs_info, "space_info",
5809                                                       sinfo->flags, num_bytes,
5810                                                       1);
5811                 }
5812         } else if (block_rsv->reserved > block_rsv->size) {
5813                 num_bytes = block_rsv->reserved - block_rsv->size;
5814                 sinfo->bytes_may_use -= num_bytes;
5815                 trace_btrfs_space_reservation(fs_info, "space_info",
5816                                       sinfo->flags, num_bytes, 0);
5817                 block_rsv->reserved = block_rsv->size;
5818         }
5819
5820         if (block_rsv->reserved == block_rsv->size)
5821                 block_rsv->full = 1;
5822         else
5823                 block_rsv->full = 0;
5824
5825         spin_unlock(&block_rsv->lock);
5826         spin_unlock(&sinfo->lock);
5827 }
5828
5829 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
5830 {
5831         struct btrfs_space_info *space_info;
5832
5833         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
5834         fs_info->chunk_block_rsv.space_info = space_info;
5835
5836         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5837         fs_info->global_block_rsv.space_info = space_info;
5838         fs_info->trans_block_rsv.space_info = space_info;
5839         fs_info->empty_block_rsv.space_info = space_info;
5840         fs_info->delayed_block_rsv.space_info = space_info;
5841
5842         fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
5843         fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
5844         fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
5845         fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
5846         if (fs_info->quota_root)
5847                 fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
5848         fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
5849
5850         update_global_block_rsv(fs_info);
5851 }
5852
5853 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
5854 {
5855         block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
5856                                 (u64)-1, NULL);
5857         WARN_ON(fs_info->trans_block_rsv.size > 0);
5858         WARN_ON(fs_info->trans_block_rsv.reserved > 0);
5859         WARN_ON(fs_info->chunk_block_rsv.size > 0);
5860         WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
5861         WARN_ON(fs_info->delayed_block_rsv.size > 0);
5862         WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
5863 }
5864
5865
5866 /*
5867  * To be called after all the new block groups attached to the transaction
5868  * handle have been created (btrfs_create_pending_block_groups()).
5869  */
5870 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
5871 {
5872         struct btrfs_fs_info *fs_info = trans->fs_info;
5873
5874         if (!trans->chunk_bytes_reserved)
5875                 return;
5876
5877         WARN_ON_ONCE(!list_empty(&trans->new_bgs));
5878
5879         block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
5880                                 trans->chunk_bytes_reserved, NULL);
5881         trans->chunk_bytes_reserved = 0;
5882 }
5883
5884 /*
5885  * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
5886  * root: the root of the parent directory
5887  * rsv: block reservation
5888  * items: the number of items that we need do reservation
5889  * use_global_rsv: allow fallback to the global block reservation
5890  *
5891  * This function is used to reserve the space for snapshot/subvolume
5892  * creation and deletion. Those operations are different with the
5893  * common file/directory operations, they change two fs/file trees
5894  * and root tree, the number of items that the qgroup reserves is
5895  * different with the free space reservation. So we can not use
5896  * the space reservation mechanism in start_transaction().
5897  */
5898 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
5899                                      struct btrfs_block_rsv *rsv, int items,
5900                                      bool use_global_rsv)
5901 {
5902         u64 qgroup_num_bytes = 0;
5903         u64 num_bytes;
5904         int ret;
5905         struct btrfs_fs_info *fs_info = root->fs_info;
5906         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5907
5908         if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
5909                 /* One for parent inode, two for dir entries */
5910                 qgroup_num_bytes = 3 * fs_info->nodesize;
5911                 ret = btrfs_qgroup_reserve_meta_prealloc(root,
5912                                 qgroup_num_bytes, true);
5913                 if (ret)
5914                         return ret;
5915         }
5916
5917         num_bytes = btrfs_calc_trans_metadata_size(fs_info, items);
5918         rsv->space_info = __find_space_info(fs_info,
5919                                             BTRFS_BLOCK_GROUP_METADATA);
5920         ret = btrfs_block_rsv_add(root, rsv, num_bytes,
5921                                   BTRFS_RESERVE_FLUSH_ALL);
5922
5923         if (ret == -ENOSPC && use_global_rsv)
5924                 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true);
5925
5926         if (ret && qgroup_num_bytes)
5927                 btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
5928
5929         return ret;
5930 }
5931
5932 void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
5933                                       struct btrfs_block_rsv *rsv)
5934 {
5935         btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
5936 }
5937
5938 static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
5939                                                  struct btrfs_inode *inode)
5940 {
5941         struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5942         u64 reserve_size = 0;
5943         u64 qgroup_rsv_size = 0;
5944         u64 csum_leaves;
5945         unsigned outstanding_extents;
5946
5947         lockdep_assert_held(&inode->lock);
5948         outstanding_extents = inode->outstanding_extents;
5949         if (outstanding_extents)
5950                 reserve_size = btrfs_calc_trans_metadata_size(fs_info,
5951                                                 outstanding_extents + 1);
5952         csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
5953                                                  inode->csum_bytes);
5954         reserve_size += btrfs_calc_trans_metadata_size(fs_info,
5955                                                        csum_leaves);
5956         /*
5957          * For qgroup rsv, the calculation is very simple:
5958          * account one nodesize for each outstanding extent
5959          *
5960          * This is overestimating in most cases.
5961          */
5962         qgroup_rsv_size = outstanding_extents * fs_info->nodesize;
5963
5964         spin_lock(&block_rsv->lock);
5965         block_rsv->size = reserve_size;
5966         block_rsv->qgroup_rsv_size = qgroup_rsv_size;
5967         spin_unlock(&block_rsv->lock);
5968 }
5969
5970 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
5971 {
5972         struct btrfs_fs_info *fs_info = inode->root->fs_info;
5973         unsigned nr_extents;
5974         enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
5975         int ret = 0;
5976         bool delalloc_lock = true;
5977
5978         /* If we are a free space inode we need to not flush since we will be in
5979          * the middle of a transaction commit.  We also don't need the delalloc
5980          * mutex since we won't race with anybody.  We need this mostly to make
5981          * lockdep shut its filthy mouth.
5982          *
5983          * If we have a transaction open (can happen if we call truncate_block
5984          * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
5985          */
5986         if (btrfs_is_free_space_inode(inode)) {
5987                 flush = BTRFS_RESERVE_NO_FLUSH;
5988                 delalloc_lock = false;
5989         } else {
5990                 if (current->journal_info)
5991                         flush = BTRFS_RESERVE_FLUSH_LIMIT;
5992
5993                 if (btrfs_transaction_in_commit(fs_info))
5994                         schedule_timeout(1);
5995         }
5996
5997         if (delalloc_lock)
5998                 mutex_lock(&inode->delalloc_mutex);
5999
6000         num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
6001
6002         /* Add our new extents and calculate the new rsv size. */
6003         spin_lock(&inode->lock);
6004         nr_extents = count_max_extents(num_bytes);
6005         btrfs_mod_outstanding_extents(inode, nr_extents);
6006         inode->csum_bytes += num_bytes;
6007         btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6008         spin_unlock(&inode->lock);
6009
6010         ret = btrfs_inode_rsv_refill(inode, flush);
6011         if (unlikely(ret))
6012                 goto out_fail;
6013
6014         if (delalloc_lock)
6015                 mutex_unlock(&inode->delalloc_mutex);
6016         return 0;
6017
6018 out_fail:
6019         spin_lock(&inode->lock);
6020         nr_extents = count_max_extents(num_bytes);
6021         btrfs_mod_outstanding_extents(inode, -nr_extents);
6022         inode->csum_bytes -= num_bytes;
6023         btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6024         spin_unlock(&inode->lock);
6025
6026         btrfs_inode_rsv_release(inode, true);
6027         if (delalloc_lock)
6028                 mutex_unlock(&inode->delalloc_mutex);
6029         return ret;
6030 }
6031
6032 /**
6033  * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
6034  * @inode: the inode to release the reservation for.
6035  * @num_bytes: the number of bytes we are releasing.
6036  * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
6037  *
6038  * This will release the metadata reservation for an inode.  This can be called
6039  * once we complete IO for a given set of bytes to release their metadata
6040  * reservations, or on error for the same reason.
6041  */
6042 void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
6043                                      bool qgroup_free)
6044 {
6045         struct btrfs_fs_info *fs_info = inode->root->fs_info;
6046
6047         num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
6048         spin_lock(&inode->lock);
6049         inode->csum_bytes -= num_bytes;
6050         btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6051         spin_unlock(&inode->lock);
6052
6053         if (btrfs_is_testing(fs_info))
6054                 return;
6055
6056         btrfs_inode_rsv_release(inode, qgroup_free);
6057 }
6058
6059 /**
6060  * btrfs_delalloc_release_extents - release our outstanding_extents
6061  * @inode: the inode to balance the reservation for.
6062  * @num_bytes: the number of bytes we originally reserved with
6063  * @qgroup_free: do we need to free qgroup meta reservation or convert them.
6064  *
6065  * When we reserve space we increase outstanding_extents for the extents we may
6066  * add.  Once we've set the range as delalloc or created our ordered extents we
6067  * have outstanding_extents to track the real usage, so we use this to free our
6068  * temporarily tracked outstanding_extents.  This _must_ be used in conjunction
6069  * with btrfs_delalloc_reserve_metadata.
6070  */
6071 void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
6072                                     bool qgroup_free)
6073 {
6074         struct btrfs_fs_info *fs_info = inode->root->fs_info;
6075         unsigned num_extents;
6076
6077         spin_lock(&inode->lock);
6078         num_extents = count_max_extents(num_bytes);
6079         btrfs_mod_outstanding_extents(inode, -num_extents);
6080         btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6081         spin_unlock(&inode->lock);
6082
6083         if (btrfs_is_testing(fs_info))
6084                 return;
6085
6086         btrfs_inode_rsv_release(inode, qgroup_free);
6087 }
6088
6089 /**
6090  * btrfs_delalloc_reserve_space - reserve data and metadata space for
6091  * delalloc
6092  * @inode: inode we're writing to
6093  * @start: start range we are writing to
6094  * @len: how long the range we are writing to
6095  * @reserved: mandatory parameter, record actually reserved qgroup ranges of
6096  *            current reservation.
6097  *
6098  * This will do the following things
6099  *
6100  * o reserve space in data space info for num bytes
6101  *   and reserve precious corresponding qgroup space
6102  *   (Done in check_data_free_space)
6103  *
6104  * o reserve space for metadata space, based on the number of outstanding
6105  *   extents and how much csums will be needed
6106  *   also reserve metadata space in a per root over-reserve method.
6107  * o add to the inodes->delalloc_bytes
6108  * o add it to the fs_info's delalloc inodes list.
6109  *   (Above 3 all done in delalloc_reserve_metadata)
6110  *
6111  * Return 0 for success
6112  * Return <0 for error(-ENOSPC or -EQUOT)
6113  */
6114 int btrfs_delalloc_reserve_space(struct inode *inode,
6115                         struct extent_changeset **reserved, u64 start, u64 len)
6116 {
6117         int ret;
6118
6119         ret = btrfs_check_data_free_space(inode, reserved, start, len);
6120         if (ret < 0)
6121                 return ret;
6122         ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
6123         if (ret < 0)
6124                 btrfs_free_reserved_data_space(inode, *reserved, start, len);
6125         return ret;
6126 }
6127
6128 /**
6129  * btrfs_delalloc_release_space - release data and metadata space for delalloc
6130  * @inode: inode we're releasing space for
6131  * @start: start position of the space already reserved
6132  * @len: the len of the space already reserved
6133  * @release_bytes: the len of the space we consumed or didn't use
6134  *
6135  * This function will release the metadata space that was not used and will
6136  * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
6137  * list if there are no delalloc bytes left.
6138  * Also it will handle the qgroup reserved space.
6139  */
6140 void btrfs_delalloc_release_space(struct inode *inode,
6141                                   struct extent_changeset *reserved,
6142                                   u64 start, u64 len, bool qgroup_free)
6143 {
6144         btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
6145         btrfs_free_reserved_data_space(inode, reserved, start, len);
6146 }
6147
6148 static int update_block_group(struct btrfs_trans_handle *trans,
6149                               struct btrfs_fs_info *info, u64 bytenr,
6150                               u64 num_bytes, int alloc)
6151 {
6152         struct btrfs_block_group_cache *cache = NULL;
6153         u64 total = num_bytes;
6154         u64 old_val;
6155         u64 byte_in_group;
6156         int factor;
6157
6158         /* block accounting for super block */
6159         spin_lock(&info->delalloc_root_lock);
6160         old_val = btrfs_super_bytes_used(info->super_copy);
6161         if (alloc)
6162                 old_val += num_bytes;
6163         else
6164                 old_val -= num_bytes;
6165         btrfs_set_super_bytes_used(info->super_copy, old_val);
6166         spin_unlock(&info->delalloc_root_lock);
6167
6168         while (total) {
6169                 cache = btrfs_lookup_block_group(info, bytenr);
6170                 if (!cache)
6171                         return -ENOENT;
6172                 factor = btrfs_bg_type_to_factor(cache->flags);
6173
6174                 /*
6175                  * If this block group has free space cache written out, we
6176                  * need to make sure to load it if we are removing space.  This
6177                  * is because we need the unpinning stage to actually add the
6178                  * space back to the block group, otherwise we will leak space.
6179                  */
6180                 if (!alloc && cache->cached == BTRFS_CACHE_NO)
6181                         cache_block_group(cache, 1);
6182
6183                 byte_in_group = bytenr - cache->key.objectid;
6184                 WARN_ON(byte_in_group > cache->key.offset);
6185
6186                 spin_lock(&cache->space_info->lock);
6187                 spin_lock(&cache->lock);
6188
6189                 if (btrfs_test_opt(info, SPACE_CACHE) &&
6190                     cache->disk_cache_state < BTRFS_DC_CLEAR)
6191                         cache->disk_cache_state = BTRFS_DC_CLEAR;
6192
6193                 old_val = btrfs_block_group_used(&cache->item);
6194                 num_bytes = min(total, cache->key.offset - byte_in_group);
6195                 if (alloc) {
6196                         old_val += num_bytes;
6197                         btrfs_set_block_group_used(&cache->item, old_val);
6198                         cache->reserved -= num_bytes;
6199                         cache->space_info->bytes_reserved -= num_bytes;
6200                         cache->space_info->bytes_used += num_bytes;
6201                         cache->space_info->disk_used += num_bytes * factor;
6202                         spin_unlock(&cache->lock);
6203                         spin_unlock(&cache->space_info->lock);
6204                 } else {
6205                         old_val -= num_bytes;
6206                         btrfs_set_block_group_used(&cache->item, old_val);
6207                         cache->pinned += num_bytes;
6208                         cache->space_info->bytes_pinned += num_bytes;
6209                         cache->space_info->bytes_used -= num_bytes;
6210                         cache->space_info->disk_used -= num_bytes * factor;
6211                         spin_unlock(&cache->lock);
6212                         spin_unlock(&cache->space_info->lock);
6213
6214                         trace_btrfs_space_reservation(info, "pinned",
6215                                                       cache->space_info->flags,
6216                                                       num_bytes, 1);
6217                         percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
6218                                            num_bytes,
6219                                            BTRFS_TOTAL_BYTES_PINNED_BATCH);
6220                         set_extent_dirty(info->pinned_extents,
6221                                          bytenr, bytenr + num_bytes - 1,
6222                                          GFP_NOFS | __GFP_NOFAIL);
6223                 }
6224
6225                 spin_lock(&trans->transaction->dirty_bgs_lock);
6226                 if (list_empty(&cache->dirty_list)) {
6227                         list_add_tail(&cache->dirty_list,
6228                                       &trans->transaction->dirty_bgs);
6229                         trans->transaction->num_dirty_bgs++;
6230                         btrfs_get_block_group(cache);
6231                 }
6232                 spin_unlock(&trans->transaction->dirty_bgs_lock);
6233
6234                 /*
6235                  * No longer have used bytes in this block group, queue it for
6236                  * deletion. We do this after adding the block group to the
6237                  * dirty list to avoid races between cleaner kthread and space
6238                  * cache writeout.
6239                  */
6240                 if (!alloc && old_val == 0)
6241                         btrfs_mark_bg_unused(cache);
6242
6243                 btrfs_put_block_group(cache);
6244                 total -= num_bytes;
6245                 bytenr += num_bytes;
6246         }
6247         return 0;
6248 }
6249
6250 static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
6251 {
6252         struct btrfs_block_group_cache *cache;
6253         u64 bytenr;
6254
6255         spin_lock(&fs_info->block_group_cache_lock);
6256         bytenr = fs_info->first_logical_byte;
6257         spin_unlock(&fs_info->block_group_cache_lock);
6258
6259         if (bytenr < (u64)-1)
6260                 return bytenr;
6261
6262         cache = btrfs_lookup_first_block_group(fs_info, search_start);
6263         if (!cache)
6264                 return 0;
6265
6266         bytenr = cache->key.objectid;
6267         btrfs_put_block_group(cache);
6268
6269         return bytenr;
6270 }
6271
6272 static int pin_down_extent(struct btrfs_fs_info *fs_info,
6273                            struct btrfs_block_group_cache *cache,
6274                            u64 bytenr, u64 num_bytes, int reserved)
6275 {
6276         spin_lock(&cache->space_info->lock);
6277         spin_lock(&cache->lock);
6278         cache->pinned += num_bytes;
6279         cache->space_info->bytes_pinned += num_bytes;
6280         if (reserved) {
6281                 cache->reserved -= num_bytes;
6282                 cache->space_info->bytes_reserved -= num_bytes;
6283         }
6284         spin_unlock(&cache->lock);
6285         spin_unlock(&cache->space_info->lock);
6286
6287         trace_btrfs_space_reservation(fs_info, "pinned",
6288                                       cache->space_info->flags, num_bytes, 1);
6289         percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
6290                     num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
6291         set_extent_dirty(fs_info->pinned_extents, bytenr,
6292                          bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
6293         return 0;
6294 }
6295
6296 /*
6297  * this function must be called within transaction
6298  */
6299 int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
6300                      u64 bytenr, u64 num_bytes, int reserved)
6301 {
6302         struct btrfs_block_group_cache *cache;
6303
6304         cache = btrfs_lookup_block_group(fs_info, bytenr);
6305         BUG_ON(!cache); /* Logic error */
6306
6307         pin_down_extent(fs_info, cache, bytenr, num_bytes, reserved);
6308
6309         btrfs_put_block_group(cache);
6310         return 0;
6311 }
6312
6313 /*
6314  * this function must be called within transaction
6315  */
6316 int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
6317                                     u64 bytenr, u64 num_bytes)
6318 {
6319         struct btrfs_block_group_cache *cache;
6320         int ret;
6321
6322         cache = btrfs_lookup_block_group(fs_info, bytenr);
6323         if (!cache)
6324                 return -EINVAL;
6325
6326         /*
6327          * pull in the free space cache (if any) so that our pin
6328          * removes the free space from the cache.  We have load_only set
6329          * to one because the slow code to read in the free extents does check
6330          * the pinned extents.
6331          */
6332         cache_block_group(cache, 1);
6333
6334         pin_down_extent(fs_info, cache, bytenr, num_bytes, 0);
6335
6336         /* remove us from the free space cache (if we're there at all) */
6337         ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
6338         btrfs_put_block_group(cache);
6339         return ret;
6340 }
6341
6342 static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
6343                                    u64 start, u64 num_bytes)
6344 {
6345         int ret;
6346         struct btrfs_block_group_cache *block_group;
6347         struct btrfs_caching_control *caching_ctl;
6348
6349         block_group = btrfs_lookup_block_group(fs_info, start);
6350         if (!block_group)
6351                 return -EINVAL;
6352
6353         cache_block_group(block_group, 0);
6354         caching_ctl = get_caching_control(block_group);
6355
6356         if (!caching_ctl) {
6357                 /* Logic error */
6358                 BUG_ON(!block_group_cache_done(block_group));
6359                 ret = btrfs_remove_free_space(block_group, start, num_bytes);
6360         } else {
6361                 mutex_lock(&caching_ctl->mutex);
6362
6363                 if (start >= caching_ctl->progress) {
6364                         ret = add_excluded_extent(fs_info, start, num_bytes);
6365                 } else if (start + num_bytes <= caching_ctl->progress) {
6366                         ret = btrfs_remove_free_space(block_group,
6367                                                       start, num_bytes);
6368                 } else {
6369                         num_bytes = caching_ctl->progress - start;
6370                         ret = btrfs_remove_free_space(block_group,
6371                                                       start, num_bytes);
6372                         if (ret)
6373                                 goto out_lock;
6374
6375                         num_bytes = (start + num_bytes) -
6376                                 caching_ctl->progress;
6377                         start = caching_ctl->progress;
6378                         ret = add_excluded_extent(fs_info, start, num_bytes);
6379                 }
6380 out_lock:
6381                 mutex_unlock(&caching_ctl->mutex);
6382                 put_caching_control(caching_ctl);
6383         }
6384         btrfs_put_block_group(block_group);
6385         return ret;
6386 }
6387
6388 int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info,
6389                                  struct extent_buffer *eb)
6390 {
6391         struct btrfs_file_extent_item *item;
6392         struct btrfs_key key;
6393         int found_type;
6394         int i;
6395         int ret = 0;
6396
6397         if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
6398                 return 0;
6399
6400         for (i = 0; i < btrfs_header_nritems(eb); i++) {
6401                 btrfs_item_key_to_cpu(eb, &key, i);
6402                 if (key.type != BTRFS_EXTENT_DATA_KEY)
6403                         continue;
6404                 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
6405                 found_type = btrfs_file_extent_type(eb, item);
6406                 if (found_type == BTRFS_FILE_EXTENT_INLINE)
6407                         continue;
6408                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
6409                         continue;
6410                 key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
6411                 key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
6412                 ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
6413                 if (ret)
6414                         break;
6415         }
6416
6417         return ret;
6418 }
6419
6420 static void
6421 btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
6422 {
6423         atomic_inc(&bg->reservations);
6424 }
6425
6426 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
6427                                         const u64 start)
6428 {
6429         struct btrfs_block_group_cache *bg;
6430
6431         bg = btrfs_lookup_block_group(fs_info, start);
6432         ASSERT(bg);
6433         if (atomic_dec_and_test(&bg->reservations))
6434                 wake_up_var(&bg->reservations);
6435         btrfs_put_block_group(bg);
6436 }
6437
6438 void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
6439 {
6440         struct btrfs_space_info *space_info = bg->space_info;
6441
6442         ASSERT(bg->ro);
6443
6444         if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
6445                 return;
6446
6447         /*
6448          * Our block group is read only but before we set it to read only,
6449          * some task might have had allocated an extent from it already, but it
6450          * has not yet created a respective ordered extent (and added it to a
6451          * root's list of ordered extents).
6452          * Therefore wait for any task currently allocating extents, since the
6453          * block group's reservations counter is incremented while a read lock
6454          * on the groups' semaphore is held and decremented after releasing
6455          * the read access on that semaphore and creating the ordered extent.
6456          */
6457         down_write(&space_info->groups_sem);
6458         up_write(&space_info->groups_sem);
6459
6460         wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
6461 }
6462
6463 /**
6464  * btrfs_add_reserved_bytes - update the block_group and space info counters
6465  * @cache:      The cache we are manipulating
6466  * @ram_bytes:  The number of bytes of file content, and will be same to
6467  *              @num_bytes except for the compress path.
6468  * @num_bytes:  The number of bytes in question
6469  * @delalloc:   The blocks are allocated for the delalloc write
6470  *
6471  * This is called by the allocator when it reserves space. If this is a
6472  * reservation and the block group has become read only we cannot make the
6473  * reservation and return -EAGAIN, otherwise this function always succeeds.
6474  */
6475 static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
6476                                     u64 ram_bytes, u64 num_bytes, int delalloc)
6477 {
6478         struct btrfs_space_info *space_info = cache->space_info;
6479         int ret = 0;
6480
6481         spin_lock(&space_info->lock);
6482         spin_lock(&cache->lock);
6483         if (cache->ro) {
6484                 ret = -EAGAIN;
6485         } else {
6486                 cache->reserved += num_bytes;
6487                 space_info->bytes_reserved += num_bytes;
6488                 space_info->bytes_may_use -= ram_bytes;
6489                 if (delalloc)
6490                         cache->delalloc_bytes += num_bytes;
6491         }
6492         spin_unlock(&cache->lock);
6493         spin_unlock(&space_info->lock);
6494         return ret;
6495 }
6496
6497 /**
6498  * btrfs_free_reserved_bytes - update the block_group and space info counters
6499  * @cache:      The cache we are manipulating
6500  * @num_bytes:  The number of bytes in question
6501  * @delalloc:   The blocks are allocated for the delalloc write
6502  *
6503  * This is called by somebody who is freeing space that was never actually used
6504  * on disk.  For example if you reserve some space for a new leaf in transaction
6505  * A and before transaction A commits you free that leaf, you call this with
6506  * reserve set to 0 in order to clear the reservation.
6507  */
6508
6509 static void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
6510                                       u64 num_bytes, int delalloc)
6511 {
6512         struct btrfs_space_info *space_info = cache->space_info;
6513
6514         spin_lock(&space_info->lock);
6515         spin_lock(&cache->lock);
6516         if (cache->ro)
6517                 space_info->bytes_readonly += num_bytes;
6518         cache->reserved -= num_bytes;
6519         space_info->bytes_reserved -= num_bytes;
6520
6521         if (delalloc)
6522                 cache->delalloc_bytes -= num_bytes;
6523         spin_unlock(&cache->lock);
6524         spin_unlock(&space_info->lock);
6525 }
6526 void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
6527 {
6528         struct btrfs_caching_control *next;
6529         struct btrfs_caching_control *caching_ctl;
6530         struct btrfs_block_group_cache *cache;
6531
6532         down_write(&fs_info->commit_root_sem);
6533
6534         list_for_each_entry_safe(caching_ctl, next,
6535                                  &fs_info->caching_block_groups, list) {
6536                 cache = caching_ctl->block_group;
6537                 if (block_group_cache_done(cache)) {
6538                         cache->last_byte_to_unpin = (u64)-1;
6539                         list_del_init(&caching_ctl->list);
6540                         put_caching_control(caching_ctl);
6541                 } else {
6542                         cache->last_byte_to_unpin = caching_ctl->progress;
6543                 }
6544         }
6545
6546         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6547                 fs_info->pinned_extents = &fs_info->freed_extents[1];
6548         else
6549                 fs_info->pinned_extents = &fs_info->freed_extents[0];
6550
6551         up_write(&fs_info->commit_root_sem);
6552
6553         update_global_block_rsv(fs_info);
6554 }
6555
6556 /*
6557  * Returns the free cluster for the given space info and sets empty_cluster to
6558  * what it should be based on the mount options.
6559  */
6560 static struct btrfs_free_cluster *
6561 fetch_cluster_info(struct btrfs_fs_info *fs_info,
6562                    struct btrfs_space_info *space_info, u64 *empty_cluster)
6563 {
6564         struct btrfs_free_cluster *ret = NULL;
6565
6566         *empty_cluster = 0;
6567         if (btrfs_mixed_space_info(space_info))
6568                 return ret;
6569
6570         if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
6571                 ret = &fs_info->meta_alloc_cluster;
6572                 if (btrfs_test_opt(fs_info, SSD))
6573                         *empty_cluster = SZ_2M;
6574                 else
6575                         *empty_cluster = SZ_64K;
6576         } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
6577                    btrfs_test_opt(fs_info, SSD_SPREAD)) {
6578                 *empty_cluster = SZ_2M;
6579                 ret = &fs_info->data_alloc_cluster;
6580         }
6581
6582         return ret;
6583 }
6584
6585 static int unpin_extent_range(struct btrfs_fs_info *fs_info,
6586                               u64 start, u64 end,
6587                               const bool return_free_space)
6588 {
6589         struct btrfs_block_group_cache *cache = NULL;
6590         struct btrfs_space_info *space_info;
6591         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
6592         struct btrfs_free_cluster *cluster = NULL;
6593         u64 len;
6594         u64 total_unpinned = 0;
6595         u64 empty_cluster = 0;
6596         bool readonly;
6597
6598         while (start <= end) {
6599                 readonly = false;
6600                 if (!cache ||
6601                     start >= cache->key.objectid + cache->key.offset) {
6602                         if (cache)
6603                                 btrfs_put_block_group(cache);
6604                         total_unpinned = 0;
6605                         cache = btrfs_lookup_block_group(fs_info, start);
6606                         BUG_ON(!cache); /* Logic error */
6607
6608                         cluster = fetch_cluster_info(fs_info,
6609                                                      cache->space_info,
6610                                                      &empty_cluster);
6611                         empty_cluster <<= 1;
6612                 }
6613
6614                 len = cache->key.objectid + cache->key.offset - start;
6615                 len = min(len, end + 1 - start);
6616
6617                 if (start < cache->last_byte_to_unpin) {
6618                         len = min(len, cache->last_byte_to_unpin - start);
6619                         if (return_free_space)
6620                                 btrfs_add_free_space(cache, start, len);
6621                 }
6622
6623                 start += len;
6624                 total_unpinned += len;
6625                 space_info = cache->space_info;
6626
6627                 /*
6628                  * If this space cluster has been marked as fragmented and we've
6629                  * unpinned enough in this block group to potentially allow a
6630                  * cluster to be created inside of it go ahead and clear the
6631                  * fragmented check.
6632                  */
6633                 if (cluster && cluster->fragmented &&
6634                     total_unpinned > empty_cluster) {
6635                         spin_lock(&cluster->lock);
6636                         cluster->fragmented = 0;
6637                         spin_unlock(&cluster->lock);
6638                 }
6639
6640                 spin_lock(&space_info->lock);
6641                 spin_lock(&cache->lock);
6642                 cache->pinned -= len;
6643                 space_info->bytes_pinned -= len;
6644
6645                 trace_btrfs_space_reservation(fs_info, "pinned",
6646                                               space_info->flags, len, 0);
6647                 space_info->max_extent_size = 0;
6648                 percpu_counter_add_batch(&space_info->total_bytes_pinned,
6649                             -len, BTRFS_TOTAL_BYTES_PINNED_BATCH);
6650                 if (cache->ro) {
6651                         space_info->bytes_readonly += len;
6652                         readonly = true;
6653                 }
6654                 spin_unlock(&cache->lock);
6655                 if (!readonly && return_free_space &&
6656                     global_rsv->space_info == space_info) {
6657                         u64 to_add = len;
6658
6659                         spin_lock(&global_rsv->lock);
6660                         if (!global_rsv->full) {
6661                                 to_add = min(len, global_rsv->size -
6662                                              global_rsv->reserved);
6663                                 global_rsv->reserved += to_add;
6664                                 space_info->bytes_may_use += to_add;
6665                                 if (global_rsv->reserved >= global_rsv->size)
6666                                         global_rsv->full = 1;
6667                                 trace_btrfs_space_reservation(fs_info,
6668                                                               "space_info",
6669                                                               space_info->flags,
6670                                                               to_add, 1);
6671                                 len -= to_add;
6672                         }
6673                         spin_unlock(&global_rsv->lock);
6674                         /* Add to any tickets we may have */
6675                         if (len)
6676                                 space_info_add_new_bytes(fs_info, space_info,
6677                                                          len);
6678                 }
6679                 spin_unlock(&space_info->lock);
6680         }
6681
6682         if (cache)
6683                 btrfs_put_block_group(cache);
6684         return 0;
6685 }
6686
6687 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
6688 {
6689         struct btrfs_fs_info *fs_info = trans->fs_info;
6690         struct btrfs_block_group_cache *block_group, *tmp;
6691         struct list_head *deleted_bgs;
6692         struct extent_io_tree *unpin;
6693         u64 start;
6694         u64 end;
6695         int ret;
6696
6697         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6698                 unpin = &fs_info->freed_extents[1];
6699         else
6700                 unpin = &fs_info->freed_extents[0];
6701
6702         while (!trans->aborted) {
6703                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
6704                 ret = find_first_extent_bit(unpin, 0, &start, &end,
6705                                             EXTENT_DIRTY, NULL);
6706                 if (ret) {
6707                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6708                         break;
6709                 }
6710
6711                 if (btrfs_test_opt(fs_info, DISCARD))
6712                         ret = btrfs_discard_extent(fs_info, start,
6713                                                    end + 1 - start, NULL);
6714
6715                 clear_extent_dirty(unpin, start, end);
6716                 unpin_extent_range(fs_info, start, end, true);
6717                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6718                 cond_resched();
6719         }
6720
6721         /*
6722          * Transaction is finished.  We don't need the lock anymore.  We
6723          * do need to clean up the block groups in case of a transaction
6724          * abort.
6725          */
6726         deleted_bgs = &trans->transaction->deleted_bgs;
6727         list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
6728                 u64 trimmed = 0;
6729
6730                 ret = -EROFS;
6731                 if (!trans->aborted)
6732                         ret = btrfs_discard_extent(fs_info,
6733                                                    block_group->key.objectid,
6734                                                    block_group->key.offset,
6735                                                    &trimmed);
6736
6737                 list_del_init(&block_group->bg_list);
6738                 btrfs_put_block_group_trimming(block_group);
6739                 btrfs_put_block_group(block_group);
6740
6741                 if (ret) {
6742                         const char *errstr = btrfs_decode_error(ret);
6743                         btrfs_warn(fs_info,
6744                            "discard failed while removing blockgroup: errno=%d %s",
6745                                    ret, errstr);
6746                 }
6747         }
6748
6749         return 0;
6750 }
6751
6752 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
6753                                struct btrfs_delayed_ref_node *node, u64 parent,
6754                                u64 root_objectid, u64 owner_objectid,
6755                                u64 owner_offset, int refs_to_drop,
6756                                struct btrfs_delayed_extent_op *extent_op)
6757 {
6758         struct btrfs_fs_info *info = trans->fs_info;
6759         struct btrfs_key key;
6760         struct btrfs_path *path;
6761         struct btrfs_root *extent_root = info->extent_root;
6762         struct extent_buffer *leaf;
6763         struct btrfs_extent_item *ei;
6764         struct btrfs_extent_inline_ref *iref;
6765         int ret;
6766         int is_data;
6767         int extent_slot = 0;
6768         int found_extent = 0;
6769         int num_to_del = 1;
6770         u32 item_size;
6771         u64 refs;
6772         u64 bytenr = node->bytenr;
6773         u64 num_bytes = node->num_bytes;
6774         int last_ref = 0;
6775         bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
6776
6777         path = btrfs_alloc_path();
6778         if (!path)
6779                 return -ENOMEM;
6780
6781         path->reada = READA_FORWARD;
6782         path->leave_spinning = 1;
6783
6784         is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
6785         BUG_ON(!is_data && refs_to_drop != 1);
6786
6787         if (is_data)
6788                 skinny_metadata = false;
6789
6790         ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
6791                                     parent, root_objectid, owner_objectid,
6792                                     owner_offset);
6793         if (ret == 0) {
6794                 extent_slot = path->slots[0];
6795                 while (extent_slot >= 0) {
6796                         btrfs_item_key_to_cpu(path->nodes[0], &key,
6797                                               extent_slot);
6798                         if (key.objectid != bytenr)
6799                                 break;
6800                         if (key.type == BTRFS_EXTENT_ITEM_KEY &&
6801                             key.offset == num_bytes) {
6802                                 found_extent = 1;
6803                                 break;
6804                         }
6805                         if (key.type == BTRFS_METADATA_ITEM_KEY &&
6806                             key.offset == owner_objectid) {
6807                                 found_extent = 1;
6808                                 break;
6809                         }
6810                         if (path->slots[0] - extent_slot > 5)
6811                                 break;
6812                         extent_slot--;
6813                 }
6814
6815                 if (!found_extent) {
6816                         BUG_ON(iref);
6817                         ret = remove_extent_backref(trans, path, NULL,
6818                                                     refs_to_drop,
6819                                                     is_data, &last_ref);
6820                         if (ret) {
6821                                 btrfs_abort_transaction(trans, ret);
6822                                 goto out;
6823                         }
6824                         btrfs_release_path(path);
6825                         path->leave_spinning = 1;
6826
6827                         key.objectid = bytenr;
6828                         key.type = BTRFS_EXTENT_ITEM_KEY;
6829                         key.offset = num_bytes;
6830
6831                         if (!is_data && skinny_metadata) {
6832                                 key.type = BTRFS_METADATA_ITEM_KEY;
6833                                 key.offset = owner_objectid;
6834                         }
6835
6836                         ret = btrfs_search_slot(trans, extent_root,
6837                                                 &key, path, -1, 1);
6838                         if (ret > 0 && skinny_metadata && path->slots[0]) {
6839                                 /*
6840                                  * Couldn't find our skinny metadata item,
6841                                  * see if we have ye olde extent item.
6842                                  */
6843                                 path->slots[0]--;
6844                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
6845                                                       path->slots[0]);
6846                                 if (key.objectid == bytenr &&
6847                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
6848                                     key.offset == num_bytes)
6849                                         ret = 0;
6850                         }
6851
6852                         if (ret > 0 && skinny_metadata) {
6853                                 skinny_metadata = false;
6854                                 key.objectid = bytenr;
6855                                 key.type = BTRFS_EXTENT_ITEM_KEY;
6856                                 key.offset = num_bytes;
6857                                 btrfs_release_path(path);
6858                                 ret = btrfs_search_slot(trans, extent_root,
6859                                                         &key, path, -1, 1);
6860                         }
6861
6862                         if (ret) {
6863                                 btrfs_err(info,
6864                                           "umm, got %d back from search, was looking for %llu",
6865                                           ret, bytenr);
6866                                 if (ret > 0)
6867                                         btrfs_print_leaf(path->nodes[0]);
6868                         }
6869                         if (ret < 0) {
6870                                 btrfs_abort_transaction(trans, ret);
6871                                 goto out;
6872                         }
6873                         extent_slot = path->slots[0];
6874                 }
6875         } else if (WARN_ON(ret == -ENOENT)) {
6876                 btrfs_print_leaf(path->nodes[0]);
6877                 btrfs_err(info,
6878                         "unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
6879                         bytenr, parent, root_objectid, owner_objectid,
6880                         owner_offset);
6881                 btrfs_abort_transaction(trans, ret);
6882                 goto out;
6883         } else {
6884                 btrfs_abort_transaction(trans, ret);
6885                 goto out;
6886         }
6887
6888         leaf = path->nodes[0];
6889         item_size = btrfs_item_size_nr(leaf, extent_slot);
6890         if (unlikely(item_size < sizeof(*ei))) {
6891                 ret = -EINVAL;
6892                 btrfs_print_v0_err(info);
6893                 btrfs_abort_transaction(trans, ret);
6894                 goto out;
6895         }
6896         ei = btrfs_item_ptr(leaf, extent_slot,
6897                             struct btrfs_extent_item);
6898         if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
6899             key.type == BTRFS_EXTENT_ITEM_KEY) {
6900                 struct btrfs_tree_block_info *bi;
6901                 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
6902                 bi = (struct btrfs_tree_block_info *)(ei + 1);
6903                 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
6904         }
6905
6906         refs = btrfs_extent_refs(leaf, ei);
6907         if (refs < refs_to_drop) {
6908                 btrfs_err(info,
6909                           "trying to drop %d refs but we only have %Lu for bytenr %Lu",
6910                           refs_to_drop, refs, bytenr);
6911                 ret = -EINVAL;
6912                 btrfs_abort_transaction(trans, ret);
6913                 goto out;
6914         }
6915         refs -= refs_to_drop;
6916
6917         if (refs > 0) {
6918                 if (extent_op)
6919                         __run_delayed_extent_op(extent_op, leaf, ei);
6920                 /*
6921                  * In the case of inline back ref, reference count will
6922                  * be updated by remove_extent_backref
6923                  */
6924                 if (iref) {
6925                         BUG_ON(!found_extent);
6926                 } else {
6927                         btrfs_set_extent_refs(leaf, ei, refs);
6928                         btrfs_mark_buffer_dirty(leaf);
6929                 }
6930                 if (found_extent) {
6931                         ret = remove_extent_backref(trans, path, iref,
6932                                                     refs_to_drop, is_data,
6933                                                     &last_ref);
6934                         if (ret) {
6935                                 btrfs_abort_transaction(trans, ret);
6936                                 goto out;
6937                         }
6938                 }
6939         } else {
6940                 if (found_extent) {
6941                         BUG_ON(is_data && refs_to_drop !=
6942                                extent_data_ref_count(path, iref));
6943                         if (iref) {
6944                                 BUG_ON(path->slots[0] != extent_slot);
6945                         } else {
6946                                 BUG_ON(path->slots[0] != extent_slot + 1);
6947                                 path->slots[0] = extent_slot;
6948                                 num_to_del = 2;
6949                         }
6950                 }
6951
6952                 last_ref = 1;
6953                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
6954                                       num_to_del);
6955                 if (ret) {
6956                         btrfs_abort_transaction(trans, ret);
6957                         goto out;
6958                 }
6959                 btrfs_release_path(path);
6960
6961                 if (is_data) {
6962                         ret = btrfs_del_csums(trans, info, bytenr, num_bytes);
6963                         if (ret) {
6964                                 btrfs_abort_transaction(trans, ret);
6965                                 goto out;
6966                         }
6967                 }
6968
6969                 ret = add_to_free_space_tree(trans, bytenr, num_bytes);
6970                 if (ret) {
6971                         btrfs_abort_transaction(trans, ret);
6972                         goto out;
6973                 }
6974
6975                 ret = update_block_group(trans, info, bytenr, num_bytes, 0);
6976                 if (ret) {
6977                         btrfs_abort_transaction(trans, ret);
6978                         goto out;
6979                 }
6980         }
6981         btrfs_release_path(path);
6982
6983 out:
6984         btrfs_free_path(path);
6985         return ret;
6986 }
6987
6988 /*
6989  * when we free an block, it is possible (and likely) that we free the last
6990  * delayed ref for that extent as well.  This searches the delayed ref tree for
6991  * a given extent, and if there are no other delayed refs to be processed, it
6992  * removes it from the tree.
6993  */
6994 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
6995                                       u64 bytenr)
6996 {
6997         struct btrfs_delayed_ref_head *head;
6998         struct btrfs_delayed_ref_root *delayed_refs;
6999         int ret = 0;
7000
7001         delayed_refs = &trans->transaction->delayed_refs;
7002         spin_lock(&delayed_refs->lock);
7003         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
7004         if (!head)
7005                 goto out_delayed_unlock;
7006
7007         spin_lock(&head->lock);
7008         if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
7009                 goto out;
7010
7011         if (head->extent_op) {
7012                 if (!head->must_insert_reserved)
7013                         goto out;
7014                 btrfs_free_delayed_extent_op(head->extent_op);
7015                 head->extent_op = NULL;
7016         }
7017
7018         /*
7019          * waiting for the lock here would deadlock.  If someone else has it
7020          * locked they are already in the process of dropping it anyway
7021          */
7022         if (!mutex_trylock(&head->mutex))
7023                 goto out;
7024
7025         /*
7026          * at this point we have a head with no other entries.  Go
7027          * ahead and process it.
7028          */
7029         rb_erase_cached(&head->href_node, &delayed_refs->href_root);
7030         RB_CLEAR_NODE(&head->href_node);
7031         atomic_dec(&delayed_refs->num_entries);
7032
7033         /*
7034          * we don't take a ref on the node because we're removing it from the
7035          * tree, so we just steal the ref the tree was holding.
7036          */
7037         delayed_refs->num_heads--;
7038         if (head->processing == 0)
7039                 delayed_refs->num_heads_ready--;
7040         head->processing = 0;
7041         spin_unlock(&head->lock);
7042         spin_unlock(&delayed_refs->lock);
7043
7044         BUG_ON(head->extent_op);
7045         if (head->must_insert_reserved)
7046                 ret = 1;
7047
7048         mutex_unlock(&head->mutex);
7049         btrfs_put_delayed_ref_head(head);
7050         return ret;
7051 out:
7052         spin_unlock(&head->lock);
7053
7054 out_delayed_unlock:
7055         spin_unlock(&delayed_refs->lock);
7056         return 0;
7057 }
7058
7059 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
7060                            struct btrfs_root *root,
7061                            struct extent_buffer *buf,
7062                            u64 parent, int last_ref)
7063 {
7064         struct btrfs_fs_info *fs_info = root->fs_info;
7065         int pin = 1;
7066         int ret;
7067
7068         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
7069                 int old_ref_mod, new_ref_mod;
7070
7071                 btrfs_ref_tree_mod(root, buf->start, buf->len, parent,
7072                                    root->root_key.objectid,
7073                                    btrfs_header_level(buf), 0,
7074                                    BTRFS_DROP_DELAYED_REF);
7075                 ret = btrfs_add_delayed_tree_ref(trans, buf->start,
7076                                                  buf->len, parent,
7077                                                  root->root_key.objectid,
7078                                                  btrfs_header_level(buf),
7079                                                  BTRFS_DROP_DELAYED_REF, NULL,
7080                                                  &old_ref_mod, &new_ref_mod);
7081                 BUG_ON(ret); /* -ENOMEM */
7082                 pin = old_ref_mod >= 0 && new_ref_mod < 0;
7083         }
7084
7085         if (last_ref && btrfs_header_generation(buf) == trans->transid) {
7086                 struct btrfs_block_group_cache *cache;
7087
7088                 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
7089                         ret = check_ref_cleanup(trans, buf->start);
7090                         if (!ret)
7091                                 goto out;
7092                 }
7093
7094                 pin = 0;
7095                 cache = btrfs_lookup_block_group(fs_info, buf->start);
7096
7097                 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
7098                         pin_down_extent(fs_info, cache, buf->start,
7099                                         buf->len, 1);
7100                         btrfs_put_block_group(cache);
7101                         goto out;
7102                 }
7103
7104                 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
7105
7106                 btrfs_add_free_space(cache, buf->start, buf->len);
7107                 btrfs_free_reserved_bytes(cache, buf->len, 0);
7108                 btrfs_put_block_group(cache);
7109                 trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
7110         }
7111 out:
7112         if (pin)
7113                 add_pinned_bytes(fs_info, buf->len, true,
7114                                  root->root_key.objectid);
7115
7116         if (last_ref) {
7117                 /*
7118                  * Deleting the buffer, clear the corrupt flag since it doesn't
7119                  * matter anymore.
7120                  */
7121                 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
7122         }
7123 }
7124
7125 /* Can return -ENOMEM */
7126 int btrfs_free_extent(struct btrfs_trans_handle *trans,
7127                       struct btrfs_root *root,
7128                       u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
7129                       u64 owner, u64 offset)
7130 {
7131         struct btrfs_fs_info *fs_info = root->fs_info;
7132         int old_ref_mod, new_ref_mod;
7133         int ret;
7134
7135         if (btrfs_is_testing(fs_info))
7136                 return 0;
7137
7138         if (root_objectid != BTRFS_TREE_LOG_OBJECTID)
7139                 btrfs_ref_tree_mod(root, bytenr, num_bytes, parent,
7140                                    root_objectid, owner, offset,
7141                                    BTRFS_DROP_DELAYED_REF);
7142
7143         /*
7144          * tree log blocks never actually go into the extent allocation
7145          * tree, just update pinning info and exit early.
7146          */
7147         if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
7148                 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
7149                 /* unlocks the pinned mutex */
7150                 btrfs_pin_extent(fs_info, bytenr, num_bytes, 1);
7151                 old_ref_mod = new_ref_mod = 0;
7152                 ret = 0;
7153         } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
7154                 ret = btrfs_add_delayed_tree_ref(trans, bytenr,
7155                                                  num_bytes, parent,
7156                                                  root_objectid, (int)owner,
7157                                                  BTRFS_DROP_DELAYED_REF, NULL,
7158                                                  &old_ref_mod, &new_ref_mod);
7159         } else {
7160                 ret = btrfs_add_delayed_data_ref(trans, bytenr,
7161                                                  num_bytes, parent,
7162                                                  root_objectid, owner, offset,
7163                                                  0, BTRFS_DROP_DELAYED_REF,
7164                                                  &old_ref_mod, &new_ref_mod);
7165         }
7166
7167         if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) {
7168                 bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
7169
7170                 add_pinned_bytes(fs_info, num_bytes, metadata, root_objectid);
7171         }
7172
7173         return ret;
7174 }
7175
7176 /*
7177  * when we wait for progress in the block group caching, its because
7178  * our allocation attempt failed at least once.  So, we must sleep
7179  * and let some progress happen before we try again.
7180  *
7181  * This function will sleep at least once waiting for new free space to
7182  * show up, and then it will check the block group free space numbers
7183  * for our min num_bytes.  Another option is to have it go ahead
7184  * and look in the rbtree for a free extent of a given size, but this
7185  * is a good start.
7186  *
7187  * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
7188  * any of the information in this block group.
7189  */
7190 static noinline void
7191 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
7192                                 u64 num_bytes)
7193 {
7194         struct btrfs_caching_control *caching_ctl;
7195
7196         caching_ctl = get_caching_control(cache);
7197         if (!caching_ctl)
7198                 return;
7199
7200         wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
7201                    (cache->free_space_ctl->free_space >= num_bytes));
7202
7203         put_caching_control(caching_ctl);
7204 }
7205
7206 static noinline int
7207 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
7208 {
7209         struct btrfs_caching_control *caching_ctl;
7210         int ret = 0;
7211
7212         caching_ctl = get_caching_control(cache);
7213         if (!caching_ctl)
7214                 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
7215
7216         wait_event(caching_ctl->wait, block_group_cache_done(cache));
7217         if (cache->cached == BTRFS_CACHE_ERROR)
7218                 ret = -EIO;
7219         put_caching_control(caching_ctl);
7220         return ret;
7221 }
7222
7223 enum btrfs_loop_type {
7224         LOOP_CACHING_NOWAIT = 0,
7225         LOOP_CACHING_WAIT = 1,
7226         LOOP_ALLOC_CHUNK = 2,
7227         LOOP_NO_EMPTY_SIZE = 3,
7228 };
7229
7230 static inline void
7231 btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
7232                        int delalloc)
7233 {
7234         if (delalloc)
7235                 down_read(&cache->data_rwsem);
7236 }
7237
7238 static inline void
7239 btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
7240                        int delalloc)
7241 {
7242         btrfs_get_block_group(cache);
7243         if (delalloc)
7244                 down_read(&cache->data_rwsem);
7245 }
7246
7247 static struct btrfs_block_group_cache *
7248 btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
7249                    struct btrfs_free_cluster *cluster,
7250                    int delalloc)
7251 {
7252         struct btrfs_block_group_cache *used_bg = NULL;
7253
7254         spin_lock(&cluster->refill_lock);
7255         while (1) {
7256                 used_bg = cluster->block_group;
7257                 if (!used_bg)
7258                         return NULL;
7259
7260                 if (used_bg == block_group)
7261                         return used_bg;
7262
7263                 btrfs_get_block_group(used_bg);
7264
7265                 if (!delalloc)
7266                         return used_bg;
7267
7268                 if (down_read_trylock(&used_bg->data_rwsem))
7269                         return used_bg;
7270
7271                 spin_unlock(&cluster->refill_lock);
7272
7273                 /* We should only have one-level nested. */
7274                 down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
7275
7276                 spin_lock(&cluster->refill_lock);
7277                 if (used_bg == cluster->block_group)
7278                         return used_bg;
7279
7280                 up_read(&used_bg->data_rwsem);
7281                 btrfs_put_block_group(used_bg);
7282         }
7283 }
7284
7285 static inline void
7286 btrfs_release_block_group(struct btrfs_block_group_cache *cache,
7287                          int delalloc)
7288 {
7289         if (delalloc)
7290                 up_read(&cache->data_rwsem);
7291         btrfs_put_block_group(cache);
7292 }
7293
7294 /*
7295  * walks the btree of allocated extents and find a hole of a given size.
7296  * The key ins is changed to record the hole:
7297  * ins->objectid == start position
7298  * ins->flags = BTRFS_EXTENT_ITEM_KEY
7299  * ins->offset == the size of the hole.
7300  * Any available blocks before search_start are skipped.
7301  *
7302  * If there is no suitable free space, we will record the max size of
7303  * the free space extent currently.
7304  */
7305 static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
7306                                 u64 ram_bytes, u64 num_bytes, u64 empty_size,
7307                                 u64 hint_byte, struct btrfs_key *ins,
7308                                 u64 flags, int delalloc)
7309 {
7310         int ret = 0;
7311         struct btrfs_root *root = fs_info->extent_root;
7312         struct btrfs_free_cluster *last_ptr = NULL;
7313         struct btrfs_block_group_cache *block_group = NULL;
7314         u64 search_start = 0;
7315         u64 max_extent_size = 0;
7316         u64 empty_cluster = 0;
7317         struct btrfs_space_info *space_info;
7318         int loop = 0;
7319         int index = btrfs_bg_flags_to_raid_index(flags);
7320         bool failed_cluster_refill = false;
7321         bool failed_alloc = false;
7322         bool use_cluster = true;
7323         bool have_caching_bg = false;
7324         bool orig_have_caching_bg = false;
7325         bool full_search = false;
7326
7327         WARN_ON(num_bytes < fs_info->sectorsize);
7328         ins->type = BTRFS_EXTENT_ITEM_KEY;
7329         ins->objectid = 0;
7330         ins->offset = 0;
7331
7332         trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
7333
7334         space_info = __find_space_info(fs_info, flags);
7335         if (!space_info) {
7336                 btrfs_err(fs_info, "No space info for %llu", flags);
7337                 return -ENOSPC;
7338         }
7339
7340         /*
7341          * If our free space is heavily fragmented we may not be able to make
7342          * big contiguous allocations, so instead of doing the expensive search
7343          * for free space, simply return ENOSPC with our max_extent_size so we
7344          * can go ahead and search for a more manageable chunk.
7345          *
7346          * If our max_extent_size is large enough for our allocation simply
7347          * disable clustering since we will likely not be able to find enough
7348          * space to create a cluster and induce latency trying.
7349          */
7350         if (unlikely(space_info->max_extent_size)) {
7351                 spin_lock(&space_info->lock);
7352                 if (space_info->max_extent_size &&
7353                     num_bytes > space_info->max_extent_size) {
7354                         ins->offset = space_info->max_extent_size;
7355                         spin_unlock(&space_info->lock);
7356                         return -ENOSPC;
7357                 } else if (space_info->max_extent_size) {
7358                         use_cluster = false;
7359                 }
7360                 spin_unlock(&space_info->lock);
7361         }
7362
7363         last_ptr = fetch_cluster_info(fs_info, space_info, &empty_cluster);
7364         if (last_ptr) {
7365                 spin_lock(&last_ptr->lock);
7366                 if (last_ptr->block_group)
7367                         hint_byte = last_ptr->window_start;
7368                 if (last_ptr->fragmented) {
7369                         /*
7370                          * We still set window_start so we can keep track of the
7371                          * last place we found an allocation to try and save
7372                          * some time.
7373                          */
7374                         hint_byte = last_ptr->window_start;
7375                         use_cluster = false;
7376                 }
7377                 spin_unlock(&last_ptr->lock);
7378         }
7379
7380         search_start = max(search_start, first_logical_byte(fs_info, 0));
7381         search_start = max(search_start, hint_byte);
7382         if (search_start == hint_byte) {
7383                 block_group = btrfs_lookup_block_group(fs_info, search_start);
7384                 /*
7385                  * we don't want to use the block group if it doesn't match our
7386                  * allocation bits, or if its not cached.
7387                  *
7388                  * However if we are re-searching with an ideal block group
7389                  * picked out then we don't care that the block group is cached.
7390                  */
7391                 if (block_group && block_group_bits(block_group, flags) &&
7392                     block_group->cached != BTRFS_CACHE_NO) {
7393                         down_read(&space_info->groups_sem);
7394                         if (list_empty(&block_group->list) ||
7395                             block_group->ro) {
7396                                 /*
7397                                  * someone is removing this block group,
7398                                  * we can't jump into the have_block_group
7399                                  * target because our list pointers are not
7400                                  * valid
7401                                  */
7402                                 btrfs_put_block_group(block_group);
7403                                 up_read(&space_info->groups_sem);
7404                         } else {
7405                                 index = btrfs_bg_flags_to_raid_index(
7406                                                 block_group->flags);
7407                                 btrfs_lock_block_group(block_group, delalloc);
7408                                 goto have_block_group;
7409                         }
7410                 } else if (block_group) {
7411                         btrfs_put_block_group(block_group);
7412                 }
7413         }
7414 search:
7415         have_caching_bg = false;
7416         if (index == 0 || index == btrfs_bg_flags_to_raid_index(flags))
7417                 full_search = true;
7418         down_read(&space_info->groups_sem);
7419         list_for_each_entry(block_group, &space_info->block_groups[index],
7420                             list) {
7421                 u64 offset;
7422                 int cached;
7423
7424                 /* If the block group is read-only, we can skip it entirely. */
7425                 if (unlikely(block_group->ro))
7426                         continue;
7427
7428                 btrfs_grab_block_group(block_group, delalloc);
7429                 search_start = block_group->key.objectid;
7430
7431                 /*
7432                  * this can happen if we end up cycling through all the
7433                  * raid types, but we want to make sure we only allocate
7434                  * for the proper type.
7435                  */
7436                 if (!block_group_bits(block_group, flags)) {
7437                         u64 extra = BTRFS_BLOCK_GROUP_DUP |
7438                                 BTRFS_BLOCK_GROUP_RAID1 |
7439                                 BTRFS_BLOCK_GROUP_RAID5 |
7440                                 BTRFS_BLOCK_GROUP_RAID6 |
7441                                 BTRFS_BLOCK_GROUP_RAID10;
7442
7443                         /*
7444                          * if they asked for extra copies and this block group
7445                          * doesn't provide them, bail.  This does allow us to
7446                          * fill raid0 from raid1.
7447                          */
7448                         if ((flags & extra) && !(block_group->flags & extra))
7449                                 goto loop;
7450                 }
7451
7452 have_block_group:
7453                 cached = block_group_cache_done(block_group);
7454                 if (unlikely(!cached)) {
7455                         have_caching_bg = true;
7456                         ret = cache_block_group(block_group, 0);
7457                         BUG_ON(ret < 0);
7458                         ret = 0;
7459                 }
7460
7461                 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
7462                         goto loop;
7463
7464                 /*
7465                  * Ok we want to try and use the cluster allocator, so
7466                  * lets look there
7467                  */
7468                 if (last_ptr && use_cluster) {
7469                         struct btrfs_block_group_cache *used_block_group;
7470                         unsigned long aligned_cluster;
7471                         /*
7472                          * the refill lock keeps out other
7473                          * people trying to start a new cluster
7474                          */
7475                         used_block_group = btrfs_lock_cluster(block_group,
7476                                                               last_ptr,
7477                                                               delalloc);
7478                         if (!used_block_group)
7479                                 goto refill_cluster;
7480
7481                         if (used_block_group != block_group &&
7482                             (used_block_group->ro ||
7483                              !block_group_bits(used_block_group, flags)))
7484                                 goto release_cluster;
7485
7486                         offset = btrfs_alloc_from_cluster(used_block_group,
7487                                                 last_ptr,
7488                                                 num_bytes,
7489                                                 used_block_group->key.objectid,
7490                                                 &max_extent_size);
7491                         if (offset) {
7492                                 /* we have a block, we're done */
7493                                 spin_unlock(&last_ptr->refill_lock);
7494                                 trace_btrfs_reserve_extent_cluster(
7495                                                 used_block_group,
7496                                                 search_start, num_bytes);
7497                                 if (used_block_group != block_group) {
7498                                         btrfs_release_block_group(block_group,
7499                                                                   delalloc);
7500                                         block_group = used_block_group;
7501                                 }
7502                                 goto checks;
7503                         }
7504
7505                         WARN_ON(last_ptr->block_group != used_block_group);
7506 release_cluster:
7507                         /* If we are on LOOP_NO_EMPTY_SIZE, we can't
7508                          * set up a new clusters, so lets just skip it
7509                          * and let the allocator find whatever block
7510                          * it can find.  If we reach this point, we
7511                          * will have tried the cluster allocator
7512                          * plenty of times and not have found
7513                          * anything, so we are likely way too
7514                          * fragmented for the clustering stuff to find
7515                          * anything.
7516                          *
7517                          * However, if the cluster is taken from the
7518                          * current block group, release the cluster
7519                          * first, so that we stand a better chance of
7520                          * succeeding in the unclustered
7521                          * allocation.  */
7522                         if (loop >= LOOP_NO_EMPTY_SIZE &&
7523                             used_block_group != block_group) {
7524                                 spin_unlock(&last_ptr->refill_lock);
7525                                 btrfs_release_block_group(used_block_group,
7526                                                           delalloc);
7527                                 goto unclustered_alloc;
7528                         }
7529
7530                         /*
7531                          * this cluster didn't work out, free it and
7532                          * start over
7533                          */
7534                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
7535
7536                         if (used_block_group != block_group)
7537                                 btrfs_release_block_group(used_block_group,
7538                                                           delalloc);
7539 refill_cluster:
7540                         if (loop >= LOOP_NO_EMPTY_SIZE) {
7541                                 spin_unlock(&last_ptr->refill_lock);
7542                                 goto unclustered_alloc;
7543                         }
7544
7545                         aligned_cluster = max_t(unsigned long,
7546                                                 empty_cluster + empty_size,
7547                                               block_group->full_stripe_len);
7548
7549                         /* allocate a cluster in this block group */
7550                         ret = btrfs_find_space_cluster(fs_info, block_group,
7551                                                        last_ptr, search_start,
7552                                                        num_bytes,
7553                                                        aligned_cluster);
7554                         if (ret == 0) {
7555                                 /*
7556                                  * now pull our allocation out of this
7557                                  * cluster
7558                                  */
7559                                 offset = btrfs_alloc_from_cluster(block_group,
7560                                                         last_ptr,
7561                                                         num_bytes,
7562                                                         search_start,
7563                                                         &max_extent_size);
7564                                 if (offset) {
7565                                         /* we found one, proceed */
7566                                         spin_unlock(&last_ptr->refill_lock);
7567                                         trace_btrfs_reserve_extent_cluster(
7568                                                 block_group, search_start,
7569                                                 num_bytes);
7570                                         goto checks;
7571                                 }
7572                         } else if (!cached && loop > LOOP_CACHING_NOWAIT
7573                                    && !failed_cluster_refill) {
7574                                 spin_unlock(&last_ptr->refill_lock);
7575
7576                                 failed_cluster_refill = true;
7577                                 wait_block_group_cache_progress(block_group,
7578                                        num_bytes + empty_cluster + empty_size);
7579                                 goto have_block_group;
7580                         }
7581
7582                         /*
7583                          * at this point we either didn't find a cluster
7584                          * or we weren't able to allocate a block from our
7585                          * cluster.  Free the cluster we've been trying
7586                          * to use, and go to the next block group
7587                          */
7588                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
7589                         spin_unlock(&last_ptr->refill_lock);
7590                         goto loop;
7591                 }
7592
7593 unclustered_alloc:
7594                 /*
7595                  * We are doing an unclustered alloc, set the fragmented flag so
7596                  * we don't bother trying to setup a cluster again until we get
7597                  * more space.
7598                  */
7599                 if (unlikely(last_ptr)) {
7600                         spin_lock(&last_ptr->lock);
7601                         last_ptr->fragmented = 1;
7602                         spin_unlock(&last_ptr->lock);
7603                 }
7604                 if (cached) {
7605                         struct btrfs_free_space_ctl *ctl =
7606                                 block_group->free_space_ctl;
7607
7608                         spin_lock(&ctl->tree_lock);
7609                         if (ctl->free_space <
7610                             num_bytes + empty_cluster + empty_size) {
7611                                 if (ctl->free_space > max_extent_size)
7612                                         max_extent_size = ctl->free_space;
7613                                 spin_unlock(&ctl->tree_lock);
7614                                 goto loop;
7615                         }
7616                         spin_unlock(&ctl->tree_lock);
7617                 }
7618
7619                 offset = btrfs_find_space_for_alloc(block_group, search_start,
7620                                                     num_bytes, empty_size,
7621                                                     &max_extent_size);
7622                 /*
7623                  * If we didn't find a chunk, and we haven't failed on this
7624                  * block group before, and this block group is in the middle of
7625                  * caching and we are ok with waiting, then go ahead and wait
7626                  * for progress to be made, and set failed_alloc to true.
7627                  *
7628                  * If failed_alloc is true then we've already waited on this
7629                  * block group once and should move on to the next block group.
7630                  */
7631                 if (!offset && !failed_alloc && !cached &&
7632                     loop > LOOP_CACHING_NOWAIT) {
7633                         wait_block_group_cache_progress(block_group,
7634                                                 num_bytes + empty_size);
7635                         failed_alloc = true;
7636                         goto have_block_group;
7637                 } else if (!offset) {
7638                         goto loop;
7639                 }
7640 checks:
7641                 search_start = round_up(offset, fs_info->stripesize);
7642
7643                 /* move on to the next group */
7644                 if (search_start + num_bytes >
7645                     block_group->key.objectid + block_group->key.offset) {
7646                         btrfs_add_free_space(block_group, offset, num_bytes);
7647                         goto loop;
7648                 }
7649
7650                 if (offset < search_start)
7651                         btrfs_add_free_space(block_group, offset,
7652                                              search_start - offset);
7653
7654                 ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
7655                                 num_bytes, delalloc);
7656                 if (ret == -EAGAIN) {
7657                         btrfs_add_free_space(block_group, offset, num_bytes);
7658                         goto loop;
7659                 }
7660                 btrfs_inc_block_group_reservations(block_group);
7661
7662                 /* we are all good, lets return */
7663                 ins->objectid = search_start;
7664                 ins->offset = num_bytes;
7665
7666                 trace_btrfs_reserve_extent(block_group, search_start, num_bytes);
7667                 btrfs_release_block_group(block_group, delalloc);
7668                 break;
7669 loop:
7670                 failed_cluster_refill = false;
7671                 failed_alloc = false;
7672                 BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
7673                        index);
7674                 btrfs_release_block_group(block_group, delalloc);
7675                 cond_resched();
7676         }
7677         up_read(&space_info->groups_sem);
7678
7679         if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg
7680                 && !orig_have_caching_bg)
7681                 orig_have_caching_bg = true;
7682
7683         if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
7684                 goto search;
7685
7686         if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
7687                 goto search;
7688
7689         /*
7690          * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
7691          *                      caching kthreads as we move along
7692          * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
7693          * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
7694          * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
7695          *                      again
7696          */
7697         if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
7698                 index = 0;
7699                 if (loop == LOOP_CACHING_NOWAIT) {
7700                         /*
7701                          * We want to skip the LOOP_CACHING_WAIT step if we
7702                          * don't have any uncached bgs and we've already done a
7703                          * full search through.
7704                          */
7705                         if (orig_have_caching_bg || !full_search)
7706                                 loop = LOOP_CACHING_WAIT;
7707                         else
7708                                 loop = LOOP_ALLOC_CHUNK;
7709                 } else {
7710                         loop++;
7711                 }
7712
7713                 if (loop == LOOP_ALLOC_CHUNK) {
7714                         struct btrfs_trans_handle *trans;
7715                         int exist = 0;
7716
7717                         trans = current->journal_info;
7718                         if (trans)
7719                                 exist = 1;
7720                         else
7721                                 trans = btrfs_join_transaction(root);
7722
7723                         if (IS_ERR(trans)) {
7724                                 ret = PTR_ERR(trans);
7725                                 goto out;
7726                         }
7727
7728                         ret = do_chunk_alloc(trans, flags, CHUNK_ALLOC_FORCE);
7729
7730                         /*
7731                          * If we can't allocate a new chunk we've already looped
7732                          * through at least once, move on to the NO_EMPTY_SIZE
7733                          * case.
7734                          */
7735                         if (ret == -ENOSPC)
7736                                 loop = LOOP_NO_EMPTY_SIZE;
7737
7738                         /*
7739                          * Do not bail out on ENOSPC since we
7740                          * can do more things.
7741                          */
7742                         if (ret < 0 && ret != -ENOSPC)
7743                                 btrfs_abort_transaction(trans, ret);
7744                         else
7745                                 ret = 0;
7746                         if (!exist)
7747                                 btrfs_end_transaction(trans);
7748                         if (ret)
7749                                 goto out;
7750                 }
7751
7752                 if (loop == LOOP_NO_EMPTY_SIZE) {
7753                         /*
7754                          * Don't loop again if we already have no empty_size and
7755                          * no empty_cluster.
7756                          */
7757                         if (empty_size == 0 &&
7758                             empty_cluster == 0) {
7759                                 ret = -ENOSPC;
7760                                 goto out;
7761                         }
7762                         empty_size = 0;
7763                         empty_cluster = 0;
7764                 }
7765
7766                 goto search;
7767         } else if (!ins->objectid) {
7768                 ret = -ENOSPC;
7769         } else if (ins->objectid) {
7770                 if (!use_cluster && last_ptr) {
7771                         spin_lock(&last_ptr->lock);
7772                         last_ptr->window_start = ins->objectid;
7773                         spin_unlock(&last_ptr->lock);
7774                 }
7775                 ret = 0;
7776         }
7777 out:
7778         if (ret == -ENOSPC) {
7779                 spin_lock(&space_info->lock);
7780                 space_info->max_extent_size = max_extent_size;
7781                 spin_unlock(&space_info->lock);
7782                 ins->offset = max_extent_size;
7783         }
7784         return ret;
7785 }
7786
7787 static void dump_space_info(struct btrfs_fs_info *fs_info,
7788                             struct btrfs_space_info *info, u64 bytes,
7789                             int dump_block_groups)
7790 {
7791         struct btrfs_block_group_cache *cache;
7792         int index = 0;
7793
7794         spin_lock(&info->lock);
7795         btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
7796                    info->flags,
7797                    info->total_bytes - btrfs_space_info_used(info, true),
7798                    info->full ? "" : "not ");
7799         btrfs_info(fs_info,
7800                 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
7801                 info->total_bytes, info->bytes_used, info->bytes_pinned,
7802                 info->bytes_reserved, info->bytes_may_use,
7803                 info->bytes_readonly);
7804         spin_unlock(&info->lock);
7805
7806         if (!dump_block_groups)
7807                 return;
7808
7809         down_read(&info->groups_sem);
7810 again:
7811         list_for_each_entry(cache, &info->block_groups[index], list) {
7812                 spin_lock(&cache->lock);
7813                 btrfs_info(fs_info,
7814                         "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
7815                         cache->key.objectid, cache->key.offset,
7816                         btrfs_block_group_used(&cache->item), cache->pinned,
7817                         cache->reserved, cache->ro ? "[readonly]" : "");
7818                 btrfs_dump_free_space(cache, bytes);
7819                 spin_unlock(&cache->lock);
7820         }
7821         if (++index < BTRFS_NR_RAID_TYPES)
7822                 goto again;
7823         up_read(&info->groups_sem);
7824 }
7825
7826 /*
7827  * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
7828  *                        hole that is at least as big as @num_bytes.
7829  *
7830  * @root           -    The root that will contain this extent
7831  *
7832  * @ram_bytes      -    The amount of space in ram that @num_bytes take. This
7833  *                      is used for accounting purposes. This value differs
7834  *                      from @num_bytes only in the case of compressed extents.
7835  *
7836  * @num_bytes      -    Number of bytes to allocate on-disk.
7837  *
7838  * @min_alloc_size -    Indicates the minimum amount of space that the
7839  *                      allocator should try to satisfy. In some cases
7840  *                      @num_bytes may be larger than what is required and if
7841  *                      the filesystem is fragmented then allocation fails.
7842  *                      However, the presence of @min_alloc_size gives a
7843  *                      chance to try and satisfy the smaller allocation.
7844  *
7845  * @empty_size     -    A hint that you plan on doing more COW. This is the
7846  *                      size in bytes the allocator should try to find free
7847  *                      next to the block it returns.  This is just a hint and
7848  *                      may be ignored by the allocator.
7849  *
7850  * @hint_byte      -    Hint to the allocator to start searching above the byte
7851  *                      address passed. It might be ignored.
7852  *
7853  * @ins            -    This key is modified to record the found hole. It will
7854  *                      have the following values:
7855  *                      ins->objectid == start position
7856  *                      ins->flags = BTRFS_EXTENT_ITEM_KEY
7857  *                      ins->offset == the size of the hole.
7858  *
7859  * @is_data        -    Boolean flag indicating whether an extent is
7860  *                      allocated for data (true) or metadata (false)
7861  *
7862  * @delalloc       -    Boolean flag indicating whether this allocation is for
7863  *                      delalloc or not. If 'true' data_rwsem of block groups
7864  *                      is going to be acquired.
7865  *
7866  *
7867  * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
7868  * case -ENOSPC is returned then @ins->offset will contain the size of the
7869  * largest available hole the allocator managed to find.
7870  */
7871 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
7872                          u64 num_bytes, u64 min_alloc_size,
7873                          u64 empty_size, u64 hint_byte,
7874                          struct btrfs_key *ins, int is_data, int delalloc)
7875 {
7876         struct btrfs_fs_info *fs_info = root->fs_info;
7877         bool final_tried = num_bytes == min_alloc_size;
7878         u64 flags;
7879         int ret;
7880
7881         flags = get_alloc_profile_by_root(root, is_data);
7882 again:
7883         WARN_ON(num_bytes < fs_info->sectorsize);
7884         ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
7885                                hint_byte, ins, flags, delalloc);
7886         if (!ret && !is_data) {
7887                 btrfs_dec_block_group_reservations(fs_info, ins->objectid);
7888         } else if (ret == -ENOSPC) {
7889                 if (!final_tried && ins->offset) {
7890                         num_bytes = min(num_bytes >> 1, ins->offset);
7891                         num_bytes = round_down(num_bytes,
7892                                                fs_info->sectorsize);
7893                         num_bytes = max(num_bytes, min_alloc_size);
7894                         ram_bytes = num_bytes;
7895                         if (num_bytes == min_alloc_size)
7896                                 final_tried = true;
7897                         goto again;
7898                 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
7899                         struct btrfs_space_info *sinfo;
7900
7901                         sinfo = __find_space_info(fs_info, flags);
7902                         btrfs_err(fs_info,
7903                                   "allocation failed flags %llu, wanted %llu",
7904                                   flags, num_bytes);
7905                         if (sinfo)
7906                                 dump_space_info(fs_info, sinfo, num_bytes, 1);
7907                 }
7908         }
7909
7910         return ret;
7911 }
7912
7913 static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
7914                                         u64 start, u64 len,
7915                                         int pin, int delalloc)
7916 {
7917         struct btrfs_block_group_cache *cache;
7918         int ret = 0;
7919
7920         cache = btrfs_lookup_block_group(fs_info, start);
7921         if (!cache) {
7922                 btrfs_err(fs_info, "Unable to find block group for %llu",
7923                           start);
7924                 return -ENOSPC;
7925         }
7926
7927         if (pin)
7928                 pin_down_extent(fs_info, cache, start, len, 1);
7929         else {
7930                 if (btrfs_test_opt(fs_info, DISCARD))
7931                         ret = btrfs_discard_extent(fs_info, start, len, NULL);
7932                 btrfs_add_free_space(cache, start, len);
7933                 btrfs_free_reserved_bytes(cache, len, delalloc);
7934                 trace_btrfs_reserved_extent_free(fs_info, start, len);
7935         }
7936
7937         btrfs_put_block_group(cache);
7938         return ret;
7939 }
7940
7941 int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
7942                                u64 start, u64 len, int delalloc)
7943 {
7944         return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc);
7945 }
7946
7947 int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
7948                                        u64 start, u64 len)
7949 {
7950         return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0);
7951 }
7952
7953 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
7954                                       u64 parent, u64 root_objectid,
7955                                       u64 flags, u64 owner, u64 offset,
7956                                       struct btrfs_key *ins, int ref_mod)
7957 {
7958         struct btrfs_fs_info *fs_info = trans->fs_info;
7959         int ret;
7960         struct btrfs_extent_item *extent_item;
7961         struct btrfs_extent_inline_ref *iref;
7962         struct btrfs_path *path;
7963         struct extent_buffer *leaf;
7964         int type;
7965         u32 size;
7966
7967         if (parent > 0)
7968                 type = BTRFS_SHARED_DATA_REF_KEY;
7969         else
7970                 type = BTRFS_EXTENT_DATA_REF_KEY;
7971
7972         size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
7973
7974         path = btrfs_alloc_path();
7975         if (!path)
7976                 return -ENOMEM;
7977
7978         path->leave_spinning = 1;
7979         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
7980                                       ins, size);
7981         if (ret) {
7982                 btrfs_free_path(path);
7983                 return ret;
7984         }
7985
7986         leaf = path->nodes[0];
7987         extent_item = btrfs_item_ptr(leaf, path->slots[0],
7988                                      struct btrfs_extent_item);
7989         btrfs_set_extent_refs(leaf, extent_item, ref_mod);
7990         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
7991         btrfs_set_extent_flags(leaf, extent_item,
7992                                flags | BTRFS_EXTENT_FLAG_DATA);
7993
7994         iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
7995         btrfs_set_extent_inline_ref_type(leaf, iref, type);
7996         if (parent > 0) {
7997                 struct btrfs_shared_data_ref *ref;
7998                 ref = (struct btrfs_shared_data_ref *)(iref + 1);
7999                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
8000                 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
8001         } else {
8002                 struct btrfs_extent_data_ref *ref;
8003                 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
8004                 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
8005                 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
8006                 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
8007                 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
8008         }
8009
8010         btrfs_mark_buffer_dirty(path->nodes[0]);
8011         btrfs_free_path(path);
8012
8013         ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset);
8014         if (ret)
8015                 return ret;
8016
8017         ret = update_block_group(trans, fs_info, ins->objectid, ins->offset, 1);
8018         if (ret) { /* -ENOENT, logic error */
8019                 btrfs_err(fs_info, "update block group failed for %llu %llu",
8020                         ins->objectid, ins->offset);
8021                 BUG();
8022         }
8023         trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
8024         return ret;
8025 }
8026
8027 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
8028                                      struct btrfs_delayed_ref_node *node,
8029                                      struct btrfs_delayed_extent_op *extent_op)
8030 {
8031         struct btrfs_fs_info *fs_info = trans->fs_info;
8032         int ret;
8033         struct btrfs_extent_item *extent_item;
8034         struct btrfs_key extent_key;
8035         struct btrfs_tree_block_info *block_info;
8036         struct btrfs_extent_inline_ref *iref;
8037         struct btrfs_path *path;
8038         struct extent_buffer *leaf;
8039         struct btrfs_delayed_tree_ref *ref;
8040         u32 size = sizeof(*extent_item) + sizeof(*iref);
8041         u64 num_bytes;
8042         u64 flags = extent_op->flags_to_set;
8043         bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
8044
8045         ref = btrfs_delayed_node_to_tree_ref(node);
8046
8047         extent_key.objectid = node->bytenr;
8048         if (skinny_metadata) {
8049                 extent_key.offset = ref->level;
8050                 extent_key.type = BTRFS_METADATA_ITEM_KEY;
8051                 num_bytes = fs_info->nodesize;
8052         } else {
8053                 extent_key.offset = node->num_bytes;
8054                 extent_key.type = BTRFS_EXTENT_ITEM_KEY;
8055                 size += sizeof(*block_info);
8056                 num_bytes = node->num_bytes;
8057         }
8058
8059         path = btrfs_alloc_path();
8060         if (!path) {
8061                 btrfs_free_and_pin_reserved_extent(fs_info,
8062                                                    extent_key.objectid,
8063                                                    fs_info->nodesize);
8064                 return -ENOMEM;
8065         }
8066
8067         path->leave_spinning = 1;
8068         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
8069                                       &extent_key, size);
8070         if (ret) {
8071                 btrfs_free_path(path);
8072                 btrfs_free_and_pin_reserved_extent(fs_info,
8073                                                    extent_key.objectid,
8074                                                    fs_info->nodesize);
8075                 return ret;
8076         }
8077
8078         leaf = path->nodes[0];
8079         extent_item = btrfs_item_ptr(leaf, path->slots[0],
8080                                      struct btrfs_extent_item);
8081         btrfs_set_extent_refs(leaf, extent_item, 1);
8082         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
8083         btrfs_set_extent_flags(leaf, extent_item,
8084                                flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
8085
8086         if (skinny_metadata) {
8087                 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
8088         } else {
8089                 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
8090                 btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
8091                 btrfs_set_tree_block_level(leaf, block_info, ref->level);
8092                 iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
8093         }
8094
8095         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
8096                 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
8097                 btrfs_set_extent_inline_ref_type(leaf, iref,
8098                                                  BTRFS_SHARED_BLOCK_REF_KEY);
8099                 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
8100         } else {
8101                 btrfs_set_extent_inline_ref_type(leaf, iref,
8102                                                  BTRFS_TREE_BLOCK_REF_KEY);
8103                 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
8104         }
8105
8106         btrfs_mark_buffer_dirty(leaf);
8107         btrfs_free_path(path);
8108
8109         ret = remove_from_free_space_tree(trans, extent_key.objectid,
8110                                           num_bytes);
8111         if (ret)
8112                 return ret;
8113
8114         ret = update_block_group(trans, fs_info, extent_key.objectid,
8115                                  fs_info->nodesize, 1);
8116         if (ret) { /* -ENOENT, logic error */
8117                 btrfs_err(fs_info, "update block group failed for %llu %llu",
8118                         extent_key.objectid, extent_key.offset);
8119                 BUG();
8120         }
8121
8122         trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid,
8123                                           fs_info->nodesize);
8124         return ret;
8125 }
8126
8127 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
8128                                      struct btrfs_root *root, u64 owner,
8129                                      u64 offset, u64 ram_bytes,
8130                                      struct btrfs_key *ins)
8131 {
8132         int ret;
8133
8134         BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
8135
8136         btrfs_ref_tree_mod(root, ins->objectid, ins->offset, 0,
8137                            root->root_key.objectid, owner, offset,
8138                            BTRFS_ADD_DELAYED_EXTENT);
8139
8140         ret = btrfs_add_delayed_data_ref(trans, ins->objectid,
8141                                          ins->offset, 0,
8142                                          root->root_key.objectid, owner,
8143                                          offset, ram_bytes,
8144                                          BTRFS_ADD_DELAYED_EXTENT, NULL, NULL);
8145         return ret;
8146 }
8147
8148 /*
8149  * this is used by the tree logging recovery code.  It records that
8150  * an extent has been allocated and makes sure to clear the free
8151  * space cache bits as well
8152  */
8153 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
8154                                    u64 root_objectid, u64 owner, u64 offset,
8155                                    struct btrfs_key *ins)
8156 {
8157         struct btrfs_fs_info *fs_info = trans->fs_info;
8158         int ret;
8159         struct btrfs_block_group_cache *block_group;
8160         struct btrfs_space_info *space_info;
8161
8162         /*
8163          * Mixed block groups will exclude before processing the log so we only
8164          * need to do the exclude dance if this fs isn't mixed.
8165          */
8166         if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
8167                 ret = __exclude_logged_extent(fs_info, ins->objectid,
8168                                               ins->offset);
8169                 if (ret)
8170                         return ret;
8171         }
8172
8173         block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
8174         if (!block_group)
8175                 return -EINVAL;
8176
8177         space_info = block_group->space_info;
8178         spin_lock(&space_info->lock);
8179         spin_lock(&block_group->lock);
8180         space_info->bytes_reserved += ins->offset;
8181         block_group->reserved += ins->offset;
8182         spin_unlock(&block_group->lock);
8183         spin_unlock(&space_info->lock);
8184
8185         ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
8186                                          offset, ins, 1);
8187         btrfs_put_block_group(block_group);
8188         return ret;
8189 }
8190
8191 static struct extent_buffer *
8192 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
8193                       u64 bytenr, int level, u64 owner)
8194 {
8195         struct btrfs_fs_info *fs_info = root->fs_info;
8196         struct extent_buffer *buf;
8197
8198         buf = btrfs_find_create_tree_block(fs_info, bytenr);
8199         if (IS_ERR(buf))
8200                 return buf;
8201
8202         /*
8203          * Extra safety check in case the extent tree is corrupted and extent
8204          * allocator chooses to use a tree block which is already used and
8205          * locked.
8206          */
8207         if (buf->lock_owner == current->pid) {
8208                 btrfs_err_rl(fs_info,
8209 "tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
8210                         buf->start, btrfs_header_owner(buf), current->pid);
8211                 free_extent_buffer(buf);
8212                 return ERR_PTR(-EUCLEAN);
8213         }
8214
8215         btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
8216         btrfs_tree_lock(buf);
8217         clean_tree_block(fs_info, buf);
8218         clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
8219
8220         btrfs_set_lock_blocking(buf);
8221         set_extent_buffer_uptodate(buf);
8222
8223         memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
8224         btrfs_set_header_level(buf, level);
8225         btrfs_set_header_bytenr(buf, buf->start);
8226         btrfs_set_header_generation(buf, trans->transid);
8227         btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
8228         btrfs_set_header_owner(buf, owner);
8229         write_extent_buffer_fsid(buf, fs_info->fsid);
8230         write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
8231         if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
8232                 buf->log_index = root->log_transid % 2;
8233                 /*
8234                  * we allow two log transactions at a time, use different
8235                  * EXENT bit to differentiate dirty pages.
8236                  */
8237                 if (buf->log_index == 0)
8238                         set_extent_dirty(&root->dirty_log_pages, buf->start,
8239                                         buf->start + buf->len - 1, GFP_NOFS);
8240                 else
8241                         set_extent_new(&root->dirty_log_pages, buf->start,
8242                                         buf->start + buf->len - 1);
8243         } else {
8244                 buf->log_index = -1;
8245                 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
8246                          buf->start + buf->len - 1, GFP_NOFS);
8247         }
8248         trans->dirty = true;
8249         /* this returns a buffer locked for blocking */
8250         return buf;
8251 }
8252
8253 static struct btrfs_block_rsv *
8254 use_block_rsv(struct btrfs_trans_handle *trans,
8255               struct btrfs_root *root, u32 blocksize)
8256 {
8257         struct btrfs_fs_info *fs_info = root->fs_info;
8258         struct btrfs_block_rsv *block_rsv;
8259         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
8260         int ret;
8261         bool global_updated = false;
8262
8263         block_rsv = get_block_rsv(trans, root);
8264
8265         if (unlikely(block_rsv->size == 0))
8266                 goto try_reserve;
8267 again:
8268         ret = block_rsv_use_bytes(block_rsv, blocksize);
8269         if (!ret)
8270                 return block_rsv;
8271
8272         if (block_rsv->failfast)
8273                 return ERR_PTR(ret);
8274
8275         if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
8276                 global_updated = true;
8277                 update_global_block_rsv(fs_info);
8278                 goto again;
8279         }
8280
8281         if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
8282                 static DEFINE_RATELIMIT_STATE(_rs,
8283                                 DEFAULT_RATELIMIT_INTERVAL * 10,
8284                                 /*DEFAULT_RATELIMIT_BURST*/ 1);
8285                 if (__ratelimit(&_rs))
8286                         WARN(1, KERN_DEBUG
8287                                 "BTRFS: block rsv returned %d\n", ret);
8288         }
8289 try_reserve:
8290         ret = reserve_metadata_bytes(root, block_rsv, blocksize,
8291                                      BTRFS_RESERVE_NO_FLUSH);
8292         if (!ret)
8293                 return block_rsv;
8294         /*
8295          * If we couldn't reserve metadata bytes try and use some from
8296          * the global reserve if its space type is the same as the global
8297          * reservation.
8298          */
8299         if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
8300             block_rsv->space_info == global_rsv->space_info) {
8301                 ret = block_rsv_use_bytes(global_rsv, blocksize);
8302                 if (!ret)
8303                         return global_rsv;
8304         }
8305         return ERR_PTR(ret);
8306 }
8307
8308 static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
8309                             struct btrfs_block_rsv *block_rsv, u32 blocksize)
8310 {
8311         block_rsv_add_bytes(block_rsv, blocksize, false);
8312         block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL);
8313 }
8314
8315 /*
8316  * finds a free extent and does all the dirty work required for allocation
8317  * returns the tree buffer or an ERR_PTR on error.
8318  */
8319 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
8320                                              struct btrfs_root *root,
8321                                              u64 parent, u64 root_objectid,
8322                                              const struct btrfs_disk_key *key,
8323                                              int level, u64 hint,
8324                                              u64 empty_size)
8325 {
8326         struct btrfs_fs_info *fs_info = root->fs_info;
8327         struct btrfs_key ins;
8328         struct btrfs_block_rsv *block_rsv;
8329         struct extent_buffer *buf;
8330         struct btrfs_delayed_extent_op *extent_op;
8331         u64 flags = 0;
8332         int ret;
8333         u32 blocksize = fs_info->nodesize;
8334         bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
8335
8336 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
8337         if (btrfs_is_testing(fs_info)) {
8338                 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
8339                                             level, root_objectid);
8340                 if (!IS_ERR(buf))
8341                         root->alloc_bytenr += blocksize;
8342                 return buf;
8343         }
8344 #endif
8345
8346         block_rsv = use_block_rsv(trans, root, blocksize);
8347         if (IS_ERR(block_rsv))
8348                 return ERR_CAST(block_rsv);
8349
8350         ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
8351                                    empty_size, hint, &ins, 0, 0);
8352         if (ret)
8353                 goto out_unuse;
8354
8355         buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
8356                                     root_objectid);
8357         if (IS_ERR(buf)) {
8358                 ret = PTR_ERR(buf);
8359                 goto out_free_reserved;
8360         }
8361
8362         if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
8363                 if (parent == 0)
8364                         parent = ins.objectid;
8365                 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
8366         } else
8367                 BUG_ON(parent > 0);
8368
8369         if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
8370                 extent_op = btrfs_alloc_delayed_extent_op();
8371                 if (!extent_op) {
8372                         ret = -ENOMEM;
8373                         goto out_free_buf;
8374                 }
8375                 if (key)
8376                         memcpy(&extent_op->key, key, sizeof(extent_op->key));
8377                 else
8378                         memset(&extent_op->key, 0, sizeof(extent_op->key));
8379                 extent_op->flags_to_set = flags;
8380                 extent_op->update_key = skinny_metadata ? false : true;
8381                 extent_op->update_flags = true;
8382                 extent_op->is_data = false;
8383                 extent_op->level = level;
8384
8385                 btrfs_ref_tree_mod(root, ins.objectid, ins.offset, parent,
8386                                    root_objectid, level, 0,
8387                                    BTRFS_ADD_DELAYED_EXTENT);
8388                 ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
8389                                                  ins.offset, parent,
8390                                                  root_objectid, level,
8391                                                  BTRFS_ADD_DELAYED_EXTENT,
8392                                                  extent_op, NULL, NULL);
8393                 if (ret)
8394                         goto out_free_delayed;
8395         }
8396         return buf;
8397
8398 out_free_delayed:
8399         btrfs_free_delayed_extent_op(extent_op);
8400 out_free_buf:
8401         free_extent_buffer(buf);
8402 out_free_reserved:
8403         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
8404 out_unuse:
8405         unuse_block_rsv(fs_info, block_rsv, blocksize);
8406         return ERR_PTR(ret);
8407 }
8408
8409 struct walk_control {
8410         u64 refs[BTRFS_MAX_LEVEL];
8411         u64 flags[BTRFS_MAX_LEVEL];
8412         struct btrfs_key update_progress;
8413         int stage;
8414         int level;
8415         int shared_level;
8416         int update_ref;
8417         int keep_locks;
8418         int reada_slot;
8419         int reada_count;
8420 };
8421
8422 #define DROP_REFERENCE  1
8423 #define UPDATE_BACKREF  2
8424
8425 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
8426                                      struct btrfs_root *root,
8427                                      struct walk_control *wc,
8428                                      struct btrfs_path *path)
8429 {
8430         struct btrfs_fs_info *fs_info = root->fs_info;
8431         u64 bytenr;
8432         u64 generation;
8433         u64 refs;
8434         u64 flags;
8435         u32 nritems;
8436         struct btrfs_key key;
8437         struct extent_buffer *eb;
8438         int ret;
8439         int slot;
8440         int nread = 0;
8441
8442         if (path->slots[wc->level] < wc->reada_slot) {
8443                 wc->reada_count = wc->reada_count * 2 / 3;
8444                 wc->reada_count = max(wc->reada_count, 2);
8445         } else {
8446                 wc->reada_count = wc->reada_count * 3 / 2;
8447                 wc->reada_count = min_t(int, wc->reada_count,
8448                                         BTRFS_NODEPTRS_PER_BLOCK(fs_info));
8449         }
8450
8451         eb = path->nodes[wc->level];
8452         nritems = btrfs_header_nritems(eb);
8453
8454         for (slot = path->slots[wc->level]; slot < nritems; slot++) {
8455                 if (nread >= wc->reada_count)
8456                         break;
8457
8458                 cond_resched();
8459                 bytenr = btrfs_node_blockptr(eb, slot);
8460                 generation = btrfs_node_ptr_generation(eb, slot);
8461
8462                 if (slot == path->slots[wc->level])
8463                         goto reada;
8464
8465                 if (wc->stage == UPDATE_BACKREF &&
8466                     generation <= root->root_key.offset)
8467                         continue;
8468
8469                 /* We don't lock the tree block, it's OK to be racy here */
8470                 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
8471                                                wc->level - 1, 1, &refs,
8472                                                &flags);
8473                 /* We don't care about errors in readahead. */
8474                 if (ret < 0)
8475                         continue;
8476                 BUG_ON(refs == 0);
8477
8478                 if (wc->stage == DROP_REFERENCE) {
8479                         if (refs == 1)
8480                                 goto reada;
8481
8482                         if (wc->level == 1 &&
8483                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8484                                 continue;
8485                         if (!wc->update_ref ||
8486                             generation <= root->root_key.offset)
8487                                 continue;
8488                         btrfs_node_key_to_cpu(eb, &key, slot);
8489                         ret = btrfs_comp_cpu_keys(&key,
8490                                                   &wc->update_progress);
8491                         if (ret < 0)
8492                                 continue;
8493                 } else {
8494                         if (wc->level == 1 &&
8495                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8496                                 continue;
8497                 }
8498 reada:
8499                 readahead_tree_block(fs_info, bytenr);
8500                 nread++;
8501         }
8502         wc->reada_slot = slot;
8503 }
8504
8505 /*
8506  * helper to process tree block while walking down the tree.
8507  *
8508  * when wc->stage == UPDATE_BACKREF, this function updates
8509  * back refs for pointers in the block.
8510  *
8511  * NOTE: return value 1 means we should stop walking down.
8512  */
8513 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
8514                                    struct btrfs_root *root,
8515                                    struct btrfs_path *path,
8516                                    struct walk_control *wc, int lookup_info)
8517 {
8518         struct btrfs_fs_info *fs_info = root->fs_info;
8519         int level = wc->level;
8520         struct extent_buffer *eb = path->nodes[level];
8521         u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
8522         int ret;
8523
8524         if (wc->stage == UPDATE_BACKREF &&
8525             btrfs_header_owner(eb) != root->root_key.objectid)
8526                 return 1;
8527
8528         /*
8529          * when reference count of tree block is 1, it won't increase
8530          * again. once full backref flag is set, we never clear it.
8531          */
8532         if (lookup_info &&
8533             ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
8534              (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
8535                 BUG_ON(!path->locks[level]);
8536                 ret = btrfs_lookup_extent_info(trans, fs_info,
8537                                                eb->start, level, 1,
8538                                                &wc->refs[level],
8539                                                &wc->flags[level]);
8540                 BUG_ON(ret == -ENOMEM);
8541                 if (ret)
8542                         return ret;
8543                 BUG_ON(wc->refs[level] == 0);
8544         }
8545
8546         if (wc->stage == DROP_REFERENCE) {
8547                 if (wc->refs[level] > 1)
8548                         return 1;
8549
8550                 if (path->locks[level] && !wc->keep_locks) {
8551                         btrfs_tree_unlock_rw(eb, path->locks[level]);
8552                         path->locks[level] = 0;
8553                 }
8554                 return 0;
8555         }
8556
8557         /* wc->stage == UPDATE_BACKREF */
8558         if (!(wc->flags[level] & flag)) {
8559                 BUG_ON(!path->locks[level]);
8560                 ret = btrfs_inc_ref(trans, root, eb, 1);
8561                 BUG_ON(ret); /* -ENOMEM */
8562                 ret = btrfs_dec_ref(trans, root, eb, 0);
8563                 BUG_ON(ret); /* -ENOMEM */
8564                 ret = btrfs_set_disk_extent_flags(trans, fs_info, eb->start,
8565                                                   eb->len, flag,
8566                                                   btrfs_header_level(eb), 0);
8567                 BUG_ON(ret); /* -ENOMEM */
8568                 wc->flags[level] |= flag;
8569         }
8570
8571         /*
8572          * the block is shared by multiple trees, so it's not good to
8573          * keep the tree lock
8574          */
8575         if (path->locks[level] && level > 0) {
8576                 btrfs_tree_unlock_rw(eb, path->locks[level]);
8577                 path->locks[level] = 0;
8578         }
8579         return 0;
8580 }
8581
8582 /*
8583  * helper to process tree block pointer.
8584  *
8585  * when wc->stage == DROP_REFERENCE, this function checks
8586  * reference count of the block pointed to. if the block
8587  * is shared and we need update back refs for the subtree
8588  * rooted at the block, this function changes wc->stage to
8589  * UPDATE_BACKREF. if the block is shared and there is no
8590  * need to update back, this function drops the reference
8591  * to the block.
8592  *
8593  * NOTE: return value 1 means we should stop walking down.
8594  */
8595 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
8596                                  struct btrfs_root *root,
8597                                  struct btrfs_path *path,
8598                                  struct walk_control *wc, int *lookup_info)
8599 {
8600         struct btrfs_fs_info *fs_info = root->fs_info;
8601         u64 bytenr;
8602         u64 generation;
8603         u64 parent;
8604         u32 blocksize;
8605         struct btrfs_key key;
8606         struct btrfs_key first_key;
8607         struct extent_buffer *next;
8608         int level = wc->level;
8609         int reada = 0;
8610         int ret = 0;
8611         bool need_account = false;
8612
8613         generation = btrfs_node_ptr_generation(path->nodes[level],
8614                                                path->slots[level]);
8615         /*
8616          * if the lower level block was created before the snapshot
8617          * was created, we know there is no need to update back refs
8618          * for the subtree
8619          */
8620         if (wc->stage == UPDATE_BACKREF &&
8621             generation <= root->root_key.offset) {
8622                 *lookup_info = 1;
8623                 return 1;
8624         }
8625
8626         bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
8627         btrfs_node_key_to_cpu(path->nodes[level], &first_key,
8628                               path->slots[level]);
8629         blocksize = fs_info->nodesize;
8630
8631         next = find_extent_buffer(fs_info, bytenr);
8632         if (!next) {
8633                 next = btrfs_find_create_tree_block(fs_info, bytenr);
8634                 if (IS_ERR(next))
8635                         return PTR_ERR(next);
8636
8637                 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
8638                                                level - 1);
8639                 reada = 1;
8640         }
8641         btrfs_tree_lock(next);
8642         btrfs_set_lock_blocking(next);
8643
8644         ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
8645                                        &wc->refs[level - 1],
8646                                        &wc->flags[level - 1]);
8647         if (ret < 0)
8648                 goto out_unlock;
8649
8650         if (unlikely(wc->refs[level - 1] == 0)) {
8651                 btrfs_err(fs_info, "Missing references.");
8652                 ret = -EIO;
8653                 goto out_unlock;
8654         }
8655         *lookup_info = 0;
8656
8657         if (wc->stage == DROP_REFERENCE) {
8658                 if (wc->refs[level - 1] > 1) {
8659                         need_account = true;
8660                         if (level == 1 &&
8661                             (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8662                                 goto skip;
8663
8664                         if (!wc->update_ref ||
8665                             generation <= root->root_key.offset)
8666                                 goto skip;
8667
8668                         btrfs_node_key_to_cpu(path->nodes[level], &key,
8669                                               path->slots[level]);
8670                         ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
8671                         if (ret < 0)
8672                                 goto skip;
8673
8674                         wc->stage = UPDATE_BACKREF;
8675                         wc->shared_level = level - 1;
8676                 }
8677         } else {
8678                 if (level == 1 &&
8679                     (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8680                         goto skip;
8681         }
8682
8683         if (!btrfs_buffer_uptodate(next, generation, 0)) {
8684                 btrfs_tree_unlock(next);
8685                 free_extent_buffer(next);
8686                 next = NULL;
8687                 *lookup_info = 1;
8688         }
8689
8690         if (!next) {
8691                 if (reada && level == 1)
8692                         reada_walk_down(trans, root, wc, path);
8693                 next = read_tree_block(fs_info, bytenr, generation, level - 1,
8694                                        &first_key);
8695                 if (IS_ERR(next)) {
8696                         return PTR_ERR(next);
8697                 } else if (!extent_buffer_uptodate(next)) {
8698                         free_extent_buffer(next);
8699                         return -EIO;
8700                 }
8701                 btrfs_tree_lock(next);
8702                 btrfs_set_lock_blocking(next);
8703         }
8704
8705         level--;
8706         ASSERT(level == btrfs_header_level(next));
8707         if (level != btrfs_header_level(next)) {
8708                 btrfs_err(root->fs_info, "mismatched level");
8709                 ret = -EIO;
8710                 goto out_unlock;
8711         }
8712         path->nodes[level] = next;
8713         path->slots[level] = 0;
8714         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8715         wc->level = level;
8716         if (wc->level == 1)
8717                 wc->reada_slot = 0;
8718         return 0;
8719 skip:
8720         wc->refs[level - 1] = 0;
8721         wc->flags[level - 1] = 0;
8722         if (wc->stage == DROP_REFERENCE) {
8723                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
8724                         parent = path->nodes[level]->start;
8725                 } else {
8726                         ASSERT(root->root_key.objectid ==
8727                                btrfs_header_owner(path->nodes[level]));
8728                         if (root->root_key.objectid !=
8729                             btrfs_header_owner(path->nodes[level])) {
8730                                 btrfs_err(root->fs_info,
8731                                                 "mismatched block owner");
8732                                 ret = -EIO;
8733                                 goto out_unlock;
8734                         }
8735                         parent = 0;
8736                 }
8737
8738                 if (need_account) {
8739                         ret = btrfs_qgroup_trace_subtree(trans, next,
8740                                                          generation, level - 1);
8741                         if (ret) {
8742                                 btrfs_err_rl(fs_info,
8743                                              "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
8744                                              ret);
8745                         }
8746                 }
8747                 ret = btrfs_free_extent(trans, root, bytenr, blocksize,
8748                                         parent, root->root_key.objectid,
8749                                         level - 1, 0);
8750                 if (ret)
8751                         goto out_unlock;
8752         }
8753
8754         *lookup_info = 1;
8755         ret = 1;
8756
8757 out_unlock:
8758         btrfs_tree_unlock(next);
8759         free_extent_buffer(next);
8760
8761         return ret;
8762 }
8763
8764 /*
8765  * helper to process tree block while walking up the tree.
8766  *
8767  * when wc->stage == DROP_REFERENCE, this function drops
8768  * reference count on the block.
8769  *
8770  * when wc->stage == UPDATE_BACKREF, this function changes
8771  * wc->stage back to DROP_REFERENCE if we changed wc->stage
8772  * to UPDATE_BACKREF previously while processing the block.
8773  *
8774  * NOTE: return value 1 means we should stop walking up.
8775  */
8776 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
8777                                  struct btrfs_root *root,
8778                                  struct btrfs_path *path,
8779                                  struct walk_control *wc)
8780 {
8781         struct btrfs_fs_info *fs_info = root->fs_info;
8782         int ret;
8783         int level = wc->level;
8784         struct extent_buffer *eb = path->nodes[level];
8785         u64 parent = 0;
8786
8787         if (wc->stage == UPDATE_BACKREF) {
8788                 BUG_ON(wc->shared_level < level);
8789                 if (level < wc->shared_level)
8790                         goto out;
8791
8792                 ret = find_next_key(path, level + 1, &wc->update_progress);
8793                 if (ret > 0)
8794                         wc->update_ref = 0;
8795
8796                 wc->stage = DROP_REFERENCE;
8797                 wc->shared_level = -1;
8798                 path->slots[level] = 0;
8799
8800                 /*
8801                  * check reference count again if the block isn't locked.
8802                  * we should start walking down the tree again if reference
8803                  * count is one.
8804                  */
8805                 if (!path->locks[level]) {
8806                         BUG_ON(level == 0);
8807                         btrfs_tree_lock(eb);
8808                         btrfs_set_lock_blocking(eb);
8809                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8810
8811                         ret = btrfs_lookup_extent_info(trans, fs_info,
8812                                                        eb->start, level, 1,
8813                                                        &wc->refs[level],
8814                                                        &wc->flags[level]);
8815                         if (ret < 0) {
8816                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
8817                                 path->locks[level] = 0;
8818                                 return ret;
8819                         }
8820                         BUG_ON(wc->refs[level] == 0);
8821                         if (wc->refs[level] == 1) {
8822                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
8823                                 path->locks[level] = 0;
8824                                 return 1;
8825                         }
8826                 }
8827         }
8828
8829         /* wc->stage == DROP_REFERENCE */
8830         BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
8831
8832         if (wc->refs[level] == 1) {
8833                 if (level == 0) {
8834                         if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8835                                 ret = btrfs_dec_ref(trans, root, eb, 1);
8836                         else
8837                                 ret = btrfs_dec_ref(trans, root, eb, 0);
8838                         BUG_ON(ret); /* -ENOMEM */
8839                         ret = btrfs_qgroup_trace_leaf_items(trans, eb);
8840                         if (ret) {
8841                                 btrfs_err_rl(fs_info,
8842                                              "error %d accounting leaf items. Quota is out of sync, rescan required.",
8843                                              ret);
8844                         }
8845                 }
8846                 /* make block locked assertion in clean_tree_block happy */
8847                 if (!path->locks[level] &&
8848                     btrfs_header_generation(eb) == trans->transid) {
8849                         btrfs_tree_lock(eb);
8850                         btrfs_set_lock_blocking(eb);
8851                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8852                 }
8853                 clean_tree_block(fs_info, eb);
8854         }
8855
8856         if (eb == root->node) {
8857                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8858                         parent = eb->start;
8859                 else if (root->root_key.objectid != btrfs_header_owner(eb))
8860                         goto owner_mismatch;
8861         } else {
8862                 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8863                         parent = path->nodes[level + 1]->start;
8864                 else if (root->root_key.objectid !=
8865                          btrfs_header_owner(path->nodes[level + 1]))
8866                         goto owner_mismatch;
8867         }
8868
8869         btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
8870 out:
8871         wc->refs[level] = 0;
8872         wc->flags[level] = 0;
8873         return 0;
8874
8875 owner_mismatch:
8876         btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
8877                      btrfs_header_owner(eb), root->root_key.objectid);
8878         return -EUCLEAN;
8879 }
8880
8881 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
8882                                    struct btrfs_root *root,
8883                                    struct btrfs_path *path,
8884                                    struct walk_control *wc)
8885 {
8886         int level = wc->level;
8887         int lookup_info = 1;
8888         int ret;
8889
8890         while (level >= 0) {
8891                 ret = walk_down_proc(trans, root, path, wc, lookup_info);
8892                 if (ret > 0)
8893                         break;
8894
8895                 if (level == 0)
8896                         break;
8897
8898                 if (path->slots[level] >=
8899                     btrfs_header_nritems(path->nodes[level]))
8900                         break;
8901
8902                 ret = do_walk_down(trans, root, path, wc, &lookup_info);
8903                 if (ret > 0) {
8904                         path->slots[level]++;
8905                         continue;
8906                 } else if (ret < 0)
8907                         return ret;
8908                 level = wc->level;
8909         }
8910         return 0;
8911 }
8912
8913 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
8914                                  struct btrfs_root *root,
8915                                  struct btrfs_path *path,
8916                                  struct walk_control *wc, int max_level)
8917 {
8918         int level = wc->level;
8919         int ret;
8920
8921         path->slots[level] = btrfs_header_nritems(path->nodes[level]);
8922         while (level < max_level && path->nodes[level]) {
8923                 wc->level = level;
8924                 if (path->slots[level] + 1 <
8925                     btrfs_header_nritems(path->nodes[level])) {
8926                         path->slots[level]++;
8927                         return 0;
8928                 } else {
8929                         ret = walk_up_proc(trans, root, path, wc);
8930                         if (ret > 0)
8931                                 return 0;
8932                         if (ret < 0)
8933                                 return ret;
8934
8935                         if (path->locks[level]) {
8936                                 btrfs_tree_unlock_rw(path->nodes[level],
8937                                                      path->locks[level]);
8938                                 path->locks[level] = 0;
8939                         }
8940                         free_extent_buffer(path->nodes[level]);
8941                         path->nodes[level] = NULL;
8942                         level++;
8943                 }
8944         }
8945         return 1;
8946 }
8947
8948 /*
8949  * drop a subvolume tree.
8950  *
8951  * this function traverses the tree freeing any blocks that only
8952  * referenced by the tree.
8953  *
8954  * when a shared tree block is found. this function decreases its
8955  * reference count by one. if update_ref is true, this function
8956  * also make sure backrefs for the shared block and all lower level
8957  * blocks are properly updated.
8958  *
8959  * If called with for_reloc == 0, may exit early with -EAGAIN
8960  */
8961 int btrfs_drop_snapshot(struct btrfs_root *root,
8962                          struct btrfs_block_rsv *block_rsv, int update_ref,
8963                          int for_reloc)
8964 {
8965         struct btrfs_fs_info *fs_info = root->fs_info;
8966         struct btrfs_path *path;
8967         struct btrfs_trans_handle *trans;
8968         struct btrfs_root *tree_root = fs_info->tree_root;
8969         struct btrfs_root_item *root_item = &root->root_item;
8970         struct walk_control *wc;
8971         struct btrfs_key key;
8972         int err = 0;
8973         int ret;
8974         int level;
8975         bool root_dropped = false;
8976
8977         btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
8978
8979         path = btrfs_alloc_path();
8980         if (!path) {
8981                 err = -ENOMEM;
8982                 goto out;
8983         }
8984
8985         wc = kzalloc(sizeof(*wc), GFP_NOFS);
8986         if (!wc) {
8987                 btrfs_free_path(path);
8988                 err = -ENOMEM;
8989                 goto out;
8990         }
8991
8992         trans = btrfs_start_transaction(tree_root, 0);
8993         if (IS_ERR(trans)) {
8994                 err = PTR_ERR(trans);
8995                 goto out_free;
8996         }
8997
8998         if (block_rsv)
8999                 trans->block_rsv = block_rsv;
9000
9001         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
9002                 level = btrfs_header_level(root->node);
9003                 path->nodes[level] = btrfs_lock_root_node(root);
9004                 btrfs_set_lock_blocking(path->nodes[level]);
9005                 path->slots[level] = 0;
9006                 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9007                 memset(&wc->update_progress, 0,
9008                        sizeof(wc->update_progress));
9009         } else {
9010                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
9011                 memcpy(&wc->update_progress, &key,
9012                        sizeof(wc->update_progress));
9013
9014                 level = root_item->drop_level;
9015                 BUG_ON(level == 0);
9016                 path->lowest_level = level;
9017                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
9018                 path->lowest_level = 0;
9019                 if (ret < 0) {
9020                         err = ret;
9021                         goto out_end_trans;
9022                 }
9023                 WARN_ON(ret > 0);
9024
9025                 /*
9026                  * unlock our path, this is safe because only this
9027                  * function is allowed to delete this snapshot
9028                  */
9029                 btrfs_unlock_up_safe(path, 0);
9030
9031                 level = btrfs_header_level(root->node);
9032                 while (1) {
9033                         btrfs_tree_lock(path->nodes[level]);
9034                         btrfs_set_lock_blocking(path->nodes[level]);
9035                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9036
9037                         ret = btrfs_lookup_extent_info(trans, fs_info,
9038                                                 path->nodes[level]->start,
9039                                                 level, 1, &wc->refs[level],
9040                                                 &wc->flags[level]);
9041                         if (ret < 0) {
9042                                 err = ret;
9043                                 goto out_end_trans;
9044                         }
9045                         BUG_ON(wc->refs[level] == 0);
9046
9047                         if (level == root_item->drop_level)
9048                                 break;
9049
9050                         btrfs_tree_unlock(path->nodes[level]);
9051                         path->locks[level] = 0;
9052                         WARN_ON(wc->refs[level] != 1);
9053                         level--;
9054                 }
9055         }
9056
9057         wc->level = level;
9058         wc->shared_level = -1;
9059         wc->stage = DROP_REFERENCE;
9060         wc->update_ref = update_ref;
9061         wc->keep_locks = 0;
9062         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
9063
9064         while (1) {
9065
9066                 ret = walk_down_tree(trans, root, path, wc);
9067                 if (ret < 0) {
9068                         err = ret;
9069                         break;
9070                 }
9071
9072                 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
9073                 if (ret < 0) {
9074                         err = ret;
9075                         break;
9076                 }
9077
9078                 if (ret > 0) {
9079                         BUG_ON(wc->stage != DROP_REFERENCE);
9080                         break;
9081                 }
9082
9083                 if (wc->stage == DROP_REFERENCE) {
9084                         level = wc->level;
9085                         btrfs_node_key(path->nodes[level],
9086                                        &root_item->drop_progress,
9087                                        path->slots[level]);
9088                         root_item->drop_level = level;
9089                 }
9090
9091                 BUG_ON(wc->level == 0);
9092                 if (btrfs_should_end_transaction(trans) ||
9093                     (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
9094                         ret = btrfs_update_root(trans, tree_root,
9095                                                 &root->root_key,
9096                                                 root_item);
9097                         if (ret) {
9098                                 btrfs_abort_transaction(trans, ret);
9099                                 err = ret;
9100                                 goto out_end_trans;
9101                         }
9102
9103                         btrfs_end_transaction_throttle(trans);
9104                         if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
9105                                 btrfs_debug(fs_info,
9106                                             "drop snapshot early exit");
9107                                 err = -EAGAIN;
9108                                 goto out_free;
9109                         }
9110
9111                         trans = btrfs_start_transaction(tree_root, 0);
9112                         if (IS_ERR(trans)) {
9113                                 err = PTR_ERR(trans);
9114                                 goto out_free;
9115                         }
9116                         if (block_rsv)
9117                                 trans->block_rsv = block_rsv;
9118                 }
9119         }
9120         btrfs_release_path(path);
9121         if (err)
9122                 goto out_end_trans;
9123
9124         ret = btrfs_del_root(trans, &root->root_key);
9125         if (ret) {
9126                 btrfs_abort_transaction(trans, ret);
9127                 err = ret;
9128                 goto out_end_trans;
9129         }
9130
9131         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
9132                 ret = btrfs_find_root(tree_root, &root->root_key, path,
9133                                       NULL, NULL);
9134                 if (ret < 0) {
9135                         btrfs_abort_transaction(trans, ret);
9136                         err = ret;
9137                         goto out_end_trans;
9138                 } else if (ret > 0) {
9139                         /* if we fail to delete the orphan item this time
9140                          * around, it'll get picked up the next time.
9141                          *
9142                          * The most common failure here is just -ENOENT.
9143                          */
9144                         btrfs_del_orphan_item(trans, tree_root,
9145                                               root->root_key.objectid);
9146                 }
9147         }
9148
9149         if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
9150                 btrfs_add_dropped_root(trans, root);
9151         } else {
9152                 free_extent_buffer(root->node);
9153                 free_extent_buffer(root->commit_root);
9154                 btrfs_put_fs_root(root);
9155         }
9156         root_dropped = true;
9157 out_end_trans:
9158         btrfs_end_transaction_throttle(trans);
9159 out_free:
9160         kfree(wc);
9161         btrfs_free_path(path);
9162 out:
9163         /*
9164          * So if we need to stop dropping the snapshot for whatever reason we
9165          * need to make sure to add it back to the dead root list so that we
9166          * keep trying to do the work later.  This also cleans up roots if we
9167          * don't have it in the radix (like when we recover after a power fail
9168          * or unmount) so we don't leak memory.
9169          */
9170         if (!for_reloc && !root_dropped)
9171                 btrfs_add_dead_root(root);
9172         if (err && err != -EAGAIN)
9173                 btrfs_handle_fs_error(fs_info, err, NULL);
9174         return err;
9175 }
9176
9177 /*
9178  * drop subtree rooted at tree block 'node'.
9179  *
9180  * NOTE: this function will unlock and release tree block 'node'
9181  * only used by relocation code
9182  */
9183 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
9184                         struct btrfs_root *root,
9185                         struct extent_buffer *node,
9186                         struct extent_buffer *parent)
9187 {
9188         struct btrfs_fs_info *fs_info = root->fs_info;
9189         struct btrfs_path *path;
9190         struct walk_control *wc;
9191         int level;
9192         int parent_level;
9193         int ret = 0;
9194         int wret;
9195
9196         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
9197
9198         path = btrfs_alloc_path();
9199         if (!path)
9200                 return -ENOMEM;
9201
9202         wc = kzalloc(sizeof(*wc), GFP_NOFS);
9203         if (!wc) {
9204                 btrfs_free_path(path);
9205                 return -ENOMEM;
9206         }
9207
9208         btrfs_assert_tree_locked(parent);
9209         parent_level = btrfs_header_level(parent);
9210         extent_buffer_get(parent);
9211         path->nodes[parent_level] = parent;
9212         path->slots[parent_level] = btrfs_header_nritems(parent);
9213
9214         btrfs_assert_tree_locked(node);
9215         level = btrfs_header_level(node);
9216         path->nodes[level] = node;
9217         path->slots[level] = 0;
9218         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9219
9220         wc->refs[parent_level] = 1;
9221         wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
9222         wc->level = level;
9223         wc->shared_level = -1;
9224         wc->stage = DROP_REFERENCE;
9225         wc->update_ref = 0;
9226         wc->keep_locks = 1;
9227         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
9228
9229         while (1) {
9230                 wret = walk_down_tree(trans, root, path, wc);
9231                 if (wret < 0) {
9232                         ret = wret;
9233                         break;
9234                 }
9235
9236                 wret = walk_up_tree(trans, root, path, wc, parent_level);
9237                 if (wret < 0)
9238                         ret = wret;
9239                 if (wret != 0)
9240                         break;
9241         }
9242
9243         kfree(wc);
9244         btrfs_free_path(path);
9245         return ret;
9246 }
9247
9248 static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
9249 {
9250         u64 num_devices;
9251         u64 stripped;
9252
9253         /*
9254          * if restripe for this chunk_type is on pick target profile and
9255          * return, otherwise do the usual balance
9256          */
9257         stripped = get_restripe_target(fs_info, flags);
9258         if (stripped)
9259                 return extended_to_chunk(stripped);
9260
9261         num_devices = fs_info->fs_devices->rw_devices;
9262
9263         stripped = BTRFS_BLOCK_GROUP_RAID0 |
9264                 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
9265                 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
9266
9267         if (num_devices == 1) {
9268                 stripped |= BTRFS_BLOCK_GROUP_DUP;
9269                 stripped = flags & ~stripped;
9270
9271                 /* turn raid0 into single device chunks */
9272                 if (flags & BTRFS_BLOCK_GROUP_RAID0)
9273                         return stripped;
9274
9275                 /* turn mirroring into duplication */
9276                 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
9277                              BTRFS_BLOCK_GROUP_RAID10))
9278                         return stripped | BTRFS_BLOCK_GROUP_DUP;
9279         } else {
9280                 /* they already had raid on here, just return */
9281                 if (flags & stripped)
9282                         return flags;
9283
9284                 stripped |= BTRFS_BLOCK_GROUP_DUP;
9285                 stripped = flags & ~stripped;
9286
9287                 /* switch duplicated blocks with raid1 */
9288                 if (flags & BTRFS_BLOCK_GROUP_DUP)
9289                         return stripped | BTRFS_BLOCK_GROUP_RAID1;
9290
9291                 /* this is drive concat, leave it alone */
9292         }
9293
9294         return flags;
9295 }
9296
9297 static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
9298 {
9299         struct btrfs_space_info *sinfo = cache->space_info;
9300         u64 num_bytes;
9301         u64 min_allocable_bytes;
9302         int ret = -ENOSPC;
9303
9304         /*
9305          * We need some metadata space and system metadata space for
9306          * allocating chunks in some corner cases until we force to set
9307          * it to be readonly.
9308          */
9309         if ((sinfo->flags &
9310              (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
9311             !force)
9312                 min_allocable_bytes = SZ_1M;
9313         else
9314                 min_allocable_bytes = 0;
9315
9316         spin_lock(&sinfo->lock);
9317         spin_lock(&cache->lock);
9318
9319         if (cache->ro) {
9320                 cache->ro++;
9321                 ret = 0;
9322                 goto out;
9323         }
9324
9325         num_bytes = cache->key.offset - cache->reserved - cache->pinned -
9326                     cache->bytes_super - btrfs_block_group_used(&cache->item);
9327
9328         if (btrfs_space_info_used(sinfo, true) + num_bytes +
9329             min_allocable_bytes <= sinfo->total_bytes) {
9330                 sinfo->bytes_readonly += num_bytes;
9331                 cache->ro++;
9332                 list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
9333                 ret = 0;
9334         }
9335 out:
9336         spin_unlock(&cache->lock);
9337         spin_unlock(&sinfo->lock);
9338         return ret;
9339 }
9340
9341 int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache)
9342
9343 {
9344         struct btrfs_fs_info *fs_info = cache->fs_info;
9345         struct btrfs_trans_handle *trans;
9346         u64 alloc_flags;
9347         int ret;
9348
9349 again:
9350         trans = btrfs_join_transaction(fs_info->extent_root);
9351         if (IS_ERR(trans))
9352                 return PTR_ERR(trans);
9353
9354         /*
9355          * we're not allowed to set block groups readonly after the dirty
9356          * block groups cache has started writing.  If it already started,
9357          * back off and let this transaction commit
9358          */
9359         mutex_lock(&fs_info->ro_block_group_mutex);
9360         if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
9361                 u64 transid = trans->transid;
9362
9363                 mutex_unlock(&fs_info->ro_block_group_mutex);
9364                 btrfs_end_transaction(trans);
9365
9366                 ret = btrfs_wait_for_commit(fs_info, transid);
9367                 if (ret)
9368                         return ret;
9369                 goto again;
9370         }
9371
9372         /*
9373          * if we are changing raid levels, try to allocate a corresponding
9374          * block group with the new raid level.
9375          */
9376         alloc_flags = update_block_group_flags(fs_info, cache->flags);
9377         if (alloc_flags != cache->flags) {
9378                 ret = do_chunk_alloc(trans, alloc_flags,
9379                                      CHUNK_ALLOC_FORCE);
9380                 /*
9381                  * ENOSPC is allowed here, we may have enough space
9382                  * already allocated at the new raid level to
9383                  * carry on
9384                  */
9385                 if (ret == -ENOSPC)
9386                         ret = 0;
9387                 if (ret < 0)
9388                         goto out;
9389         }
9390
9391         ret = inc_block_group_ro(cache, 0);
9392         if (!ret)
9393                 goto out;
9394         alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags);
9395         ret = do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
9396         if (ret < 0)
9397                 goto out;
9398         ret = inc_block_group_ro(cache, 0);
9399 out:
9400         if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
9401                 alloc_flags = update_block_group_flags(fs_info, cache->flags);
9402                 mutex_lock(&fs_info->chunk_mutex);
9403                 check_system_chunk(trans, alloc_flags);
9404                 mutex_unlock(&fs_info->chunk_mutex);
9405         }
9406         mutex_unlock(&fs_info->ro_block_group_mutex);
9407
9408         btrfs_end_transaction(trans);
9409         return ret;
9410 }
9411
9412 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
9413 {
9414         u64 alloc_flags = get_alloc_profile(trans->fs_info, type);
9415
9416         return do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
9417 }
9418
9419 /*
9420  * helper to account the unused space of all the readonly block group in the
9421  * space_info. takes mirrors into account.
9422  */
9423 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
9424 {
9425         struct btrfs_block_group_cache *block_group;
9426         u64 free_bytes = 0;
9427         int factor;
9428
9429         /* It's df, we don't care if it's racy */
9430         if (list_empty(&sinfo->ro_bgs))
9431                 return 0;
9432
9433         spin_lock(&sinfo->lock);
9434         list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
9435                 spin_lock(&block_group->lock);
9436
9437                 if (!block_group->ro) {
9438                         spin_unlock(&block_group->lock);
9439                         continue;
9440                 }
9441
9442                 factor = btrfs_bg_type_to_factor(block_group->flags);
9443                 free_bytes += (block_group->key.offset -
9444                                btrfs_block_group_used(&block_group->item)) *
9445                                factor;
9446
9447                 spin_unlock(&block_group->lock);
9448         }
9449         spin_unlock(&sinfo->lock);
9450
9451         return free_bytes;
9452 }
9453
9454 void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
9455 {
9456         struct btrfs_space_info *sinfo = cache->space_info;
9457         u64 num_bytes;
9458
9459         BUG_ON(!cache->ro);
9460
9461         spin_lock(&sinfo->lock);
9462         spin_lock(&cache->lock);
9463         if (!--cache->ro) {
9464                 num_bytes = cache->key.offset - cache->reserved -
9465                             cache->pinned - cache->bytes_super -
9466                             btrfs_block_group_used(&cache->item);
9467                 sinfo->bytes_readonly -= num_bytes;
9468                 list_del_init(&cache->ro_list);
9469         }
9470         spin_unlock(&cache->lock);
9471         spin_unlock(&sinfo->lock);
9472 }
9473
9474 /*
9475  * checks to see if its even possible to relocate this block group.
9476  *
9477  * @return - -1 if it's not a good idea to relocate this block group, 0 if its
9478  * ok to go ahead and try.
9479  */
9480 int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
9481 {
9482         struct btrfs_root *root = fs_info->extent_root;
9483         struct btrfs_block_group_cache *block_group;
9484         struct btrfs_space_info *space_info;
9485         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
9486         struct btrfs_device *device;
9487         struct btrfs_trans_handle *trans;
9488         u64 min_free;
9489         u64 dev_min = 1;
9490         u64 dev_nr = 0;
9491         u64 target;
9492         int debug;
9493         int index;
9494         int full = 0;
9495         int ret = 0;
9496
9497         debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG);
9498
9499         block_group = btrfs_lookup_block_group(fs_info, bytenr);
9500
9501         /* odd, couldn't find the block group, leave it alone */
9502         if (!block_group) {
9503                 if (debug)
9504                         btrfs_warn(fs_info,
9505                                    "can't find block group for bytenr %llu",
9506                                    bytenr);
9507                 return -1;
9508         }
9509
9510         min_free = btrfs_block_group_used(&block_group->item);
9511
9512         /* no bytes used, we're good */
9513         if (!min_free)
9514                 goto out;
9515
9516         space_info = block_group->space_info;
9517         spin_lock(&space_info->lock);
9518
9519         full = space_info->full;
9520
9521         /*
9522          * if this is the last block group we have in this space, we can't
9523          * relocate it unless we're able to allocate a new chunk below.
9524          *
9525          * Otherwise, we need to make sure we have room in the space to handle
9526          * all of the extents from this block group.  If we can, we're good
9527          */
9528         if ((space_info->total_bytes != block_group->key.offset) &&
9529             (btrfs_space_info_used(space_info, false) + min_free <
9530              space_info->total_bytes)) {
9531                 spin_unlock(&space_info->lock);
9532                 goto out;
9533         }
9534         spin_unlock(&space_info->lock);
9535
9536         /*
9537          * ok we don't have enough space, but maybe we have free space on our
9538          * devices to allocate new chunks for relocation, so loop through our
9539          * alloc devices and guess if we have enough space.  if this block
9540          * group is going to be restriped, run checks against the target
9541          * profile instead of the current one.
9542          */
9543         ret = -1;
9544
9545         /*
9546          * index:
9547          *      0: raid10
9548          *      1: raid1
9549          *      2: dup
9550          *      3: raid0
9551          *      4: single
9552          */
9553         target = get_restripe_target(fs_info, block_group->flags);
9554         if (target) {
9555                 index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target));
9556         } else {
9557                 /*
9558                  * this is just a balance, so if we were marked as full
9559                  * we know there is no space for a new chunk
9560                  */
9561                 if (full) {
9562                         if (debug)
9563                                 btrfs_warn(fs_info,
9564                                            "no space to alloc new chunk for block group %llu",
9565                                            block_group->key.objectid);
9566                         goto out;
9567                 }
9568
9569                 index = btrfs_bg_flags_to_raid_index(block_group->flags);
9570         }
9571
9572         if (index == BTRFS_RAID_RAID10) {
9573                 dev_min = 4;
9574                 /* Divide by 2 */
9575                 min_free >>= 1;
9576         } else if (index == BTRFS_RAID_RAID1) {
9577                 dev_min = 2;
9578         } else if (index == BTRFS_RAID_DUP) {
9579                 /* Multiply by 2 */
9580                 min_free <<= 1;
9581         } else if (index == BTRFS_RAID_RAID0) {
9582                 dev_min = fs_devices->rw_devices;
9583                 min_free = div64_u64(min_free, dev_min);
9584         }
9585
9586         /* We need to do this so that we can look at pending chunks */
9587         trans = btrfs_join_transaction(root);
9588         if (IS_ERR(trans)) {
9589                 ret = PTR_ERR(trans);
9590                 goto out;
9591         }
9592
9593         mutex_lock(&fs_info->chunk_mutex);
9594         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
9595                 u64 dev_offset;
9596
9597                 /*
9598                  * check to make sure we can actually find a chunk with enough
9599                  * space to fit our block group in.
9600                  */
9601                 if (device->total_bytes > device->bytes_used + min_free &&
9602                     !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
9603                         ret = find_free_dev_extent(trans, device, min_free,
9604                                                    &dev_offset, NULL);
9605                         if (!ret)
9606                                 dev_nr++;
9607
9608                         if (dev_nr >= dev_min)
9609                                 break;
9610
9611                         ret = -1;
9612                 }
9613         }
9614         if (debug && ret == -1)
9615                 btrfs_warn(fs_info,
9616                            "no space to allocate a new chunk for block group %llu",
9617                            block_group->key.objectid);
9618         mutex_unlock(&fs_info->chunk_mutex);
9619         btrfs_end_transaction(trans);
9620 out:
9621         btrfs_put_block_group(block_group);
9622         return ret;
9623 }
9624
9625 static int find_first_block_group(struct btrfs_fs_info *fs_info,
9626                                   struct btrfs_path *path,
9627                                   struct btrfs_key *key)
9628 {
9629         struct btrfs_root *root = fs_info->extent_root;
9630         int ret = 0;
9631         struct btrfs_key found_key;
9632         struct extent_buffer *leaf;
9633         struct btrfs_block_group_item bg;
9634         u64 flags;
9635         int slot;
9636
9637         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
9638         if (ret < 0)
9639                 goto out;
9640
9641         while (1) {
9642                 slot = path->slots[0];
9643                 leaf = path->nodes[0];
9644                 if (slot >= btrfs_header_nritems(leaf)) {
9645                         ret = btrfs_next_leaf(root, path);
9646                         if (ret == 0)
9647                                 continue;
9648                         if (ret < 0)
9649                                 goto out;
9650                         break;
9651                 }
9652                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
9653
9654                 if (found_key.objectid >= key->objectid &&
9655                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
9656                         struct extent_map_tree *em_tree;
9657                         struct extent_map *em;
9658
9659                         em_tree = &root->fs_info->mapping_tree.map_tree;
9660                         read_lock(&em_tree->lock);
9661                         em = lookup_extent_mapping(em_tree, found_key.objectid,
9662                                                    found_key.offset);
9663                         read_unlock(&em_tree->lock);
9664                         if (!em) {
9665                                 btrfs_err(fs_info,
9666                         "logical %llu len %llu found bg but no related chunk",
9667                                           found_key.objectid, found_key.offset);
9668                                 ret = -ENOENT;
9669                         } else if (em->start != found_key.objectid ||
9670                                    em->len != found_key.offset) {
9671                                 btrfs_err(fs_info,
9672                 "block group %llu len %llu mismatch with chunk %llu len %llu",
9673                                           found_key.objectid, found_key.offset,
9674                                           em->start, em->len);
9675                                 ret = -EUCLEAN;
9676                         } else {
9677                                 read_extent_buffer(leaf, &bg,
9678                                         btrfs_item_ptr_offset(leaf, slot),
9679                                         sizeof(bg));
9680                                 flags = btrfs_block_group_flags(&bg) &
9681                                         BTRFS_BLOCK_GROUP_TYPE_MASK;
9682
9683                                 if (flags != (em->map_lookup->type &
9684                                               BTRFS_BLOCK_GROUP_TYPE_MASK)) {
9685                                         btrfs_err(fs_info,
9686 "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
9687                                                 found_key.objectid,
9688                                                 found_key.offset, flags,
9689                                                 (BTRFS_BLOCK_GROUP_TYPE_MASK &
9690                                                  em->map_lookup->type));
9691                                         ret = -EUCLEAN;
9692                                 } else {
9693                                         ret = 0;
9694                                 }
9695                         }
9696                         free_extent_map(em);
9697                         goto out;
9698                 }
9699                 path->slots[0]++;
9700         }
9701 out:
9702         return ret;
9703 }
9704
9705 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
9706 {
9707         struct btrfs_block_group_cache *block_group;
9708         u64 last = 0;
9709
9710         while (1) {
9711                 struct inode *inode;
9712
9713                 block_group = btrfs_lookup_first_block_group(info, last);
9714                 while (block_group) {
9715                         wait_block_group_cache_done(block_group);
9716                         spin_lock(&block_group->lock);
9717                         if (block_group->iref)
9718                                 break;
9719                         spin_unlock(&block_group->lock);
9720                         block_group = next_block_group(info, block_group);
9721                 }
9722                 if (!block_group) {
9723                         if (last == 0)
9724                                 break;
9725                         last = 0;
9726                         continue;
9727                 }
9728
9729                 inode = block_group->inode;
9730                 block_group->iref = 0;
9731                 block_group->inode = NULL;
9732                 spin_unlock(&block_group->lock);
9733                 ASSERT(block_group->io_ctl.inode == NULL);
9734                 iput(inode);
9735                 last = block_group->key.objectid + block_group->key.offset;
9736                 btrfs_put_block_group(block_group);
9737         }
9738 }
9739
9740 /*
9741  * Must be called only after stopping all workers, since we could have block
9742  * group caching kthreads running, and therefore they could race with us if we
9743  * freed the block groups before stopping them.
9744  */
9745 int btrfs_free_block_groups(struct btrfs_fs_info *info)
9746 {
9747         struct btrfs_block_group_cache *block_group;
9748         struct btrfs_space_info *space_info;
9749         struct btrfs_caching_control *caching_ctl;
9750         struct rb_node *n;
9751
9752         down_write(&info->commit_root_sem);
9753         while (!list_empty(&info->caching_block_groups)) {
9754                 caching_ctl = list_entry(info->caching_block_groups.next,
9755                                          struct btrfs_caching_control, list);
9756                 list_del(&caching_ctl->list);
9757                 put_caching_control(caching_ctl);
9758         }
9759         up_write(&info->commit_root_sem);
9760
9761         spin_lock(&info->unused_bgs_lock);
9762         while (!list_empty(&info->unused_bgs)) {
9763                 block_group = list_first_entry(&info->unused_bgs,
9764                                                struct btrfs_block_group_cache,
9765                                                bg_list);
9766                 list_del_init(&block_group->bg_list);
9767                 btrfs_put_block_group(block_group);
9768         }
9769         spin_unlock(&info->unused_bgs_lock);
9770
9771         spin_lock(&info->block_group_cache_lock);
9772         while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
9773                 block_group = rb_entry(n, struct btrfs_block_group_cache,
9774                                        cache_node);
9775                 rb_erase(&block_group->cache_node,
9776                          &info->block_group_cache_tree);
9777                 RB_CLEAR_NODE(&block_group->cache_node);
9778                 spin_unlock(&info->block_group_cache_lock);
9779
9780                 down_write(&block_group->space_info->groups_sem);
9781                 list_del(&block_group->list);
9782                 up_write(&block_group->space_info->groups_sem);
9783
9784                 /*
9785                  * We haven't cached this block group, which means we could
9786                  * possibly have excluded extents on this block group.
9787                  */
9788                 if (block_group->cached == BTRFS_CACHE_NO ||
9789                     block_group->cached == BTRFS_CACHE_ERROR)
9790                         free_excluded_extents(block_group);
9791
9792                 btrfs_remove_free_space_cache(block_group);
9793                 ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
9794                 ASSERT(list_empty(&block_group->dirty_list));
9795                 ASSERT(list_empty(&block_group->io_list));
9796                 ASSERT(list_empty(&block_group->bg_list));
9797                 ASSERT(atomic_read(&block_group->count) == 1);
9798                 btrfs_put_block_group(block_group);
9799
9800                 spin_lock(&info->block_group_cache_lock);
9801         }
9802         spin_unlock(&info->block_group_cache_lock);
9803
9804         /* now that all the block groups are freed, go through and
9805          * free all the space_info structs.  This is only called during
9806          * the final stages of unmount, and so we know nobody is
9807          * using them.  We call synchronize_rcu() once before we start,
9808          * just to be on the safe side.
9809          */
9810         synchronize_rcu();
9811
9812         release_global_block_rsv(info);
9813
9814         while (!list_empty(&info->space_info)) {
9815                 int i;
9816
9817                 space_info = list_entry(info->space_info.next,
9818                                         struct btrfs_space_info,
9819                                         list);
9820
9821                 /*
9822                  * Do not hide this behind enospc_debug, this is actually
9823                  * important and indicates a real bug if this happens.
9824                  */
9825                 if (WARN_ON(space_info->bytes_pinned > 0 ||
9826                             space_info->bytes_reserved > 0 ||
9827                             space_info->bytes_may_use > 0))
9828                         dump_space_info(info, space_info, 0, 0);
9829                 list_del(&space_info->list);
9830                 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
9831                         struct kobject *kobj;
9832                         kobj = space_info->block_group_kobjs[i];
9833                         space_info->block_group_kobjs[i] = NULL;
9834                         if (kobj) {
9835                                 kobject_del(kobj);
9836                                 kobject_put(kobj);
9837                         }
9838                 }
9839                 kobject_del(&space_info->kobj);
9840                 kobject_put(&space_info->kobj);
9841         }
9842         return 0;
9843 }
9844
9845 /* link_block_group will queue up kobjects to add when we're reclaim-safe */
9846 void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info)
9847 {
9848         struct btrfs_space_info *space_info;
9849         struct raid_kobject *rkobj;
9850         LIST_HEAD(list);
9851         int index;
9852         int ret = 0;
9853
9854         spin_lock(&fs_info->pending_raid_kobjs_lock);
9855         list_splice_init(&fs_info->pending_raid_kobjs, &list);
9856         spin_unlock(&fs_info->pending_raid_kobjs_lock);
9857
9858         list_for_each_entry(rkobj, &list, list) {
9859                 space_info = __find_space_info(fs_info, rkobj->flags);
9860                 index = btrfs_bg_flags_to_raid_index(rkobj->flags);
9861
9862                 ret = kobject_add(&rkobj->kobj, &space_info->kobj,
9863                                   "%s", get_raid_name(index));
9864                 if (ret) {
9865                         kobject_put(&rkobj->kobj);
9866                         break;
9867                 }
9868         }
9869         if (ret)
9870                 btrfs_warn(fs_info,
9871                            "failed to add kobject for block cache, ignoring");
9872 }
9873
9874 static void link_block_group(struct btrfs_block_group_cache *cache)
9875 {
9876         struct btrfs_space_info *space_info = cache->space_info;
9877         struct btrfs_fs_info *fs_info = cache->fs_info;
9878         int index = btrfs_bg_flags_to_raid_index(cache->flags);
9879         bool first = false;
9880
9881         down_write(&space_info->groups_sem);
9882         if (list_empty(&space_info->block_groups[index]))
9883                 first = true;
9884         list_add_tail(&cache->list, &space_info->block_groups[index]);
9885         up_write(&space_info->groups_sem);
9886
9887         if (first) {
9888                 struct raid_kobject *rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
9889                 if (!rkobj) {
9890                         btrfs_warn(cache->fs_info,
9891                                 "couldn't alloc memory for raid level kobject");
9892                         return;
9893                 }
9894                 rkobj->flags = cache->flags;
9895                 kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
9896
9897                 spin_lock(&fs_info->pending_raid_kobjs_lock);
9898                 list_add_tail(&rkobj->list, &fs_info->pending_raid_kobjs);
9899                 spin_unlock(&fs_info->pending_raid_kobjs_lock);
9900                 space_info->block_group_kobjs[index] = &rkobj->kobj;
9901         }
9902 }
9903
9904 static struct btrfs_block_group_cache *
9905 btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
9906                                u64 start, u64 size)
9907 {
9908         struct btrfs_block_group_cache *cache;
9909
9910         cache = kzalloc(sizeof(*cache), GFP_NOFS);
9911         if (!cache)
9912                 return NULL;
9913
9914         cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
9915                                         GFP_NOFS);
9916         if (!cache->free_space_ctl) {
9917                 kfree(cache);
9918                 return NULL;
9919         }
9920
9921         cache->key.objectid = start;
9922         cache->key.offset = size;
9923         cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
9924
9925         cache->fs_info = fs_info;
9926         cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
9927         set_free_space_tree_thresholds(cache);
9928
9929         atomic_set(&cache->count, 1);
9930         spin_lock_init(&cache->lock);
9931         init_rwsem(&cache->data_rwsem);
9932         INIT_LIST_HEAD(&cache->list);
9933         INIT_LIST_HEAD(&cache->cluster_list);
9934         INIT_LIST_HEAD(&cache->bg_list);
9935         INIT_LIST_HEAD(&cache->ro_list);
9936         INIT_LIST_HEAD(&cache->dirty_list);
9937         INIT_LIST_HEAD(&cache->io_list);
9938         btrfs_init_free_space_ctl(cache);
9939         atomic_set(&cache->trimming, 0);
9940         mutex_init(&cache->free_space_lock);
9941         btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
9942
9943         return cache;
9944 }
9945
9946
9947 /*
9948  * Iterate all chunks and verify that each of them has the corresponding block
9949  * group
9950  */
9951 static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
9952 {
9953         struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
9954         struct extent_map *em;
9955         struct btrfs_block_group_cache *bg;
9956         u64 start = 0;
9957         int ret = 0;
9958
9959         while (1) {
9960                 read_lock(&map_tree->map_tree.lock);
9961                 /*
9962                  * lookup_extent_mapping will return the first extent map
9963                  * intersecting the range, so setting @len to 1 is enough to
9964                  * get the first chunk.
9965                  */
9966                 em = lookup_extent_mapping(&map_tree->map_tree, start, 1);
9967                 read_unlock(&map_tree->map_tree.lock);
9968                 if (!em)
9969                         break;
9970
9971                 bg = btrfs_lookup_block_group(fs_info, em->start);
9972                 if (!bg) {
9973                         btrfs_err(fs_info,
9974         "chunk start=%llu len=%llu doesn't have corresponding block group",
9975                                      em->start, em->len);
9976                         ret = -EUCLEAN;
9977                         free_extent_map(em);
9978                         break;
9979                 }
9980                 if (bg->key.objectid != em->start ||
9981                     bg->key.offset != em->len ||
9982                     (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
9983                     (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
9984                         btrfs_err(fs_info,
9985 "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
9986                                 em->start, em->len,
9987                                 em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
9988                                 bg->key.objectid, bg->key.offset,
9989                                 bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
9990                         ret = -EUCLEAN;
9991                         free_extent_map(em);
9992                         btrfs_put_block_group(bg);
9993                         break;
9994                 }
9995                 start = em->start + em->len;
9996                 free_extent_map(em);
9997                 btrfs_put_block_group(bg);
9998         }
9999         return ret;
10000 }
10001
10002 int btrfs_read_block_groups(struct btrfs_fs_info *info)
10003 {
10004         struct btrfs_path *path;
10005         int ret;
10006         struct btrfs_block_group_cache *cache;
10007         struct btrfs_space_info *space_info;
10008         struct btrfs_key key;
10009         struct btrfs_key found_key;
10010         struct extent_buffer *leaf;
10011         int need_clear = 0;
10012         u64 cache_gen;
10013         u64 feature;
10014         int mixed;
10015
10016         feature = btrfs_super_incompat_flags(info->super_copy);
10017         mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
10018
10019         key.objectid = 0;
10020         key.offset = 0;
10021         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
10022         path = btrfs_alloc_path();
10023         if (!path)
10024                 return -ENOMEM;
10025         path->reada = READA_FORWARD;
10026
10027         cache_gen = btrfs_super_cache_generation(info->super_copy);
10028         if (btrfs_test_opt(info, SPACE_CACHE) &&
10029             btrfs_super_generation(info->super_copy) != cache_gen)
10030                 need_clear = 1;
10031         if (btrfs_test_opt(info, CLEAR_CACHE))
10032                 need_clear = 1;
10033
10034         while (1) {
10035                 ret = find_first_block_group(info, path, &key);
10036                 if (ret > 0)
10037                         break;
10038                 if (ret != 0)
10039                         goto error;
10040
10041                 leaf = path->nodes[0];
10042                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
10043
10044                 cache = btrfs_create_block_group_cache(info, found_key.objectid,
10045                                                        found_key.offset);
10046                 if (!cache) {
10047                         ret = -ENOMEM;
10048                         goto error;
10049                 }
10050
10051                 if (need_clear) {
10052                         /*
10053                          * When we mount with old space cache, we need to
10054                          * set BTRFS_DC_CLEAR and set dirty flag.
10055                          *
10056                          * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
10057                          *    truncate the old free space cache inode and
10058                          *    setup a new one.
10059                          * b) Setting 'dirty flag' makes sure that we flush
10060                          *    the new space cache info onto disk.
10061                          */
10062                         if (btrfs_test_opt(info, SPACE_CACHE))
10063                                 cache->disk_cache_state = BTRFS_DC_CLEAR;
10064                 }
10065
10066                 read_extent_buffer(leaf, &cache->item,
10067                                    btrfs_item_ptr_offset(leaf, path->slots[0]),
10068                                    sizeof(cache->item));
10069                 cache->flags = btrfs_block_group_flags(&cache->item);
10070                 if (!mixed &&
10071                     ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
10072                     (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
10073                         btrfs_err(info,
10074 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
10075                                   cache->key.objectid);
10076                         ret = -EINVAL;
10077                         goto error;
10078                 }
10079
10080                 key.objectid = found_key.objectid + found_key.offset;
10081                 btrfs_release_path(path);
10082
10083                 /*
10084                  * We need to exclude the super stripes now so that the space
10085                  * info has super bytes accounted for, otherwise we'll think
10086                  * we have more space than we actually do.
10087                  */
10088                 ret = exclude_super_stripes(cache);
10089                 if (ret) {
10090                         /*
10091                          * We may have excluded something, so call this just in
10092                          * case.
10093                          */
10094                         free_excluded_extents(cache);
10095                         btrfs_put_block_group(cache);
10096                         goto error;
10097                 }
10098
10099                 /*
10100                  * check for two cases, either we are full, and therefore
10101                  * don't need to bother with the caching work since we won't
10102                  * find any space, or we are empty, and we can just add all
10103                  * the space in and be done with it.  This saves us _alot_ of
10104                  * time, particularly in the full case.
10105                  */
10106                 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
10107                         cache->last_byte_to_unpin = (u64)-1;
10108                         cache->cached = BTRFS_CACHE_FINISHED;
10109                         free_excluded_extents(cache);
10110                 } else if (btrfs_block_group_used(&cache->item) == 0) {
10111                         cache->last_byte_to_unpin = (u64)-1;
10112                         cache->cached = BTRFS_CACHE_FINISHED;
10113                         add_new_free_space(cache, found_key.objectid,
10114                                            found_key.objectid +
10115                                            found_key.offset);
10116                         free_excluded_extents(cache);
10117                 }
10118
10119                 ret = btrfs_add_block_group_cache(info, cache);
10120                 if (ret) {
10121                         btrfs_remove_free_space_cache(cache);
10122                         btrfs_put_block_group(cache);
10123                         goto error;
10124                 }
10125
10126                 trace_btrfs_add_block_group(info, cache, 0);
10127                 update_space_info(info, cache->flags, found_key.offset,
10128                                   btrfs_block_group_used(&cache->item),
10129                                   cache->bytes_super, &space_info);
10130
10131                 cache->space_info = space_info;
10132
10133                 link_block_group(cache);
10134
10135                 set_avail_alloc_bits(info, cache->flags);
10136                 if (btrfs_chunk_readonly(info, cache->key.objectid)) {
10137                         inc_block_group_ro(cache, 1);
10138                 } else if (btrfs_block_group_used(&cache->item) == 0) {
10139                         ASSERT(list_empty(&cache->bg_list));
10140                         btrfs_mark_bg_unused(cache);
10141                 }
10142         }
10143
10144         list_for_each_entry_rcu(space_info, &info->space_info, list) {
10145                 if (!(get_alloc_profile(info, space_info->flags) &
10146                       (BTRFS_BLOCK_GROUP_RAID10 |
10147                        BTRFS_BLOCK_GROUP_RAID1 |
10148                        BTRFS_BLOCK_GROUP_RAID5 |
10149                        BTRFS_BLOCK_GROUP_RAID6 |
10150                        BTRFS_BLOCK_GROUP_DUP)))
10151                         continue;
10152                 /*
10153                  * avoid allocating from un-mirrored block group if there are
10154                  * mirrored block groups.
10155                  */
10156                 list_for_each_entry(cache,
10157                                 &space_info->block_groups[BTRFS_RAID_RAID0],
10158                                 list)
10159                         inc_block_group_ro(cache, 1);
10160                 list_for_each_entry(cache,
10161                                 &space_info->block_groups[BTRFS_RAID_SINGLE],
10162                                 list)
10163                         inc_block_group_ro(cache, 1);
10164         }
10165
10166         btrfs_add_raid_kobjects(info);
10167         init_global_block_rsv(info);
10168         ret = check_chunk_block_group_mappings(info);
10169 error:
10170         btrfs_free_path(path);
10171         return ret;
10172 }
10173
10174 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
10175 {
10176         struct btrfs_fs_info *fs_info = trans->fs_info;
10177         struct btrfs_block_group_cache *block_group, *tmp;
10178         struct btrfs_root *extent_root = fs_info->extent_root;
10179         struct btrfs_block_group_item item;
10180         struct btrfs_key key;
10181         int ret = 0;
10182         bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
10183
10184         trans->can_flush_pending_bgs = false;
10185         list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
10186                 if (ret)
10187                         goto next;
10188
10189                 spin_lock(&block_group->lock);
10190                 memcpy(&item, &block_group->item, sizeof(item));
10191                 memcpy(&key, &block_group->key, sizeof(key));
10192                 spin_unlock(&block_group->lock);
10193
10194                 ret = btrfs_insert_item(trans, extent_root, &key, &item,
10195                                         sizeof(item));
10196                 if (ret)
10197                         btrfs_abort_transaction(trans, ret);
10198                 ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset);
10199                 if (ret)
10200                         btrfs_abort_transaction(trans, ret);
10201                 add_block_group_free_space(trans, block_group);
10202                 /* already aborted the transaction if it failed. */
10203 next:
10204                 list_del_init(&block_group->bg_list);
10205         }
10206         trans->can_flush_pending_bgs = can_flush_pending_bgs;
10207 }
10208
10209 int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
10210                            u64 type, u64 chunk_offset, u64 size)
10211 {
10212         struct btrfs_fs_info *fs_info = trans->fs_info;
10213         struct btrfs_block_group_cache *cache;
10214         int ret;
10215
10216         btrfs_set_log_full_commit(fs_info, trans);
10217
10218         cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size);
10219         if (!cache)
10220                 return -ENOMEM;
10221
10222         btrfs_set_block_group_used(&cache->item, bytes_used);
10223         btrfs_set_block_group_chunk_objectid(&cache->item,
10224                                              BTRFS_FIRST_CHUNK_TREE_OBJECTID);
10225         btrfs_set_block_group_flags(&cache->item, type);
10226
10227         cache->flags = type;
10228         cache->last_byte_to_unpin = (u64)-1;
10229         cache->cached = BTRFS_CACHE_FINISHED;
10230         cache->needs_free_space = 1;
10231         ret = exclude_super_stripes(cache);
10232         if (ret) {
10233                 /*
10234                  * We may have excluded something, so call this just in
10235                  * case.
10236                  */
10237                 free_excluded_extents(cache);
10238                 btrfs_put_block_group(cache);
10239                 return ret;
10240         }
10241
10242         add_new_free_space(cache, chunk_offset, chunk_offset + size);
10243
10244         free_excluded_extents(cache);
10245
10246 #ifdef CONFIG_BTRFS_DEBUG
10247         if (btrfs_should_fragment_free_space(cache)) {
10248                 u64 new_bytes_used = size - bytes_used;
10249
10250                 bytes_used += new_bytes_used >> 1;
10251                 fragment_free_space(cache);
10252         }
10253 #endif
10254         /*
10255          * Ensure the corresponding space_info object is created and
10256          * assigned to our block group. We want our bg to be added to the rbtree
10257          * with its ->space_info set.
10258          */
10259         cache->space_info = __find_space_info(fs_info, cache->flags);
10260         ASSERT(cache->space_info);
10261
10262         ret = btrfs_add_block_group_cache(fs_info, cache);
10263         if (ret) {
10264                 btrfs_remove_free_space_cache(cache);
10265                 btrfs_put_block_group(cache);
10266                 return ret;
10267         }
10268
10269         /*
10270          * Now that our block group has its ->space_info set and is inserted in
10271          * the rbtree, update the space info's counters.
10272          */
10273         trace_btrfs_add_block_group(fs_info, cache, 1);
10274         update_space_info(fs_info, cache->flags, size, bytes_used,
10275                                 cache->bytes_super, &cache->space_info);
10276         update_global_block_rsv(fs_info);
10277
10278         link_block_group(cache);
10279
10280         list_add_tail(&cache->bg_list, &trans->new_bgs);
10281
10282         set_avail_alloc_bits(fs_info, type);
10283         return 0;
10284 }
10285
10286 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
10287 {
10288         u64 extra_flags = chunk_to_extended(flags) &
10289                                 BTRFS_EXTENDED_PROFILE_MASK;
10290
10291         write_seqlock(&fs_info->profiles_lock);
10292         if (flags & BTRFS_BLOCK_GROUP_DATA)
10293                 fs_info->avail_data_alloc_bits &= ~extra_flags;
10294         if (flags & BTRFS_BLOCK_GROUP_METADATA)
10295                 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
10296         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
10297                 fs_info->avail_system_alloc_bits &= ~extra_flags;
10298         write_sequnlock(&fs_info->profiles_lock);
10299 }
10300
10301 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
10302                              u64 group_start, struct extent_map *em)
10303 {
10304         struct btrfs_fs_info *fs_info = trans->fs_info;
10305         struct btrfs_root *root = fs_info->extent_root;
10306         struct btrfs_path *path;
10307         struct btrfs_block_group_cache *block_group;
10308         struct btrfs_free_cluster *cluster;
10309         struct btrfs_root *tree_root = fs_info->tree_root;
10310         struct btrfs_key key;
10311         struct inode *inode;
10312         struct kobject *kobj = NULL;
10313         int ret;
10314         int index;
10315         int factor;
10316         struct btrfs_caching_control *caching_ctl = NULL;
10317         bool remove_em;
10318
10319         block_group = btrfs_lookup_block_group(fs_info, group_start);
10320         BUG_ON(!block_group);
10321         BUG_ON(!block_group->ro);
10322
10323         trace_btrfs_remove_block_group(block_group);
10324         /*
10325          * Free the reserved super bytes from this block group before
10326          * remove it.
10327          */
10328         free_excluded_extents(block_group);
10329         btrfs_free_ref_tree_range(fs_info, block_group->key.objectid,
10330                                   block_group->key.offset);
10331
10332         memcpy(&key, &block_group->key, sizeof(key));
10333         index = btrfs_bg_flags_to_raid_index(block_group->flags);
10334         factor = btrfs_bg_type_to_factor(block_group->flags);
10335
10336         /* make sure this block group isn't part of an allocation cluster */
10337         cluster = &fs_info->data_alloc_cluster;
10338         spin_lock(&cluster->refill_lock);
10339         btrfs_return_cluster_to_free_space(block_group, cluster);
10340         spin_unlock(&cluster->refill_lock);
10341
10342         /*
10343          * make sure this block group isn't part of a metadata
10344          * allocation cluster
10345          */
10346         cluster = &fs_info->meta_alloc_cluster;
10347         spin_lock(&cluster->refill_lock);
10348         btrfs_return_cluster_to_free_space(block_group, cluster);
10349         spin_unlock(&cluster->refill_lock);
10350
10351         path = btrfs_alloc_path();
10352         if (!path) {
10353                 ret = -ENOMEM;
10354                 goto out;
10355         }
10356
10357         /*
10358          * get the inode first so any iput calls done for the io_list
10359          * aren't the final iput (no unlinks allowed now)
10360          */
10361         inode = lookup_free_space_inode(fs_info, block_group, path);
10362
10363         mutex_lock(&trans->transaction->cache_write_mutex);
10364         /*
10365          * make sure our free spache cache IO is done before remove the
10366          * free space inode
10367          */
10368         spin_lock(&trans->transaction->dirty_bgs_lock);
10369         if (!list_empty(&block_group->io_list)) {
10370                 list_del_init(&block_group->io_list);
10371
10372                 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
10373
10374                 spin_unlock(&trans->transaction->dirty_bgs_lock);
10375                 btrfs_wait_cache_io(trans, block_group, path);
10376                 btrfs_put_block_group(block_group);
10377                 spin_lock(&trans->transaction->dirty_bgs_lock);
10378         }
10379
10380         if (!list_empty(&block_group->dirty_list)) {
10381                 list_del_init(&block_group->dirty_list);
10382                 btrfs_put_block_group(block_group);
10383         }
10384         spin_unlock(&trans->transaction->dirty_bgs_lock);
10385         mutex_unlock(&trans->transaction->cache_write_mutex);
10386
10387         if (!IS_ERR(inode)) {
10388                 ret = btrfs_orphan_add(trans, BTRFS_I(inode));
10389                 if (ret) {
10390                         btrfs_add_delayed_iput(inode);
10391                         goto out;
10392                 }
10393                 clear_nlink(inode);
10394                 /* One for the block groups ref */
10395                 spin_lock(&block_group->lock);
10396                 if (block_group->iref) {
10397                         block_group->iref = 0;
10398                         block_group->inode = NULL;
10399                         spin_unlock(&block_group->lock);
10400                         iput(inode);
10401                 } else {
10402                         spin_unlock(&block_group->lock);
10403                 }
10404                 /* One for our lookup ref */
10405                 btrfs_add_delayed_iput(inode);
10406         }
10407
10408         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
10409         key.offset = block_group->key.objectid;
10410         key.type = 0;
10411
10412         ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
10413         if (ret < 0)
10414                 goto out;
10415         if (ret > 0)
10416                 btrfs_release_path(path);
10417         if (ret == 0) {
10418                 ret = btrfs_del_item(trans, tree_root, path);
10419                 if (ret)
10420                         goto out;
10421                 btrfs_release_path(path);
10422         }
10423
10424         spin_lock(&fs_info->block_group_cache_lock);
10425         rb_erase(&block_group->cache_node,
10426                  &fs_info->block_group_cache_tree);
10427         RB_CLEAR_NODE(&block_group->cache_node);
10428
10429         if (fs_info->first_logical_byte == block_group->key.objectid)
10430                 fs_info->first_logical_byte = (u64)-1;
10431         spin_unlock(&fs_info->block_group_cache_lock);
10432
10433         down_write(&block_group->space_info->groups_sem);
10434         /*
10435          * we must use list_del_init so people can check to see if they
10436          * are still on the list after taking the semaphore
10437          */
10438         list_del_init(&block_group->list);
10439         if (list_empty(&block_group->space_info->block_groups[index])) {
10440                 kobj = block_group->space_info->block_group_kobjs[index];
10441                 block_group->space_info->block_group_kobjs[index] = NULL;
10442                 clear_avail_alloc_bits(fs_info, block_group->flags);
10443         }
10444         up_write(&block_group->space_info->groups_sem);
10445         if (kobj) {
10446                 kobject_del(kobj);
10447                 kobject_put(kobj);
10448         }
10449
10450         if (block_group->has_caching_ctl)
10451                 caching_ctl = get_caching_control(block_group);
10452         if (block_group->cached == BTRFS_CACHE_STARTED)
10453                 wait_block_group_cache_done(block_group);
10454         if (block_group->has_caching_ctl) {
10455                 down_write(&fs_info->commit_root_sem);
10456                 if (!caching_ctl) {
10457                         struct btrfs_caching_control *ctl;
10458
10459                         list_for_each_entry(ctl,
10460                                     &fs_info->caching_block_groups, list)
10461                                 if (ctl->block_group == block_group) {
10462                                         caching_ctl = ctl;
10463                                         refcount_inc(&caching_ctl->count);
10464                                         break;
10465                                 }
10466                 }
10467                 if (caching_ctl)
10468                         list_del_init(&caching_ctl->list);
10469                 up_write(&fs_info->commit_root_sem);
10470                 if (caching_ctl) {
10471                         /* Once for the caching bgs list and once for us. */
10472                         put_caching_control(caching_ctl);
10473                         put_caching_control(caching_ctl);
10474                 }
10475         }
10476
10477         spin_lock(&trans->transaction->dirty_bgs_lock);
10478         if (!list_empty(&block_group->dirty_list)) {
10479                 WARN_ON(1);
10480         }
10481         if (!list_empty(&block_group->io_list)) {
10482                 WARN_ON(1);
10483         }
10484         spin_unlock(&trans->transaction->dirty_bgs_lock);
10485         btrfs_remove_free_space_cache(block_group);
10486
10487         spin_lock(&block_group->space_info->lock);
10488         list_del_init(&block_group->ro_list);
10489
10490         if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
10491                 WARN_ON(block_group->space_info->total_bytes
10492                         < block_group->key.offset);
10493                 WARN_ON(block_group->space_info->bytes_readonly
10494                         < block_group->key.offset);
10495                 WARN_ON(block_group->space_info->disk_total
10496                         < block_group->key.offset * factor);
10497         }
10498         block_group->space_info->total_bytes -= block_group->key.offset;
10499         block_group->space_info->bytes_readonly -= block_group->key.offset;
10500         block_group->space_info->disk_total -= block_group->key.offset * factor;
10501
10502         spin_unlock(&block_group->space_info->lock);
10503
10504         memcpy(&key, &block_group->key, sizeof(key));
10505
10506         mutex_lock(&fs_info->chunk_mutex);
10507         if (!list_empty(&em->list)) {
10508                 /* We're in the transaction->pending_chunks list. */
10509                 free_extent_map(em);
10510         }
10511         spin_lock(&block_group->lock);
10512         block_group->removed = 1;
10513         /*
10514          * At this point trimming can't start on this block group, because we
10515          * removed the block group from the tree fs_info->block_group_cache_tree
10516          * so no one can't find it anymore and even if someone already got this
10517          * block group before we removed it from the rbtree, they have already
10518          * incremented block_group->trimming - if they didn't, they won't find
10519          * any free space entries because we already removed them all when we
10520          * called btrfs_remove_free_space_cache().
10521          *
10522          * And we must not remove the extent map from the fs_info->mapping_tree
10523          * to prevent the same logical address range and physical device space
10524          * ranges from being reused for a new block group. This is because our
10525          * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
10526          * completely transactionless, so while it is trimming a range the
10527          * currently running transaction might finish and a new one start,
10528          * allowing for new block groups to be created that can reuse the same
10529          * physical device locations unless we take this special care.
10530          *
10531          * There may also be an implicit trim operation if the file system
10532          * is mounted with -odiscard. The same protections must remain
10533          * in place until the extents have been discarded completely when
10534          * the transaction commit has completed.
10535          */
10536         remove_em = (atomic_read(&block_group->trimming) == 0);
10537         /*
10538          * Make sure a trimmer task always sees the em in the pinned_chunks list
10539          * if it sees block_group->removed == 1 (needs to lock block_group->lock
10540          * before checking block_group->removed).
10541          */
10542         if (!remove_em) {
10543                 /*
10544                  * Our em might be in trans->transaction->pending_chunks which
10545                  * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
10546                  * and so is the fs_info->pinned_chunks list.
10547                  *
10548                  * So at this point we must be holding the chunk_mutex to avoid
10549                  * any races with chunk allocation (more specifically at
10550                  * volumes.c:contains_pending_extent()), to ensure it always
10551                  * sees the em, either in the pending_chunks list or in the
10552                  * pinned_chunks list.
10553                  */
10554                 list_move_tail(&em->list, &fs_info->pinned_chunks);
10555         }
10556         spin_unlock(&block_group->lock);
10557
10558         if (remove_em) {
10559                 struct extent_map_tree *em_tree;
10560
10561                 em_tree = &fs_info->mapping_tree.map_tree;
10562                 write_lock(&em_tree->lock);
10563                 /*
10564                  * The em might be in the pending_chunks list, so make sure the
10565                  * chunk mutex is locked, since remove_extent_mapping() will
10566                  * delete us from that list.
10567                  */
10568                 remove_extent_mapping(em_tree, em);
10569                 write_unlock(&em_tree->lock);
10570                 /* once for the tree */
10571                 free_extent_map(em);
10572         }
10573
10574         mutex_unlock(&fs_info->chunk_mutex);
10575
10576         ret = remove_block_group_free_space(trans, block_group);
10577         if (ret)
10578                 goto out;
10579
10580         btrfs_put_block_group(block_group);
10581         btrfs_put_block_group(block_group);
10582
10583         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
10584         if (ret > 0)
10585                 ret = -EIO;
10586         if (ret < 0)
10587                 goto out;
10588
10589         ret = btrfs_del_item(trans, root, path);
10590 out:
10591         btrfs_free_path(path);
10592         return ret;
10593 }
10594
10595 struct btrfs_trans_handle *
10596 btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
10597                                      const u64 chunk_offset)
10598 {
10599         struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
10600         struct extent_map *em;
10601         struct map_lookup *map;
10602         unsigned int num_items;
10603
10604         read_lock(&em_tree->lock);
10605         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
10606         read_unlock(&em_tree->lock);
10607         ASSERT(em && em->start == chunk_offset);
10608
10609         /*
10610          * We need to reserve 3 + N units from the metadata space info in order
10611          * to remove a block group (done at btrfs_remove_chunk() and at
10612          * btrfs_remove_block_group()), which are used for:
10613          *
10614          * 1 unit for adding the free space inode's orphan (located in the tree
10615          * of tree roots).
10616          * 1 unit for deleting the block group item (located in the extent
10617          * tree).
10618          * 1 unit for deleting the free space item (located in tree of tree
10619          * roots).
10620          * N units for deleting N device extent items corresponding to each
10621          * stripe (located in the device tree).
10622          *
10623          * In order to remove a block group we also need to reserve units in the
10624          * system space info in order to update the chunk tree (update one or
10625          * more device items and remove one chunk item), but this is done at
10626          * btrfs_remove_chunk() through a call to check_system_chunk().
10627          */
10628         map = em->map_lookup;
10629         num_items = 3 + map->num_stripes;
10630         free_extent_map(em);
10631
10632         return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
10633                                                            num_items, 1);
10634 }
10635
10636 /*
10637  * Process the unused_bgs list and remove any that don't have any allocated
10638  * space inside of them.
10639  */
10640 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
10641 {
10642         struct btrfs_block_group_cache *block_group;
10643         struct btrfs_space_info *space_info;
10644         struct btrfs_trans_handle *trans;
10645         int ret = 0;
10646
10647         if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
10648                 return;
10649
10650         spin_lock(&fs_info->unused_bgs_lock);
10651         while (!list_empty(&fs_info->unused_bgs)) {
10652                 u64 start, end;
10653                 int trimming;
10654
10655                 block_group = list_first_entry(&fs_info->unused_bgs,
10656                                                struct btrfs_block_group_cache,
10657                                                bg_list);
10658                 list_del_init(&block_group->bg_list);
10659
10660                 space_info = block_group->space_info;
10661
10662                 if (ret || btrfs_mixed_space_info(space_info)) {
10663                         btrfs_put_block_group(block_group);
10664                         continue;
10665                 }
10666                 spin_unlock(&fs_info->unused_bgs_lock);
10667
10668                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
10669
10670                 /* Don't want to race with allocators so take the groups_sem */
10671                 down_write(&space_info->groups_sem);
10672                 spin_lock(&block_group->lock);
10673                 if (block_group->reserved || block_group->pinned ||
10674                     btrfs_block_group_used(&block_group->item) ||
10675                     block_group->ro ||
10676                     list_is_singular(&block_group->list)) {
10677                         /*
10678                          * We want to bail if we made new allocations or have
10679                          * outstanding allocations in this block group.  We do
10680                          * the ro check in case balance is currently acting on
10681                          * this block group.
10682                          */
10683                         trace_btrfs_skip_unused_block_group(block_group);
10684                         spin_unlock(&block_group->lock);
10685                         up_write(&space_info->groups_sem);
10686                         goto next;
10687                 }
10688                 spin_unlock(&block_group->lock);
10689
10690                 /* We don't want to force the issue, only flip if it's ok. */
10691                 ret = inc_block_group_ro(block_group, 0);
10692                 up_write(&space_info->groups_sem);
10693                 if (ret < 0) {
10694                         ret = 0;
10695                         goto next;
10696                 }
10697
10698                 /*
10699                  * Want to do this before we do anything else so we can recover
10700                  * properly if we fail to join the transaction.
10701                  */
10702                 trans = btrfs_start_trans_remove_block_group(fs_info,
10703                                                      block_group->key.objectid);
10704                 if (IS_ERR(trans)) {
10705                         btrfs_dec_block_group_ro(block_group);
10706                         ret = PTR_ERR(trans);
10707                         goto next;
10708                 }
10709
10710                 /*
10711                  * We could have pending pinned extents for this block group,
10712                  * just delete them, we don't care about them anymore.
10713                  */
10714                 start = block_group->key.objectid;
10715                 end = start + block_group->key.offset - 1;
10716                 /*
10717                  * Hold the unused_bg_unpin_mutex lock to avoid racing with
10718                  * btrfs_finish_extent_commit(). If we are at transaction N,
10719                  * another task might be running finish_extent_commit() for the
10720                  * previous transaction N - 1, and have seen a range belonging
10721                  * to the block group in freed_extents[] before we were able to
10722                  * clear the whole block group range from freed_extents[]. This
10723                  * means that task can lookup for the block group after we
10724                  * unpinned it from freed_extents[] and removed it, leading to
10725                  * a BUG_ON() at btrfs_unpin_extent_range().
10726                  */
10727                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
10728                 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
10729                                   EXTENT_DIRTY);
10730                 if (ret) {
10731                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10732                         btrfs_dec_block_group_ro(block_group);
10733                         goto end_trans;
10734                 }
10735                 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
10736                                   EXTENT_DIRTY);
10737                 if (ret) {
10738                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10739                         btrfs_dec_block_group_ro(block_group);
10740                         goto end_trans;
10741                 }
10742                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10743
10744                 /* Reset pinned so btrfs_put_block_group doesn't complain */
10745                 spin_lock(&space_info->lock);
10746                 spin_lock(&block_group->lock);
10747
10748                 space_info->bytes_pinned -= block_group->pinned;
10749                 space_info->bytes_readonly += block_group->pinned;
10750                 percpu_counter_add_batch(&space_info->total_bytes_pinned,
10751                                    -block_group->pinned,
10752                                    BTRFS_TOTAL_BYTES_PINNED_BATCH);
10753                 block_group->pinned = 0;
10754
10755                 spin_unlock(&block_group->lock);
10756                 spin_unlock(&space_info->lock);
10757
10758                 /* DISCARD can flip during remount */
10759                 trimming = btrfs_test_opt(fs_info, DISCARD);
10760
10761                 /* Implicit trim during transaction commit. */
10762                 if (trimming)
10763                         btrfs_get_block_group_trimming(block_group);
10764
10765                 /*
10766                  * Btrfs_remove_chunk will abort the transaction if things go
10767                  * horribly wrong.
10768                  */
10769                 ret = btrfs_remove_chunk(trans, block_group->key.objectid);
10770
10771                 if (ret) {
10772                         if (trimming)
10773                                 btrfs_put_block_group_trimming(block_group);
10774                         goto end_trans;
10775                 }
10776
10777                 /*
10778                  * If we're not mounted with -odiscard, we can just forget
10779                  * about this block group. Otherwise we'll need to wait
10780                  * until transaction commit to do the actual discard.
10781                  */
10782                 if (trimming) {
10783                         spin_lock(&fs_info->unused_bgs_lock);
10784                         /*
10785                          * A concurrent scrub might have added us to the list
10786                          * fs_info->unused_bgs, so use a list_move operation
10787                          * to add the block group to the deleted_bgs list.
10788                          */
10789                         list_move(&block_group->bg_list,
10790                                   &trans->transaction->deleted_bgs);
10791                         spin_unlock(&fs_info->unused_bgs_lock);
10792                         btrfs_get_block_group(block_group);
10793                 }
10794 end_trans:
10795                 btrfs_end_transaction(trans);
10796 next:
10797                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
10798                 btrfs_put_block_group(block_group);
10799                 spin_lock(&fs_info->unused_bgs_lock);
10800         }
10801         spin_unlock(&fs_info->unused_bgs_lock);
10802 }
10803
10804 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
10805 {
10806         struct btrfs_super_block *disk_super;
10807         u64 features;
10808         u64 flags;
10809         int mixed = 0;
10810         int ret;
10811
10812         disk_super = fs_info->super_copy;
10813         if (!btrfs_super_root(disk_super))
10814                 return -EINVAL;
10815
10816         features = btrfs_super_incompat_flags(disk_super);
10817         if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
10818                 mixed = 1;
10819
10820         flags = BTRFS_BLOCK_GROUP_SYSTEM;
10821         ret = create_space_info(fs_info, flags);
10822         if (ret)
10823                 goto out;
10824
10825         if (mixed) {
10826                 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
10827                 ret = create_space_info(fs_info, flags);
10828         } else {
10829                 flags = BTRFS_BLOCK_GROUP_METADATA;
10830                 ret = create_space_info(fs_info, flags);
10831                 if (ret)
10832                         goto out;
10833
10834                 flags = BTRFS_BLOCK_GROUP_DATA;
10835                 ret = create_space_info(fs_info, flags);
10836         }
10837 out:
10838         return ret;
10839 }
10840
10841 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
10842                                    u64 start, u64 end)
10843 {
10844         return unpin_extent_range(fs_info, start, end, false);
10845 }
10846
10847 /*
10848  * It used to be that old block groups would be left around forever.
10849  * Iterating over them would be enough to trim unused space.  Since we
10850  * now automatically remove them, we also need to iterate over unallocated
10851  * space.
10852  *
10853  * We don't want a transaction for this since the discard may take a
10854  * substantial amount of time.  We don't require that a transaction be
10855  * running, but we do need to take a running transaction into account
10856  * to ensure that we're not discarding chunks that were released or
10857  * allocated in the current transaction.
10858  *
10859  * Holding the chunks lock will prevent other threads from allocating
10860  * or releasing chunks, but it won't prevent a running transaction
10861  * from committing and releasing the memory that the pending chunks
10862  * list head uses.  For that, we need to take a reference to the
10863  * transaction and hold the commit root sem.  We only need to hold
10864  * it while performing the free space search since we have already
10865  * held back allocations.
10866  */
10867 static int btrfs_trim_free_extents(struct btrfs_device *device,
10868                                    u64 minlen, u64 *trimmed)
10869 {
10870         u64 start = 0, len = 0;
10871         int ret;
10872
10873         *trimmed = 0;
10874
10875         /* Discard not supported = nothing to do. */
10876         if (!blk_queue_discard(bdev_get_queue(device->bdev)))
10877                 return 0;
10878
10879         /* Not writeable = nothing to do. */
10880         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
10881                 return 0;
10882
10883         /* No free space = nothing to do. */
10884         if (device->total_bytes <= device->bytes_used)
10885                 return 0;
10886
10887         ret = 0;
10888
10889         while (1) {
10890                 struct btrfs_fs_info *fs_info = device->fs_info;
10891                 struct btrfs_transaction *trans;
10892                 u64 bytes;
10893
10894                 ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
10895                 if (ret)
10896                         break;
10897
10898                 ret = down_read_killable(&fs_info->commit_root_sem);
10899                 if (ret) {
10900                         mutex_unlock(&fs_info->chunk_mutex);
10901                         break;
10902                 }
10903
10904                 spin_lock(&fs_info->trans_lock);
10905                 trans = fs_info->running_transaction;
10906                 if (trans)
10907                         refcount_inc(&trans->use_count);
10908                 spin_unlock(&fs_info->trans_lock);
10909
10910                 if (!trans)
10911                         up_read(&fs_info->commit_root_sem);
10912
10913                 ret = find_free_dev_extent_start(trans, device, minlen, start,
10914                                                  &start, &len);
10915                 if (trans) {
10916                         up_read(&fs_info->commit_root_sem);
10917                         btrfs_put_transaction(trans);
10918                 }
10919
10920                 if (ret) {
10921                         mutex_unlock(&fs_info->chunk_mutex);
10922                         if (ret == -ENOSPC)
10923                                 ret = 0;
10924                         break;
10925                 }
10926
10927                 ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
10928                 mutex_unlock(&fs_info->chunk_mutex);
10929
10930                 if (ret)
10931                         break;
10932
10933                 start += len;
10934                 *trimmed += bytes;
10935
10936                 if (fatal_signal_pending(current)) {
10937                         ret = -ERESTARTSYS;
10938                         break;
10939                 }
10940
10941                 cond_resched();
10942         }
10943
10944         return ret;
10945 }
10946
10947 /*
10948  * Trim the whole filesystem by:
10949  * 1) trimming the free space in each block group
10950  * 2) trimming the unallocated space on each device
10951  *
10952  * This will also continue trimming even if a block group or device encounters
10953  * an error.  The return value will be the last error, or 0 if nothing bad
10954  * happens.
10955  */
10956 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
10957 {
10958         struct btrfs_block_group_cache *cache = NULL;
10959         struct btrfs_device *device;
10960         struct list_head *devices;
10961         u64 group_trimmed;
10962         u64 start;
10963         u64 end;
10964         u64 trimmed = 0;
10965         u64 bg_failed = 0;
10966         u64 dev_failed = 0;
10967         int bg_ret = 0;
10968         int dev_ret = 0;
10969         int ret = 0;
10970
10971         cache = btrfs_lookup_first_block_group(fs_info, range->start);
10972         for (; cache; cache = next_block_group(fs_info, cache)) {
10973                 if (cache->key.objectid >= (range->start + range->len)) {
10974                         btrfs_put_block_group(cache);
10975                         break;
10976                 }
10977
10978                 start = max(range->start, cache->key.objectid);
10979                 end = min(range->start + range->len,
10980                                 cache->key.objectid + cache->key.offset);
10981
10982                 if (end - start >= range->minlen) {
10983                         if (!block_group_cache_done(cache)) {
10984                                 ret = cache_block_group(cache, 0);
10985                                 if (ret) {
10986                                         bg_failed++;
10987                                         bg_ret = ret;
10988                                         continue;
10989                                 }
10990                                 ret = wait_block_group_cache_done(cache);
10991                                 if (ret) {
10992                                         bg_failed++;
10993                                         bg_ret = ret;
10994                                         continue;
10995                                 }
10996                         }
10997                         ret = btrfs_trim_block_group(cache,
10998                                                      &group_trimmed,
10999                                                      start,
11000                                                      end,
11001                                                      range->minlen);
11002
11003                         trimmed += group_trimmed;
11004                         if (ret) {
11005                                 bg_failed++;
11006                                 bg_ret = ret;
11007                                 continue;
11008                         }
11009                 }
11010         }
11011
11012         if (bg_failed)
11013                 btrfs_warn(fs_info,
11014                         "failed to trim %llu block group(s), last error %d",
11015                         bg_failed, bg_ret);
11016         mutex_lock(&fs_info->fs_devices->device_list_mutex);
11017         devices = &fs_info->fs_devices->devices;
11018         list_for_each_entry(device, devices, dev_list) {
11019                 ret = btrfs_trim_free_extents(device, range->minlen,
11020                                               &group_trimmed);
11021                 if (ret) {
11022                         dev_failed++;
11023                         dev_ret = ret;
11024                         break;
11025                 }
11026
11027                 trimmed += group_trimmed;
11028         }
11029         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
11030
11031         if (dev_failed)
11032                 btrfs_warn(fs_info,
11033                         "failed to trim %llu device(s), last error %d",
11034                         dev_failed, dev_ret);
11035         range->len = trimmed;
11036         if (bg_ret)
11037                 return bg_ret;
11038         return dev_ret;
11039 }
11040
11041 /*
11042  * btrfs_{start,end}_write_no_snapshotting() are similar to
11043  * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
11044  * data into the page cache through nocow before the subvolume is snapshoted,
11045  * but flush the data into disk after the snapshot creation, or to prevent
11046  * operations while snapshotting is ongoing and that cause the snapshot to be
11047  * inconsistent (writes followed by expanding truncates for example).
11048  */
11049 void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
11050 {
11051         percpu_counter_dec(&root->subv_writers->counter);
11052         cond_wake_up(&root->subv_writers->wait);
11053 }
11054
11055 int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
11056 {
11057         if (atomic_read(&root->will_be_snapshotted))
11058                 return 0;
11059
11060         percpu_counter_inc(&root->subv_writers->counter);
11061         /*
11062          * Make sure counter is updated before we check for snapshot creation.
11063          */
11064         smp_mb();
11065         if (atomic_read(&root->will_be_snapshotted)) {
11066                 btrfs_end_write_no_snapshotting(root);
11067                 return 0;
11068         }
11069         return 1;
11070 }
11071
11072 void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
11073 {
11074         while (true) {
11075                 int ret;
11076
11077                 ret = btrfs_start_write_no_snapshotting(root);
11078                 if (ret)
11079                         break;
11080                 wait_var_event(&root->will_be_snapshotted,
11081                                !atomic_read(&root->will_be_snapshotted));
11082         }
11083 }
11084
11085 void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg)
11086 {
11087         struct btrfs_fs_info *fs_info = bg->fs_info;
11088
11089         spin_lock(&fs_info->unused_bgs_lock);
11090         if (list_empty(&bg->bg_list)) {
11091                 btrfs_get_block_group(bg);
11092                 trace_btrfs_add_unused_block_group(bg);
11093                 list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
11094         }
11095         spin_unlock(&fs_info->unused_bgs_lock);
11096 }