1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2007 Oracle. All rights reserved.
6 #include <linux/sched.h>
7 #include <linux/sched/signal.h>
8 #include <linux/pagemap.h>
9 #include <linux/writeback.h>
10 #include <linux/blkdev.h>
11 #include <linux/sort.h>
12 #include <linux/rcupdate.h>
13 #include <linux/kthread.h>
14 #include <linux/slab.h>
15 #include <linux/ratelimit.h>
16 #include <linux/percpu_counter.h>
17 #include <linux/lockdep.h>
18 #include <linux/crc32c.h>
21 #include "print-tree.h"
25 #include "free-space-cache.h"
26 #include "free-space-tree.h"
30 #include "ref-verify.h"
31 #include "space-info.h"
32 #include "block-rsv.h"
34 #undef SCRAMBLE_DELAYED_REFS
37 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
38 struct btrfs_delayed_ref_node *node, u64 parent,
39 u64 root_objectid, u64 owner_objectid,
40 u64 owner_offset, int refs_to_drop,
41 struct btrfs_delayed_extent_op *extra_op);
42 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
43 struct extent_buffer *leaf,
44 struct btrfs_extent_item *ei);
45 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
46 u64 parent, u64 root_objectid,
47 u64 flags, u64 owner, u64 offset,
48 struct btrfs_key *ins, int ref_mod);
49 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
50 struct btrfs_delayed_ref_node *node,
51 struct btrfs_delayed_extent_op *extent_op);
52 static int find_next_key(struct btrfs_path *path, int level,
53 struct btrfs_key *key);
56 block_group_cache_done(struct btrfs_block_group_cache *cache)
59 return cache->cached == BTRFS_CACHE_FINISHED ||
60 cache->cached == BTRFS_CACHE_ERROR;
63 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
65 return (cache->flags & bits) == bits;
68 void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
70 atomic_inc(&cache->count);
73 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
75 if (atomic_dec_and_test(&cache->count)) {
76 WARN_ON(cache->pinned > 0);
77 WARN_ON(cache->reserved > 0);
80 * If not empty, someone is still holding mutex of
81 * full_stripe_lock, which can only be released by caller.
82 * And it will definitely cause use-after-free when caller
83 * tries to release full stripe lock.
85 * No better way to resolve, but only to warn.
87 WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
88 kfree(cache->free_space_ctl);
94 * this adds the block group to the fs_info rb tree for the block group
97 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
98 struct btrfs_block_group_cache *block_group)
101 struct rb_node *parent = NULL;
102 struct btrfs_block_group_cache *cache;
104 spin_lock(&info->block_group_cache_lock);
105 p = &info->block_group_cache_tree.rb_node;
109 cache = rb_entry(parent, struct btrfs_block_group_cache,
111 if (block_group->key.objectid < cache->key.objectid) {
113 } else if (block_group->key.objectid > cache->key.objectid) {
116 spin_unlock(&info->block_group_cache_lock);
121 rb_link_node(&block_group->cache_node, parent, p);
122 rb_insert_color(&block_group->cache_node,
123 &info->block_group_cache_tree);
125 if (info->first_logical_byte > block_group->key.objectid)
126 info->first_logical_byte = block_group->key.objectid;
128 spin_unlock(&info->block_group_cache_lock);
134 * This will return the block group at or after bytenr if contains is 0, else
135 * it will return the block group that contains the bytenr
137 static struct btrfs_block_group_cache *
138 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
141 struct btrfs_block_group_cache *cache, *ret = NULL;
145 spin_lock(&info->block_group_cache_lock);
146 n = info->block_group_cache_tree.rb_node;
149 cache = rb_entry(n, struct btrfs_block_group_cache,
151 end = cache->key.objectid + cache->key.offset - 1;
152 start = cache->key.objectid;
154 if (bytenr < start) {
155 if (!contains && (!ret || start < ret->key.objectid))
158 } else if (bytenr > start) {
159 if (contains && bytenr <= end) {
170 btrfs_get_block_group(ret);
171 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
172 info->first_logical_byte = ret->key.objectid;
174 spin_unlock(&info->block_group_cache_lock);
179 static int add_excluded_extent(struct btrfs_fs_info *fs_info,
180 u64 start, u64 num_bytes)
182 u64 end = start + num_bytes - 1;
183 set_extent_bits(&fs_info->freed_extents[0],
184 start, end, EXTENT_UPTODATE);
185 set_extent_bits(&fs_info->freed_extents[1],
186 start, end, EXTENT_UPTODATE);
190 static void free_excluded_extents(struct btrfs_block_group_cache *cache)
192 struct btrfs_fs_info *fs_info = cache->fs_info;
195 start = cache->key.objectid;
196 end = start + cache->key.offset - 1;
198 clear_extent_bits(&fs_info->freed_extents[0],
199 start, end, EXTENT_UPTODATE);
200 clear_extent_bits(&fs_info->freed_extents[1],
201 start, end, EXTENT_UPTODATE);
204 static int exclude_super_stripes(struct btrfs_block_group_cache *cache)
206 struct btrfs_fs_info *fs_info = cache->fs_info;
212 if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
213 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
214 cache->bytes_super += stripe_len;
215 ret = add_excluded_extent(fs_info, cache->key.objectid,
221 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
222 bytenr = btrfs_sb_offset(i);
223 ret = btrfs_rmap_block(fs_info, cache->key.objectid,
224 bytenr, &logical, &nr, &stripe_len);
231 if (logical[nr] > cache->key.objectid +
235 if (logical[nr] + stripe_len <= cache->key.objectid)
239 if (start < cache->key.objectid) {
240 start = cache->key.objectid;
241 len = (logical[nr] + stripe_len) - start;
243 len = min_t(u64, stripe_len,
244 cache->key.objectid +
245 cache->key.offset - start);
248 cache->bytes_super += len;
249 ret = add_excluded_extent(fs_info, start, len);
261 static struct btrfs_caching_control *
262 get_caching_control(struct btrfs_block_group_cache *cache)
264 struct btrfs_caching_control *ctl;
266 spin_lock(&cache->lock);
267 if (!cache->caching_ctl) {
268 spin_unlock(&cache->lock);
272 ctl = cache->caching_ctl;
273 refcount_inc(&ctl->count);
274 spin_unlock(&cache->lock);
278 static void put_caching_control(struct btrfs_caching_control *ctl)
280 if (refcount_dec_and_test(&ctl->count))
284 #ifdef CONFIG_BTRFS_DEBUG
285 static void fragment_free_space(struct btrfs_block_group_cache *block_group)
287 struct btrfs_fs_info *fs_info = block_group->fs_info;
288 u64 start = block_group->key.objectid;
289 u64 len = block_group->key.offset;
290 u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
291 fs_info->nodesize : fs_info->sectorsize;
292 u64 step = chunk << 1;
294 while (len > chunk) {
295 btrfs_remove_free_space(block_group, start, chunk);
306 * this is only called by cache_block_group, since we could have freed extents
307 * we need to check the pinned_extents for any extents that can't be used yet
308 * since their free space will be released as soon as the transaction commits.
310 u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
313 struct btrfs_fs_info *info = block_group->fs_info;
314 u64 extent_start, extent_end, size, total_added = 0;
317 while (start < end) {
318 ret = find_first_extent_bit(info->pinned_extents, start,
319 &extent_start, &extent_end,
320 EXTENT_DIRTY | EXTENT_UPTODATE,
325 if (extent_start <= start) {
326 start = extent_end + 1;
327 } else if (extent_start > start && extent_start < end) {
328 size = extent_start - start;
330 ret = btrfs_add_free_space(block_group, start,
332 BUG_ON(ret); /* -ENOMEM or logic error */
333 start = extent_end + 1;
342 ret = btrfs_add_free_space(block_group, start, size);
343 BUG_ON(ret); /* -ENOMEM or logic error */
349 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
351 struct btrfs_block_group_cache *block_group = caching_ctl->block_group;
352 struct btrfs_fs_info *fs_info = block_group->fs_info;
353 struct btrfs_root *extent_root = fs_info->extent_root;
354 struct btrfs_path *path;
355 struct extent_buffer *leaf;
356 struct btrfs_key key;
363 path = btrfs_alloc_path();
367 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
369 #ifdef CONFIG_BTRFS_DEBUG
371 * If we're fragmenting we don't want to make anybody think we can
372 * allocate from this block group until we've had a chance to fragment
375 if (btrfs_should_fragment_free_space(block_group))
379 * We don't want to deadlock with somebody trying to allocate a new
380 * extent for the extent root while also trying to search the extent
381 * root to add free space. So we skip locking and search the commit
382 * root, since its read-only
384 path->skip_locking = 1;
385 path->search_commit_root = 1;
386 path->reada = READA_FORWARD;
390 key.type = BTRFS_EXTENT_ITEM_KEY;
393 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
397 leaf = path->nodes[0];
398 nritems = btrfs_header_nritems(leaf);
401 if (btrfs_fs_closing(fs_info) > 1) {
406 if (path->slots[0] < nritems) {
407 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
409 ret = find_next_key(path, 0, &key);
413 if (need_resched() ||
414 rwsem_is_contended(&fs_info->commit_root_sem)) {
416 caching_ctl->progress = last;
417 btrfs_release_path(path);
418 up_read(&fs_info->commit_root_sem);
419 mutex_unlock(&caching_ctl->mutex);
421 mutex_lock(&caching_ctl->mutex);
422 down_read(&fs_info->commit_root_sem);
426 ret = btrfs_next_leaf(extent_root, path);
431 leaf = path->nodes[0];
432 nritems = btrfs_header_nritems(leaf);
436 if (key.objectid < last) {
439 key.type = BTRFS_EXTENT_ITEM_KEY;
442 caching_ctl->progress = last;
443 btrfs_release_path(path);
447 if (key.objectid < block_group->key.objectid) {
452 if (key.objectid >= block_group->key.objectid +
453 block_group->key.offset)
456 if (key.type == BTRFS_EXTENT_ITEM_KEY ||
457 key.type == BTRFS_METADATA_ITEM_KEY) {
458 total_found += add_new_free_space(block_group, last,
460 if (key.type == BTRFS_METADATA_ITEM_KEY)
461 last = key.objectid +
464 last = key.objectid + key.offset;
466 if (total_found > CACHING_CTL_WAKE_UP) {
469 wake_up(&caching_ctl->wait);
476 total_found += add_new_free_space(block_group, last,
477 block_group->key.objectid +
478 block_group->key.offset);
479 caching_ctl->progress = (u64)-1;
482 btrfs_free_path(path);
486 static noinline void caching_thread(struct btrfs_work *work)
488 struct btrfs_block_group_cache *block_group;
489 struct btrfs_fs_info *fs_info;
490 struct btrfs_caching_control *caching_ctl;
493 caching_ctl = container_of(work, struct btrfs_caching_control, work);
494 block_group = caching_ctl->block_group;
495 fs_info = block_group->fs_info;
497 mutex_lock(&caching_ctl->mutex);
498 down_read(&fs_info->commit_root_sem);
500 if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
501 ret = load_free_space_tree(caching_ctl);
503 ret = load_extent_tree_free(caching_ctl);
505 spin_lock(&block_group->lock);
506 block_group->caching_ctl = NULL;
507 block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
508 spin_unlock(&block_group->lock);
510 #ifdef CONFIG_BTRFS_DEBUG
511 if (btrfs_should_fragment_free_space(block_group)) {
514 spin_lock(&block_group->space_info->lock);
515 spin_lock(&block_group->lock);
516 bytes_used = block_group->key.offset -
517 btrfs_block_group_used(&block_group->item);
518 block_group->space_info->bytes_used += bytes_used >> 1;
519 spin_unlock(&block_group->lock);
520 spin_unlock(&block_group->space_info->lock);
521 fragment_free_space(block_group);
525 caching_ctl->progress = (u64)-1;
527 up_read(&fs_info->commit_root_sem);
528 free_excluded_extents(block_group);
529 mutex_unlock(&caching_ctl->mutex);
531 wake_up(&caching_ctl->wait);
533 put_caching_control(caching_ctl);
534 btrfs_put_block_group(block_group);
537 static int cache_block_group(struct btrfs_block_group_cache *cache,
541 struct btrfs_fs_info *fs_info = cache->fs_info;
542 struct btrfs_caching_control *caching_ctl;
545 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
549 INIT_LIST_HEAD(&caching_ctl->list);
550 mutex_init(&caching_ctl->mutex);
551 init_waitqueue_head(&caching_ctl->wait);
552 caching_ctl->block_group = cache;
553 caching_ctl->progress = cache->key.objectid;
554 refcount_set(&caching_ctl->count, 1);
555 btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
556 caching_thread, NULL, NULL);
558 spin_lock(&cache->lock);
560 * This should be a rare occasion, but this could happen I think in the
561 * case where one thread starts to load the space cache info, and then
562 * some other thread starts a transaction commit which tries to do an
563 * allocation while the other thread is still loading the space cache
564 * info. The previous loop should have kept us from choosing this block
565 * group, but if we've moved to the state where we will wait on caching
566 * block groups we need to first check if we're doing a fast load here,
567 * so we can wait for it to finish, otherwise we could end up allocating
568 * from a block group who's cache gets evicted for one reason or
571 while (cache->cached == BTRFS_CACHE_FAST) {
572 struct btrfs_caching_control *ctl;
574 ctl = cache->caching_ctl;
575 refcount_inc(&ctl->count);
576 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
577 spin_unlock(&cache->lock);
581 finish_wait(&ctl->wait, &wait);
582 put_caching_control(ctl);
583 spin_lock(&cache->lock);
586 if (cache->cached != BTRFS_CACHE_NO) {
587 spin_unlock(&cache->lock);
591 WARN_ON(cache->caching_ctl);
592 cache->caching_ctl = caching_ctl;
593 cache->cached = BTRFS_CACHE_FAST;
594 spin_unlock(&cache->lock);
596 if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
597 mutex_lock(&caching_ctl->mutex);
598 ret = load_free_space_cache(cache);
600 spin_lock(&cache->lock);
602 cache->caching_ctl = NULL;
603 cache->cached = BTRFS_CACHE_FINISHED;
604 cache->last_byte_to_unpin = (u64)-1;
605 caching_ctl->progress = (u64)-1;
607 if (load_cache_only) {
608 cache->caching_ctl = NULL;
609 cache->cached = BTRFS_CACHE_NO;
611 cache->cached = BTRFS_CACHE_STARTED;
612 cache->has_caching_ctl = 1;
615 spin_unlock(&cache->lock);
616 #ifdef CONFIG_BTRFS_DEBUG
618 btrfs_should_fragment_free_space(cache)) {
621 spin_lock(&cache->space_info->lock);
622 spin_lock(&cache->lock);
623 bytes_used = cache->key.offset -
624 btrfs_block_group_used(&cache->item);
625 cache->space_info->bytes_used += bytes_used >> 1;
626 spin_unlock(&cache->lock);
627 spin_unlock(&cache->space_info->lock);
628 fragment_free_space(cache);
631 mutex_unlock(&caching_ctl->mutex);
633 wake_up(&caching_ctl->wait);
635 put_caching_control(caching_ctl);
636 free_excluded_extents(cache);
641 * We're either using the free space tree or no caching at all.
642 * Set cached to the appropriate value and wakeup any waiters.
644 spin_lock(&cache->lock);
645 if (load_cache_only) {
646 cache->caching_ctl = NULL;
647 cache->cached = BTRFS_CACHE_NO;
649 cache->cached = BTRFS_CACHE_STARTED;
650 cache->has_caching_ctl = 1;
652 spin_unlock(&cache->lock);
653 wake_up(&caching_ctl->wait);
656 if (load_cache_only) {
657 put_caching_control(caching_ctl);
661 down_write(&fs_info->commit_root_sem);
662 refcount_inc(&caching_ctl->count);
663 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
664 up_write(&fs_info->commit_root_sem);
666 btrfs_get_block_group(cache);
668 btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
674 * return the block group that starts at or after bytenr
676 static struct btrfs_block_group_cache *
677 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
679 return block_group_cache_tree_search(info, bytenr, 0);
683 * return the block group that contains the given bytenr
685 struct btrfs_block_group_cache *btrfs_lookup_block_group(
686 struct btrfs_fs_info *info,
689 return block_group_cache_tree_search(info, bytenr, 1);
692 static u64 generic_ref_to_space_flags(struct btrfs_ref *ref)
694 if (ref->type == BTRFS_REF_METADATA) {
695 if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID)
696 return BTRFS_BLOCK_GROUP_SYSTEM;
698 return BTRFS_BLOCK_GROUP_METADATA;
700 return BTRFS_BLOCK_GROUP_DATA;
703 static void add_pinned_bytes(struct btrfs_fs_info *fs_info,
704 struct btrfs_ref *ref)
706 struct btrfs_space_info *space_info;
707 u64 flags = generic_ref_to_space_flags(ref);
709 space_info = btrfs_find_space_info(fs_info, flags);
711 percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len,
712 BTRFS_TOTAL_BYTES_PINNED_BATCH);
715 static void sub_pinned_bytes(struct btrfs_fs_info *fs_info,
716 struct btrfs_ref *ref)
718 struct btrfs_space_info *space_info;
719 u64 flags = generic_ref_to_space_flags(ref);
721 space_info = btrfs_find_space_info(fs_info, flags);
723 percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len,
724 BTRFS_TOTAL_BYTES_PINNED_BATCH);
727 /* simple helper to search for an existing data extent at a given offset */
728 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
731 struct btrfs_key key;
732 struct btrfs_path *path;
734 path = btrfs_alloc_path();
738 key.objectid = start;
740 key.type = BTRFS_EXTENT_ITEM_KEY;
741 ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
742 btrfs_free_path(path);
747 * helper function to lookup reference count and flags of a tree block.
749 * the head node for delayed ref is used to store the sum of all the
750 * reference count modifications queued up in the rbtree. the head
751 * node may also store the extent flags to set. This way you can check
752 * to see what the reference count and extent flags would be if all of
753 * the delayed refs are not processed.
755 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
756 struct btrfs_fs_info *fs_info, u64 bytenr,
757 u64 offset, int metadata, u64 *refs, u64 *flags)
759 struct btrfs_delayed_ref_head *head;
760 struct btrfs_delayed_ref_root *delayed_refs;
761 struct btrfs_path *path;
762 struct btrfs_extent_item *ei;
763 struct extent_buffer *leaf;
764 struct btrfs_key key;
771 * If we don't have skinny metadata, don't bother doing anything
774 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
775 offset = fs_info->nodesize;
779 path = btrfs_alloc_path();
784 path->skip_locking = 1;
785 path->search_commit_root = 1;
789 key.objectid = bytenr;
792 key.type = BTRFS_METADATA_ITEM_KEY;
794 key.type = BTRFS_EXTENT_ITEM_KEY;
796 ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
800 if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
801 if (path->slots[0]) {
803 btrfs_item_key_to_cpu(path->nodes[0], &key,
805 if (key.objectid == bytenr &&
806 key.type == BTRFS_EXTENT_ITEM_KEY &&
807 key.offset == fs_info->nodesize)
813 leaf = path->nodes[0];
814 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
815 if (item_size >= sizeof(*ei)) {
816 ei = btrfs_item_ptr(leaf, path->slots[0],
817 struct btrfs_extent_item);
818 num_refs = btrfs_extent_refs(leaf, ei);
819 extent_flags = btrfs_extent_flags(leaf, ei);
822 btrfs_print_v0_err(fs_info);
824 btrfs_abort_transaction(trans, ret);
826 btrfs_handle_fs_error(fs_info, ret, NULL);
831 BUG_ON(num_refs == 0);
841 delayed_refs = &trans->transaction->delayed_refs;
842 spin_lock(&delayed_refs->lock);
843 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
845 if (!mutex_trylock(&head->mutex)) {
846 refcount_inc(&head->refs);
847 spin_unlock(&delayed_refs->lock);
849 btrfs_release_path(path);
852 * Mutex was contended, block until it's released and try
855 mutex_lock(&head->mutex);
856 mutex_unlock(&head->mutex);
857 btrfs_put_delayed_ref_head(head);
860 spin_lock(&head->lock);
861 if (head->extent_op && head->extent_op->update_flags)
862 extent_flags |= head->extent_op->flags_to_set;
864 BUG_ON(num_refs == 0);
866 num_refs += head->ref_mod;
867 spin_unlock(&head->lock);
868 mutex_unlock(&head->mutex);
870 spin_unlock(&delayed_refs->lock);
872 WARN_ON(num_refs == 0);
876 *flags = extent_flags;
878 btrfs_free_path(path);
883 * Back reference rules. Back refs have three main goals:
885 * 1) differentiate between all holders of references to an extent so that
886 * when a reference is dropped we can make sure it was a valid reference
887 * before freeing the extent.
889 * 2) Provide enough information to quickly find the holders of an extent
890 * if we notice a given block is corrupted or bad.
892 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
893 * maintenance. This is actually the same as #2, but with a slightly
894 * different use case.
896 * There are two kinds of back refs. The implicit back refs is optimized
897 * for pointers in non-shared tree blocks. For a given pointer in a block,
898 * back refs of this kind provide information about the block's owner tree
899 * and the pointer's key. These information allow us to find the block by
900 * b-tree searching. The full back refs is for pointers in tree blocks not
901 * referenced by their owner trees. The location of tree block is recorded
902 * in the back refs. Actually the full back refs is generic, and can be
903 * used in all cases the implicit back refs is used. The major shortcoming
904 * of the full back refs is its overhead. Every time a tree block gets
905 * COWed, we have to update back refs entry for all pointers in it.
907 * For a newly allocated tree block, we use implicit back refs for
908 * pointers in it. This means most tree related operations only involve
909 * implicit back refs. For a tree block created in old transaction, the
910 * only way to drop a reference to it is COW it. So we can detect the
911 * event that tree block loses its owner tree's reference and do the
912 * back refs conversion.
914 * When a tree block is COWed through a tree, there are four cases:
916 * The reference count of the block is one and the tree is the block's
917 * owner tree. Nothing to do in this case.
919 * The reference count of the block is one and the tree is not the
920 * block's owner tree. In this case, full back refs is used for pointers
921 * in the block. Remove these full back refs, add implicit back refs for
922 * every pointers in the new block.
924 * The reference count of the block is greater than one and the tree is
925 * the block's owner tree. In this case, implicit back refs is used for
926 * pointers in the block. Add full back refs for every pointers in the
927 * block, increase lower level extents' reference counts. The original
928 * implicit back refs are entailed to the new block.
930 * The reference count of the block is greater than one and the tree is
931 * not the block's owner tree. Add implicit back refs for every pointer in
932 * the new block, increase lower level extents' reference count.
934 * Back Reference Key composing:
936 * The key objectid corresponds to the first byte in the extent,
937 * The key type is used to differentiate between types of back refs.
938 * There are different meanings of the key offset for different types
941 * File extents can be referenced by:
943 * - multiple snapshots, subvolumes, or different generations in one subvol
944 * - different files inside a single subvolume
945 * - different offsets inside a file (bookend extents in file.c)
947 * The extent ref structure for the implicit back refs has fields for:
949 * - Objectid of the subvolume root
950 * - objectid of the file holding the reference
951 * - original offset in the file
952 * - how many bookend extents
954 * The key offset for the implicit back refs is hash of the first
957 * The extent ref structure for the full back refs has field for:
959 * - number of pointers in the tree leaf
961 * The key offset for the implicit back refs is the first byte of
964 * When a file extent is allocated, The implicit back refs is used.
965 * the fields are filled in:
967 * (root_key.objectid, inode objectid, offset in file, 1)
969 * When a file extent is removed file truncation, we find the
970 * corresponding implicit back refs and check the following fields:
972 * (btrfs_header_owner(leaf), inode objectid, offset in file)
974 * Btree extents can be referenced by:
976 * - Different subvolumes
978 * Both the implicit back refs and the full back refs for tree blocks
979 * only consist of key. The key offset for the implicit back refs is
980 * objectid of block's owner tree. The key offset for the full back refs
981 * is the first byte of parent block.
983 * When implicit back refs is used, information about the lowest key and
984 * level of the tree block are required. These information are stored in
985 * tree block info structure.
989 * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
990 * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
991 * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
993 int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
994 struct btrfs_extent_inline_ref *iref,
995 enum btrfs_inline_ref_type is_data)
997 int type = btrfs_extent_inline_ref_type(eb, iref);
998 u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
1000 if (type == BTRFS_TREE_BLOCK_REF_KEY ||
1001 type == BTRFS_SHARED_BLOCK_REF_KEY ||
1002 type == BTRFS_SHARED_DATA_REF_KEY ||
1003 type == BTRFS_EXTENT_DATA_REF_KEY) {
1004 if (is_data == BTRFS_REF_TYPE_BLOCK) {
1005 if (type == BTRFS_TREE_BLOCK_REF_KEY)
1007 if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1008 ASSERT(eb->fs_info);
1010 * Every shared one has parent tree
1011 * block, which must be aligned to
1015 IS_ALIGNED(offset, eb->fs_info->nodesize))
1018 } else if (is_data == BTRFS_REF_TYPE_DATA) {
1019 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1021 if (type == BTRFS_SHARED_DATA_REF_KEY) {
1022 ASSERT(eb->fs_info);
1024 * Every shared one has parent tree
1025 * block, which must be aligned to
1029 IS_ALIGNED(offset, eb->fs_info->nodesize))
1033 ASSERT(is_data == BTRFS_REF_TYPE_ANY);
1038 btrfs_print_leaf((struct extent_buffer *)eb);
1039 btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d",
1043 return BTRFS_REF_TYPE_INVALID;
1046 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1048 u32 high_crc = ~(u32)0;
1049 u32 low_crc = ~(u32)0;
1052 lenum = cpu_to_le64(root_objectid);
1053 high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
1054 lenum = cpu_to_le64(owner);
1055 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1056 lenum = cpu_to_le64(offset);
1057 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1059 return ((u64)high_crc << 31) ^ (u64)low_crc;
1062 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1063 struct btrfs_extent_data_ref *ref)
1065 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1066 btrfs_extent_data_ref_objectid(leaf, ref),
1067 btrfs_extent_data_ref_offset(leaf, ref));
1070 static int match_extent_data_ref(struct extent_buffer *leaf,
1071 struct btrfs_extent_data_ref *ref,
1072 u64 root_objectid, u64 owner, u64 offset)
1074 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1075 btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1076 btrfs_extent_data_ref_offset(leaf, ref) != offset)
1081 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1082 struct btrfs_path *path,
1083 u64 bytenr, u64 parent,
1085 u64 owner, u64 offset)
1087 struct btrfs_root *root = trans->fs_info->extent_root;
1088 struct btrfs_key key;
1089 struct btrfs_extent_data_ref *ref;
1090 struct extent_buffer *leaf;
1096 key.objectid = bytenr;
1098 key.type = BTRFS_SHARED_DATA_REF_KEY;
1099 key.offset = parent;
1101 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1102 key.offset = hash_extent_data_ref(root_objectid,
1107 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1119 leaf = path->nodes[0];
1120 nritems = btrfs_header_nritems(leaf);
1122 if (path->slots[0] >= nritems) {
1123 ret = btrfs_next_leaf(root, path);
1129 leaf = path->nodes[0];
1130 nritems = btrfs_header_nritems(leaf);
1134 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1135 if (key.objectid != bytenr ||
1136 key.type != BTRFS_EXTENT_DATA_REF_KEY)
1139 ref = btrfs_item_ptr(leaf, path->slots[0],
1140 struct btrfs_extent_data_ref);
1142 if (match_extent_data_ref(leaf, ref, root_objectid,
1145 btrfs_release_path(path);
1157 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1158 struct btrfs_path *path,
1159 u64 bytenr, u64 parent,
1160 u64 root_objectid, u64 owner,
1161 u64 offset, int refs_to_add)
1163 struct btrfs_root *root = trans->fs_info->extent_root;
1164 struct btrfs_key key;
1165 struct extent_buffer *leaf;
1170 key.objectid = bytenr;
1172 key.type = BTRFS_SHARED_DATA_REF_KEY;
1173 key.offset = parent;
1174 size = sizeof(struct btrfs_shared_data_ref);
1176 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1177 key.offset = hash_extent_data_ref(root_objectid,
1179 size = sizeof(struct btrfs_extent_data_ref);
1182 ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1183 if (ret && ret != -EEXIST)
1186 leaf = path->nodes[0];
1188 struct btrfs_shared_data_ref *ref;
1189 ref = btrfs_item_ptr(leaf, path->slots[0],
1190 struct btrfs_shared_data_ref);
1192 btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1194 num_refs = btrfs_shared_data_ref_count(leaf, ref);
1195 num_refs += refs_to_add;
1196 btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1199 struct btrfs_extent_data_ref *ref;
1200 while (ret == -EEXIST) {
1201 ref = btrfs_item_ptr(leaf, path->slots[0],
1202 struct btrfs_extent_data_ref);
1203 if (match_extent_data_ref(leaf, ref, root_objectid,
1206 btrfs_release_path(path);
1208 ret = btrfs_insert_empty_item(trans, root, path, &key,
1210 if (ret && ret != -EEXIST)
1213 leaf = path->nodes[0];
1215 ref = btrfs_item_ptr(leaf, path->slots[0],
1216 struct btrfs_extent_data_ref);
1218 btrfs_set_extent_data_ref_root(leaf, ref,
1220 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1221 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1222 btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1224 num_refs = btrfs_extent_data_ref_count(leaf, ref);
1225 num_refs += refs_to_add;
1226 btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1229 btrfs_mark_buffer_dirty(leaf);
1232 btrfs_release_path(path);
1236 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1237 struct btrfs_path *path,
1238 int refs_to_drop, int *last_ref)
1240 struct btrfs_key key;
1241 struct btrfs_extent_data_ref *ref1 = NULL;
1242 struct btrfs_shared_data_ref *ref2 = NULL;
1243 struct extent_buffer *leaf;
1247 leaf = path->nodes[0];
1248 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1250 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1251 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1252 struct btrfs_extent_data_ref);
1253 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1254 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1255 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1256 struct btrfs_shared_data_ref);
1257 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1258 } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
1259 btrfs_print_v0_err(trans->fs_info);
1260 btrfs_abort_transaction(trans, -EINVAL);
1266 BUG_ON(num_refs < refs_to_drop);
1267 num_refs -= refs_to_drop;
1269 if (num_refs == 0) {
1270 ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
1273 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1274 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1275 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1276 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1277 btrfs_mark_buffer_dirty(leaf);
1282 static noinline u32 extent_data_ref_count(struct btrfs_path *path,
1283 struct btrfs_extent_inline_ref *iref)
1285 struct btrfs_key key;
1286 struct extent_buffer *leaf;
1287 struct btrfs_extent_data_ref *ref1;
1288 struct btrfs_shared_data_ref *ref2;
1292 leaf = path->nodes[0];
1293 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1295 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
1298 * If type is invalid, we should have bailed out earlier than
1301 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
1302 ASSERT(type != BTRFS_REF_TYPE_INVALID);
1303 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1304 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1305 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1307 ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1308 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1310 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1311 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1312 struct btrfs_extent_data_ref);
1313 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1314 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1315 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1316 struct btrfs_shared_data_ref);
1317 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1324 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1325 struct btrfs_path *path,
1326 u64 bytenr, u64 parent,
1329 struct btrfs_root *root = trans->fs_info->extent_root;
1330 struct btrfs_key key;
1333 key.objectid = bytenr;
1335 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1336 key.offset = parent;
1338 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1339 key.offset = root_objectid;
1342 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1348 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1349 struct btrfs_path *path,
1350 u64 bytenr, u64 parent,
1353 struct btrfs_key key;
1356 key.objectid = bytenr;
1358 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1359 key.offset = parent;
1361 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1362 key.offset = root_objectid;
1365 ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root,
1367 btrfs_release_path(path);
1371 static inline int extent_ref_type(u64 parent, u64 owner)
1374 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1376 type = BTRFS_SHARED_BLOCK_REF_KEY;
1378 type = BTRFS_TREE_BLOCK_REF_KEY;
1381 type = BTRFS_SHARED_DATA_REF_KEY;
1383 type = BTRFS_EXTENT_DATA_REF_KEY;
1388 static int find_next_key(struct btrfs_path *path, int level,
1389 struct btrfs_key *key)
1392 for (; level < BTRFS_MAX_LEVEL; level++) {
1393 if (!path->nodes[level])
1395 if (path->slots[level] + 1 >=
1396 btrfs_header_nritems(path->nodes[level]))
1399 btrfs_item_key_to_cpu(path->nodes[level], key,
1400 path->slots[level] + 1);
1402 btrfs_node_key_to_cpu(path->nodes[level], key,
1403 path->slots[level] + 1);
1410 * look for inline back ref. if back ref is found, *ref_ret is set
1411 * to the address of inline back ref, and 0 is returned.
1413 * if back ref isn't found, *ref_ret is set to the address where it
1414 * should be inserted, and -ENOENT is returned.
1416 * if insert is true and there are too many inline back refs, the path
1417 * points to the extent item, and -EAGAIN is returned.
1419 * NOTE: inline back refs are ordered in the same way that back ref
1420 * items in the tree are ordered.
1422 static noinline_for_stack
1423 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1424 struct btrfs_path *path,
1425 struct btrfs_extent_inline_ref **ref_ret,
1426 u64 bytenr, u64 num_bytes,
1427 u64 parent, u64 root_objectid,
1428 u64 owner, u64 offset, int insert)
1430 struct btrfs_fs_info *fs_info = trans->fs_info;
1431 struct btrfs_root *root = fs_info->extent_root;
1432 struct btrfs_key key;
1433 struct extent_buffer *leaf;
1434 struct btrfs_extent_item *ei;
1435 struct btrfs_extent_inline_ref *iref;
1445 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
1448 key.objectid = bytenr;
1449 key.type = BTRFS_EXTENT_ITEM_KEY;
1450 key.offset = num_bytes;
1452 want = extent_ref_type(parent, owner);
1454 extra_size = btrfs_extent_inline_ref_size(want);
1455 path->keep_locks = 1;
1460 * Owner is our level, so we can just add one to get the level for the
1461 * block we are interested in.
1463 if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
1464 key.type = BTRFS_METADATA_ITEM_KEY;
1469 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1476 * We may be a newly converted file system which still has the old fat
1477 * extent entries for metadata, so try and see if we have one of those.
1479 if (ret > 0 && skinny_metadata) {
1480 skinny_metadata = false;
1481 if (path->slots[0]) {
1483 btrfs_item_key_to_cpu(path->nodes[0], &key,
1485 if (key.objectid == bytenr &&
1486 key.type == BTRFS_EXTENT_ITEM_KEY &&
1487 key.offset == num_bytes)
1491 key.objectid = bytenr;
1492 key.type = BTRFS_EXTENT_ITEM_KEY;
1493 key.offset = num_bytes;
1494 btrfs_release_path(path);
1499 if (ret && !insert) {
1502 } else if (WARN_ON(ret)) {
1507 leaf = path->nodes[0];
1508 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1509 if (unlikely(item_size < sizeof(*ei))) {
1511 btrfs_print_v0_err(fs_info);
1512 btrfs_abort_transaction(trans, err);
1516 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1517 flags = btrfs_extent_flags(leaf, ei);
1519 ptr = (unsigned long)(ei + 1);
1520 end = (unsigned long)ei + item_size;
1522 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
1523 ptr += sizeof(struct btrfs_tree_block_info);
1527 if (owner >= BTRFS_FIRST_FREE_OBJECTID)
1528 needed = BTRFS_REF_TYPE_DATA;
1530 needed = BTRFS_REF_TYPE_BLOCK;
1538 iref = (struct btrfs_extent_inline_ref *)ptr;
1539 type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
1540 if (type == BTRFS_REF_TYPE_INVALID) {
1548 ptr += btrfs_extent_inline_ref_size(type);
1552 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1553 struct btrfs_extent_data_ref *dref;
1554 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1555 if (match_extent_data_ref(leaf, dref, root_objectid,
1560 if (hash_extent_data_ref_item(leaf, dref) <
1561 hash_extent_data_ref(root_objectid, owner, offset))
1565 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1567 if (parent == ref_offset) {
1571 if (ref_offset < parent)
1574 if (root_objectid == ref_offset) {
1578 if (ref_offset < root_objectid)
1582 ptr += btrfs_extent_inline_ref_size(type);
1584 if (err == -ENOENT && insert) {
1585 if (item_size + extra_size >=
1586 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1591 * To add new inline back ref, we have to make sure
1592 * there is no corresponding back ref item.
1593 * For simplicity, we just do not add new inline back
1594 * ref if there is any kind of item for this block
1596 if (find_next_key(path, 0, &key) == 0 &&
1597 key.objectid == bytenr &&
1598 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1603 *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1606 path->keep_locks = 0;
1607 btrfs_unlock_up_safe(path, 1);
1613 * helper to add new inline back ref
1615 static noinline_for_stack
1616 void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
1617 struct btrfs_path *path,
1618 struct btrfs_extent_inline_ref *iref,
1619 u64 parent, u64 root_objectid,
1620 u64 owner, u64 offset, int refs_to_add,
1621 struct btrfs_delayed_extent_op *extent_op)
1623 struct extent_buffer *leaf;
1624 struct btrfs_extent_item *ei;
1627 unsigned long item_offset;
1632 leaf = path->nodes[0];
1633 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1634 item_offset = (unsigned long)iref - (unsigned long)ei;
1636 type = extent_ref_type(parent, owner);
1637 size = btrfs_extent_inline_ref_size(type);
1639 btrfs_extend_item(path, size);
1641 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1642 refs = btrfs_extent_refs(leaf, ei);
1643 refs += refs_to_add;
1644 btrfs_set_extent_refs(leaf, ei, refs);
1646 __run_delayed_extent_op(extent_op, leaf, ei);
1648 ptr = (unsigned long)ei + item_offset;
1649 end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1650 if (ptr < end - size)
1651 memmove_extent_buffer(leaf, ptr + size, ptr,
1654 iref = (struct btrfs_extent_inline_ref *)ptr;
1655 btrfs_set_extent_inline_ref_type(leaf, iref, type);
1656 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1657 struct btrfs_extent_data_ref *dref;
1658 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1659 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1660 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1661 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1662 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1663 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1664 struct btrfs_shared_data_ref *sref;
1665 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1666 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1667 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1668 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1669 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1671 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1673 btrfs_mark_buffer_dirty(leaf);
1676 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1677 struct btrfs_path *path,
1678 struct btrfs_extent_inline_ref **ref_ret,
1679 u64 bytenr, u64 num_bytes, u64 parent,
1680 u64 root_objectid, u64 owner, u64 offset)
1684 ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
1685 num_bytes, parent, root_objectid,
1690 btrfs_release_path(path);
1693 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1694 ret = lookup_tree_block_ref(trans, path, bytenr, parent,
1697 ret = lookup_extent_data_ref(trans, path, bytenr, parent,
1698 root_objectid, owner, offset);
1704 * helper to update/remove inline back ref
1706 static noinline_for_stack
1707 void update_inline_extent_backref(struct btrfs_path *path,
1708 struct btrfs_extent_inline_ref *iref,
1710 struct btrfs_delayed_extent_op *extent_op,
1713 struct extent_buffer *leaf = path->nodes[0];
1714 struct btrfs_extent_item *ei;
1715 struct btrfs_extent_data_ref *dref = NULL;
1716 struct btrfs_shared_data_ref *sref = NULL;
1724 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1725 refs = btrfs_extent_refs(leaf, ei);
1726 WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1727 refs += refs_to_mod;
1728 btrfs_set_extent_refs(leaf, ei, refs);
1730 __run_delayed_extent_op(extent_op, leaf, ei);
1733 * If type is invalid, we should have bailed out after
1734 * lookup_inline_extent_backref().
1736 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
1737 ASSERT(type != BTRFS_REF_TYPE_INVALID);
1739 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1740 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1741 refs = btrfs_extent_data_ref_count(leaf, dref);
1742 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1743 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1744 refs = btrfs_shared_data_ref_count(leaf, sref);
1747 BUG_ON(refs_to_mod != -1);
1750 BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1751 refs += refs_to_mod;
1754 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1755 btrfs_set_extent_data_ref_count(leaf, dref, refs);
1757 btrfs_set_shared_data_ref_count(leaf, sref, refs);
1760 size = btrfs_extent_inline_ref_size(type);
1761 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1762 ptr = (unsigned long)iref;
1763 end = (unsigned long)ei + item_size;
1764 if (ptr + size < end)
1765 memmove_extent_buffer(leaf, ptr, ptr + size,
1768 btrfs_truncate_item(path, item_size, 1);
1770 btrfs_mark_buffer_dirty(leaf);
1773 static noinline_for_stack
1774 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1775 struct btrfs_path *path,
1776 u64 bytenr, u64 num_bytes, u64 parent,
1777 u64 root_objectid, u64 owner,
1778 u64 offset, int refs_to_add,
1779 struct btrfs_delayed_extent_op *extent_op)
1781 struct btrfs_extent_inline_ref *iref;
1784 ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
1785 num_bytes, parent, root_objectid,
1788 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1789 update_inline_extent_backref(path, iref, refs_to_add,
1791 } else if (ret == -ENOENT) {
1792 setup_inline_extent_backref(trans->fs_info, path, iref, parent,
1793 root_objectid, owner, offset,
1794 refs_to_add, extent_op);
1800 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1801 struct btrfs_path *path,
1802 u64 bytenr, u64 parent, u64 root_objectid,
1803 u64 owner, u64 offset, int refs_to_add)
1806 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1807 BUG_ON(refs_to_add != 1);
1808 ret = insert_tree_block_ref(trans, path, bytenr, parent,
1811 ret = insert_extent_data_ref(trans, path, bytenr, parent,
1812 root_objectid, owner, offset,
1818 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1819 struct btrfs_path *path,
1820 struct btrfs_extent_inline_ref *iref,
1821 int refs_to_drop, int is_data, int *last_ref)
1825 BUG_ON(!is_data && refs_to_drop != 1);
1827 update_inline_extent_backref(path, iref, -refs_to_drop, NULL,
1829 } else if (is_data) {
1830 ret = remove_extent_data_ref(trans, path, refs_to_drop,
1834 ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
1839 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1840 u64 *discarded_bytes)
1843 u64 bytes_left, end;
1844 u64 aligned_start = ALIGN(start, 1 << 9);
1846 if (WARN_ON(start != aligned_start)) {
1847 len -= aligned_start - start;
1848 len = round_down(len, 1 << 9);
1849 start = aligned_start;
1852 *discarded_bytes = 0;
1860 /* Skip any superblocks on this device. */
1861 for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1862 u64 sb_start = btrfs_sb_offset(j);
1863 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1864 u64 size = sb_start - start;
1866 if (!in_range(sb_start, start, bytes_left) &&
1867 !in_range(sb_end, start, bytes_left) &&
1868 !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1872 * Superblock spans beginning of range. Adjust start and
1875 if (sb_start <= start) {
1876 start += sb_end - start;
1881 bytes_left = end - start;
1886 ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
1889 *discarded_bytes += size;
1890 else if (ret != -EOPNOTSUPP)
1899 bytes_left = end - start;
1903 ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
1906 *discarded_bytes += bytes_left;
1911 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1912 u64 num_bytes, u64 *actual_bytes)
1915 u64 discarded_bytes = 0;
1916 struct btrfs_bio *bbio = NULL;
1920 * Avoid races with device replace and make sure our bbio has devices
1921 * associated to its stripes that don't go away while we are discarding.
1923 btrfs_bio_counter_inc_blocked(fs_info);
1924 /* Tell the block device(s) that the sectors can be discarded */
1925 ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes,
1927 /* Error condition is -ENOMEM */
1929 struct btrfs_bio_stripe *stripe = bbio->stripes;
1933 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1935 struct request_queue *req_q;
1937 if (!stripe->dev->bdev) {
1938 ASSERT(btrfs_test_opt(fs_info, DEGRADED));
1941 req_q = bdev_get_queue(stripe->dev->bdev);
1942 if (!blk_queue_discard(req_q))
1945 ret = btrfs_issue_discard(stripe->dev->bdev,
1950 discarded_bytes += bytes;
1951 else if (ret != -EOPNOTSUPP)
1952 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
1955 * Just in case we get back EOPNOTSUPP for some reason,
1956 * just ignore the return value so we don't screw up
1957 * people calling discard_extent.
1961 btrfs_put_bbio(bbio);
1963 btrfs_bio_counter_dec(fs_info);
1966 *actual_bytes = discarded_bytes;
1969 if (ret == -EOPNOTSUPP)
1974 /* Can return -ENOMEM */
1975 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1976 struct btrfs_ref *generic_ref)
1978 struct btrfs_fs_info *fs_info = trans->fs_info;
1979 int old_ref_mod, new_ref_mod;
1982 ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
1983 generic_ref->action);
1984 BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
1985 generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID);
1987 if (generic_ref->type == BTRFS_REF_METADATA)
1988 ret = btrfs_add_delayed_tree_ref(trans, generic_ref,
1989 NULL, &old_ref_mod, &new_ref_mod);
1991 ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0,
1992 &old_ref_mod, &new_ref_mod);
1994 btrfs_ref_tree_mod(fs_info, generic_ref);
1996 if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
1997 sub_pinned_bytes(fs_info, generic_ref);
2003 * __btrfs_inc_extent_ref - insert backreference for a given extent
2005 * @trans: Handle of transaction
2007 * @node: The delayed ref node used to get the bytenr/length for
2008 * extent whose references are incremented.
2010 * @parent: If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
2011 * BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
2012 * bytenr of the parent block. Since new extents are always
2013 * created with indirect references, this will only be the case
2014 * when relocating a shared extent. In that case, root_objectid
2015 * will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must
2018 * @root_objectid: The id of the root where this modification has originated,
2019 * this can be either one of the well-known metadata trees or
2020 * the subvolume id which references this extent.
2022 * @owner: For data extents it is the inode number of the owning file.
2023 * For metadata extents this parameter holds the level in the
2024 * tree of the extent.
2026 * @offset: For metadata extents the offset is ignored and is currently
2027 * always passed as 0. For data extents it is the fileoffset
2028 * this extent belongs to.
2030 * @refs_to_add Number of references to add
2032 * @extent_op Pointer to a structure, holding information necessary when
2033 * updating a tree block's flags
2036 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2037 struct btrfs_delayed_ref_node *node,
2038 u64 parent, u64 root_objectid,
2039 u64 owner, u64 offset, int refs_to_add,
2040 struct btrfs_delayed_extent_op *extent_op)
2042 struct btrfs_path *path;
2043 struct extent_buffer *leaf;
2044 struct btrfs_extent_item *item;
2045 struct btrfs_key key;
2046 u64 bytenr = node->bytenr;
2047 u64 num_bytes = node->num_bytes;
2051 path = btrfs_alloc_path();
2055 path->reada = READA_FORWARD;
2056 path->leave_spinning = 1;
2057 /* this will setup the path even if it fails to insert the back ref */
2058 ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
2059 parent, root_objectid, owner,
2060 offset, refs_to_add, extent_op);
2061 if ((ret < 0 && ret != -EAGAIN) || !ret)
2065 * Ok we had -EAGAIN which means we didn't have space to insert and
2066 * inline extent ref, so just update the reference count and add a
2069 leaf = path->nodes[0];
2070 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2071 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2072 refs = btrfs_extent_refs(leaf, item);
2073 btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
2075 __run_delayed_extent_op(extent_op, leaf, item);
2077 btrfs_mark_buffer_dirty(leaf);
2078 btrfs_release_path(path);
2080 path->reada = READA_FORWARD;
2081 path->leave_spinning = 1;
2082 /* now insert the actual backref */
2083 ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid,
2084 owner, offset, refs_to_add);
2086 btrfs_abort_transaction(trans, ret);
2088 btrfs_free_path(path);
2092 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
2093 struct btrfs_delayed_ref_node *node,
2094 struct btrfs_delayed_extent_op *extent_op,
2095 int insert_reserved)
2098 struct btrfs_delayed_data_ref *ref;
2099 struct btrfs_key ins;
2104 ins.objectid = node->bytenr;
2105 ins.offset = node->num_bytes;
2106 ins.type = BTRFS_EXTENT_ITEM_KEY;
2108 ref = btrfs_delayed_node_to_data_ref(node);
2109 trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
2111 if (node->type == BTRFS_SHARED_DATA_REF_KEY)
2112 parent = ref->parent;
2113 ref_root = ref->root;
2115 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2117 flags |= extent_op->flags_to_set;
2118 ret = alloc_reserved_file_extent(trans, parent, ref_root,
2119 flags, ref->objectid,
2122 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2123 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
2124 ref->objectid, ref->offset,
2125 node->ref_mod, extent_op);
2126 } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2127 ret = __btrfs_free_extent(trans, node, parent,
2128 ref_root, ref->objectid,
2129 ref->offset, node->ref_mod,
2137 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2138 struct extent_buffer *leaf,
2139 struct btrfs_extent_item *ei)
2141 u64 flags = btrfs_extent_flags(leaf, ei);
2142 if (extent_op->update_flags) {
2143 flags |= extent_op->flags_to_set;
2144 btrfs_set_extent_flags(leaf, ei, flags);
2147 if (extent_op->update_key) {
2148 struct btrfs_tree_block_info *bi;
2149 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2150 bi = (struct btrfs_tree_block_info *)(ei + 1);
2151 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2155 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2156 struct btrfs_delayed_ref_head *head,
2157 struct btrfs_delayed_extent_op *extent_op)
2159 struct btrfs_fs_info *fs_info = trans->fs_info;
2160 struct btrfs_key key;
2161 struct btrfs_path *path;
2162 struct btrfs_extent_item *ei;
2163 struct extent_buffer *leaf;
2167 int metadata = !extent_op->is_data;
2172 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2175 path = btrfs_alloc_path();
2179 key.objectid = head->bytenr;
2182 key.type = BTRFS_METADATA_ITEM_KEY;
2183 key.offset = extent_op->level;
2185 key.type = BTRFS_EXTENT_ITEM_KEY;
2186 key.offset = head->num_bytes;
2190 path->reada = READA_FORWARD;
2191 path->leave_spinning = 1;
2192 ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
2199 if (path->slots[0] > 0) {
2201 btrfs_item_key_to_cpu(path->nodes[0], &key,
2203 if (key.objectid == head->bytenr &&
2204 key.type == BTRFS_EXTENT_ITEM_KEY &&
2205 key.offset == head->num_bytes)
2209 btrfs_release_path(path);
2212 key.objectid = head->bytenr;
2213 key.offset = head->num_bytes;
2214 key.type = BTRFS_EXTENT_ITEM_KEY;
2223 leaf = path->nodes[0];
2224 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2226 if (unlikely(item_size < sizeof(*ei))) {
2228 btrfs_print_v0_err(fs_info);
2229 btrfs_abort_transaction(trans, err);
2233 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2234 __run_delayed_extent_op(extent_op, leaf, ei);
2236 btrfs_mark_buffer_dirty(leaf);
2238 btrfs_free_path(path);
2242 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2243 struct btrfs_delayed_ref_node *node,
2244 struct btrfs_delayed_extent_op *extent_op,
2245 int insert_reserved)
2248 struct btrfs_delayed_tree_ref *ref;
2252 ref = btrfs_delayed_node_to_tree_ref(node);
2253 trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
2255 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2256 parent = ref->parent;
2257 ref_root = ref->root;
2259 if (node->ref_mod != 1) {
2260 btrfs_err(trans->fs_info,
2261 "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
2262 node->bytenr, node->ref_mod, node->action, ref_root,
2266 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2267 BUG_ON(!extent_op || !extent_op->update_flags);
2268 ret = alloc_reserved_tree_block(trans, node, extent_op);
2269 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2270 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
2271 ref->level, 0, 1, extent_op);
2272 } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2273 ret = __btrfs_free_extent(trans, node, parent, ref_root,
2274 ref->level, 0, 1, extent_op);
2281 /* helper function to actually process a single delayed ref entry */
2282 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2283 struct btrfs_delayed_ref_node *node,
2284 struct btrfs_delayed_extent_op *extent_op,
2285 int insert_reserved)
2289 if (trans->aborted) {
2290 if (insert_reserved)
2291 btrfs_pin_extent(trans->fs_info, node->bytenr,
2292 node->num_bytes, 1);
2296 if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2297 node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2298 ret = run_delayed_tree_ref(trans, node, extent_op,
2300 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2301 node->type == BTRFS_SHARED_DATA_REF_KEY)
2302 ret = run_delayed_data_ref(trans, node, extent_op,
2306 if (ret && insert_reserved)
2307 btrfs_pin_extent(trans->fs_info, node->bytenr,
2308 node->num_bytes, 1);
2312 static inline struct btrfs_delayed_ref_node *
2313 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2315 struct btrfs_delayed_ref_node *ref;
2317 if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
2321 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
2322 * This is to prevent a ref count from going down to zero, which deletes
2323 * the extent item from the extent tree, when there still are references
2324 * to add, which would fail because they would not find the extent item.
2326 if (!list_empty(&head->ref_add_list))
2327 return list_first_entry(&head->ref_add_list,
2328 struct btrfs_delayed_ref_node, add_list);
2330 ref = rb_entry(rb_first_cached(&head->ref_tree),
2331 struct btrfs_delayed_ref_node, ref_node);
2332 ASSERT(list_empty(&ref->add_list));
2336 static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
2337 struct btrfs_delayed_ref_head *head)
2339 spin_lock(&delayed_refs->lock);
2340 head->processing = 0;
2341 delayed_refs->num_heads_ready++;
2342 spin_unlock(&delayed_refs->lock);
2343 btrfs_delayed_ref_unlock(head);
2346 static struct btrfs_delayed_extent_op *cleanup_extent_op(
2347 struct btrfs_delayed_ref_head *head)
2349 struct btrfs_delayed_extent_op *extent_op = head->extent_op;
2354 if (head->must_insert_reserved) {
2355 head->extent_op = NULL;
2356 btrfs_free_delayed_extent_op(extent_op);
2362 static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
2363 struct btrfs_delayed_ref_head *head)
2365 struct btrfs_delayed_extent_op *extent_op;
2368 extent_op = cleanup_extent_op(head);
2371 head->extent_op = NULL;
2372 spin_unlock(&head->lock);
2373 ret = run_delayed_extent_op(trans, head, extent_op);
2374 btrfs_free_delayed_extent_op(extent_op);
2375 return ret ? ret : 1;
2378 void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
2379 struct btrfs_delayed_ref_root *delayed_refs,
2380 struct btrfs_delayed_ref_head *head)
2382 int nr_items = 1; /* Dropping this ref head update. */
2384 if (head->total_ref_mod < 0) {
2385 struct btrfs_space_info *space_info;
2389 flags = BTRFS_BLOCK_GROUP_DATA;
2390 else if (head->is_system)
2391 flags = BTRFS_BLOCK_GROUP_SYSTEM;
2393 flags = BTRFS_BLOCK_GROUP_METADATA;
2394 space_info = btrfs_find_space_info(fs_info, flags);
2396 percpu_counter_add_batch(&space_info->total_bytes_pinned,
2398 BTRFS_TOTAL_BYTES_PINNED_BATCH);
2401 * We had csum deletions accounted for in our delayed refs rsv,
2402 * we need to drop the csum leaves for this update from our
2405 if (head->is_data) {
2406 spin_lock(&delayed_refs->lock);
2407 delayed_refs->pending_csums -= head->num_bytes;
2408 spin_unlock(&delayed_refs->lock);
2409 nr_items += btrfs_csum_bytes_to_leaves(fs_info,
2414 btrfs_delayed_refs_rsv_release(fs_info, nr_items);
2417 static int cleanup_ref_head(struct btrfs_trans_handle *trans,
2418 struct btrfs_delayed_ref_head *head)
2421 struct btrfs_fs_info *fs_info = trans->fs_info;
2422 struct btrfs_delayed_ref_root *delayed_refs;
2425 delayed_refs = &trans->transaction->delayed_refs;
2427 ret = run_and_cleanup_extent_op(trans, head);
2429 unselect_delayed_ref_head(delayed_refs, head);
2430 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
2437 * Need to drop our head ref lock and re-acquire the delayed ref lock
2438 * and then re-check to make sure nobody got added.
2440 spin_unlock(&head->lock);
2441 spin_lock(&delayed_refs->lock);
2442 spin_lock(&head->lock);
2443 if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
2444 spin_unlock(&head->lock);
2445 spin_unlock(&delayed_refs->lock);
2448 btrfs_delete_ref_head(delayed_refs, head);
2449 spin_unlock(&head->lock);
2450 spin_unlock(&delayed_refs->lock);
2452 if (head->must_insert_reserved) {
2453 btrfs_pin_extent(fs_info, head->bytenr,
2454 head->num_bytes, 1);
2455 if (head->is_data) {
2456 ret = btrfs_del_csums(trans, fs_info, head->bytenr,
2461 btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
2463 trace_run_delayed_ref_head(fs_info, head, 0);
2464 btrfs_delayed_ref_unlock(head);
2465 btrfs_put_delayed_ref_head(head);
2469 static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
2470 struct btrfs_trans_handle *trans)
2472 struct btrfs_delayed_ref_root *delayed_refs =
2473 &trans->transaction->delayed_refs;
2474 struct btrfs_delayed_ref_head *head = NULL;
2477 spin_lock(&delayed_refs->lock);
2478 head = btrfs_select_ref_head(delayed_refs);
2480 spin_unlock(&delayed_refs->lock);
2485 * Grab the lock that says we are going to process all the refs for
2488 ret = btrfs_delayed_ref_lock(delayed_refs, head);
2489 spin_unlock(&delayed_refs->lock);
2492 * We may have dropped the spin lock to get the head mutex lock, and
2493 * that might have given someone else time to free the head. If that's
2494 * true, it has been removed from our list and we can move on.
2497 head = ERR_PTR(-EAGAIN);
2502 static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
2503 struct btrfs_delayed_ref_head *locked_ref,
2504 unsigned long *run_refs)
2506 struct btrfs_fs_info *fs_info = trans->fs_info;
2507 struct btrfs_delayed_ref_root *delayed_refs;
2508 struct btrfs_delayed_extent_op *extent_op;
2509 struct btrfs_delayed_ref_node *ref;
2510 int must_insert_reserved = 0;
2513 delayed_refs = &trans->transaction->delayed_refs;
2515 lockdep_assert_held(&locked_ref->mutex);
2516 lockdep_assert_held(&locked_ref->lock);
2518 while ((ref = select_delayed_ref(locked_ref))) {
2520 btrfs_check_delayed_seq(fs_info, ref->seq)) {
2521 spin_unlock(&locked_ref->lock);
2522 unselect_delayed_ref_head(delayed_refs, locked_ref);
2528 rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
2529 RB_CLEAR_NODE(&ref->ref_node);
2530 if (!list_empty(&ref->add_list))
2531 list_del(&ref->add_list);
2533 * When we play the delayed ref, also correct the ref_mod on
2536 switch (ref->action) {
2537 case BTRFS_ADD_DELAYED_REF:
2538 case BTRFS_ADD_DELAYED_EXTENT:
2539 locked_ref->ref_mod -= ref->ref_mod;
2541 case BTRFS_DROP_DELAYED_REF:
2542 locked_ref->ref_mod += ref->ref_mod;
2547 atomic_dec(&delayed_refs->num_entries);
2550 * Record the must_insert_reserved flag before we drop the
2553 must_insert_reserved = locked_ref->must_insert_reserved;
2554 locked_ref->must_insert_reserved = 0;
2556 extent_op = locked_ref->extent_op;
2557 locked_ref->extent_op = NULL;
2558 spin_unlock(&locked_ref->lock);
2560 ret = run_one_delayed_ref(trans, ref, extent_op,
2561 must_insert_reserved);
2563 btrfs_free_delayed_extent_op(extent_op);
2565 unselect_delayed_ref_head(delayed_refs, locked_ref);
2566 btrfs_put_delayed_ref(ref);
2567 btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
2572 btrfs_put_delayed_ref(ref);
2575 spin_lock(&locked_ref->lock);
2576 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2583 * Returns 0 on success or if called with an already aborted transaction.
2584 * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2586 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2589 struct btrfs_fs_info *fs_info = trans->fs_info;
2590 struct btrfs_delayed_ref_root *delayed_refs;
2591 struct btrfs_delayed_ref_head *locked_ref = NULL;
2592 ktime_t start = ktime_get();
2594 unsigned long count = 0;
2595 unsigned long actual_count = 0;
2597 delayed_refs = &trans->transaction->delayed_refs;
2600 locked_ref = btrfs_obtain_ref_head(trans);
2601 if (IS_ERR_OR_NULL(locked_ref)) {
2602 if (PTR_ERR(locked_ref) == -EAGAIN) {
2611 * We need to try and merge add/drops of the same ref since we
2612 * can run into issues with relocate dropping the implicit ref
2613 * and then it being added back again before the drop can
2614 * finish. If we merged anything we need to re-loop so we can
2616 * Or we can get node references of the same type that weren't
2617 * merged when created due to bumps in the tree mod seq, and
2618 * we need to merge them to prevent adding an inline extent
2619 * backref before dropping it (triggering a BUG_ON at
2620 * insert_inline_extent_backref()).
2622 spin_lock(&locked_ref->lock);
2623 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2625 ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
2627 if (ret < 0 && ret != -EAGAIN) {
2629 * Error, btrfs_run_delayed_refs_for_head already
2630 * unlocked everything so just bail out
2635 * Success, perform the usual cleanup of a processed
2638 ret = cleanup_ref_head(trans, locked_ref);
2640 /* We dropped our lock, we need to loop. */
2649 * Either success case or btrfs_run_delayed_refs_for_head
2650 * returned -EAGAIN, meaning we need to select another head
2655 } while ((nr != -1 && count < nr) || locked_ref);
2658 * We don't want to include ref heads since we can have empty ref heads
2659 * and those will drastically skew our runtime down since we just do
2660 * accounting, no actual extent tree updates.
2662 if (actual_count > 0) {
2663 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2667 * We weigh the current average higher than our current runtime
2668 * to avoid large swings in the average.
2670 spin_lock(&delayed_refs->lock);
2671 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2672 fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */
2673 spin_unlock(&delayed_refs->lock);
2678 #ifdef SCRAMBLE_DELAYED_REFS
2680 * Normally delayed refs get processed in ascending bytenr order. This
2681 * correlates in most cases to the order added. To expose dependencies on this
2682 * order, we start to process the tree in the middle instead of the beginning
2684 static u64 find_middle(struct rb_root *root)
2686 struct rb_node *n = root->rb_node;
2687 struct btrfs_delayed_ref_node *entry;
2690 u64 first = 0, last = 0;
2694 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2695 first = entry->bytenr;
2699 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2700 last = entry->bytenr;
2705 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2706 WARN_ON(!entry->in_tree);
2708 middle = entry->bytenr;
2721 static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads)
2725 num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2726 sizeof(struct btrfs_extent_inline_ref));
2727 if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2728 num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2731 * We don't ever fill up leaves all the way so multiply by 2 just to be
2732 * closer to what we're really going to want to use.
2734 return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info));
2738 * Takes the number of bytes to be csumm'ed and figures out how many leaves it
2739 * would require to store the csums for that many bytes.
2741 u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
2744 u64 num_csums_per_leaf;
2747 csum_size = BTRFS_MAX_ITEM_SIZE(fs_info);
2748 num_csums_per_leaf = div64_u64(csum_size,
2749 (u64)btrfs_super_csum_size(fs_info->super_copy));
2750 num_csums = div64_u64(csum_bytes, fs_info->sectorsize);
2751 num_csums += num_csums_per_leaf - 1;
2752 num_csums = div64_u64(num_csums, num_csums_per_leaf);
2756 bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info)
2758 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
2759 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
2763 spin_lock(&global_rsv->lock);
2764 reserved = global_rsv->reserved;
2765 spin_unlock(&global_rsv->lock);
2768 * Since the global reserve is just kind of magic we don't really want
2769 * to rely on it to save our bacon, so if our size is more than the
2770 * delayed_refs_rsv and the global rsv then it's time to think about
2773 spin_lock(&delayed_refs_rsv->lock);
2774 reserved += delayed_refs_rsv->reserved;
2775 if (delayed_refs_rsv->size >= reserved)
2777 spin_unlock(&delayed_refs_rsv->lock);
2781 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
2784 atomic_read(&trans->transaction->delayed_refs.num_entries);
2789 avg_runtime = trans->fs_info->avg_delayed_ref_runtime;
2790 val = num_entries * avg_runtime;
2791 if (val >= NSEC_PER_SEC)
2793 if (val >= NSEC_PER_SEC / 2)
2796 return btrfs_check_space_for_delayed_refs(trans->fs_info);
2800 * this starts processing the delayed reference count updates and
2801 * extent insertions we have queued up so far. count can be
2802 * 0, which means to process everything in the tree at the start
2803 * of the run (but not newly added entries), or it can be some target
2804 * number you'd like to process.
2806 * Returns 0 on success or if called with an aborted transaction
2807 * Returns <0 on error and aborts the transaction
2809 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2810 unsigned long count)
2812 struct btrfs_fs_info *fs_info = trans->fs_info;
2813 struct rb_node *node;
2814 struct btrfs_delayed_ref_root *delayed_refs;
2815 struct btrfs_delayed_ref_head *head;
2817 int run_all = count == (unsigned long)-1;
2819 /* We'll clean this up in btrfs_cleanup_transaction */
2823 if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
2826 delayed_refs = &trans->transaction->delayed_refs;
2828 count = atomic_read(&delayed_refs->num_entries) * 2;
2831 #ifdef SCRAMBLE_DELAYED_REFS
2832 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2834 ret = __btrfs_run_delayed_refs(trans, count);
2836 btrfs_abort_transaction(trans, ret);
2841 btrfs_create_pending_block_groups(trans);
2843 spin_lock(&delayed_refs->lock);
2844 node = rb_first_cached(&delayed_refs->href_root);
2846 spin_unlock(&delayed_refs->lock);
2849 head = rb_entry(node, struct btrfs_delayed_ref_head,
2851 refcount_inc(&head->refs);
2852 spin_unlock(&delayed_refs->lock);
2854 /* Mutex was contended, block until it's released and retry. */
2855 mutex_lock(&head->mutex);
2856 mutex_unlock(&head->mutex);
2858 btrfs_put_delayed_ref_head(head);
2866 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2867 u64 bytenr, u64 num_bytes, u64 flags,
2868 int level, int is_data)
2870 struct btrfs_delayed_extent_op *extent_op;
2873 extent_op = btrfs_alloc_delayed_extent_op();
2877 extent_op->flags_to_set = flags;
2878 extent_op->update_flags = true;
2879 extent_op->update_key = false;
2880 extent_op->is_data = is_data ? true : false;
2881 extent_op->level = level;
2883 ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
2885 btrfs_free_delayed_extent_op(extent_op);
2889 static noinline int check_delayed_ref(struct btrfs_root *root,
2890 struct btrfs_path *path,
2891 u64 objectid, u64 offset, u64 bytenr)
2893 struct btrfs_delayed_ref_head *head;
2894 struct btrfs_delayed_ref_node *ref;
2895 struct btrfs_delayed_data_ref *data_ref;
2896 struct btrfs_delayed_ref_root *delayed_refs;
2897 struct btrfs_transaction *cur_trans;
2898 struct rb_node *node;
2901 spin_lock(&root->fs_info->trans_lock);
2902 cur_trans = root->fs_info->running_transaction;
2904 refcount_inc(&cur_trans->use_count);
2905 spin_unlock(&root->fs_info->trans_lock);
2909 delayed_refs = &cur_trans->delayed_refs;
2910 spin_lock(&delayed_refs->lock);
2911 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
2913 spin_unlock(&delayed_refs->lock);
2914 btrfs_put_transaction(cur_trans);
2918 if (!mutex_trylock(&head->mutex)) {
2919 refcount_inc(&head->refs);
2920 spin_unlock(&delayed_refs->lock);
2922 btrfs_release_path(path);
2925 * Mutex was contended, block until it's released and let
2928 mutex_lock(&head->mutex);
2929 mutex_unlock(&head->mutex);
2930 btrfs_put_delayed_ref_head(head);
2931 btrfs_put_transaction(cur_trans);
2934 spin_unlock(&delayed_refs->lock);
2936 spin_lock(&head->lock);
2938 * XXX: We should replace this with a proper search function in the
2941 for (node = rb_first_cached(&head->ref_tree); node;
2942 node = rb_next(node)) {
2943 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
2944 /* If it's a shared ref we know a cross reference exists */
2945 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
2950 data_ref = btrfs_delayed_node_to_data_ref(ref);
2953 * If our ref doesn't match the one we're currently looking at
2954 * then we have a cross reference.
2956 if (data_ref->root != root->root_key.objectid ||
2957 data_ref->objectid != objectid ||
2958 data_ref->offset != offset) {
2963 spin_unlock(&head->lock);
2964 mutex_unlock(&head->mutex);
2965 btrfs_put_transaction(cur_trans);
2969 static noinline int check_committed_ref(struct btrfs_root *root,
2970 struct btrfs_path *path,
2971 u64 objectid, u64 offset, u64 bytenr)
2973 struct btrfs_fs_info *fs_info = root->fs_info;
2974 struct btrfs_root *extent_root = fs_info->extent_root;
2975 struct extent_buffer *leaf;
2976 struct btrfs_extent_data_ref *ref;
2977 struct btrfs_extent_inline_ref *iref;
2978 struct btrfs_extent_item *ei;
2979 struct btrfs_key key;
2984 key.objectid = bytenr;
2985 key.offset = (u64)-1;
2986 key.type = BTRFS_EXTENT_ITEM_KEY;
2988 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2991 BUG_ON(ret == 0); /* Corruption */
2994 if (path->slots[0] == 0)
2998 leaf = path->nodes[0];
2999 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3001 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
3005 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3006 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
3008 if (item_size != sizeof(*ei) +
3009 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
3012 if (btrfs_extent_generation(leaf, ei) <=
3013 btrfs_root_last_snapshot(&root->root_item))
3016 iref = (struct btrfs_extent_inline_ref *)(ei + 1);
3018 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
3019 if (type != BTRFS_EXTENT_DATA_REF_KEY)
3022 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
3023 if (btrfs_extent_refs(leaf, ei) !=
3024 btrfs_extent_data_ref_count(leaf, ref) ||
3025 btrfs_extent_data_ref_root(leaf, ref) !=
3026 root->root_key.objectid ||
3027 btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
3028 btrfs_extent_data_ref_offset(leaf, ref) != offset)
3036 int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
3039 struct btrfs_path *path;
3042 path = btrfs_alloc_path();
3047 ret = check_committed_ref(root, path, objectid,
3049 if (ret && ret != -ENOENT)
3052 ret = check_delayed_ref(root, path, objectid, offset, bytenr);
3053 } while (ret == -EAGAIN);
3056 btrfs_free_path(path);
3057 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
3062 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3063 struct btrfs_root *root,
3064 struct extent_buffer *buf,
3065 int full_backref, int inc)
3067 struct btrfs_fs_info *fs_info = root->fs_info;
3073 struct btrfs_key key;
3074 struct btrfs_file_extent_item *fi;
3075 struct btrfs_ref generic_ref = { 0 };
3076 bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
3082 if (btrfs_is_testing(fs_info))
3085 ref_root = btrfs_header_owner(buf);
3086 nritems = btrfs_header_nritems(buf);
3087 level = btrfs_header_level(buf);
3089 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
3093 parent = buf->start;
3097 action = BTRFS_ADD_DELAYED_REF;
3099 action = BTRFS_DROP_DELAYED_REF;
3101 for (i = 0; i < nritems; i++) {
3103 btrfs_item_key_to_cpu(buf, &key, i);
3104 if (key.type != BTRFS_EXTENT_DATA_KEY)
3106 fi = btrfs_item_ptr(buf, i,
3107 struct btrfs_file_extent_item);
3108 if (btrfs_file_extent_type(buf, fi) ==
3109 BTRFS_FILE_EXTENT_INLINE)
3111 bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
3115 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
3116 key.offset -= btrfs_file_extent_offset(buf, fi);
3117 btrfs_init_generic_ref(&generic_ref, action, bytenr,
3119 generic_ref.real_root = root->root_key.objectid;
3120 btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
3122 generic_ref.skip_qgroup = for_reloc;
3124 ret = btrfs_inc_extent_ref(trans, &generic_ref);
3126 ret = btrfs_free_extent(trans, &generic_ref);
3130 bytenr = btrfs_node_blockptr(buf, i);
3131 num_bytes = fs_info->nodesize;
3132 btrfs_init_generic_ref(&generic_ref, action, bytenr,
3134 generic_ref.real_root = root->root_key.objectid;
3135 btrfs_init_tree_ref(&generic_ref, level - 1, ref_root);
3136 generic_ref.skip_qgroup = for_reloc;
3138 ret = btrfs_inc_extent_ref(trans, &generic_ref);
3140 ret = btrfs_free_extent(trans, &generic_ref);
3150 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3151 struct extent_buffer *buf, int full_backref)
3153 return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
3156 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3157 struct extent_buffer *buf, int full_backref)
3159 return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
3162 static int write_one_cache_group(struct btrfs_trans_handle *trans,
3163 struct btrfs_path *path,
3164 struct btrfs_block_group_cache *cache)
3166 struct btrfs_fs_info *fs_info = trans->fs_info;
3168 struct btrfs_root *extent_root = fs_info->extent_root;
3170 struct extent_buffer *leaf;
3172 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
3179 leaf = path->nodes[0];
3180 bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3181 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
3182 btrfs_mark_buffer_dirty(leaf);
3184 btrfs_release_path(path);
3189 static struct btrfs_block_group_cache *next_block_group(
3190 struct btrfs_block_group_cache *cache)
3192 struct btrfs_fs_info *fs_info = cache->fs_info;
3193 struct rb_node *node;
3195 spin_lock(&fs_info->block_group_cache_lock);
3197 /* If our block group was removed, we need a full search. */
3198 if (RB_EMPTY_NODE(&cache->cache_node)) {
3199 const u64 next_bytenr = cache->key.objectid + cache->key.offset;
3201 spin_unlock(&fs_info->block_group_cache_lock);
3202 btrfs_put_block_group(cache);
3203 cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
3205 node = rb_next(&cache->cache_node);
3206 btrfs_put_block_group(cache);
3208 cache = rb_entry(node, struct btrfs_block_group_cache,
3210 btrfs_get_block_group(cache);
3213 spin_unlock(&fs_info->block_group_cache_lock);
3217 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
3218 struct btrfs_trans_handle *trans,
3219 struct btrfs_path *path)
3221 struct btrfs_fs_info *fs_info = block_group->fs_info;
3222 struct btrfs_root *root = fs_info->tree_root;
3223 struct inode *inode = NULL;
3224 struct extent_changeset *data_reserved = NULL;
3226 int dcs = BTRFS_DC_ERROR;
3232 * If this block group is smaller than 100 megs don't bother caching the
3235 if (block_group->key.offset < (100 * SZ_1M)) {
3236 spin_lock(&block_group->lock);
3237 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3238 spin_unlock(&block_group->lock);
3245 inode = lookup_free_space_inode(block_group, path);
3246 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3247 ret = PTR_ERR(inode);
3248 btrfs_release_path(path);
3252 if (IS_ERR(inode)) {
3256 if (block_group->ro)
3259 ret = create_free_space_inode(trans, block_group, path);
3266 * We want to set the generation to 0, that way if anything goes wrong
3267 * from here on out we know not to trust this cache when we load up next
3270 BTRFS_I(inode)->generation = 0;
3271 ret = btrfs_update_inode(trans, root, inode);
3274 * So theoretically we could recover from this, simply set the
3275 * super cache generation to 0 so we know to invalidate the
3276 * cache, but then we'd have to keep track of the block groups
3277 * that fail this way so we know we _have_ to reset this cache
3278 * before the next commit or risk reading stale cache. So to
3279 * limit our exposure to horrible edge cases lets just abort the
3280 * transaction, this only happens in really bad situations
3283 btrfs_abort_transaction(trans, ret);
3288 /* We've already setup this transaction, go ahead and exit */
3289 if (block_group->cache_generation == trans->transid &&
3290 i_size_read(inode)) {
3291 dcs = BTRFS_DC_SETUP;
3295 if (i_size_read(inode) > 0) {
3296 ret = btrfs_check_trunc_cache_free_space(fs_info,
3297 &fs_info->global_block_rsv);
3301 ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
3306 spin_lock(&block_group->lock);
3307 if (block_group->cached != BTRFS_CACHE_FINISHED ||
3308 !btrfs_test_opt(fs_info, SPACE_CACHE)) {
3310 * don't bother trying to write stuff out _if_
3311 * a) we're not cached,
3312 * b) we're with nospace_cache mount option,
3313 * c) we're with v2 space_cache (FREE_SPACE_TREE).
3315 dcs = BTRFS_DC_WRITTEN;
3316 spin_unlock(&block_group->lock);
3319 spin_unlock(&block_group->lock);
3322 * We hit an ENOSPC when setting up the cache in this transaction, just
3323 * skip doing the setup, we've already cleared the cache so we're safe.
3325 if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
3331 * Try to preallocate enough space based on how big the block group is.
3332 * Keep in mind this has to include any pinned space which could end up
3333 * taking up quite a bit since it's not folded into the other space
3336 num_pages = div_u64(block_group->key.offset, SZ_256M);
3341 num_pages *= PAGE_SIZE;
3343 ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
3347 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3348 num_pages, num_pages,
3351 * Our cache requires contiguous chunks so that we don't modify a bunch
3352 * of metadata or split extents when writing the cache out, which means
3353 * we can enospc if we are heavily fragmented in addition to just normal
3354 * out of space conditions. So if we hit this just skip setting up any
3355 * other block groups for this transaction, maybe we'll unpin enough
3356 * space the next time around.
3359 dcs = BTRFS_DC_SETUP;
3360 else if (ret == -ENOSPC)
3361 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
3366 btrfs_release_path(path);
3368 spin_lock(&block_group->lock);
3369 if (!ret && dcs == BTRFS_DC_SETUP)
3370 block_group->cache_generation = trans->transid;
3371 block_group->disk_cache_state = dcs;
3372 spin_unlock(&block_group->lock);
3374 extent_changeset_free(data_reserved);
3378 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
3380 struct btrfs_fs_info *fs_info = trans->fs_info;
3381 struct btrfs_block_group_cache *cache, *tmp;
3382 struct btrfs_transaction *cur_trans = trans->transaction;
3383 struct btrfs_path *path;
3385 if (list_empty(&cur_trans->dirty_bgs) ||
3386 !btrfs_test_opt(fs_info, SPACE_CACHE))
3389 path = btrfs_alloc_path();
3393 /* Could add new block groups, use _safe just in case */
3394 list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
3396 if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3397 cache_save_setup(cache, trans, path);
3400 btrfs_free_path(path);
3405 * transaction commit does final block group cache writeback during a
3406 * critical section where nothing is allowed to change the FS. This is
3407 * required in order for the cache to actually match the block group,
3408 * but can introduce a lot of latency into the commit.
3410 * So, btrfs_start_dirty_block_groups is here to kick off block group
3411 * cache IO. There's a chance we'll have to redo some of it if the
3412 * block group changes again during the commit, but it greatly reduces
3413 * the commit latency by getting rid of the easy block groups while
3414 * we're still allowing others to join the commit.
3416 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
3418 struct btrfs_fs_info *fs_info = trans->fs_info;
3419 struct btrfs_block_group_cache *cache;
3420 struct btrfs_transaction *cur_trans = trans->transaction;
3423 struct btrfs_path *path = NULL;
3425 struct list_head *io = &cur_trans->io_bgs;
3426 int num_started = 0;
3429 spin_lock(&cur_trans->dirty_bgs_lock);
3430 if (list_empty(&cur_trans->dirty_bgs)) {
3431 spin_unlock(&cur_trans->dirty_bgs_lock);
3434 list_splice_init(&cur_trans->dirty_bgs, &dirty);
3435 spin_unlock(&cur_trans->dirty_bgs_lock);
3439 * make sure all the block groups on our dirty list actually
3442 btrfs_create_pending_block_groups(trans);
3445 path = btrfs_alloc_path();
3451 * cache_write_mutex is here only to save us from balance or automatic
3452 * removal of empty block groups deleting this block group while we are
3453 * writing out the cache
3455 mutex_lock(&trans->transaction->cache_write_mutex);
3456 while (!list_empty(&dirty)) {
3457 bool drop_reserve = true;
3459 cache = list_first_entry(&dirty,
3460 struct btrfs_block_group_cache,
3463 * this can happen if something re-dirties a block
3464 * group that is already under IO. Just wait for it to
3465 * finish and then do it all again
3467 if (!list_empty(&cache->io_list)) {
3468 list_del_init(&cache->io_list);
3469 btrfs_wait_cache_io(trans, cache, path);
3470 btrfs_put_block_group(cache);
3475 * btrfs_wait_cache_io uses the cache->dirty_list to decide
3476 * if it should update the cache_state. Don't delete
3477 * until after we wait.
3479 * Since we're not running in the commit critical section
3480 * we need the dirty_bgs_lock to protect from update_block_group
3482 spin_lock(&cur_trans->dirty_bgs_lock);
3483 list_del_init(&cache->dirty_list);
3484 spin_unlock(&cur_trans->dirty_bgs_lock);
3488 cache_save_setup(cache, trans, path);
3490 if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3491 cache->io_ctl.inode = NULL;
3492 ret = btrfs_write_out_cache(trans, cache, path);
3493 if (ret == 0 && cache->io_ctl.inode) {
3498 * The cache_write_mutex is protecting the
3499 * io_list, also refer to the definition of
3500 * btrfs_transaction::io_bgs for more details
3502 list_add_tail(&cache->io_list, io);
3505 * if we failed to write the cache, the
3506 * generation will be bad and life goes on
3512 ret = write_one_cache_group(trans, path, cache);
3514 * Our block group might still be attached to the list
3515 * of new block groups in the transaction handle of some
3516 * other task (struct btrfs_trans_handle->new_bgs). This
3517 * means its block group item isn't yet in the extent
3518 * tree. If this happens ignore the error, as we will
3519 * try again later in the critical section of the
3520 * transaction commit.
3522 if (ret == -ENOENT) {
3524 spin_lock(&cur_trans->dirty_bgs_lock);
3525 if (list_empty(&cache->dirty_list)) {
3526 list_add_tail(&cache->dirty_list,
3527 &cur_trans->dirty_bgs);
3528 btrfs_get_block_group(cache);
3529 drop_reserve = false;
3531 spin_unlock(&cur_trans->dirty_bgs_lock);
3533 btrfs_abort_transaction(trans, ret);
3537 /* if it's not on the io list, we need to put the block group */
3539 btrfs_put_block_group(cache);
3541 btrfs_delayed_refs_rsv_release(fs_info, 1);
3547 * Avoid blocking other tasks for too long. It might even save
3548 * us from writing caches for block groups that are going to be
3551 mutex_unlock(&trans->transaction->cache_write_mutex);
3552 mutex_lock(&trans->transaction->cache_write_mutex);
3554 mutex_unlock(&trans->transaction->cache_write_mutex);
3557 * go through delayed refs for all the stuff we've just kicked off
3558 * and then loop back (just once)
3560 ret = btrfs_run_delayed_refs(trans, 0);
3561 if (!ret && loops == 0) {
3563 spin_lock(&cur_trans->dirty_bgs_lock);
3564 list_splice_init(&cur_trans->dirty_bgs, &dirty);
3566 * dirty_bgs_lock protects us from concurrent block group
3567 * deletes too (not just cache_write_mutex).
3569 if (!list_empty(&dirty)) {
3570 spin_unlock(&cur_trans->dirty_bgs_lock);
3573 spin_unlock(&cur_trans->dirty_bgs_lock);
3574 } else if (ret < 0) {
3575 btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
3578 btrfs_free_path(path);
3582 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
3584 struct btrfs_fs_info *fs_info = trans->fs_info;
3585 struct btrfs_block_group_cache *cache;
3586 struct btrfs_transaction *cur_trans = trans->transaction;
3589 struct btrfs_path *path;
3590 struct list_head *io = &cur_trans->io_bgs;
3591 int num_started = 0;
3593 path = btrfs_alloc_path();
3598 * Even though we are in the critical section of the transaction commit,
3599 * we can still have concurrent tasks adding elements to this
3600 * transaction's list of dirty block groups. These tasks correspond to
3601 * endio free space workers started when writeback finishes for a
3602 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
3603 * allocate new block groups as a result of COWing nodes of the root
3604 * tree when updating the free space inode. The writeback for the space
3605 * caches is triggered by an earlier call to
3606 * btrfs_start_dirty_block_groups() and iterations of the following
3608 * Also we want to do the cache_save_setup first and then run the
3609 * delayed refs to make sure we have the best chance at doing this all
3612 spin_lock(&cur_trans->dirty_bgs_lock);
3613 while (!list_empty(&cur_trans->dirty_bgs)) {
3614 cache = list_first_entry(&cur_trans->dirty_bgs,
3615 struct btrfs_block_group_cache,
3619 * this can happen if cache_save_setup re-dirties a block
3620 * group that is already under IO. Just wait for it to
3621 * finish and then do it all again
3623 if (!list_empty(&cache->io_list)) {
3624 spin_unlock(&cur_trans->dirty_bgs_lock);
3625 list_del_init(&cache->io_list);
3626 btrfs_wait_cache_io(trans, cache, path);
3627 btrfs_put_block_group(cache);
3628 spin_lock(&cur_trans->dirty_bgs_lock);
3632 * don't remove from the dirty list until after we've waited
3635 list_del_init(&cache->dirty_list);
3636 spin_unlock(&cur_trans->dirty_bgs_lock);
3639 cache_save_setup(cache, trans, path);
3642 ret = btrfs_run_delayed_refs(trans,
3643 (unsigned long) -1);
3645 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3646 cache->io_ctl.inode = NULL;
3647 ret = btrfs_write_out_cache(trans, cache, path);
3648 if (ret == 0 && cache->io_ctl.inode) {
3651 list_add_tail(&cache->io_list, io);
3654 * if we failed to write the cache, the
3655 * generation will be bad and life goes on
3661 ret = write_one_cache_group(trans, path, cache);
3663 * One of the free space endio workers might have
3664 * created a new block group while updating a free space
3665 * cache's inode (at inode.c:btrfs_finish_ordered_io())
3666 * and hasn't released its transaction handle yet, in
3667 * which case the new block group is still attached to
3668 * its transaction handle and its creation has not
3669 * finished yet (no block group item in the extent tree
3670 * yet, etc). If this is the case, wait for all free
3671 * space endio workers to finish and retry. This is a
3672 * a very rare case so no need for a more efficient and
3675 if (ret == -ENOENT) {
3676 wait_event(cur_trans->writer_wait,
3677 atomic_read(&cur_trans->num_writers) == 1);
3678 ret = write_one_cache_group(trans, path, cache);
3681 btrfs_abort_transaction(trans, ret);
3684 /* if its not on the io list, we need to put the block group */
3686 btrfs_put_block_group(cache);
3687 btrfs_delayed_refs_rsv_release(fs_info, 1);
3688 spin_lock(&cur_trans->dirty_bgs_lock);
3690 spin_unlock(&cur_trans->dirty_bgs_lock);
3693 * Refer to the definition of io_bgs member for details why it's safe
3694 * to use it without any locking
3696 while (!list_empty(io)) {
3697 cache = list_first_entry(io, struct btrfs_block_group_cache,
3699 list_del_init(&cache->io_list);
3700 btrfs_wait_cache_io(trans, cache, path);
3701 btrfs_put_block_group(cache);
3704 btrfs_free_path(path);
3708 int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
3710 struct btrfs_block_group_cache *block_group;
3713 block_group = btrfs_lookup_block_group(fs_info, bytenr);
3714 if (!block_group || block_group->ro)
3717 btrfs_put_block_group(block_group);
3721 bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3723 struct btrfs_block_group_cache *bg;
3726 bg = btrfs_lookup_block_group(fs_info, bytenr);
3730 spin_lock(&bg->lock);
3734 atomic_inc(&bg->nocow_writers);
3735 spin_unlock(&bg->lock);
3737 /* no put on block group, done by btrfs_dec_nocow_writers */
3739 btrfs_put_block_group(bg);
3745 void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3747 struct btrfs_block_group_cache *bg;
3749 bg = btrfs_lookup_block_group(fs_info, bytenr);
3751 if (atomic_dec_and_test(&bg->nocow_writers))
3752 wake_up_var(&bg->nocow_writers);
3754 * Once for our lookup and once for the lookup done by a previous call
3755 * to btrfs_inc_nocow_writers()
3757 btrfs_put_block_group(bg);
3758 btrfs_put_block_group(bg);
3761 void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
3763 wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
3766 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3768 u64 extra_flags = chunk_to_extended(flags) &
3769 BTRFS_EXTENDED_PROFILE_MASK;
3771 write_seqlock(&fs_info->profiles_lock);
3772 if (flags & BTRFS_BLOCK_GROUP_DATA)
3773 fs_info->avail_data_alloc_bits |= extra_flags;
3774 if (flags & BTRFS_BLOCK_GROUP_METADATA)
3775 fs_info->avail_metadata_alloc_bits |= extra_flags;
3776 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3777 fs_info->avail_system_alloc_bits |= extra_flags;
3778 write_sequnlock(&fs_info->profiles_lock);
3782 * returns target flags in extended format or 0 if restripe for this
3783 * chunk_type is not in progress
3785 * should be called with balance_lock held
3787 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
3789 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3795 if (flags & BTRFS_BLOCK_GROUP_DATA &&
3796 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3797 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
3798 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
3799 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3800 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
3801 } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
3802 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3803 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
3810 * @flags: available profiles in extended format (see ctree.h)
3812 * Returns reduced profile in chunk format. If profile changing is in
3813 * progress (either running or paused) picks the target profile (if it's
3814 * already available), otherwise falls back to plain reducing.
3816 static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
3818 u64 num_devices = fs_info->fs_devices->rw_devices;
3824 * see if restripe for this chunk_type is in progress, if so
3825 * try to reduce to the target profile
3827 spin_lock(&fs_info->balance_lock);
3828 target = get_restripe_target(fs_info, flags);
3830 /* pick target profile only if it's already available */
3831 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
3832 spin_unlock(&fs_info->balance_lock);
3833 return extended_to_chunk(target);
3836 spin_unlock(&fs_info->balance_lock);
3838 /* First, mask out the RAID levels which aren't possible */
3839 for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
3840 if (num_devices >= btrfs_raid_array[raid_type].devs_min)
3841 allowed |= btrfs_raid_array[raid_type].bg_flag;
3845 if (allowed & BTRFS_BLOCK_GROUP_RAID6)
3846 allowed = BTRFS_BLOCK_GROUP_RAID6;
3847 else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
3848 allowed = BTRFS_BLOCK_GROUP_RAID5;
3849 else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
3850 allowed = BTRFS_BLOCK_GROUP_RAID10;
3851 else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
3852 allowed = BTRFS_BLOCK_GROUP_RAID1;
3853 else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
3854 allowed = BTRFS_BLOCK_GROUP_RAID0;
3856 flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
3858 return extended_to_chunk(flags | allowed);
3861 static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
3868 seq = read_seqbegin(&fs_info->profiles_lock);
3870 if (flags & BTRFS_BLOCK_GROUP_DATA)
3871 flags |= fs_info->avail_data_alloc_bits;
3872 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3873 flags |= fs_info->avail_system_alloc_bits;
3874 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3875 flags |= fs_info->avail_metadata_alloc_bits;
3876 } while (read_seqretry(&fs_info->profiles_lock, seq));
3878 return btrfs_reduce_alloc_profile(fs_info, flags);
3881 static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
3883 struct btrfs_fs_info *fs_info = root->fs_info;
3888 flags = BTRFS_BLOCK_GROUP_DATA;
3889 else if (root == fs_info->chunk_root)
3890 flags = BTRFS_BLOCK_GROUP_SYSTEM;
3892 flags = BTRFS_BLOCK_GROUP_METADATA;
3894 ret = get_alloc_profile(fs_info, flags);
3898 u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
3900 return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA);
3903 u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info)
3905 return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3908 u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
3910 return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3913 int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
3915 struct btrfs_root *root = inode->root;
3916 struct btrfs_fs_info *fs_info = root->fs_info;
3917 struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
3920 int need_commit = 2;
3921 int have_pinned_space;
3923 /* make sure bytes are sectorsize aligned */
3924 bytes = ALIGN(bytes, fs_info->sectorsize);
3926 if (btrfs_is_free_space_inode(inode)) {
3928 ASSERT(current->journal_info);
3932 /* make sure we have enough space to handle the data first */
3933 spin_lock(&data_sinfo->lock);
3934 used = btrfs_space_info_used(data_sinfo, true);
3936 if (used + bytes > data_sinfo->total_bytes) {
3937 struct btrfs_trans_handle *trans;
3940 * if we don't have enough free bytes in this space then we need
3941 * to alloc a new chunk.
3943 if (!data_sinfo->full) {
3946 data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
3947 spin_unlock(&data_sinfo->lock);
3949 alloc_target = btrfs_data_alloc_profile(fs_info);
3951 * It is ugly that we don't call nolock join
3952 * transaction for the free space inode case here.
3953 * But it is safe because we only do the data space
3954 * reservation for the free space cache in the
3955 * transaction context, the common join transaction
3956 * just increase the counter of the current transaction
3957 * handler, doesn't try to acquire the trans_lock of
3960 trans = btrfs_join_transaction(root);
3962 return PTR_ERR(trans);
3964 ret = btrfs_chunk_alloc(trans, alloc_target,
3965 CHUNK_ALLOC_NO_FORCE);
3966 btrfs_end_transaction(trans);
3971 have_pinned_space = 1;
3980 * If we don't have enough pinned space to deal with this
3981 * allocation, and no removed chunk in current transaction,
3982 * don't bother committing the transaction.
3984 have_pinned_space = __percpu_counter_compare(
3985 &data_sinfo->total_bytes_pinned,
3986 used + bytes - data_sinfo->total_bytes,
3987 BTRFS_TOTAL_BYTES_PINNED_BATCH);
3988 spin_unlock(&data_sinfo->lock);
3990 /* commit the current transaction and try again */
3995 if (need_commit > 0) {
3996 btrfs_start_delalloc_roots(fs_info, -1);
3997 btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
4001 trans = btrfs_join_transaction(root);
4003 return PTR_ERR(trans);
4004 if (have_pinned_space >= 0 ||
4005 test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
4006 &trans->transaction->flags) ||
4008 ret = btrfs_commit_transaction(trans);
4012 * The cleaner kthread might still be doing iput
4013 * operations. Wait for it to finish so that
4014 * more space is released. We don't need to
4015 * explicitly run the delayed iputs here because
4016 * the commit_transaction would have woken up
4019 ret = btrfs_wait_on_delayed_iputs(fs_info);
4024 btrfs_end_transaction(trans);
4028 trace_btrfs_space_reservation(fs_info,
4029 "space_info:enospc",
4030 data_sinfo->flags, bytes, 1);
4033 btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes);
4034 trace_btrfs_space_reservation(fs_info, "space_info",
4035 data_sinfo->flags, bytes, 1);
4036 spin_unlock(&data_sinfo->lock);
4041 int btrfs_check_data_free_space(struct inode *inode,
4042 struct extent_changeset **reserved, u64 start, u64 len)
4044 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4047 /* align the range */
4048 len = round_up(start + len, fs_info->sectorsize) -
4049 round_down(start, fs_info->sectorsize);
4050 start = round_down(start, fs_info->sectorsize);
4052 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len);
4056 /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
4057 ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
4059 btrfs_free_reserved_data_space_noquota(inode, start, len);
4066 * Called if we need to clear a data reservation for this inode
4067 * Normally in a error case.
4069 * This one will *NOT* use accurate qgroup reserved space API, just for case
4070 * which we can't sleep and is sure it won't affect qgroup reserved space.
4071 * Like clear_bit_hook().
4073 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
4076 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4077 struct btrfs_space_info *data_sinfo;
4079 /* Make sure the range is aligned to sectorsize */
4080 len = round_up(start + len, fs_info->sectorsize) -
4081 round_down(start, fs_info->sectorsize);
4082 start = round_down(start, fs_info->sectorsize);
4084 data_sinfo = fs_info->data_sinfo;
4085 spin_lock(&data_sinfo->lock);
4086 btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, -len);
4087 trace_btrfs_space_reservation(fs_info, "space_info",
4088 data_sinfo->flags, len, 0);
4089 spin_unlock(&data_sinfo->lock);
4093 * Called if we need to clear a data reservation for this inode
4094 * Normally in a error case.
4096 * This one will handle the per-inode data rsv map for accurate reserved
4099 void btrfs_free_reserved_data_space(struct inode *inode,
4100 struct extent_changeset *reserved, u64 start, u64 len)
4102 struct btrfs_root *root = BTRFS_I(inode)->root;
4104 /* Make sure the range is aligned to sectorsize */
4105 len = round_up(start + len, root->fs_info->sectorsize) -
4106 round_down(start, root->fs_info->sectorsize);
4107 start = round_down(start, root->fs_info->sectorsize);
4109 btrfs_free_reserved_data_space_noquota(inode, start, len);
4110 btrfs_qgroup_free_data(inode, reserved, start, len);
4113 static void force_metadata_allocation(struct btrfs_fs_info *info)
4115 struct list_head *head = &info->space_info;
4116 struct btrfs_space_info *found;
4119 list_for_each_entry_rcu(found, head, list) {
4120 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
4121 found->force_alloc = CHUNK_ALLOC_FORCE;
4126 static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
4127 struct btrfs_space_info *sinfo, int force)
4129 u64 bytes_used = btrfs_space_info_used(sinfo, false);
4132 if (force == CHUNK_ALLOC_FORCE)
4136 * in limited mode, we want to have some free space up to
4137 * about 1% of the FS size.
4139 if (force == CHUNK_ALLOC_LIMITED) {
4140 thresh = btrfs_super_total_bytes(fs_info->super_copy);
4141 thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
4143 if (sinfo->total_bytes - bytes_used < thresh)
4147 if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
4152 static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
4156 num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
4158 num_dev = fs_info->fs_devices->rw_devices;
4164 * If @is_allocation is true, reserve space in the system space info necessary
4165 * for allocating a chunk, otherwise if it's false, reserve space necessary for
4168 void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
4170 struct btrfs_fs_info *fs_info = trans->fs_info;
4171 struct btrfs_space_info *info;
4178 * Needed because we can end up allocating a system chunk and for an
4179 * atomic and race free space reservation in the chunk block reserve.
4181 lockdep_assert_held(&fs_info->chunk_mutex);
4183 info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4184 spin_lock(&info->lock);
4185 left = info->total_bytes - btrfs_space_info_used(info, true);
4186 spin_unlock(&info->lock);
4188 num_devs = get_profile_num_devs(fs_info, type);
4190 /* num_devs device items to update and 1 chunk item to add or remove */
4191 thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) +
4192 btrfs_calc_trans_metadata_size(fs_info, 1);
4194 if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4195 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
4196 left, thresh, type);
4197 btrfs_dump_space_info(fs_info, info, 0, 0);
4200 if (left < thresh) {
4201 u64 flags = btrfs_system_alloc_profile(fs_info);
4204 * Ignore failure to create system chunk. We might end up not
4205 * needing it, as we might not need to COW all nodes/leafs from
4206 * the paths we visit in the chunk tree (they were already COWed
4207 * or created in the current transaction for example).
4209 ret = btrfs_alloc_chunk(trans, flags);
4213 ret = btrfs_block_rsv_add(fs_info->chunk_root,
4214 &fs_info->chunk_block_rsv,
4215 thresh, BTRFS_RESERVE_NO_FLUSH);
4217 trans->chunk_bytes_reserved += thresh;
4222 * If force is CHUNK_ALLOC_FORCE:
4223 * - return 1 if it successfully allocates a chunk,
4224 * - return errors including -ENOSPC otherwise.
4225 * If force is NOT CHUNK_ALLOC_FORCE:
4226 * - return 0 if it doesn't need to allocate a new chunk,
4227 * - return 1 if it successfully allocates a chunk,
4228 * - return errors including -ENOSPC otherwise.
4230 int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
4231 enum btrfs_chunk_alloc_enum force)
4233 struct btrfs_fs_info *fs_info = trans->fs_info;
4234 struct btrfs_space_info *space_info;
4235 bool wait_for_alloc = false;
4236 bool should_alloc = false;
4239 /* Don't re-enter if we're already allocating a chunk */
4240 if (trans->allocating_chunk)
4243 space_info = btrfs_find_space_info(fs_info, flags);
4247 spin_lock(&space_info->lock);
4248 if (force < space_info->force_alloc)
4249 force = space_info->force_alloc;
4250 should_alloc = should_alloc_chunk(fs_info, space_info, force);
4251 if (space_info->full) {
4252 /* No more free physical space */
4257 spin_unlock(&space_info->lock);
4259 } else if (!should_alloc) {
4260 spin_unlock(&space_info->lock);
4262 } else if (space_info->chunk_alloc) {
4264 * Someone is already allocating, so we need to block
4265 * until this someone is finished and then loop to
4266 * recheck if we should continue with our allocation
4269 wait_for_alloc = true;
4270 spin_unlock(&space_info->lock);
4271 mutex_lock(&fs_info->chunk_mutex);
4272 mutex_unlock(&fs_info->chunk_mutex);
4274 /* Proceed with allocation */
4275 space_info->chunk_alloc = 1;
4276 wait_for_alloc = false;
4277 spin_unlock(&space_info->lock);
4281 } while (wait_for_alloc);
4283 mutex_lock(&fs_info->chunk_mutex);
4284 trans->allocating_chunk = true;
4287 * If we have mixed data/metadata chunks we want to make sure we keep
4288 * allocating mixed chunks instead of individual chunks.
4290 if (btrfs_mixed_space_info(space_info))
4291 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
4294 * if we're doing a data chunk, go ahead and make sure that
4295 * we keep a reasonable number of metadata chunks allocated in the
4298 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
4299 fs_info->data_chunk_allocations++;
4300 if (!(fs_info->data_chunk_allocations %
4301 fs_info->metadata_ratio))
4302 force_metadata_allocation(fs_info);
4306 * Check if we have enough space in SYSTEM chunk because we may need
4307 * to update devices.
4309 check_system_chunk(trans, flags);
4311 ret = btrfs_alloc_chunk(trans, flags);
4312 trans->allocating_chunk = false;
4314 spin_lock(&space_info->lock);
4317 space_info->full = 1;
4322 space_info->max_extent_size = 0;
4325 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4327 space_info->chunk_alloc = 0;
4328 spin_unlock(&space_info->lock);
4329 mutex_unlock(&fs_info->chunk_mutex);
4331 * When we allocate a new chunk we reserve space in the chunk block
4332 * reserve to make sure we can COW nodes/leafs in the chunk tree or
4333 * add new nodes/leafs to it if we end up needing to do it when
4334 * inserting the chunk item and updating device items as part of the
4335 * second phase of chunk allocation, performed by
4336 * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
4337 * large number of new block groups to create in our transaction
4338 * handle's new_bgs list to avoid exhausting the chunk block reserve
4339 * in extreme cases - like having a single transaction create many new
4340 * block groups when starting to write out the free space caches of all
4341 * the block groups that were made dirty during the lifetime of the
4344 if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
4345 btrfs_create_pending_block_groups(trans);
4350 static struct btrfs_block_rsv *get_block_rsv(
4351 const struct btrfs_trans_handle *trans,
4352 const struct btrfs_root *root)
4354 struct btrfs_fs_info *fs_info = root->fs_info;
4355 struct btrfs_block_rsv *block_rsv = NULL;
4357 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4358 (root == fs_info->csum_root && trans->adding_csums) ||
4359 (root == fs_info->uuid_root))
4360 block_rsv = trans->block_rsv;
4363 block_rsv = root->block_rsv;
4366 block_rsv = &fs_info->empty_block_rsv;
4371 int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes)
4374 spin_lock(&block_rsv->lock);
4375 if (block_rsv->reserved >= num_bytes) {
4376 block_rsv->reserved -= num_bytes;
4377 if (block_rsv->reserved < block_rsv->size)
4378 block_rsv->full = 0;
4381 spin_unlock(&block_rsv->lock);
4385 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
4386 u64 num_bytes, bool update_size)
4388 spin_lock(&block_rsv->lock);
4389 block_rsv->reserved += num_bytes;
4391 block_rsv->size += num_bytes;
4392 else if (block_rsv->reserved >= block_rsv->size)
4393 block_rsv->full = 1;
4394 spin_unlock(&block_rsv->lock);
4397 int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
4398 struct btrfs_block_rsv *dest, u64 num_bytes,
4401 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4404 if (global_rsv->space_info != dest->space_info)
4407 spin_lock(&global_rsv->lock);
4408 min_bytes = div_factor(global_rsv->size, min_factor);
4409 if (global_rsv->reserved < min_bytes + num_bytes) {
4410 spin_unlock(&global_rsv->lock);
4413 global_rsv->reserved -= num_bytes;
4414 if (global_rsv->reserved < global_rsv->size)
4415 global_rsv->full = 0;
4416 spin_unlock(&global_rsv->lock);
4418 block_rsv_add_bytes(dest, num_bytes, true);
4423 * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv.
4424 * @fs_info - the fs info for our fs.
4425 * @src - the source block rsv to transfer from.
4426 * @num_bytes - the number of bytes to transfer.
4428 * This transfers up to the num_bytes amount from the src rsv to the
4429 * delayed_refs_rsv. Any extra bytes are returned to the space info.
4431 void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
4432 struct btrfs_block_rsv *src,
4435 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
4438 spin_lock(&src->lock);
4439 src->reserved -= num_bytes;
4440 src->size -= num_bytes;
4441 spin_unlock(&src->lock);
4443 spin_lock(&delayed_refs_rsv->lock);
4444 if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) {
4445 u64 delta = delayed_refs_rsv->size -
4446 delayed_refs_rsv->reserved;
4447 if (num_bytes > delta) {
4448 to_free = num_bytes - delta;
4452 to_free = num_bytes;
4457 delayed_refs_rsv->reserved += num_bytes;
4458 if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size)
4459 delayed_refs_rsv->full = 1;
4460 spin_unlock(&delayed_refs_rsv->lock);
4463 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
4466 btrfs_space_info_add_old_bytes(fs_info,
4467 delayed_refs_rsv->space_info, to_free);
4471 * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage.
4472 * @fs_info - the fs_info for our fs.
4473 * @flush - control how we can flush for this reservation.
4475 * This will refill the delayed block_rsv up to 1 items size worth of space and
4476 * will return -ENOSPC if we can't make the reservation.
4478 int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
4479 enum btrfs_reserve_flush_enum flush)
4481 struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
4482 u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1);
4486 spin_lock(&block_rsv->lock);
4487 if (block_rsv->reserved < block_rsv->size) {
4488 num_bytes = block_rsv->size - block_rsv->reserved;
4489 num_bytes = min(num_bytes, limit);
4491 spin_unlock(&block_rsv->lock);
4496 ret = btrfs_reserve_metadata_bytes(fs_info->extent_root, block_rsv,
4500 block_rsv_add_bytes(block_rsv, num_bytes, 0);
4501 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
4506 static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
4507 struct btrfs_block_rsv *block_rsv,
4508 struct btrfs_block_rsv *dest, u64 num_bytes,
4509 u64 *qgroup_to_release_ret)
4511 struct btrfs_space_info *space_info = block_rsv->space_info;
4512 u64 qgroup_to_release = 0;
4515 spin_lock(&block_rsv->lock);
4516 if (num_bytes == (u64)-1) {
4517 num_bytes = block_rsv->size;
4518 qgroup_to_release = block_rsv->qgroup_rsv_size;
4520 block_rsv->size -= num_bytes;
4521 if (block_rsv->reserved >= block_rsv->size) {
4522 num_bytes = block_rsv->reserved - block_rsv->size;
4523 block_rsv->reserved = block_rsv->size;
4524 block_rsv->full = 1;
4528 if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) {
4529 qgroup_to_release = block_rsv->qgroup_rsv_reserved -
4530 block_rsv->qgroup_rsv_size;
4531 block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size;
4533 qgroup_to_release = 0;
4535 spin_unlock(&block_rsv->lock);
4538 if (num_bytes > 0) {
4540 spin_lock(&dest->lock);
4544 bytes_to_add = dest->size - dest->reserved;
4545 bytes_to_add = min(num_bytes, bytes_to_add);
4546 dest->reserved += bytes_to_add;
4547 if (dest->reserved >= dest->size)
4549 num_bytes -= bytes_to_add;
4551 spin_unlock(&dest->lock);
4554 btrfs_space_info_add_old_bytes(fs_info, space_info,
4557 if (qgroup_to_release_ret)
4558 *qgroup_to_release_ret = qgroup_to_release;
4562 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
4563 struct btrfs_block_rsv *dst, u64 num_bytes,
4568 ret = btrfs_block_rsv_use_bytes(src, num_bytes);
4572 block_rsv_add_bytes(dst, num_bytes, update_size);
4576 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
4578 memset(rsv, 0, sizeof(*rsv));
4579 spin_lock_init(&rsv->lock);
4583 void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
4584 struct btrfs_block_rsv *rsv,
4585 unsigned short type)
4587 btrfs_init_block_rsv(rsv, type);
4588 rsv->space_info = btrfs_find_space_info(fs_info,
4589 BTRFS_BLOCK_GROUP_METADATA);
4592 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
4593 unsigned short type)
4595 struct btrfs_block_rsv *block_rsv;
4597 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
4601 btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
4605 void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
4606 struct btrfs_block_rsv *rsv)
4610 btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
4614 int btrfs_block_rsv_add(struct btrfs_root *root,
4615 struct btrfs_block_rsv *block_rsv, u64 num_bytes,
4616 enum btrfs_reserve_flush_enum flush)
4623 ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
4625 block_rsv_add_bytes(block_rsv, num_bytes, true);
4630 int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
4638 spin_lock(&block_rsv->lock);
4639 num_bytes = div_factor(block_rsv->size, min_factor);
4640 if (block_rsv->reserved >= num_bytes)
4642 spin_unlock(&block_rsv->lock);
4647 int btrfs_block_rsv_refill(struct btrfs_root *root,
4648 struct btrfs_block_rsv *block_rsv, u64 min_reserved,
4649 enum btrfs_reserve_flush_enum flush)
4657 spin_lock(&block_rsv->lock);
4658 num_bytes = min_reserved;
4659 if (block_rsv->reserved >= num_bytes)
4662 num_bytes -= block_rsv->reserved;
4663 spin_unlock(&block_rsv->lock);
4668 ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
4670 block_rsv_add_bytes(block_rsv, num_bytes, false);
4677 static u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
4678 struct btrfs_block_rsv *block_rsv,
4679 u64 num_bytes, u64 *qgroup_to_release)
4681 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4682 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
4683 struct btrfs_block_rsv *target = delayed_rsv;
4685 if (target->full || target == block_rsv)
4686 target = global_rsv;
4688 if (block_rsv->space_info != target->space_info)
4691 return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes,
4695 void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
4696 struct btrfs_block_rsv *block_rsv,
4699 __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
4703 * btrfs_inode_rsv_release - release any excessive reservation.
4704 * @inode - the inode we need to release from.
4705 * @qgroup_free - free or convert qgroup meta.
4706 * Unlike normal operation, qgroup meta reservation needs to know if we are
4707 * freeing qgroup reservation or just converting it into per-trans. Normally
4708 * @qgroup_free is true for error handling, and false for normal release.
4710 * This is the same as btrfs_block_rsv_release, except that it handles the
4711 * tracepoint for the reservation.
4713 static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
4715 struct btrfs_fs_info *fs_info = inode->root->fs_info;
4716 struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
4718 u64 qgroup_to_release = 0;
4721 * Since we statically set the block_rsv->size we just want to say we
4722 * are releasing 0 bytes, and then we'll just get the reservation over
4725 released = __btrfs_block_rsv_release(fs_info, block_rsv, 0,
4726 &qgroup_to_release);
4728 trace_btrfs_space_reservation(fs_info, "delalloc",
4729 btrfs_ino(inode), released, 0);
4731 btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release);
4733 btrfs_qgroup_convert_reserved_meta(inode->root,
4738 * btrfs_delayed_refs_rsv_release - release a ref head's reservation.
4739 * @fs_info - the fs_info for our fs.
4740 * @nr - the number of items to drop.
4742 * This drops the delayed ref head's count from the delayed refs rsv and frees
4743 * any excess reservation we had.
4745 void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
4747 struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
4748 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4749 u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr);
4752 released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv,
4755 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
4759 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
4761 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
4762 struct btrfs_space_info *sinfo = block_rsv->space_info;
4766 * The global block rsv is based on the size of the extent tree, the
4767 * checksum tree and the root tree. If the fs is empty we want to set
4768 * it to a minimal amount for safety.
4770 num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
4771 btrfs_root_used(&fs_info->csum_root->root_item) +
4772 btrfs_root_used(&fs_info->tree_root->root_item);
4773 num_bytes = max_t(u64, num_bytes, SZ_16M);
4775 spin_lock(&sinfo->lock);
4776 spin_lock(&block_rsv->lock);
4778 block_rsv->size = min_t(u64, num_bytes, SZ_512M);
4780 if (block_rsv->reserved < block_rsv->size) {
4781 num_bytes = btrfs_space_info_used(sinfo, true);
4782 if (sinfo->total_bytes > num_bytes) {
4783 num_bytes = sinfo->total_bytes - num_bytes;
4784 num_bytes = min(num_bytes,
4785 block_rsv->size - block_rsv->reserved);
4786 block_rsv->reserved += num_bytes;
4787 btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
4789 trace_btrfs_space_reservation(fs_info, "space_info",
4790 sinfo->flags, num_bytes,
4793 } else if (block_rsv->reserved > block_rsv->size) {
4794 num_bytes = block_rsv->reserved - block_rsv->size;
4795 btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
4797 trace_btrfs_space_reservation(fs_info, "space_info",
4798 sinfo->flags, num_bytes, 0);
4799 block_rsv->reserved = block_rsv->size;
4802 if (block_rsv->reserved == block_rsv->size)
4803 block_rsv->full = 1;
4805 block_rsv->full = 0;
4807 spin_unlock(&block_rsv->lock);
4808 spin_unlock(&sinfo->lock);
4811 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
4813 struct btrfs_space_info *space_info;
4815 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4816 fs_info->chunk_block_rsv.space_info = space_info;
4818 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4819 fs_info->global_block_rsv.space_info = space_info;
4820 fs_info->trans_block_rsv.space_info = space_info;
4821 fs_info->empty_block_rsv.space_info = space_info;
4822 fs_info->delayed_block_rsv.space_info = space_info;
4823 fs_info->delayed_refs_rsv.space_info = space_info;
4825 fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv;
4826 fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv;
4827 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
4828 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
4829 if (fs_info->quota_root)
4830 fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
4831 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
4833 update_global_block_rsv(fs_info);
4836 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
4838 block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
4840 WARN_ON(fs_info->trans_block_rsv.size > 0);
4841 WARN_ON(fs_info->trans_block_rsv.reserved > 0);
4842 WARN_ON(fs_info->chunk_block_rsv.size > 0);
4843 WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
4844 WARN_ON(fs_info->delayed_block_rsv.size > 0);
4845 WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
4846 WARN_ON(fs_info->delayed_refs_rsv.reserved > 0);
4847 WARN_ON(fs_info->delayed_refs_rsv.size > 0);
4851 * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv
4852 * @trans - the trans that may have generated delayed refs
4854 * This is to be called anytime we may have adjusted trans->delayed_ref_updates,
4855 * it'll calculate the additional size and add it to the delayed_refs_rsv.
4857 void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
4859 struct btrfs_fs_info *fs_info = trans->fs_info;
4860 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
4863 if (!trans->delayed_ref_updates)
4866 num_bytes = btrfs_calc_trans_metadata_size(fs_info,
4867 trans->delayed_ref_updates);
4868 spin_lock(&delayed_rsv->lock);
4869 delayed_rsv->size += num_bytes;
4870 delayed_rsv->full = 0;
4871 spin_unlock(&delayed_rsv->lock);
4872 trans->delayed_ref_updates = 0;
4876 * To be called after all the new block groups attached to the transaction
4877 * handle have been created (btrfs_create_pending_block_groups()).
4879 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
4881 struct btrfs_fs_info *fs_info = trans->fs_info;
4883 if (!trans->chunk_bytes_reserved)
4886 WARN_ON_ONCE(!list_empty(&trans->new_bgs));
4888 block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
4889 trans->chunk_bytes_reserved, NULL);
4890 trans->chunk_bytes_reserved = 0;
4894 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
4895 * root: the root of the parent directory
4896 * rsv: block reservation
4897 * items: the number of items that we need do reservation
4898 * use_global_rsv: allow fallback to the global block reservation
4900 * This function is used to reserve the space for snapshot/subvolume
4901 * creation and deletion. Those operations are different with the
4902 * common file/directory operations, they change two fs/file trees
4903 * and root tree, the number of items that the qgroup reserves is
4904 * different with the free space reservation. So we can not use
4905 * the space reservation mechanism in start_transaction().
4907 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
4908 struct btrfs_block_rsv *rsv, int items,
4909 bool use_global_rsv)
4911 u64 qgroup_num_bytes = 0;
4914 struct btrfs_fs_info *fs_info = root->fs_info;
4915 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4917 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
4918 /* One for parent inode, two for dir entries */
4919 qgroup_num_bytes = 3 * fs_info->nodesize;
4920 ret = btrfs_qgroup_reserve_meta_prealloc(root,
4921 qgroup_num_bytes, true);
4926 num_bytes = btrfs_calc_trans_metadata_size(fs_info, items);
4927 rsv->space_info = btrfs_find_space_info(fs_info,
4928 BTRFS_BLOCK_GROUP_METADATA);
4929 ret = btrfs_block_rsv_add(root, rsv, num_bytes,
4930 BTRFS_RESERVE_FLUSH_ALL);
4932 if (ret == -ENOSPC && use_global_rsv)
4933 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true);
4935 if (ret && qgroup_num_bytes)
4936 btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
4941 void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
4942 struct btrfs_block_rsv *rsv)
4944 btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
4947 static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
4948 struct btrfs_inode *inode)
4950 struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
4951 u64 reserve_size = 0;
4952 u64 qgroup_rsv_size = 0;
4954 unsigned outstanding_extents;
4956 lockdep_assert_held(&inode->lock);
4957 outstanding_extents = inode->outstanding_extents;
4958 if (outstanding_extents)
4959 reserve_size = btrfs_calc_trans_metadata_size(fs_info,
4960 outstanding_extents + 1);
4961 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
4963 reserve_size += btrfs_calc_trans_metadata_size(fs_info,
4966 * For qgroup rsv, the calculation is very simple:
4967 * account one nodesize for each outstanding extent
4969 * This is overestimating in most cases.
4971 qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize;
4973 spin_lock(&block_rsv->lock);
4974 block_rsv->size = reserve_size;
4975 block_rsv->qgroup_rsv_size = qgroup_rsv_size;
4976 spin_unlock(&block_rsv->lock);
4979 static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
4980 u64 num_bytes, u64 *meta_reserve,
4981 u64 *qgroup_reserve)
4983 u64 nr_extents = count_max_extents(num_bytes);
4984 u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes);
4986 /* We add one for the inode update at finish ordered time */
4987 *meta_reserve = btrfs_calc_trans_metadata_size(fs_info,
4988 nr_extents + csum_leaves + 1);
4989 *qgroup_reserve = nr_extents * fs_info->nodesize;
4992 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
4994 struct btrfs_root *root = inode->root;
4995 struct btrfs_fs_info *fs_info = root->fs_info;
4996 struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
4997 u64 meta_reserve, qgroup_reserve;
4998 unsigned nr_extents;
4999 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
5001 bool delalloc_lock = true;
5003 /* If we are a free space inode we need to not flush since we will be in
5004 * the middle of a transaction commit. We also don't need the delalloc
5005 * mutex since we won't race with anybody. We need this mostly to make
5006 * lockdep shut its filthy mouth.
5008 * If we have a transaction open (can happen if we call truncate_block
5009 * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
5011 if (btrfs_is_free_space_inode(inode)) {
5012 flush = BTRFS_RESERVE_NO_FLUSH;
5013 delalloc_lock = false;
5015 if (current->journal_info)
5016 flush = BTRFS_RESERVE_FLUSH_LIMIT;
5018 if (btrfs_transaction_in_commit(fs_info))
5019 schedule_timeout(1);
5023 mutex_lock(&inode->delalloc_mutex);
5025 num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
5028 * We always want to do it this way, every other way is wrong and ends
5029 * in tears. Pre-reserving the amount we are going to add will always
5030 * be the right way, because otherwise if we have enough parallelism we
5031 * could end up with thousands of inodes all holding little bits of
5032 * reservations they were able to make previously and the only way to
5033 * reclaim that space is to ENOSPC out the operations and clear
5034 * everything out and try again, which is bad. This way we just
5035 * over-reserve slightly, and clean up the mess when we are done.
5037 calc_inode_reservations(fs_info, num_bytes, &meta_reserve,
5039 ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true);
5042 ret = btrfs_reserve_metadata_bytes(root, block_rsv, meta_reserve, flush);
5047 * Now we need to update our outstanding extents and csum bytes _first_
5048 * and then add the reservation to the block_rsv. This keeps us from
5049 * racing with an ordered completion or some such that would think it
5050 * needs to free the reservation we just made.
5052 spin_lock(&inode->lock);
5053 nr_extents = count_max_extents(num_bytes);
5054 btrfs_mod_outstanding_extents(inode, nr_extents);
5055 inode->csum_bytes += num_bytes;
5056 btrfs_calculate_inode_block_rsv_size(fs_info, inode);
5057 spin_unlock(&inode->lock);
5059 /* Now we can safely add our space to our block rsv */
5060 block_rsv_add_bytes(block_rsv, meta_reserve, false);
5061 trace_btrfs_space_reservation(root->fs_info, "delalloc",
5062 btrfs_ino(inode), meta_reserve, 1);
5064 spin_lock(&block_rsv->lock);
5065 block_rsv->qgroup_rsv_reserved += qgroup_reserve;
5066 spin_unlock(&block_rsv->lock);
5069 mutex_unlock(&inode->delalloc_mutex);
5072 btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);
5074 btrfs_inode_rsv_release(inode, true);
5076 mutex_unlock(&inode->delalloc_mutex);
5081 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
5082 * @inode: the inode to release the reservation for.
5083 * @num_bytes: the number of bytes we are releasing.
5084 * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
5086 * This will release the metadata reservation for an inode. This can be called
5087 * once we complete IO for a given set of bytes to release their metadata
5088 * reservations, or on error for the same reason.
5090 void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
5093 struct btrfs_fs_info *fs_info = inode->root->fs_info;
5095 num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
5096 spin_lock(&inode->lock);
5097 inode->csum_bytes -= num_bytes;
5098 btrfs_calculate_inode_block_rsv_size(fs_info, inode);
5099 spin_unlock(&inode->lock);
5101 if (btrfs_is_testing(fs_info))
5104 btrfs_inode_rsv_release(inode, qgroup_free);
5108 * btrfs_delalloc_release_extents - release our outstanding_extents
5109 * @inode: the inode to balance the reservation for.
5110 * @num_bytes: the number of bytes we originally reserved with
5111 * @qgroup_free: do we need to free qgroup meta reservation or convert them.
5113 * When we reserve space we increase outstanding_extents for the extents we may
5114 * add. Once we've set the range as delalloc or created our ordered extents we
5115 * have outstanding_extents to track the real usage, so we use this to free our
5116 * temporarily tracked outstanding_extents. This _must_ be used in conjunction
5117 * with btrfs_delalloc_reserve_metadata.
5119 void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
5122 struct btrfs_fs_info *fs_info = inode->root->fs_info;
5123 unsigned num_extents;
5125 spin_lock(&inode->lock);
5126 num_extents = count_max_extents(num_bytes);
5127 btrfs_mod_outstanding_extents(inode, -num_extents);
5128 btrfs_calculate_inode_block_rsv_size(fs_info, inode);
5129 spin_unlock(&inode->lock);
5131 if (btrfs_is_testing(fs_info))
5134 btrfs_inode_rsv_release(inode, qgroup_free);
5138 * btrfs_delalloc_reserve_space - reserve data and metadata space for
5140 * @inode: inode we're writing to
5141 * @start: start range we are writing to
5142 * @len: how long the range we are writing to
5143 * @reserved: mandatory parameter, record actually reserved qgroup ranges of
5144 * current reservation.
5146 * This will do the following things
5148 * o reserve space in data space info for num bytes
5149 * and reserve precious corresponding qgroup space
5150 * (Done in check_data_free_space)
5152 * o reserve space for metadata space, based on the number of outstanding
5153 * extents and how much csums will be needed
5154 * also reserve metadata space in a per root over-reserve method.
5155 * o add to the inodes->delalloc_bytes
5156 * o add it to the fs_info's delalloc inodes list.
5157 * (Above 3 all done in delalloc_reserve_metadata)
5159 * Return 0 for success
5160 * Return <0 for error(-ENOSPC or -EQUOT)
5162 int btrfs_delalloc_reserve_space(struct inode *inode,
5163 struct extent_changeset **reserved, u64 start, u64 len)
5167 ret = btrfs_check_data_free_space(inode, reserved, start, len);
5170 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
5172 btrfs_free_reserved_data_space(inode, *reserved, start, len);
5177 * btrfs_delalloc_release_space - release data and metadata space for delalloc
5178 * @inode: inode we're releasing space for
5179 * @start: start position of the space already reserved
5180 * @len: the len of the space already reserved
5181 * @release_bytes: the len of the space we consumed or didn't use
5183 * This function will release the metadata space that was not used and will
5184 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
5185 * list if there are no delalloc bytes left.
5186 * Also it will handle the qgroup reserved space.
5188 void btrfs_delalloc_release_space(struct inode *inode,
5189 struct extent_changeset *reserved,
5190 u64 start, u64 len, bool qgroup_free)
5192 btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
5193 btrfs_free_reserved_data_space(inode, reserved, start, len);
5196 static int update_block_group(struct btrfs_trans_handle *trans,
5197 u64 bytenr, u64 num_bytes, int alloc)
5199 struct btrfs_fs_info *info = trans->fs_info;
5200 struct btrfs_block_group_cache *cache = NULL;
5201 u64 total = num_bytes;
5207 /* block accounting for super block */
5208 spin_lock(&info->delalloc_root_lock);
5209 old_val = btrfs_super_bytes_used(info->super_copy);
5211 old_val += num_bytes;
5213 old_val -= num_bytes;
5214 btrfs_set_super_bytes_used(info->super_copy, old_val);
5215 spin_unlock(&info->delalloc_root_lock);
5218 cache = btrfs_lookup_block_group(info, bytenr);
5223 factor = btrfs_bg_type_to_factor(cache->flags);
5226 * If this block group has free space cache written out, we
5227 * need to make sure to load it if we are removing space. This
5228 * is because we need the unpinning stage to actually add the
5229 * space back to the block group, otherwise we will leak space.
5231 if (!alloc && cache->cached == BTRFS_CACHE_NO)
5232 cache_block_group(cache, 1);
5234 byte_in_group = bytenr - cache->key.objectid;
5235 WARN_ON(byte_in_group > cache->key.offset);
5237 spin_lock(&cache->space_info->lock);
5238 spin_lock(&cache->lock);
5240 if (btrfs_test_opt(info, SPACE_CACHE) &&
5241 cache->disk_cache_state < BTRFS_DC_CLEAR)
5242 cache->disk_cache_state = BTRFS_DC_CLEAR;
5244 old_val = btrfs_block_group_used(&cache->item);
5245 num_bytes = min(total, cache->key.offset - byte_in_group);
5247 old_val += num_bytes;
5248 btrfs_set_block_group_used(&cache->item, old_val);
5249 cache->reserved -= num_bytes;
5250 cache->space_info->bytes_reserved -= num_bytes;
5251 cache->space_info->bytes_used += num_bytes;
5252 cache->space_info->disk_used += num_bytes * factor;
5253 spin_unlock(&cache->lock);
5254 spin_unlock(&cache->space_info->lock);
5256 old_val -= num_bytes;
5257 btrfs_set_block_group_used(&cache->item, old_val);
5258 cache->pinned += num_bytes;
5259 btrfs_space_info_update_bytes_pinned(info,
5260 cache->space_info, num_bytes);
5261 cache->space_info->bytes_used -= num_bytes;
5262 cache->space_info->disk_used -= num_bytes * factor;
5263 spin_unlock(&cache->lock);
5264 spin_unlock(&cache->space_info->lock);
5266 trace_btrfs_space_reservation(info, "pinned",
5267 cache->space_info->flags,
5269 percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
5271 BTRFS_TOTAL_BYTES_PINNED_BATCH);
5272 set_extent_dirty(info->pinned_extents,
5273 bytenr, bytenr + num_bytes - 1,
5274 GFP_NOFS | __GFP_NOFAIL);
5277 spin_lock(&trans->transaction->dirty_bgs_lock);
5278 if (list_empty(&cache->dirty_list)) {
5279 list_add_tail(&cache->dirty_list,
5280 &trans->transaction->dirty_bgs);
5281 trans->delayed_ref_updates++;
5282 btrfs_get_block_group(cache);
5284 spin_unlock(&trans->transaction->dirty_bgs_lock);
5287 * No longer have used bytes in this block group, queue it for
5288 * deletion. We do this after adding the block group to the
5289 * dirty list to avoid races between cleaner kthread and space
5292 if (!alloc && old_val == 0)
5293 btrfs_mark_bg_unused(cache);
5295 btrfs_put_block_group(cache);
5297 bytenr += num_bytes;
5300 /* Modified block groups are accounted for in the delayed_refs_rsv. */
5301 btrfs_update_delayed_refs_rsv(trans);
5305 static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
5307 struct btrfs_block_group_cache *cache;
5310 spin_lock(&fs_info->block_group_cache_lock);
5311 bytenr = fs_info->first_logical_byte;
5312 spin_unlock(&fs_info->block_group_cache_lock);
5314 if (bytenr < (u64)-1)
5317 cache = btrfs_lookup_first_block_group(fs_info, search_start);
5321 bytenr = cache->key.objectid;
5322 btrfs_put_block_group(cache);
5327 static int pin_down_extent(struct btrfs_block_group_cache *cache,
5328 u64 bytenr, u64 num_bytes, int reserved)
5330 struct btrfs_fs_info *fs_info = cache->fs_info;
5332 spin_lock(&cache->space_info->lock);
5333 spin_lock(&cache->lock);
5334 cache->pinned += num_bytes;
5335 btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
5338 cache->reserved -= num_bytes;
5339 cache->space_info->bytes_reserved -= num_bytes;
5341 spin_unlock(&cache->lock);
5342 spin_unlock(&cache->space_info->lock);
5344 trace_btrfs_space_reservation(fs_info, "pinned",
5345 cache->space_info->flags, num_bytes, 1);
5346 percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
5347 num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
5348 set_extent_dirty(fs_info->pinned_extents, bytenr,
5349 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
5354 * this function must be called within transaction
5356 int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
5357 u64 bytenr, u64 num_bytes, int reserved)
5359 struct btrfs_block_group_cache *cache;
5361 cache = btrfs_lookup_block_group(fs_info, bytenr);
5362 BUG_ON(!cache); /* Logic error */
5364 pin_down_extent(cache, bytenr, num_bytes, reserved);
5366 btrfs_put_block_group(cache);
5371 * this function must be called within transaction
5373 int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
5374 u64 bytenr, u64 num_bytes)
5376 struct btrfs_block_group_cache *cache;
5379 cache = btrfs_lookup_block_group(fs_info, bytenr);
5384 * pull in the free space cache (if any) so that our pin
5385 * removes the free space from the cache. We have load_only set
5386 * to one because the slow code to read in the free extents does check
5387 * the pinned extents.
5389 cache_block_group(cache, 1);
5391 pin_down_extent(cache, bytenr, num_bytes, 0);
5393 /* remove us from the free space cache (if we're there at all) */
5394 ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
5395 btrfs_put_block_group(cache);
5399 static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
5400 u64 start, u64 num_bytes)
5403 struct btrfs_block_group_cache *block_group;
5404 struct btrfs_caching_control *caching_ctl;
5406 block_group = btrfs_lookup_block_group(fs_info, start);
5410 cache_block_group(block_group, 0);
5411 caching_ctl = get_caching_control(block_group);
5415 BUG_ON(!block_group_cache_done(block_group));
5416 ret = btrfs_remove_free_space(block_group, start, num_bytes);
5418 mutex_lock(&caching_ctl->mutex);
5420 if (start >= caching_ctl->progress) {
5421 ret = add_excluded_extent(fs_info, start, num_bytes);
5422 } else if (start + num_bytes <= caching_ctl->progress) {
5423 ret = btrfs_remove_free_space(block_group,
5426 num_bytes = caching_ctl->progress - start;
5427 ret = btrfs_remove_free_space(block_group,
5432 num_bytes = (start + num_bytes) -
5433 caching_ctl->progress;
5434 start = caching_ctl->progress;
5435 ret = add_excluded_extent(fs_info, start, num_bytes);
5438 mutex_unlock(&caching_ctl->mutex);
5439 put_caching_control(caching_ctl);
5441 btrfs_put_block_group(block_group);
5445 int btrfs_exclude_logged_extents(struct extent_buffer *eb)
5447 struct btrfs_fs_info *fs_info = eb->fs_info;
5448 struct btrfs_file_extent_item *item;
5449 struct btrfs_key key;
5454 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
5457 for (i = 0; i < btrfs_header_nritems(eb); i++) {
5458 btrfs_item_key_to_cpu(eb, &key, i);
5459 if (key.type != BTRFS_EXTENT_DATA_KEY)
5461 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
5462 found_type = btrfs_file_extent_type(eb, item);
5463 if (found_type == BTRFS_FILE_EXTENT_INLINE)
5465 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
5467 key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
5468 key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
5469 ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
5478 btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
5480 atomic_inc(&bg->reservations);
5483 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
5486 struct btrfs_block_group_cache *bg;
5488 bg = btrfs_lookup_block_group(fs_info, start);
5490 if (atomic_dec_and_test(&bg->reservations))
5491 wake_up_var(&bg->reservations);
5492 btrfs_put_block_group(bg);
5495 void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
5497 struct btrfs_space_info *space_info = bg->space_info;
5501 if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
5505 * Our block group is read only but before we set it to read only,
5506 * some task might have had allocated an extent from it already, but it
5507 * has not yet created a respective ordered extent (and added it to a
5508 * root's list of ordered extents).
5509 * Therefore wait for any task currently allocating extents, since the
5510 * block group's reservations counter is incremented while a read lock
5511 * on the groups' semaphore is held and decremented after releasing
5512 * the read access on that semaphore and creating the ordered extent.
5514 down_write(&space_info->groups_sem);
5515 up_write(&space_info->groups_sem);
5517 wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
5521 * btrfs_add_reserved_bytes - update the block_group and space info counters
5522 * @cache: The cache we are manipulating
5523 * @ram_bytes: The number of bytes of file content, and will be same to
5524 * @num_bytes except for the compress path.
5525 * @num_bytes: The number of bytes in question
5526 * @delalloc: The blocks are allocated for the delalloc write
5528 * This is called by the allocator when it reserves space. If this is a
5529 * reservation and the block group has become read only we cannot make the
5530 * reservation and return -EAGAIN, otherwise this function always succeeds.
5532 static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
5533 u64 ram_bytes, u64 num_bytes, int delalloc)
5535 struct btrfs_space_info *space_info = cache->space_info;
5538 spin_lock(&space_info->lock);
5539 spin_lock(&cache->lock);
5543 cache->reserved += num_bytes;
5544 space_info->bytes_reserved += num_bytes;
5545 btrfs_space_info_update_bytes_may_use(cache->fs_info,
5546 space_info, -ram_bytes);
5548 cache->delalloc_bytes += num_bytes;
5550 spin_unlock(&cache->lock);
5551 spin_unlock(&space_info->lock);
5556 * btrfs_free_reserved_bytes - update the block_group and space info counters
5557 * @cache: The cache we are manipulating
5558 * @num_bytes: The number of bytes in question
5559 * @delalloc: The blocks are allocated for the delalloc write
5561 * This is called by somebody who is freeing space that was never actually used
5562 * on disk. For example if you reserve some space for a new leaf in transaction
5563 * A and before transaction A commits you free that leaf, you call this with
5564 * reserve set to 0 in order to clear the reservation.
5567 static void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
5568 u64 num_bytes, int delalloc)
5570 struct btrfs_space_info *space_info = cache->space_info;
5572 spin_lock(&space_info->lock);
5573 spin_lock(&cache->lock);
5575 space_info->bytes_readonly += num_bytes;
5576 cache->reserved -= num_bytes;
5577 space_info->bytes_reserved -= num_bytes;
5578 space_info->max_extent_size = 0;
5581 cache->delalloc_bytes -= num_bytes;
5582 spin_unlock(&cache->lock);
5583 spin_unlock(&space_info->lock);
5585 void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
5587 struct btrfs_caching_control *next;
5588 struct btrfs_caching_control *caching_ctl;
5589 struct btrfs_block_group_cache *cache;
5591 down_write(&fs_info->commit_root_sem);
5593 list_for_each_entry_safe(caching_ctl, next,
5594 &fs_info->caching_block_groups, list) {
5595 cache = caching_ctl->block_group;
5596 if (block_group_cache_done(cache)) {
5597 cache->last_byte_to_unpin = (u64)-1;
5598 list_del_init(&caching_ctl->list);
5599 put_caching_control(caching_ctl);
5601 cache->last_byte_to_unpin = caching_ctl->progress;
5605 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
5606 fs_info->pinned_extents = &fs_info->freed_extents[1];
5608 fs_info->pinned_extents = &fs_info->freed_extents[0];
5610 up_write(&fs_info->commit_root_sem);
5612 update_global_block_rsv(fs_info);
5616 * Returns the free cluster for the given space info and sets empty_cluster to
5617 * what it should be based on the mount options.
5619 static struct btrfs_free_cluster *
5620 fetch_cluster_info(struct btrfs_fs_info *fs_info,
5621 struct btrfs_space_info *space_info, u64 *empty_cluster)
5623 struct btrfs_free_cluster *ret = NULL;
5626 if (btrfs_mixed_space_info(space_info))
5629 if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
5630 ret = &fs_info->meta_alloc_cluster;
5631 if (btrfs_test_opt(fs_info, SSD))
5632 *empty_cluster = SZ_2M;
5634 *empty_cluster = SZ_64K;
5635 } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
5636 btrfs_test_opt(fs_info, SSD_SPREAD)) {
5637 *empty_cluster = SZ_2M;
5638 ret = &fs_info->data_alloc_cluster;
5644 static int unpin_extent_range(struct btrfs_fs_info *fs_info,
5646 const bool return_free_space)
5648 struct btrfs_block_group_cache *cache = NULL;
5649 struct btrfs_space_info *space_info;
5650 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5651 struct btrfs_free_cluster *cluster = NULL;
5653 u64 total_unpinned = 0;
5654 u64 empty_cluster = 0;
5657 while (start <= end) {
5660 start >= cache->key.objectid + cache->key.offset) {
5662 btrfs_put_block_group(cache);
5664 cache = btrfs_lookup_block_group(fs_info, start);
5665 BUG_ON(!cache); /* Logic error */
5667 cluster = fetch_cluster_info(fs_info,
5670 empty_cluster <<= 1;
5673 len = cache->key.objectid + cache->key.offset - start;
5674 len = min(len, end + 1 - start);
5676 if (start < cache->last_byte_to_unpin) {
5677 len = min(len, cache->last_byte_to_unpin - start);
5678 if (return_free_space)
5679 btrfs_add_free_space(cache, start, len);
5683 total_unpinned += len;
5684 space_info = cache->space_info;
5687 * If this space cluster has been marked as fragmented and we've
5688 * unpinned enough in this block group to potentially allow a
5689 * cluster to be created inside of it go ahead and clear the
5692 if (cluster && cluster->fragmented &&
5693 total_unpinned > empty_cluster) {
5694 spin_lock(&cluster->lock);
5695 cluster->fragmented = 0;
5696 spin_unlock(&cluster->lock);
5699 spin_lock(&space_info->lock);
5700 spin_lock(&cache->lock);
5701 cache->pinned -= len;
5702 btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
5704 trace_btrfs_space_reservation(fs_info, "pinned",
5705 space_info->flags, len, 0);
5706 space_info->max_extent_size = 0;
5707 percpu_counter_add_batch(&space_info->total_bytes_pinned,
5708 -len, BTRFS_TOTAL_BYTES_PINNED_BATCH);
5710 space_info->bytes_readonly += len;
5713 spin_unlock(&cache->lock);
5714 if (!readonly && return_free_space &&
5715 global_rsv->space_info == space_info) {
5718 spin_lock(&global_rsv->lock);
5719 if (!global_rsv->full) {
5720 to_add = min(len, global_rsv->size -
5721 global_rsv->reserved);
5722 global_rsv->reserved += to_add;
5723 btrfs_space_info_update_bytes_may_use(fs_info,
5724 space_info, to_add);
5725 if (global_rsv->reserved >= global_rsv->size)
5726 global_rsv->full = 1;
5727 trace_btrfs_space_reservation(fs_info,
5733 spin_unlock(&global_rsv->lock);
5734 /* Add to any tickets we may have */
5736 btrfs_space_info_add_new_bytes(fs_info,
5739 spin_unlock(&space_info->lock);
5743 btrfs_put_block_group(cache);
5747 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
5749 struct btrfs_fs_info *fs_info = trans->fs_info;
5750 struct btrfs_block_group_cache *block_group, *tmp;
5751 struct list_head *deleted_bgs;
5752 struct extent_io_tree *unpin;
5757 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
5758 unpin = &fs_info->freed_extents[1];
5760 unpin = &fs_info->freed_extents[0];
5762 while (!trans->aborted) {
5763 struct extent_state *cached_state = NULL;
5765 mutex_lock(&fs_info->unused_bg_unpin_mutex);
5766 ret = find_first_extent_bit(unpin, 0, &start, &end,
5767 EXTENT_DIRTY, &cached_state);
5769 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
5773 if (btrfs_test_opt(fs_info, DISCARD))
5774 ret = btrfs_discard_extent(fs_info, start,
5775 end + 1 - start, NULL);
5777 clear_extent_dirty(unpin, start, end, &cached_state);
5778 unpin_extent_range(fs_info, start, end, true);
5779 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
5780 free_extent_state(cached_state);
5785 * Transaction is finished. We don't need the lock anymore. We
5786 * do need to clean up the block groups in case of a transaction
5789 deleted_bgs = &trans->transaction->deleted_bgs;
5790 list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
5794 if (!trans->aborted)
5795 ret = btrfs_discard_extent(fs_info,
5796 block_group->key.objectid,
5797 block_group->key.offset,
5800 list_del_init(&block_group->bg_list);
5801 btrfs_put_block_group_trimming(block_group);
5802 btrfs_put_block_group(block_group);
5805 const char *errstr = btrfs_decode_error(ret);
5807 "discard failed while removing blockgroup: errno=%d %s",
5815 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5816 struct btrfs_delayed_ref_node *node, u64 parent,
5817 u64 root_objectid, u64 owner_objectid,
5818 u64 owner_offset, int refs_to_drop,
5819 struct btrfs_delayed_extent_op *extent_op)
5821 struct btrfs_fs_info *info = trans->fs_info;
5822 struct btrfs_key key;
5823 struct btrfs_path *path;
5824 struct btrfs_root *extent_root = info->extent_root;
5825 struct extent_buffer *leaf;
5826 struct btrfs_extent_item *ei;
5827 struct btrfs_extent_inline_ref *iref;
5830 int extent_slot = 0;
5831 int found_extent = 0;
5835 u64 bytenr = node->bytenr;
5836 u64 num_bytes = node->num_bytes;
5838 bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
5840 path = btrfs_alloc_path();
5844 path->reada = READA_FORWARD;
5845 path->leave_spinning = 1;
5847 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
5848 BUG_ON(!is_data && refs_to_drop != 1);
5851 skinny_metadata = false;
5853 ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
5854 parent, root_objectid, owner_objectid,
5857 extent_slot = path->slots[0];
5858 while (extent_slot >= 0) {
5859 btrfs_item_key_to_cpu(path->nodes[0], &key,
5861 if (key.objectid != bytenr)
5863 if (key.type == BTRFS_EXTENT_ITEM_KEY &&
5864 key.offset == num_bytes) {
5868 if (key.type == BTRFS_METADATA_ITEM_KEY &&
5869 key.offset == owner_objectid) {
5873 if (path->slots[0] - extent_slot > 5)
5878 if (!found_extent) {
5880 ret = remove_extent_backref(trans, path, NULL,
5882 is_data, &last_ref);
5884 btrfs_abort_transaction(trans, ret);
5887 btrfs_release_path(path);
5888 path->leave_spinning = 1;
5890 key.objectid = bytenr;
5891 key.type = BTRFS_EXTENT_ITEM_KEY;
5892 key.offset = num_bytes;
5894 if (!is_data && skinny_metadata) {
5895 key.type = BTRFS_METADATA_ITEM_KEY;
5896 key.offset = owner_objectid;
5899 ret = btrfs_search_slot(trans, extent_root,
5901 if (ret > 0 && skinny_metadata && path->slots[0]) {
5903 * Couldn't find our skinny metadata item,
5904 * see if we have ye olde extent item.
5907 btrfs_item_key_to_cpu(path->nodes[0], &key,
5909 if (key.objectid == bytenr &&
5910 key.type == BTRFS_EXTENT_ITEM_KEY &&
5911 key.offset == num_bytes)
5915 if (ret > 0 && skinny_metadata) {
5916 skinny_metadata = false;
5917 key.objectid = bytenr;
5918 key.type = BTRFS_EXTENT_ITEM_KEY;
5919 key.offset = num_bytes;
5920 btrfs_release_path(path);
5921 ret = btrfs_search_slot(trans, extent_root,
5927 "umm, got %d back from search, was looking for %llu",
5930 btrfs_print_leaf(path->nodes[0]);
5933 btrfs_abort_transaction(trans, ret);
5936 extent_slot = path->slots[0];
5938 } else if (WARN_ON(ret == -ENOENT)) {
5939 btrfs_print_leaf(path->nodes[0]);
5941 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu",
5942 bytenr, parent, root_objectid, owner_objectid,
5944 btrfs_abort_transaction(trans, ret);
5947 btrfs_abort_transaction(trans, ret);
5951 leaf = path->nodes[0];
5952 item_size = btrfs_item_size_nr(leaf, extent_slot);
5953 if (unlikely(item_size < sizeof(*ei))) {
5955 btrfs_print_v0_err(info);
5956 btrfs_abort_transaction(trans, ret);
5959 ei = btrfs_item_ptr(leaf, extent_slot,
5960 struct btrfs_extent_item);
5961 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
5962 key.type == BTRFS_EXTENT_ITEM_KEY) {
5963 struct btrfs_tree_block_info *bi;
5964 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
5965 bi = (struct btrfs_tree_block_info *)(ei + 1);
5966 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
5969 refs = btrfs_extent_refs(leaf, ei);
5970 if (refs < refs_to_drop) {
5972 "trying to drop %d refs but we only have %Lu for bytenr %Lu",
5973 refs_to_drop, refs, bytenr);
5975 btrfs_abort_transaction(trans, ret);
5978 refs -= refs_to_drop;
5982 __run_delayed_extent_op(extent_op, leaf, ei);
5984 * In the case of inline back ref, reference count will
5985 * be updated by remove_extent_backref
5988 BUG_ON(!found_extent);
5990 btrfs_set_extent_refs(leaf, ei, refs);
5991 btrfs_mark_buffer_dirty(leaf);
5994 ret = remove_extent_backref(trans, path, iref,
5995 refs_to_drop, is_data,
5998 btrfs_abort_transaction(trans, ret);
6004 BUG_ON(is_data && refs_to_drop !=
6005 extent_data_ref_count(path, iref));
6007 BUG_ON(path->slots[0] != extent_slot);
6009 BUG_ON(path->slots[0] != extent_slot + 1);
6010 path->slots[0] = extent_slot;
6016 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
6019 btrfs_abort_transaction(trans, ret);
6022 btrfs_release_path(path);
6025 ret = btrfs_del_csums(trans, info, bytenr, num_bytes);
6027 btrfs_abort_transaction(trans, ret);
6032 ret = add_to_free_space_tree(trans, bytenr, num_bytes);
6034 btrfs_abort_transaction(trans, ret);
6038 ret = update_block_group(trans, bytenr, num_bytes, 0);
6040 btrfs_abort_transaction(trans, ret);
6044 btrfs_release_path(path);
6047 btrfs_free_path(path);
6052 * when we free an block, it is possible (and likely) that we free the last
6053 * delayed ref for that extent as well. This searches the delayed ref tree for
6054 * a given extent, and if there are no other delayed refs to be processed, it
6055 * removes it from the tree.
6057 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
6060 struct btrfs_delayed_ref_head *head;
6061 struct btrfs_delayed_ref_root *delayed_refs;
6064 delayed_refs = &trans->transaction->delayed_refs;
6065 spin_lock(&delayed_refs->lock);
6066 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
6068 goto out_delayed_unlock;
6070 spin_lock(&head->lock);
6071 if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
6074 if (cleanup_extent_op(head) != NULL)
6078 * waiting for the lock here would deadlock. If someone else has it
6079 * locked they are already in the process of dropping it anyway
6081 if (!mutex_trylock(&head->mutex))
6084 btrfs_delete_ref_head(delayed_refs, head);
6085 head->processing = 0;
6087 spin_unlock(&head->lock);
6088 spin_unlock(&delayed_refs->lock);
6090 BUG_ON(head->extent_op);
6091 if (head->must_insert_reserved)
6094 btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
6095 mutex_unlock(&head->mutex);
6096 btrfs_put_delayed_ref_head(head);
6099 spin_unlock(&head->lock);
6102 spin_unlock(&delayed_refs->lock);
6106 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
6107 struct btrfs_root *root,
6108 struct extent_buffer *buf,
6109 u64 parent, int last_ref)
6111 struct btrfs_fs_info *fs_info = root->fs_info;
6112 struct btrfs_ref generic_ref = { 0 };
6116 btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
6117 buf->start, buf->len, parent);
6118 btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
6119 root->root_key.objectid);
6121 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
6122 int old_ref_mod, new_ref_mod;
6124 btrfs_ref_tree_mod(fs_info, &generic_ref);
6125 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL,
6126 &old_ref_mod, &new_ref_mod);
6127 BUG_ON(ret); /* -ENOMEM */
6128 pin = old_ref_mod >= 0 && new_ref_mod < 0;
6131 if (last_ref && btrfs_header_generation(buf) == trans->transid) {
6132 struct btrfs_block_group_cache *cache;
6134 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
6135 ret = check_ref_cleanup(trans, buf->start);
6141 cache = btrfs_lookup_block_group(fs_info, buf->start);
6143 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
6144 pin_down_extent(cache, buf->start, buf->len, 1);
6145 btrfs_put_block_group(cache);
6149 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
6151 btrfs_add_free_space(cache, buf->start, buf->len);
6152 btrfs_free_reserved_bytes(cache, buf->len, 0);
6153 btrfs_put_block_group(cache);
6154 trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
6158 add_pinned_bytes(fs_info, &generic_ref);
6162 * Deleting the buffer, clear the corrupt flag since it doesn't
6165 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
6169 /* Can return -ENOMEM */
6170 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
6172 struct btrfs_fs_info *fs_info = trans->fs_info;
6173 int old_ref_mod, new_ref_mod;
6176 if (btrfs_is_testing(fs_info))
6180 * tree log blocks never actually go into the extent allocation
6181 * tree, just update pinning info and exit early.
6183 if ((ref->type == BTRFS_REF_METADATA &&
6184 ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
6185 (ref->type == BTRFS_REF_DATA &&
6186 ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
6187 /* unlocks the pinned mutex */
6188 btrfs_pin_extent(fs_info, ref->bytenr, ref->len, 1);
6189 old_ref_mod = new_ref_mod = 0;
6191 } else if (ref->type == BTRFS_REF_METADATA) {
6192 ret = btrfs_add_delayed_tree_ref(trans, ref, NULL,
6193 &old_ref_mod, &new_ref_mod);
6195 ret = btrfs_add_delayed_data_ref(trans, ref, 0,
6196 &old_ref_mod, &new_ref_mod);
6199 if (!((ref->type == BTRFS_REF_METADATA &&
6200 ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
6201 (ref->type == BTRFS_REF_DATA &&
6202 ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
6203 btrfs_ref_tree_mod(fs_info, ref);
6205 if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0)
6206 add_pinned_bytes(fs_info, ref);
6212 * when we wait for progress in the block group caching, its because
6213 * our allocation attempt failed at least once. So, we must sleep
6214 * and let some progress happen before we try again.
6216 * This function will sleep at least once waiting for new free space to
6217 * show up, and then it will check the block group free space numbers
6218 * for our min num_bytes. Another option is to have it go ahead
6219 * and look in the rbtree for a free extent of a given size, but this
6222 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
6223 * any of the information in this block group.
6225 static noinline void
6226 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
6229 struct btrfs_caching_control *caching_ctl;
6231 caching_ctl = get_caching_control(cache);
6235 wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
6236 (cache->free_space_ctl->free_space >= num_bytes));
6238 put_caching_control(caching_ctl);
6242 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
6244 struct btrfs_caching_control *caching_ctl;
6247 caching_ctl = get_caching_control(cache);
6249 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
6251 wait_event(caching_ctl->wait, block_group_cache_done(cache));
6252 if (cache->cached == BTRFS_CACHE_ERROR)
6254 put_caching_control(caching_ctl);
6258 enum btrfs_loop_type {
6259 LOOP_CACHING_NOWAIT,
6266 btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
6270 down_read(&cache->data_rwsem);
6274 btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
6277 btrfs_get_block_group(cache);
6279 down_read(&cache->data_rwsem);
6282 static struct btrfs_block_group_cache *
6283 btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
6284 struct btrfs_free_cluster *cluster,
6287 struct btrfs_block_group_cache *used_bg = NULL;
6289 spin_lock(&cluster->refill_lock);
6291 used_bg = cluster->block_group;
6295 if (used_bg == block_group)
6298 btrfs_get_block_group(used_bg);
6303 if (down_read_trylock(&used_bg->data_rwsem))
6306 spin_unlock(&cluster->refill_lock);
6308 /* We should only have one-level nested. */
6309 down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
6311 spin_lock(&cluster->refill_lock);
6312 if (used_bg == cluster->block_group)
6315 up_read(&used_bg->data_rwsem);
6316 btrfs_put_block_group(used_bg);
6321 btrfs_release_block_group(struct btrfs_block_group_cache *cache,
6325 up_read(&cache->data_rwsem);
6326 btrfs_put_block_group(cache);
6330 * Structure used internally for find_free_extent() function. Wraps needed
6333 struct find_free_extent_ctl {
6334 /* Basic allocation info */
6341 /* Where to start the search inside the bg */
6344 /* For clustered allocation */
6347 bool have_caching_bg;
6348 bool orig_have_caching_bg;
6350 /* RAID index, converted from flags */
6354 * Current loop number, check find_free_extent_update_loop() for details
6359 * Whether we're refilling a cluster, if true we need to re-search
6360 * current block group but don't try to refill the cluster again.
6362 bool retry_clustered;
6365 * Whether we're updating free space cache, if true we need to re-search
6366 * current block group but don't try updating free space cache again.
6368 bool retry_unclustered;
6370 /* If current block group is cached */
6373 /* Max contiguous hole found */
6374 u64 max_extent_size;
6376 /* Total free space from free space cache, not always contiguous */
6377 u64 total_free_space;
6385 * Helper function for find_free_extent().
6387 * Return -ENOENT to inform caller that we need fallback to unclustered mode.
6388 * Return -EAGAIN to inform caller that we need to re-search this block group
6389 * Return >0 to inform caller that we find nothing
6390 * Return 0 means we have found a location and set ffe_ctl->found_offset.
6392 static int find_free_extent_clustered(struct btrfs_block_group_cache *bg,
6393 struct btrfs_free_cluster *last_ptr,
6394 struct find_free_extent_ctl *ffe_ctl,
6395 struct btrfs_block_group_cache **cluster_bg_ret)
6397 struct btrfs_block_group_cache *cluster_bg;
6398 u64 aligned_cluster;
6402 cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
6404 goto refill_cluster;
6405 if (cluster_bg != bg && (cluster_bg->ro ||
6406 !block_group_bits(cluster_bg, ffe_ctl->flags)))
6407 goto release_cluster;
6409 offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
6410 ffe_ctl->num_bytes, cluster_bg->key.objectid,
6411 &ffe_ctl->max_extent_size);
6413 /* We have a block, we're done */
6414 spin_unlock(&last_ptr->refill_lock);
6415 trace_btrfs_reserve_extent_cluster(cluster_bg,
6416 ffe_ctl->search_start, ffe_ctl->num_bytes);
6417 *cluster_bg_ret = cluster_bg;
6418 ffe_ctl->found_offset = offset;
6421 WARN_ON(last_ptr->block_group != cluster_bg);
6425 * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
6426 * lets just skip it and let the allocator find whatever block it can
6427 * find. If we reach this point, we will have tried the cluster
6428 * allocator plenty of times and not have found anything, so we are
6429 * likely way too fragmented for the clustering stuff to find anything.
6431 * However, if the cluster is taken from the current block group,
6432 * release the cluster first, so that we stand a better chance of
6433 * succeeding in the unclustered allocation.
6435 if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
6436 spin_unlock(&last_ptr->refill_lock);
6437 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
6441 /* This cluster didn't work out, free it and start over */
6442 btrfs_return_cluster_to_free_space(NULL, last_ptr);
6444 if (cluster_bg != bg)
6445 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
6448 if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
6449 spin_unlock(&last_ptr->refill_lock);
6453 aligned_cluster = max_t(u64,
6454 ffe_ctl->empty_cluster + ffe_ctl->empty_size,
6455 bg->full_stripe_len);
6456 ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
6457 ffe_ctl->num_bytes, aligned_cluster);
6459 /* Now pull our allocation out of this cluster */
6460 offset = btrfs_alloc_from_cluster(bg, last_ptr,
6461 ffe_ctl->num_bytes, ffe_ctl->search_start,
6462 &ffe_ctl->max_extent_size);
6464 /* We found one, proceed */
6465 spin_unlock(&last_ptr->refill_lock);
6466 trace_btrfs_reserve_extent_cluster(bg,
6467 ffe_ctl->search_start,
6468 ffe_ctl->num_bytes);
6469 ffe_ctl->found_offset = offset;
6472 } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
6473 !ffe_ctl->retry_clustered) {
6474 spin_unlock(&last_ptr->refill_lock);
6476 ffe_ctl->retry_clustered = true;
6477 wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
6478 ffe_ctl->empty_cluster + ffe_ctl->empty_size);
6482 * At this point we either didn't find a cluster or we weren't able to
6483 * allocate a block from our cluster. Free the cluster we've been
6484 * trying to use, and go to the next block group.
6486 btrfs_return_cluster_to_free_space(NULL, last_ptr);
6487 spin_unlock(&last_ptr->refill_lock);
6492 * Return >0 to inform caller that we find nothing
6493 * Return 0 when we found an free extent and set ffe_ctrl->found_offset
6494 * Return -EAGAIN to inform caller that we need to re-search this block group
6496 static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg,
6497 struct btrfs_free_cluster *last_ptr,
6498 struct find_free_extent_ctl *ffe_ctl)
6503 * We are doing an unclustered allocation, set the fragmented flag so
6504 * we don't bother trying to setup a cluster again until we get more
6507 if (unlikely(last_ptr)) {
6508 spin_lock(&last_ptr->lock);
6509 last_ptr->fragmented = 1;
6510 spin_unlock(&last_ptr->lock);
6512 if (ffe_ctl->cached) {
6513 struct btrfs_free_space_ctl *free_space_ctl;
6515 free_space_ctl = bg->free_space_ctl;
6516 spin_lock(&free_space_ctl->tree_lock);
6517 if (free_space_ctl->free_space <
6518 ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
6519 ffe_ctl->empty_size) {
6520 ffe_ctl->total_free_space = max_t(u64,
6521 ffe_ctl->total_free_space,
6522 free_space_ctl->free_space);
6523 spin_unlock(&free_space_ctl->tree_lock);
6526 spin_unlock(&free_space_ctl->tree_lock);
6529 offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
6530 ffe_ctl->num_bytes, ffe_ctl->empty_size,
6531 &ffe_ctl->max_extent_size);
6534 * If we didn't find a chunk, and we haven't failed on this block group
6535 * before, and this block group is in the middle of caching and we are
6536 * ok with waiting, then go ahead and wait for progress to be made, and
6537 * set @retry_unclustered to true.
6539 * If @retry_unclustered is true then we've already waited on this
6540 * block group once and should move on to the next block group.
6542 if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
6543 ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
6544 wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
6545 ffe_ctl->empty_size);
6546 ffe_ctl->retry_unclustered = true;
6548 } else if (!offset) {
6551 ffe_ctl->found_offset = offset;
6556 * Return >0 means caller needs to re-search for free extent
6557 * Return 0 means we have the needed free extent.
6558 * Return <0 means we failed to locate any free extent.
6560 static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
6561 struct btrfs_free_cluster *last_ptr,
6562 struct btrfs_key *ins,
6563 struct find_free_extent_ctl *ffe_ctl,
6564 int full_search, bool use_cluster)
6566 struct btrfs_root *root = fs_info->extent_root;
6569 if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
6570 ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
6571 ffe_ctl->orig_have_caching_bg = true;
6573 if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT &&
6574 ffe_ctl->have_caching_bg)
6577 if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
6580 if (ins->objectid) {
6581 if (!use_cluster && last_ptr) {
6582 spin_lock(&last_ptr->lock);
6583 last_ptr->window_start = ins->objectid;
6584 spin_unlock(&last_ptr->lock);
6590 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
6591 * caching kthreads as we move along
6592 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
6593 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
6594 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
6597 if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
6599 if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
6601 * We want to skip the LOOP_CACHING_WAIT step if we
6602 * don't have any uncached bgs and we've already done a
6603 * full search through.
6605 if (ffe_ctl->orig_have_caching_bg || !full_search)
6606 ffe_ctl->loop = LOOP_CACHING_WAIT;
6608 ffe_ctl->loop = LOOP_ALLOC_CHUNK;
6613 if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
6614 struct btrfs_trans_handle *trans;
6617 trans = current->journal_info;
6621 trans = btrfs_join_transaction(root);
6623 if (IS_ERR(trans)) {
6624 ret = PTR_ERR(trans);
6628 ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
6632 * If we can't allocate a new chunk we've already looped
6633 * through at least once, move on to the NO_EMPTY_SIZE
6637 ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
6639 /* Do not bail out on ENOSPC since we can do more. */
6640 if (ret < 0 && ret != -ENOSPC)
6641 btrfs_abort_transaction(trans, ret);
6645 btrfs_end_transaction(trans);
6650 if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
6652 * Don't loop again if we already have no empty_size and
6655 if (ffe_ctl->empty_size == 0 &&
6656 ffe_ctl->empty_cluster == 0)
6658 ffe_ctl->empty_size = 0;
6659 ffe_ctl->empty_cluster = 0;
6667 * walks the btree of allocated extents and find a hole of a given size.
6668 * The key ins is changed to record the hole:
6669 * ins->objectid == start position
6670 * ins->flags = BTRFS_EXTENT_ITEM_KEY
6671 * ins->offset == the size of the hole.
6672 * Any available blocks before search_start are skipped.
6674 * If there is no suitable free space, we will record the max size of
6675 * the free space extent currently.
6677 * The overall logic and call chain:
6679 * find_free_extent()
6680 * |- Iterate through all block groups
6681 * | |- Get a valid block group
6682 * | |- Try to do clustered allocation in that block group
6683 * | |- Try to do unclustered allocation in that block group
6684 * | |- Check if the result is valid
6685 * | | |- If valid, then exit
6686 * | |- Jump to next block group
6688 * |- Push harder to find free extents
6689 * |- If not found, re-iterate all block groups
6691 static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
6692 u64 ram_bytes, u64 num_bytes, u64 empty_size,
6693 u64 hint_byte, struct btrfs_key *ins,
6694 u64 flags, int delalloc)
6697 struct btrfs_free_cluster *last_ptr = NULL;
6698 struct btrfs_block_group_cache *block_group = NULL;
6699 struct find_free_extent_ctl ffe_ctl = {0};
6700 struct btrfs_space_info *space_info;
6701 bool use_cluster = true;
6702 bool full_search = false;
6704 WARN_ON(num_bytes < fs_info->sectorsize);
6706 ffe_ctl.ram_bytes = ram_bytes;
6707 ffe_ctl.num_bytes = num_bytes;
6708 ffe_ctl.empty_size = empty_size;
6709 ffe_ctl.flags = flags;
6710 ffe_ctl.search_start = 0;
6711 ffe_ctl.retry_clustered = false;
6712 ffe_ctl.retry_unclustered = false;
6713 ffe_ctl.delalloc = delalloc;
6714 ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
6715 ffe_ctl.have_caching_bg = false;
6716 ffe_ctl.orig_have_caching_bg = false;
6717 ffe_ctl.found_offset = 0;
6719 ins->type = BTRFS_EXTENT_ITEM_KEY;
6723 trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
6725 space_info = btrfs_find_space_info(fs_info, flags);
6727 btrfs_err(fs_info, "No space info for %llu", flags);
6732 * If our free space is heavily fragmented we may not be able to make
6733 * big contiguous allocations, so instead of doing the expensive search
6734 * for free space, simply return ENOSPC with our max_extent_size so we
6735 * can go ahead and search for a more manageable chunk.
6737 * If our max_extent_size is large enough for our allocation simply
6738 * disable clustering since we will likely not be able to find enough
6739 * space to create a cluster and induce latency trying.
6741 if (unlikely(space_info->max_extent_size)) {
6742 spin_lock(&space_info->lock);
6743 if (space_info->max_extent_size &&
6744 num_bytes > space_info->max_extent_size) {
6745 ins->offset = space_info->max_extent_size;
6746 spin_unlock(&space_info->lock);
6748 } else if (space_info->max_extent_size) {
6749 use_cluster = false;
6751 spin_unlock(&space_info->lock);
6754 last_ptr = fetch_cluster_info(fs_info, space_info,
6755 &ffe_ctl.empty_cluster);
6757 spin_lock(&last_ptr->lock);
6758 if (last_ptr->block_group)
6759 hint_byte = last_ptr->window_start;
6760 if (last_ptr->fragmented) {
6762 * We still set window_start so we can keep track of the
6763 * last place we found an allocation to try and save
6766 hint_byte = last_ptr->window_start;
6767 use_cluster = false;
6769 spin_unlock(&last_ptr->lock);
6772 ffe_ctl.search_start = max(ffe_ctl.search_start,
6773 first_logical_byte(fs_info, 0));
6774 ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte);
6775 if (ffe_ctl.search_start == hint_byte) {
6776 block_group = btrfs_lookup_block_group(fs_info,
6777 ffe_ctl.search_start);
6779 * we don't want to use the block group if it doesn't match our
6780 * allocation bits, or if its not cached.
6782 * However if we are re-searching with an ideal block group
6783 * picked out then we don't care that the block group is cached.
6785 if (block_group && block_group_bits(block_group, flags) &&
6786 block_group->cached != BTRFS_CACHE_NO) {
6787 down_read(&space_info->groups_sem);
6788 if (list_empty(&block_group->list) ||
6791 * someone is removing this block group,
6792 * we can't jump into the have_block_group
6793 * target because our list pointers are not
6796 btrfs_put_block_group(block_group);
6797 up_read(&space_info->groups_sem);
6799 ffe_ctl.index = btrfs_bg_flags_to_raid_index(
6800 block_group->flags);
6801 btrfs_lock_block_group(block_group, delalloc);
6802 goto have_block_group;
6804 } else if (block_group) {
6805 btrfs_put_block_group(block_group);
6809 ffe_ctl.have_caching_bg = false;
6810 if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) ||
6813 down_read(&space_info->groups_sem);
6814 list_for_each_entry(block_group,
6815 &space_info->block_groups[ffe_ctl.index], list) {
6816 /* If the block group is read-only, we can skip it entirely. */
6817 if (unlikely(block_group->ro))
6820 btrfs_grab_block_group(block_group, delalloc);
6821 ffe_ctl.search_start = block_group->key.objectid;
6824 * this can happen if we end up cycling through all the
6825 * raid types, but we want to make sure we only allocate
6826 * for the proper type.
6828 if (!block_group_bits(block_group, flags)) {
6829 u64 extra = BTRFS_BLOCK_GROUP_DUP |
6830 BTRFS_BLOCK_GROUP_RAID1_MASK |
6831 BTRFS_BLOCK_GROUP_RAID56_MASK |
6832 BTRFS_BLOCK_GROUP_RAID10;
6835 * if they asked for extra copies and this block group
6836 * doesn't provide them, bail. This does allow us to
6837 * fill raid0 from raid1.
6839 if ((flags & extra) && !(block_group->flags & extra))
6844 ffe_ctl.cached = block_group_cache_done(block_group);
6845 if (unlikely(!ffe_ctl.cached)) {
6846 ffe_ctl.have_caching_bg = true;
6847 ret = cache_block_group(block_group, 0);
6852 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
6856 * Ok we want to try and use the cluster allocator, so
6859 if (last_ptr && use_cluster) {
6860 struct btrfs_block_group_cache *cluster_bg = NULL;
6862 ret = find_free_extent_clustered(block_group, last_ptr,
6863 &ffe_ctl, &cluster_bg);
6866 if (cluster_bg && cluster_bg != block_group) {
6867 btrfs_release_block_group(block_group,
6869 block_group = cluster_bg;
6872 } else if (ret == -EAGAIN) {
6873 goto have_block_group;
6874 } else if (ret > 0) {
6877 /* ret == -ENOENT case falls through */
6880 ret = find_free_extent_unclustered(block_group, last_ptr,
6883 goto have_block_group;
6886 /* ret == 0 case falls through */
6888 ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
6889 fs_info->stripesize);
6891 /* move on to the next group */
6892 if (ffe_ctl.search_start + num_bytes >
6893 block_group->key.objectid + block_group->key.offset) {
6894 btrfs_add_free_space(block_group, ffe_ctl.found_offset,
6899 if (ffe_ctl.found_offset < ffe_ctl.search_start)
6900 btrfs_add_free_space(block_group, ffe_ctl.found_offset,
6901 ffe_ctl.search_start - ffe_ctl.found_offset);
6903 ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
6904 num_bytes, delalloc);
6905 if (ret == -EAGAIN) {
6906 btrfs_add_free_space(block_group, ffe_ctl.found_offset,
6910 btrfs_inc_block_group_reservations(block_group);
6912 /* we are all good, lets return */
6913 ins->objectid = ffe_ctl.search_start;
6914 ins->offset = num_bytes;
6916 trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start,
6918 btrfs_release_block_group(block_group, delalloc);
6921 ffe_ctl.retry_clustered = false;
6922 ffe_ctl.retry_unclustered = false;
6923 BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
6925 btrfs_release_block_group(block_group, delalloc);
6928 up_read(&space_info->groups_sem);
6930 ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl,
6931 full_search, use_cluster);
6935 if (ret == -ENOSPC) {
6937 * Use ffe_ctl->total_free_space as fallback if we can't find
6938 * any contiguous hole.
6940 if (!ffe_ctl.max_extent_size)
6941 ffe_ctl.max_extent_size = ffe_ctl.total_free_space;
6942 spin_lock(&space_info->lock);
6943 space_info->max_extent_size = ffe_ctl.max_extent_size;
6944 spin_unlock(&space_info->lock);
6945 ins->offset = ffe_ctl.max_extent_size;
6951 * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
6952 * hole that is at least as big as @num_bytes.
6954 * @root - The root that will contain this extent
6956 * @ram_bytes - The amount of space in ram that @num_bytes take. This
6957 * is used for accounting purposes. This value differs
6958 * from @num_bytes only in the case of compressed extents.
6960 * @num_bytes - Number of bytes to allocate on-disk.
6962 * @min_alloc_size - Indicates the minimum amount of space that the
6963 * allocator should try to satisfy. In some cases
6964 * @num_bytes may be larger than what is required and if
6965 * the filesystem is fragmented then allocation fails.
6966 * However, the presence of @min_alloc_size gives a
6967 * chance to try and satisfy the smaller allocation.
6969 * @empty_size - A hint that you plan on doing more COW. This is the
6970 * size in bytes the allocator should try to find free
6971 * next to the block it returns. This is just a hint and
6972 * may be ignored by the allocator.
6974 * @hint_byte - Hint to the allocator to start searching above the byte
6975 * address passed. It might be ignored.
6977 * @ins - This key is modified to record the found hole. It will
6978 * have the following values:
6979 * ins->objectid == start position
6980 * ins->flags = BTRFS_EXTENT_ITEM_KEY
6981 * ins->offset == the size of the hole.
6983 * @is_data - Boolean flag indicating whether an extent is
6984 * allocated for data (true) or metadata (false)
6986 * @delalloc - Boolean flag indicating whether this allocation is for
6987 * delalloc or not. If 'true' data_rwsem of block groups
6988 * is going to be acquired.
6991 * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
6992 * case -ENOSPC is returned then @ins->offset will contain the size of the
6993 * largest available hole the allocator managed to find.
6995 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
6996 u64 num_bytes, u64 min_alloc_size,
6997 u64 empty_size, u64 hint_byte,
6998 struct btrfs_key *ins, int is_data, int delalloc)
7000 struct btrfs_fs_info *fs_info = root->fs_info;
7001 bool final_tried = num_bytes == min_alloc_size;
7005 flags = get_alloc_profile_by_root(root, is_data);
7007 WARN_ON(num_bytes < fs_info->sectorsize);
7008 ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
7009 hint_byte, ins, flags, delalloc);
7010 if (!ret && !is_data) {
7011 btrfs_dec_block_group_reservations(fs_info, ins->objectid);
7012 } else if (ret == -ENOSPC) {
7013 if (!final_tried && ins->offset) {
7014 num_bytes = min(num_bytes >> 1, ins->offset);
7015 num_bytes = round_down(num_bytes,
7016 fs_info->sectorsize);
7017 num_bytes = max(num_bytes, min_alloc_size);
7018 ram_bytes = num_bytes;
7019 if (num_bytes == min_alloc_size)
7022 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
7023 struct btrfs_space_info *sinfo;
7025 sinfo = btrfs_find_space_info(fs_info, flags);
7027 "allocation failed flags %llu, wanted %llu",
7030 btrfs_dump_space_info(fs_info, sinfo,
7038 static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
7040 int pin, int delalloc)
7042 struct btrfs_block_group_cache *cache;
7045 cache = btrfs_lookup_block_group(fs_info, start);
7047 btrfs_err(fs_info, "Unable to find block group for %llu",
7053 pin_down_extent(cache, start, len, 1);
7055 if (btrfs_test_opt(fs_info, DISCARD))
7056 ret = btrfs_discard_extent(fs_info, start, len, NULL);
7057 btrfs_add_free_space(cache, start, len);
7058 btrfs_free_reserved_bytes(cache, len, delalloc);
7059 trace_btrfs_reserved_extent_free(fs_info, start, len);
7062 btrfs_put_block_group(cache);
7066 int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
7067 u64 start, u64 len, int delalloc)
7069 return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc);
7072 int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
7075 return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0);
7078 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
7079 u64 parent, u64 root_objectid,
7080 u64 flags, u64 owner, u64 offset,
7081 struct btrfs_key *ins, int ref_mod)
7083 struct btrfs_fs_info *fs_info = trans->fs_info;
7085 struct btrfs_extent_item *extent_item;
7086 struct btrfs_extent_inline_ref *iref;
7087 struct btrfs_path *path;
7088 struct extent_buffer *leaf;
7093 type = BTRFS_SHARED_DATA_REF_KEY;
7095 type = BTRFS_EXTENT_DATA_REF_KEY;
7097 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
7099 path = btrfs_alloc_path();
7103 path->leave_spinning = 1;
7104 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
7107 btrfs_free_path(path);
7111 leaf = path->nodes[0];
7112 extent_item = btrfs_item_ptr(leaf, path->slots[0],
7113 struct btrfs_extent_item);
7114 btrfs_set_extent_refs(leaf, extent_item, ref_mod);
7115 btrfs_set_extent_generation(leaf, extent_item, trans->transid);
7116 btrfs_set_extent_flags(leaf, extent_item,
7117 flags | BTRFS_EXTENT_FLAG_DATA);
7119 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
7120 btrfs_set_extent_inline_ref_type(leaf, iref, type);
7122 struct btrfs_shared_data_ref *ref;
7123 ref = (struct btrfs_shared_data_ref *)(iref + 1);
7124 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
7125 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
7127 struct btrfs_extent_data_ref *ref;
7128 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
7129 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
7130 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
7131 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
7132 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
7135 btrfs_mark_buffer_dirty(path->nodes[0]);
7136 btrfs_free_path(path);
7138 ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset);
7142 ret = update_block_group(trans, ins->objectid, ins->offset, 1);
7143 if (ret) { /* -ENOENT, logic error */
7144 btrfs_err(fs_info, "update block group failed for %llu %llu",
7145 ins->objectid, ins->offset);
7148 trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
7152 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
7153 struct btrfs_delayed_ref_node *node,
7154 struct btrfs_delayed_extent_op *extent_op)
7156 struct btrfs_fs_info *fs_info = trans->fs_info;
7158 struct btrfs_extent_item *extent_item;
7159 struct btrfs_key extent_key;
7160 struct btrfs_tree_block_info *block_info;
7161 struct btrfs_extent_inline_ref *iref;
7162 struct btrfs_path *path;
7163 struct extent_buffer *leaf;
7164 struct btrfs_delayed_tree_ref *ref;
7165 u32 size = sizeof(*extent_item) + sizeof(*iref);
7167 u64 flags = extent_op->flags_to_set;
7168 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
7170 ref = btrfs_delayed_node_to_tree_ref(node);
7172 extent_key.objectid = node->bytenr;
7173 if (skinny_metadata) {
7174 extent_key.offset = ref->level;
7175 extent_key.type = BTRFS_METADATA_ITEM_KEY;
7176 num_bytes = fs_info->nodesize;
7178 extent_key.offset = node->num_bytes;
7179 extent_key.type = BTRFS_EXTENT_ITEM_KEY;
7180 size += sizeof(*block_info);
7181 num_bytes = node->num_bytes;
7184 path = btrfs_alloc_path();
7188 path->leave_spinning = 1;
7189 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
7192 btrfs_free_path(path);
7196 leaf = path->nodes[0];
7197 extent_item = btrfs_item_ptr(leaf, path->slots[0],
7198 struct btrfs_extent_item);
7199 btrfs_set_extent_refs(leaf, extent_item, 1);
7200 btrfs_set_extent_generation(leaf, extent_item, trans->transid);
7201 btrfs_set_extent_flags(leaf, extent_item,
7202 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
7204 if (skinny_metadata) {
7205 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
7207 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
7208 btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
7209 btrfs_set_tree_block_level(leaf, block_info, ref->level);
7210 iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
7213 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
7214 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
7215 btrfs_set_extent_inline_ref_type(leaf, iref,
7216 BTRFS_SHARED_BLOCK_REF_KEY);
7217 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
7219 btrfs_set_extent_inline_ref_type(leaf, iref,
7220 BTRFS_TREE_BLOCK_REF_KEY);
7221 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
7224 btrfs_mark_buffer_dirty(leaf);
7225 btrfs_free_path(path);
7227 ret = remove_from_free_space_tree(trans, extent_key.objectid,
7232 ret = update_block_group(trans, extent_key.objectid,
7233 fs_info->nodesize, 1);
7234 if (ret) { /* -ENOENT, logic error */
7235 btrfs_err(fs_info, "update block group failed for %llu %llu",
7236 extent_key.objectid, extent_key.offset);
7240 trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid,
7245 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
7246 struct btrfs_root *root, u64 owner,
7247 u64 offset, u64 ram_bytes,
7248 struct btrfs_key *ins)
7250 struct btrfs_ref generic_ref = { 0 };
7253 BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
7255 btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
7256 ins->objectid, ins->offset, 0);
7257 btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset);
7258 btrfs_ref_tree_mod(root->fs_info, &generic_ref);
7259 ret = btrfs_add_delayed_data_ref(trans, &generic_ref,
7260 ram_bytes, NULL, NULL);
7265 * this is used by the tree logging recovery code. It records that
7266 * an extent has been allocated and makes sure to clear the free
7267 * space cache bits as well
7269 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
7270 u64 root_objectid, u64 owner, u64 offset,
7271 struct btrfs_key *ins)
7273 struct btrfs_fs_info *fs_info = trans->fs_info;
7275 struct btrfs_block_group_cache *block_group;
7276 struct btrfs_space_info *space_info;
7279 * Mixed block groups will exclude before processing the log so we only
7280 * need to do the exclude dance if this fs isn't mixed.
7282 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
7283 ret = __exclude_logged_extent(fs_info, ins->objectid,
7289 block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
7293 space_info = block_group->space_info;
7294 spin_lock(&space_info->lock);
7295 spin_lock(&block_group->lock);
7296 space_info->bytes_reserved += ins->offset;
7297 block_group->reserved += ins->offset;
7298 spin_unlock(&block_group->lock);
7299 spin_unlock(&space_info->lock);
7301 ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
7303 btrfs_put_block_group(block_group);
7307 static struct extent_buffer *
7308 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
7309 u64 bytenr, int level, u64 owner)
7311 struct btrfs_fs_info *fs_info = root->fs_info;
7312 struct extent_buffer *buf;
7314 buf = btrfs_find_create_tree_block(fs_info, bytenr);
7319 * Extra safety check in case the extent tree is corrupted and extent
7320 * allocator chooses to use a tree block which is already used and
7323 if (buf->lock_owner == current->pid) {
7324 btrfs_err_rl(fs_info,
7325 "tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
7326 buf->start, btrfs_header_owner(buf), current->pid);
7327 free_extent_buffer(buf);
7328 return ERR_PTR(-EUCLEAN);
7331 btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
7332 btrfs_tree_lock(buf);
7333 btrfs_clean_tree_block(buf);
7334 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
7336 btrfs_set_lock_blocking_write(buf);
7337 set_extent_buffer_uptodate(buf);
7339 memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
7340 btrfs_set_header_level(buf, level);
7341 btrfs_set_header_bytenr(buf, buf->start);
7342 btrfs_set_header_generation(buf, trans->transid);
7343 btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
7344 btrfs_set_header_owner(buf, owner);
7345 write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
7346 write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
7347 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
7348 buf->log_index = root->log_transid % 2;
7350 * we allow two log transactions at a time, use different
7351 * EXTENT bit to differentiate dirty pages.
7353 if (buf->log_index == 0)
7354 set_extent_dirty(&root->dirty_log_pages, buf->start,
7355 buf->start + buf->len - 1, GFP_NOFS);
7357 set_extent_new(&root->dirty_log_pages, buf->start,
7358 buf->start + buf->len - 1);
7360 buf->log_index = -1;
7361 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
7362 buf->start + buf->len - 1, GFP_NOFS);
7364 trans->dirty = true;
7365 /* this returns a buffer locked for blocking */
7369 static struct btrfs_block_rsv *
7370 use_block_rsv(struct btrfs_trans_handle *trans,
7371 struct btrfs_root *root, u32 blocksize)
7373 struct btrfs_fs_info *fs_info = root->fs_info;
7374 struct btrfs_block_rsv *block_rsv;
7375 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
7377 bool global_updated = false;
7379 block_rsv = get_block_rsv(trans, root);
7381 if (unlikely(block_rsv->size == 0))
7384 ret = btrfs_block_rsv_use_bytes(block_rsv, blocksize);
7388 if (block_rsv->failfast)
7389 return ERR_PTR(ret);
7391 if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
7392 global_updated = true;
7393 update_global_block_rsv(fs_info);
7398 * The global reserve still exists to save us from ourselves, so don't
7399 * warn_on if we are short on our delayed refs reserve.
7401 if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS &&
7402 btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
7403 static DEFINE_RATELIMIT_STATE(_rs,
7404 DEFAULT_RATELIMIT_INTERVAL * 10,
7405 /*DEFAULT_RATELIMIT_BURST*/ 1);
7406 if (__ratelimit(&_rs))
7408 "BTRFS: block rsv returned %d\n", ret);
7411 ret = btrfs_reserve_metadata_bytes(root, block_rsv, blocksize,
7412 BTRFS_RESERVE_NO_FLUSH);
7416 * If we couldn't reserve metadata bytes try and use some from
7417 * the global reserve if its space type is the same as the global
7420 if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
7421 block_rsv->space_info == global_rsv->space_info) {
7422 ret = btrfs_block_rsv_use_bytes(global_rsv, blocksize);
7426 return ERR_PTR(ret);
7429 static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
7430 struct btrfs_block_rsv *block_rsv, u32 blocksize)
7432 block_rsv_add_bytes(block_rsv, blocksize, false);
7433 block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL);
7437 * finds a free extent and does all the dirty work required for allocation
7438 * returns the tree buffer or an ERR_PTR on error.
7440 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
7441 struct btrfs_root *root,
7442 u64 parent, u64 root_objectid,
7443 const struct btrfs_disk_key *key,
7444 int level, u64 hint,
7447 struct btrfs_fs_info *fs_info = root->fs_info;
7448 struct btrfs_key ins;
7449 struct btrfs_block_rsv *block_rsv;
7450 struct extent_buffer *buf;
7451 struct btrfs_delayed_extent_op *extent_op;
7452 struct btrfs_ref generic_ref = { 0 };
7455 u32 blocksize = fs_info->nodesize;
7456 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
7458 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
7459 if (btrfs_is_testing(fs_info)) {
7460 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
7461 level, root_objectid);
7463 root->alloc_bytenr += blocksize;
7468 block_rsv = use_block_rsv(trans, root, blocksize);
7469 if (IS_ERR(block_rsv))
7470 return ERR_CAST(block_rsv);
7472 ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
7473 empty_size, hint, &ins, 0, 0);
7477 buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
7481 goto out_free_reserved;
7484 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
7486 parent = ins.objectid;
7487 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
7491 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
7492 extent_op = btrfs_alloc_delayed_extent_op();
7498 memcpy(&extent_op->key, key, sizeof(extent_op->key));
7500 memset(&extent_op->key, 0, sizeof(extent_op->key));
7501 extent_op->flags_to_set = flags;
7502 extent_op->update_key = skinny_metadata ? false : true;
7503 extent_op->update_flags = true;
7504 extent_op->is_data = false;
7505 extent_op->level = level;
7507 btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
7508 ins.objectid, ins.offset, parent);
7509 generic_ref.real_root = root->root_key.objectid;
7510 btrfs_init_tree_ref(&generic_ref, level, root_objectid);
7511 btrfs_ref_tree_mod(fs_info, &generic_ref);
7512 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref,
7513 extent_op, NULL, NULL);
7515 goto out_free_delayed;
7520 btrfs_free_delayed_extent_op(extent_op);
7522 free_extent_buffer(buf);
7524 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
7526 unuse_block_rsv(fs_info, block_rsv, blocksize);
7527 return ERR_PTR(ret);
7530 struct walk_control {
7531 u64 refs[BTRFS_MAX_LEVEL];
7532 u64 flags[BTRFS_MAX_LEVEL];
7533 struct btrfs_key update_progress;
7534 struct btrfs_key drop_progress;
7546 #define DROP_REFERENCE 1
7547 #define UPDATE_BACKREF 2
7549 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
7550 struct btrfs_root *root,
7551 struct walk_control *wc,
7552 struct btrfs_path *path)
7554 struct btrfs_fs_info *fs_info = root->fs_info;
7560 struct btrfs_key key;
7561 struct extent_buffer *eb;
7566 if (path->slots[wc->level] < wc->reada_slot) {
7567 wc->reada_count = wc->reada_count * 2 / 3;
7568 wc->reada_count = max(wc->reada_count, 2);
7570 wc->reada_count = wc->reada_count * 3 / 2;
7571 wc->reada_count = min_t(int, wc->reada_count,
7572 BTRFS_NODEPTRS_PER_BLOCK(fs_info));
7575 eb = path->nodes[wc->level];
7576 nritems = btrfs_header_nritems(eb);
7578 for (slot = path->slots[wc->level]; slot < nritems; slot++) {
7579 if (nread >= wc->reada_count)
7583 bytenr = btrfs_node_blockptr(eb, slot);
7584 generation = btrfs_node_ptr_generation(eb, slot);
7586 if (slot == path->slots[wc->level])
7589 if (wc->stage == UPDATE_BACKREF &&
7590 generation <= root->root_key.offset)
7593 /* We don't lock the tree block, it's OK to be racy here */
7594 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
7595 wc->level - 1, 1, &refs,
7597 /* We don't care about errors in readahead. */
7602 if (wc->stage == DROP_REFERENCE) {
7606 if (wc->level == 1 &&
7607 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7609 if (!wc->update_ref ||
7610 generation <= root->root_key.offset)
7612 btrfs_node_key_to_cpu(eb, &key, slot);
7613 ret = btrfs_comp_cpu_keys(&key,
7614 &wc->update_progress);
7618 if (wc->level == 1 &&
7619 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7623 readahead_tree_block(fs_info, bytenr);
7626 wc->reada_slot = slot;
7630 * helper to process tree block while walking down the tree.
7632 * when wc->stage == UPDATE_BACKREF, this function updates
7633 * back refs for pointers in the block.
7635 * NOTE: return value 1 means we should stop walking down.
7637 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
7638 struct btrfs_root *root,
7639 struct btrfs_path *path,
7640 struct walk_control *wc, int lookup_info)
7642 struct btrfs_fs_info *fs_info = root->fs_info;
7643 int level = wc->level;
7644 struct extent_buffer *eb = path->nodes[level];
7645 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
7648 if (wc->stage == UPDATE_BACKREF &&
7649 btrfs_header_owner(eb) != root->root_key.objectid)
7653 * when reference count of tree block is 1, it won't increase
7654 * again. once full backref flag is set, we never clear it.
7657 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
7658 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
7659 BUG_ON(!path->locks[level]);
7660 ret = btrfs_lookup_extent_info(trans, fs_info,
7661 eb->start, level, 1,
7664 BUG_ON(ret == -ENOMEM);
7667 BUG_ON(wc->refs[level] == 0);
7670 if (wc->stage == DROP_REFERENCE) {
7671 if (wc->refs[level] > 1)
7674 if (path->locks[level] && !wc->keep_locks) {
7675 btrfs_tree_unlock_rw(eb, path->locks[level]);
7676 path->locks[level] = 0;
7681 /* wc->stage == UPDATE_BACKREF */
7682 if (!(wc->flags[level] & flag)) {
7683 BUG_ON(!path->locks[level]);
7684 ret = btrfs_inc_ref(trans, root, eb, 1);
7685 BUG_ON(ret); /* -ENOMEM */
7686 ret = btrfs_dec_ref(trans, root, eb, 0);
7687 BUG_ON(ret); /* -ENOMEM */
7688 ret = btrfs_set_disk_extent_flags(trans, eb->start,
7690 btrfs_header_level(eb), 0);
7691 BUG_ON(ret); /* -ENOMEM */
7692 wc->flags[level] |= flag;
7696 * the block is shared by multiple trees, so it's not good to
7697 * keep the tree lock
7699 if (path->locks[level] && level > 0) {
7700 btrfs_tree_unlock_rw(eb, path->locks[level]);
7701 path->locks[level] = 0;
7707 * This is used to verify a ref exists for this root to deal with a bug where we
7708 * would have a drop_progress key that hadn't been updated properly.
7710 static int check_ref_exists(struct btrfs_trans_handle *trans,
7711 struct btrfs_root *root, u64 bytenr, u64 parent,
7714 struct btrfs_path *path;
7715 struct btrfs_extent_inline_ref *iref;
7718 path = btrfs_alloc_path();
7722 ret = lookup_extent_backref(trans, path, &iref, bytenr,
7723 root->fs_info->nodesize, parent,
7724 root->root_key.objectid, level, 0);
7725 btrfs_free_path(path);
7734 * helper to process tree block pointer.
7736 * when wc->stage == DROP_REFERENCE, this function checks
7737 * reference count of the block pointed to. if the block
7738 * is shared and we need update back refs for the subtree
7739 * rooted at the block, this function changes wc->stage to
7740 * UPDATE_BACKREF. if the block is shared and there is no
7741 * need to update back, this function drops the reference
7744 * NOTE: return value 1 means we should stop walking down.
7746 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
7747 struct btrfs_root *root,
7748 struct btrfs_path *path,
7749 struct walk_control *wc, int *lookup_info)
7751 struct btrfs_fs_info *fs_info = root->fs_info;
7755 struct btrfs_key key;
7756 struct btrfs_key first_key;
7757 struct btrfs_ref ref = { 0 };
7758 struct extent_buffer *next;
7759 int level = wc->level;
7762 bool need_account = false;
7764 generation = btrfs_node_ptr_generation(path->nodes[level],
7765 path->slots[level]);
7767 * if the lower level block was created before the snapshot
7768 * was created, we know there is no need to update back refs
7771 if (wc->stage == UPDATE_BACKREF &&
7772 generation <= root->root_key.offset) {
7777 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
7778 btrfs_node_key_to_cpu(path->nodes[level], &first_key,
7779 path->slots[level]);
7781 next = find_extent_buffer(fs_info, bytenr);
7783 next = btrfs_find_create_tree_block(fs_info, bytenr);
7785 return PTR_ERR(next);
7787 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
7791 btrfs_tree_lock(next);
7792 btrfs_set_lock_blocking_write(next);
7794 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
7795 &wc->refs[level - 1],
7796 &wc->flags[level - 1]);
7800 if (unlikely(wc->refs[level - 1] == 0)) {
7801 btrfs_err(fs_info, "Missing references.");
7807 if (wc->stage == DROP_REFERENCE) {
7808 if (wc->refs[level - 1] > 1) {
7809 need_account = true;
7811 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7814 if (!wc->update_ref ||
7815 generation <= root->root_key.offset)
7818 btrfs_node_key_to_cpu(path->nodes[level], &key,
7819 path->slots[level]);
7820 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
7824 wc->stage = UPDATE_BACKREF;
7825 wc->shared_level = level - 1;
7829 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7833 if (!btrfs_buffer_uptodate(next, generation, 0)) {
7834 btrfs_tree_unlock(next);
7835 free_extent_buffer(next);
7841 if (reada && level == 1)
7842 reada_walk_down(trans, root, wc, path);
7843 next = read_tree_block(fs_info, bytenr, generation, level - 1,
7846 return PTR_ERR(next);
7847 } else if (!extent_buffer_uptodate(next)) {
7848 free_extent_buffer(next);
7851 btrfs_tree_lock(next);
7852 btrfs_set_lock_blocking_write(next);
7856 ASSERT(level == btrfs_header_level(next));
7857 if (level != btrfs_header_level(next)) {
7858 btrfs_err(root->fs_info, "mismatched level");
7862 path->nodes[level] = next;
7863 path->slots[level] = 0;
7864 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7870 wc->refs[level - 1] = 0;
7871 wc->flags[level - 1] = 0;
7872 if (wc->stage == DROP_REFERENCE) {
7873 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
7874 parent = path->nodes[level]->start;
7876 ASSERT(root->root_key.objectid ==
7877 btrfs_header_owner(path->nodes[level]));
7878 if (root->root_key.objectid !=
7879 btrfs_header_owner(path->nodes[level])) {
7880 btrfs_err(root->fs_info,
7881 "mismatched block owner");
7889 * If we had a drop_progress we need to verify the refs are set
7890 * as expected. If we find our ref then we know that from here
7891 * on out everything should be correct, and we can clear the
7894 if (wc->restarted) {
7895 ret = check_ref_exists(trans, root, bytenr, parent,
7906 * Reloc tree doesn't contribute to qgroup numbers, and we have
7907 * already accounted them at merge time (replace_path),
7908 * thus we could skip expensive subtree trace here.
7910 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
7912 ret = btrfs_qgroup_trace_subtree(trans, next,
7913 generation, level - 1);
7915 btrfs_err_rl(fs_info,
7916 "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
7922 * We need to update the next key in our walk control so we can
7923 * update the drop_progress key accordingly. We don't care if
7924 * find_next_key doesn't find a key because that means we're at
7925 * the end and are going to clean up now.
7927 wc->drop_level = level;
7928 find_next_key(path, level, &wc->drop_progress);
7930 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
7931 fs_info->nodesize, parent);
7932 btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid);
7933 ret = btrfs_free_extent(trans, &ref);
7942 btrfs_tree_unlock(next);
7943 free_extent_buffer(next);
7949 * helper to process tree block while walking up the tree.
7951 * when wc->stage == DROP_REFERENCE, this function drops
7952 * reference count on the block.
7954 * when wc->stage == UPDATE_BACKREF, this function changes
7955 * wc->stage back to DROP_REFERENCE if we changed wc->stage
7956 * to UPDATE_BACKREF previously while processing the block.
7958 * NOTE: return value 1 means we should stop walking up.
7960 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
7961 struct btrfs_root *root,
7962 struct btrfs_path *path,
7963 struct walk_control *wc)
7965 struct btrfs_fs_info *fs_info = root->fs_info;
7967 int level = wc->level;
7968 struct extent_buffer *eb = path->nodes[level];
7971 if (wc->stage == UPDATE_BACKREF) {
7972 BUG_ON(wc->shared_level < level);
7973 if (level < wc->shared_level)
7976 ret = find_next_key(path, level + 1, &wc->update_progress);
7980 wc->stage = DROP_REFERENCE;
7981 wc->shared_level = -1;
7982 path->slots[level] = 0;
7985 * check reference count again if the block isn't locked.
7986 * we should start walking down the tree again if reference
7989 if (!path->locks[level]) {
7991 btrfs_tree_lock(eb);
7992 btrfs_set_lock_blocking_write(eb);
7993 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7995 ret = btrfs_lookup_extent_info(trans, fs_info,
7996 eb->start, level, 1,
8000 btrfs_tree_unlock_rw(eb, path->locks[level]);
8001 path->locks[level] = 0;
8004 BUG_ON(wc->refs[level] == 0);
8005 if (wc->refs[level] == 1) {
8006 btrfs_tree_unlock_rw(eb, path->locks[level]);
8007 path->locks[level] = 0;
8013 /* wc->stage == DROP_REFERENCE */
8014 BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
8016 if (wc->refs[level] == 1) {
8018 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8019 ret = btrfs_dec_ref(trans, root, eb, 1);
8021 ret = btrfs_dec_ref(trans, root, eb, 0);
8022 BUG_ON(ret); /* -ENOMEM */
8023 if (is_fstree(root->root_key.objectid)) {
8024 ret = btrfs_qgroup_trace_leaf_items(trans, eb);
8026 btrfs_err_rl(fs_info,
8027 "error %d accounting leaf items, quota is out of sync, rescan required",
8032 /* make block locked assertion in btrfs_clean_tree_block happy */
8033 if (!path->locks[level] &&
8034 btrfs_header_generation(eb) == trans->transid) {
8035 btrfs_tree_lock(eb);
8036 btrfs_set_lock_blocking_write(eb);
8037 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8039 btrfs_clean_tree_block(eb);
8042 if (eb == root->node) {
8043 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8045 else if (root->root_key.objectid != btrfs_header_owner(eb))
8046 goto owner_mismatch;
8048 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8049 parent = path->nodes[level + 1]->start;
8050 else if (root->root_key.objectid !=
8051 btrfs_header_owner(path->nodes[level + 1]))
8052 goto owner_mismatch;
8055 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
8057 wc->refs[level] = 0;
8058 wc->flags[level] = 0;
8062 btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
8063 btrfs_header_owner(eb), root->root_key.objectid);
8067 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
8068 struct btrfs_root *root,
8069 struct btrfs_path *path,
8070 struct walk_control *wc)
8072 int level = wc->level;
8073 int lookup_info = 1;
8076 while (level >= 0) {
8077 ret = walk_down_proc(trans, root, path, wc, lookup_info);
8084 if (path->slots[level] >=
8085 btrfs_header_nritems(path->nodes[level]))
8088 ret = do_walk_down(trans, root, path, wc, &lookup_info);
8090 path->slots[level]++;
8099 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
8100 struct btrfs_root *root,
8101 struct btrfs_path *path,
8102 struct walk_control *wc, int max_level)
8104 int level = wc->level;
8107 path->slots[level] = btrfs_header_nritems(path->nodes[level]);
8108 while (level < max_level && path->nodes[level]) {
8110 if (path->slots[level] + 1 <
8111 btrfs_header_nritems(path->nodes[level])) {
8112 path->slots[level]++;
8115 ret = walk_up_proc(trans, root, path, wc);
8121 if (path->locks[level]) {
8122 btrfs_tree_unlock_rw(path->nodes[level],
8123 path->locks[level]);
8124 path->locks[level] = 0;
8126 free_extent_buffer(path->nodes[level]);
8127 path->nodes[level] = NULL;
8135 * drop a subvolume tree.
8137 * this function traverses the tree freeing any blocks that only
8138 * referenced by the tree.
8140 * when a shared tree block is found. this function decreases its
8141 * reference count by one. if update_ref is true, this function
8142 * also make sure backrefs for the shared block and all lower level
8143 * blocks are properly updated.
8145 * If called with for_reloc == 0, may exit early with -EAGAIN
8147 int btrfs_drop_snapshot(struct btrfs_root *root,
8148 struct btrfs_block_rsv *block_rsv, int update_ref,
8151 struct btrfs_fs_info *fs_info = root->fs_info;
8152 struct btrfs_path *path;
8153 struct btrfs_trans_handle *trans;
8154 struct btrfs_root *tree_root = fs_info->tree_root;
8155 struct btrfs_root_item *root_item = &root->root_item;
8156 struct walk_control *wc;
8157 struct btrfs_key key;
8161 bool root_dropped = false;
8163 btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
8165 path = btrfs_alloc_path();
8171 wc = kzalloc(sizeof(*wc), GFP_NOFS);
8173 btrfs_free_path(path);
8178 trans = btrfs_start_transaction(tree_root, 0);
8179 if (IS_ERR(trans)) {
8180 err = PTR_ERR(trans);
8184 err = btrfs_run_delayed_items(trans);
8189 trans->block_rsv = block_rsv;
8192 * This will help us catch people modifying the fs tree while we're
8193 * dropping it. It is unsafe to mess with the fs tree while it's being
8194 * dropped as we unlock the root node and parent nodes as we walk down
8195 * the tree, assuming nothing will change. If something does change
8196 * then we'll have stale information and drop references to blocks we've
8199 set_bit(BTRFS_ROOT_DELETING, &root->state);
8200 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
8201 level = btrfs_header_level(root->node);
8202 path->nodes[level] = btrfs_lock_root_node(root);
8203 btrfs_set_lock_blocking_write(path->nodes[level]);
8204 path->slots[level] = 0;
8205 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8206 memset(&wc->update_progress, 0,
8207 sizeof(wc->update_progress));
8209 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
8210 memcpy(&wc->update_progress, &key,
8211 sizeof(wc->update_progress));
8213 level = root_item->drop_level;
8215 path->lowest_level = level;
8216 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
8217 path->lowest_level = 0;
8225 * unlock our path, this is safe because only this
8226 * function is allowed to delete this snapshot
8228 btrfs_unlock_up_safe(path, 0);
8230 level = btrfs_header_level(root->node);
8232 btrfs_tree_lock(path->nodes[level]);
8233 btrfs_set_lock_blocking_write(path->nodes[level]);
8234 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8236 ret = btrfs_lookup_extent_info(trans, fs_info,
8237 path->nodes[level]->start,
8238 level, 1, &wc->refs[level],
8244 BUG_ON(wc->refs[level] == 0);
8246 if (level == root_item->drop_level)
8249 btrfs_tree_unlock(path->nodes[level]);
8250 path->locks[level] = 0;
8251 WARN_ON(wc->refs[level] != 1);
8256 wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
8258 wc->shared_level = -1;
8259 wc->stage = DROP_REFERENCE;
8260 wc->update_ref = update_ref;
8262 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
8266 ret = walk_down_tree(trans, root, path, wc);
8272 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
8279 BUG_ON(wc->stage != DROP_REFERENCE);
8283 if (wc->stage == DROP_REFERENCE) {
8284 wc->drop_level = wc->level;
8285 btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
8287 path->slots[wc->drop_level]);
8289 btrfs_cpu_key_to_disk(&root_item->drop_progress,
8290 &wc->drop_progress);
8291 root_item->drop_level = wc->drop_level;
8293 BUG_ON(wc->level == 0);
8294 if (btrfs_should_end_transaction(trans) ||
8295 (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
8296 ret = btrfs_update_root(trans, tree_root,
8300 btrfs_abort_transaction(trans, ret);
8305 btrfs_end_transaction_throttle(trans);
8306 if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
8307 btrfs_debug(fs_info,
8308 "drop snapshot early exit");
8313 trans = btrfs_start_transaction(tree_root, 0);
8314 if (IS_ERR(trans)) {
8315 err = PTR_ERR(trans);
8319 trans->block_rsv = block_rsv;
8322 btrfs_release_path(path);
8326 ret = btrfs_del_root(trans, &root->root_key);
8328 btrfs_abort_transaction(trans, ret);
8333 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
8334 ret = btrfs_find_root(tree_root, &root->root_key, path,
8337 btrfs_abort_transaction(trans, ret);
8340 } else if (ret > 0) {
8341 /* if we fail to delete the orphan item this time
8342 * around, it'll get picked up the next time.
8344 * The most common failure here is just -ENOENT.
8346 btrfs_del_orphan_item(trans, tree_root,
8347 root->root_key.objectid);
8351 if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
8352 btrfs_add_dropped_root(trans, root);
8354 free_extent_buffer(root->node);
8355 free_extent_buffer(root->commit_root);
8356 btrfs_put_fs_root(root);
8358 root_dropped = true;
8360 btrfs_end_transaction_throttle(trans);
8363 btrfs_free_path(path);
8366 * So if we need to stop dropping the snapshot for whatever reason we
8367 * need to make sure to add it back to the dead root list so that we
8368 * keep trying to do the work later. This also cleans up roots if we
8369 * don't have it in the radix (like when we recover after a power fail
8370 * or unmount) so we don't leak memory.
8372 if (!for_reloc && !root_dropped)
8373 btrfs_add_dead_root(root);
8374 if (err && err != -EAGAIN)
8375 btrfs_handle_fs_error(fs_info, err, NULL);
8380 * drop subtree rooted at tree block 'node'.
8382 * NOTE: this function will unlock and release tree block 'node'
8383 * only used by relocation code
8385 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
8386 struct btrfs_root *root,
8387 struct extent_buffer *node,
8388 struct extent_buffer *parent)
8390 struct btrfs_fs_info *fs_info = root->fs_info;
8391 struct btrfs_path *path;
8392 struct walk_control *wc;
8398 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
8400 path = btrfs_alloc_path();
8404 wc = kzalloc(sizeof(*wc), GFP_NOFS);
8406 btrfs_free_path(path);
8410 btrfs_assert_tree_locked(parent);
8411 parent_level = btrfs_header_level(parent);
8412 extent_buffer_get(parent);
8413 path->nodes[parent_level] = parent;
8414 path->slots[parent_level] = btrfs_header_nritems(parent);
8416 btrfs_assert_tree_locked(node);
8417 level = btrfs_header_level(node);
8418 path->nodes[level] = node;
8419 path->slots[level] = 0;
8420 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8422 wc->refs[parent_level] = 1;
8423 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
8425 wc->shared_level = -1;
8426 wc->stage = DROP_REFERENCE;
8429 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
8432 wret = walk_down_tree(trans, root, path, wc);
8438 wret = walk_up_tree(trans, root, path, wc, parent_level);
8446 btrfs_free_path(path);
8450 static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
8456 * if restripe for this chunk_type is on pick target profile and
8457 * return, otherwise do the usual balance
8459 stripped = get_restripe_target(fs_info, flags);
8461 return extended_to_chunk(stripped);
8463 num_devices = fs_info->fs_devices->rw_devices;
8465 stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK |
8466 BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10;
8468 if (num_devices == 1) {
8469 stripped |= BTRFS_BLOCK_GROUP_DUP;
8470 stripped = flags & ~stripped;
8472 /* turn raid0 into single device chunks */
8473 if (flags & BTRFS_BLOCK_GROUP_RAID0)
8476 /* turn mirroring into duplication */
8477 if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK |
8478 BTRFS_BLOCK_GROUP_RAID10))
8479 return stripped | BTRFS_BLOCK_GROUP_DUP;
8481 /* they already had raid on here, just return */
8482 if (flags & stripped)
8485 stripped |= BTRFS_BLOCK_GROUP_DUP;
8486 stripped = flags & ~stripped;
8488 /* switch duplicated blocks with raid1 */
8489 if (flags & BTRFS_BLOCK_GROUP_DUP)
8490 return stripped | BTRFS_BLOCK_GROUP_RAID1;
8492 /* this is drive concat, leave it alone */
8498 static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
8500 struct btrfs_space_info *sinfo = cache->space_info;
8503 u64 min_allocable_bytes;
8507 * We need some metadata space and system metadata space for
8508 * allocating chunks in some corner cases until we force to set
8509 * it to be readonly.
8512 (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
8514 min_allocable_bytes = SZ_1M;
8516 min_allocable_bytes = 0;
8518 spin_lock(&sinfo->lock);
8519 spin_lock(&cache->lock);
8527 num_bytes = cache->key.offset - cache->reserved - cache->pinned -
8528 cache->bytes_super - btrfs_block_group_used(&cache->item);
8529 sinfo_used = btrfs_space_info_used(sinfo, true);
8531 if (sinfo_used + num_bytes + min_allocable_bytes <=
8532 sinfo->total_bytes) {
8533 sinfo->bytes_readonly += num_bytes;
8535 list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
8539 spin_unlock(&cache->lock);
8540 spin_unlock(&sinfo->lock);
8541 if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
8542 btrfs_info(cache->fs_info,
8543 "unable to make block group %llu ro",
8544 cache->key.objectid);
8545 btrfs_info(cache->fs_info,
8546 "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu",
8547 sinfo_used, num_bytes, min_allocable_bytes);
8548 btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
8553 int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache)
8556 struct btrfs_fs_info *fs_info = cache->fs_info;
8557 struct btrfs_trans_handle *trans;
8562 trans = btrfs_join_transaction(fs_info->extent_root);
8564 return PTR_ERR(trans);
8567 * we're not allowed to set block groups readonly after the dirty
8568 * block groups cache has started writing. If it already started,
8569 * back off and let this transaction commit
8571 mutex_lock(&fs_info->ro_block_group_mutex);
8572 if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
8573 u64 transid = trans->transid;
8575 mutex_unlock(&fs_info->ro_block_group_mutex);
8576 btrfs_end_transaction(trans);
8578 ret = btrfs_wait_for_commit(fs_info, transid);
8585 * if we are changing raid levels, try to allocate a corresponding
8586 * block group with the new raid level.
8588 alloc_flags = update_block_group_flags(fs_info, cache->flags);
8589 if (alloc_flags != cache->flags) {
8590 ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
8592 * ENOSPC is allowed here, we may have enough space
8593 * already allocated at the new raid level to
8602 ret = inc_block_group_ro(cache, 0);
8605 alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags);
8606 ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
8609 ret = inc_block_group_ro(cache, 0);
8611 if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
8612 alloc_flags = update_block_group_flags(fs_info, cache->flags);
8613 mutex_lock(&fs_info->chunk_mutex);
8614 check_system_chunk(trans, alloc_flags);
8615 mutex_unlock(&fs_info->chunk_mutex);
8617 mutex_unlock(&fs_info->ro_block_group_mutex);
8619 btrfs_end_transaction(trans);
8623 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
8625 u64 alloc_flags = get_alloc_profile(trans->fs_info, type);
8627 return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
8631 * helper to account the unused space of all the readonly block group in the
8632 * space_info. takes mirrors into account.
8634 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
8636 struct btrfs_block_group_cache *block_group;
8640 /* It's df, we don't care if it's racy */
8641 if (list_empty(&sinfo->ro_bgs))
8644 spin_lock(&sinfo->lock);
8645 list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
8646 spin_lock(&block_group->lock);
8648 if (!block_group->ro) {
8649 spin_unlock(&block_group->lock);
8653 factor = btrfs_bg_type_to_factor(block_group->flags);
8654 free_bytes += (block_group->key.offset -
8655 btrfs_block_group_used(&block_group->item)) *
8658 spin_unlock(&block_group->lock);
8660 spin_unlock(&sinfo->lock);
8665 void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
8667 struct btrfs_space_info *sinfo = cache->space_info;
8672 spin_lock(&sinfo->lock);
8673 spin_lock(&cache->lock);
8675 num_bytes = cache->key.offset - cache->reserved -
8676 cache->pinned - cache->bytes_super -
8677 btrfs_block_group_used(&cache->item);
8678 sinfo->bytes_readonly -= num_bytes;
8679 list_del_init(&cache->ro_list);
8681 spin_unlock(&cache->lock);
8682 spin_unlock(&sinfo->lock);
8686 * Checks to see if it's even possible to relocate this block group.
8688 * @return - -1 if it's not a good idea to relocate this block group, 0 if its
8689 * ok to go ahead and try.
8691 int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
8693 struct btrfs_block_group_cache *block_group;
8694 struct btrfs_space_info *space_info;
8695 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
8696 struct btrfs_device *device;
8706 debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG);
8708 block_group = btrfs_lookup_block_group(fs_info, bytenr);
8710 /* odd, couldn't find the block group, leave it alone */
8714 "can't find block group for bytenr %llu",
8719 min_free = btrfs_block_group_used(&block_group->item);
8721 /* no bytes used, we're good */
8725 space_info = block_group->space_info;
8726 spin_lock(&space_info->lock);
8728 full = space_info->full;
8731 * if this is the last block group we have in this space, we can't
8732 * relocate it unless we're able to allocate a new chunk below.
8734 * Otherwise, we need to make sure we have room in the space to handle
8735 * all of the extents from this block group. If we can, we're good
8737 if ((space_info->total_bytes != block_group->key.offset) &&
8738 (btrfs_space_info_used(space_info, false) + min_free <
8739 space_info->total_bytes)) {
8740 spin_unlock(&space_info->lock);
8743 spin_unlock(&space_info->lock);
8746 * ok we don't have enough space, but maybe we have free space on our
8747 * devices to allocate new chunks for relocation, so loop through our
8748 * alloc devices and guess if we have enough space. if this block
8749 * group is going to be restriped, run checks against the target
8750 * profile instead of the current one.
8762 target = get_restripe_target(fs_info, block_group->flags);
8764 index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target));
8767 * this is just a balance, so if we were marked as full
8768 * we know there is no space for a new chunk
8773 "no space to alloc new chunk for block group %llu",
8774 block_group->key.objectid);
8778 index = btrfs_bg_flags_to_raid_index(block_group->flags);
8781 if (index == BTRFS_RAID_RAID10) {
8785 } else if (index == BTRFS_RAID_RAID1) {
8787 } else if (index == BTRFS_RAID_DUP) {
8790 } else if (index == BTRFS_RAID_RAID0) {
8791 dev_min = fs_devices->rw_devices;
8792 min_free = div64_u64(min_free, dev_min);
8795 mutex_lock(&fs_info->chunk_mutex);
8796 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
8800 * check to make sure we can actually find a chunk with enough
8801 * space to fit our block group in.
8803 if (device->total_bytes > device->bytes_used + min_free &&
8804 !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
8805 ret = find_free_dev_extent(device, min_free,
8810 if (dev_nr >= dev_min)
8816 if (debug && ret == -1)
8818 "no space to allocate a new chunk for block group %llu",
8819 block_group->key.objectid);
8820 mutex_unlock(&fs_info->chunk_mutex);
8822 btrfs_put_block_group(block_group);
8826 static int find_first_block_group(struct btrfs_fs_info *fs_info,
8827 struct btrfs_path *path,
8828 struct btrfs_key *key)
8830 struct btrfs_root *root = fs_info->extent_root;
8832 struct btrfs_key found_key;
8833 struct extent_buffer *leaf;
8834 struct btrfs_block_group_item bg;
8838 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
8843 slot = path->slots[0];
8844 leaf = path->nodes[0];
8845 if (slot >= btrfs_header_nritems(leaf)) {
8846 ret = btrfs_next_leaf(root, path);
8853 btrfs_item_key_to_cpu(leaf, &found_key, slot);
8855 if (found_key.objectid >= key->objectid &&
8856 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
8857 struct extent_map_tree *em_tree;
8858 struct extent_map *em;
8860 em_tree = &root->fs_info->mapping_tree;
8861 read_lock(&em_tree->lock);
8862 em = lookup_extent_mapping(em_tree, found_key.objectid,
8864 read_unlock(&em_tree->lock);
8867 "logical %llu len %llu found bg but no related chunk",
8868 found_key.objectid, found_key.offset);
8870 } else if (em->start != found_key.objectid ||
8871 em->len != found_key.offset) {
8873 "block group %llu len %llu mismatch with chunk %llu len %llu",
8874 found_key.objectid, found_key.offset,
8875 em->start, em->len);
8878 read_extent_buffer(leaf, &bg,
8879 btrfs_item_ptr_offset(leaf, slot),
8881 flags = btrfs_block_group_flags(&bg) &
8882 BTRFS_BLOCK_GROUP_TYPE_MASK;
8884 if (flags != (em->map_lookup->type &
8885 BTRFS_BLOCK_GROUP_TYPE_MASK)) {
8887 "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
8889 found_key.offset, flags,
8890 (BTRFS_BLOCK_GROUP_TYPE_MASK &
8891 em->map_lookup->type));
8897 free_extent_map(em);
8906 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
8908 struct btrfs_block_group_cache *block_group;
8912 struct inode *inode;
8914 block_group = btrfs_lookup_first_block_group(info, last);
8915 while (block_group) {
8916 wait_block_group_cache_done(block_group);
8917 spin_lock(&block_group->lock);
8918 if (block_group->iref)
8920 spin_unlock(&block_group->lock);
8921 block_group = next_block_group(block_group);
8930 inode = block_group->inode;
8931 block_group->iref = 0;
8932 block_group->inode = NULL;
8933 spin_unlock(&block_group->lock);
8934 ASSERT(block_group->io_ctl.inode == NULL);
8936 last = block_group->key.objectid + block_group->key.offset;
8937 btrfs_put_block_group(block_group);
8942 * Must be called only after stopping all workers, since we could have block
8943 * group caching kthreads running, and therefore they could race with us if we
8944 * freed the block groups before stopping them.
8946 int btrfs_free_block_groups(struct btrfs_fs_info *info)
8948 struct btrfs_block_group_cache *block_group;
8949 struct btrfs_space_info *space_info;
8950 struct btrfs_caching_control *caching_ctl;
8953 down_write(&info->commit_root_sem);
8954 while (!list_empty(&info->caching_block_groups)) {
8955 caching_ctl = list_entry(info->caching_block_groups.next,
8956 struct btrfs_caching_control, list);
8957 list_del(&caching_ctl->list);
8958 put_caching_control(caching_ctl);
8960 up_write(&info->commit_root_sem);
8962 spin_lock(&info->unused_bgs_lock);
8963 while (!list_empty(&info->unused_bgs)) {
8964 block_group = list_first_entry(&info->unused_bgs,
8965 struct btrfs_block_group_cache,
8967 list_del_init(&block_group->bg_list);
8968 btrfs_put_block_group(block_group);
8970 spin_unlock(&info->unused_bgs_lock);
8972 spin_lock(&info->block_group_cache_lock);
8973 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
8974 block_group = rb_entry(n, struct btrfs_block_group_cache,
8976 rb_erase(&block_group->cache_node,
8977 &info->block_group_cache_tree);
8978 RB_CLEAR_NODE(&block_group->cache_node);
8979 spin_unlock(&info->block_group_cache_lock);
8981 down_write(&block_group->space_info->groups_sem);
8982 list_del(&block_group->list);
8983 up_write(&block_group->space_info->groups_sem);
8986 * We haven't cached this block group, which means we could
8987 * possibly have excluded extents on this block group.
8989 if (block_group->cached == BTRFS_CACHE_NO ||
8990 block_group->cached == BTRFS_CACHE_ERROR)
8991 free_excluded_extents(block_group);
8993 btrfs_remove_free_space_cache(block_group);
8994 ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
8995 ASSERT(list_empty(&block_group->dirty_list));
8996 ASSERT(list_empty(&block_group->io_list));
8997 ASSERT(list_empty(&block_group->bg_list));
8998 ASSERT(atomic_read(&block_group->count) == 1);
8999 btrfs_put_block_group(block_group);
9001 spin_lock(&info->block_group_cache_lock);
9003 spin_unlock(&info->block_group_cache_lock);
9005 /* now that all the block groups are freed, go through and
9006 * free all the space_info structs. This is only called during
9007 * the final stages of unmount, and so we know nobody is
9008 * using them. We call synchronize_rcu() once before we start,
9009 * just to be on the safe side.
9013 release_global_block_rsv(info);
9015 while (!list_empty(&info->space_info)) {
9018 space_info = list_entry(info->space_info.next,
9019 struct btrfs_space_info,
9023 * Do not hide this behind enospc_debug, this is actually
9024 * important and indicates a real bug if this happens.
9026 if (WARN_ON(space_info->bytes_pinned > 0 ||
9027 space_info->bytes_reserved > 0 ||
9028 space_info->bytes_may_use > 0))
9029 btrfs_dump_space_info(info, space_info, 0, 0);
9030 list_del(&space_info->list);
9031 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
9032 struct kobject *kobj;
9033 kobj = space_info->block_group_kobjs[i];
9034 space_info->block_group_kobjs[i] = NULL;
9040 kobject_del(&space_info->kobj);
9041 kobject_put(&space_info->kobj);
9046 /* link_block_group will queue up kobjects to add when we're reclaim-safe */
9047 void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info)
9049 struct btrfs_space_info *space_info;
9050 struct raid_kobject *rkobj;
9054 spin_lock(&fs_info->pending_raid_kobjs_lock);
9055 list_splice_init(&fs_info->pending_raid_kobjs, &list);
9056 spin_unlock(&fs_info->pending_raid_kobjs_lock);
9058 list_for_each_entry(rkobj, &list, list) {
9059 space_info = btrfs_find_space_info(fs_info, rkobj->flags);
9061 ret = kobject_add(&rkobj->kobj, &space_info->kobj,
9062 "%s", btrfs_bg_type_to_raid_name(rkobj->flags));
9064 kobject_put(&rkobj->kobj);
9070 "failed to add kobject for block cache, ignoring");
9073 static void link_block_group(struct btrfs_block_group_cache *cache)
9075 struct btrfs_space_info *space_info = cache->space_info;
9076 struct btrfs_fs_info *fs_info = cache->fs_info;
9077 int index = btrfs_bg_flags_to_raid_index(cache->flags);
9080 down_write(&space_info->groups_sem);
9081 if (list_empty(&space_info->block_groups[index]))
9083 list_add_tail(&cache->list, &space_info->block_groups[index]);
9084 up_write(&space_info->groups_sem);
9087 struct raid_kobject *rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
9089 btrfs_warn(cache->fs_info,
9090 "couldn't alloc memory for raid level kobject");
9093 rkobj->flags = cache->flags;
9094 kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
9096 spin_lock(&fs_info->pending_raid_kobjs_lock);
9097 list_add_tail(&rkobj->list, &fs_info->pending_raid_kobjs);
9098 spin_unlock(&fs_info->pending_raid_kobjs_lock);
9099 space_info->block_group_kobjs[index] = &rkobj->kobj;
9103 static struct btrfs_block_group_cache *
9104 btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
9105 u64 start, u64 size)
9107 struct btrfs_block_group_cache *cache;
9109 cache = kzalloc(sizeof(*cache), GFP_NOFS);
9113 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
9115 if (!cache->free_space_ctl) {
9120 cache->key.objectid = start;
9121 cache->key.offset = size;
9122 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
9124 cache->fs_info = fs_info;
9125 cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
9126 set_free_space_tree_thresholds(cache);
9128 atomic_set(&cache->count, 1);
9129 spin_lock_init(&cache->lock);
9130 init_rwsem(&cache->data_rwsem);
9131 INIT_LIST_HEAD(&cache->list);
9132 INIT_LIST_HEAD(&cache->cluster_list);
9133 INIT_LIST_HEAD(&cache->bg_list);
9134 INIT_LIST_HEAD(&cache->ro_list);
9135 INIT_LIST_HEAD(&cache->dirty_list);
9136 INIT_LIST_HEAD(&cache->io_list);
9137 btrfs_init_free_space_ctl(cache);
9138 atomic_set(&cache->trimming, 0);
9139 mutex_init(&cache->free_space_lock);
9140 btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
9147 * Iterate all chunks and verify that each of them has the corresponding block
9150 static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
9152 struct extent_map_tree *map_tree = &fs_info->mapping_tree;
9153 struct extent_map *em;
9154 struct btrfs_block_group_cache *bg;
9159 read_lock(&map_tree->lock);
9161 * lookup_extent_mapping will return the first extent map
9162 * intersecting the range, so setting @len to 1 is enough to
9163 * get the first chunk.
9165 em = lookup_extent_mapping(map_tree, start, 1);
9166 read_unlock(&map_tree->lock);
9170 bg = btrfs_lookup_block_group(fs_info, em->start);
9173 "chunk start=%llu len=%llu doesn't have corresponding block group",
9174 em->start, em->len);
9176 free_extent_map(em);
9179 if (bg->key.objectid != em->start ||
9180 bg->key.offset != em->len ||
9181 (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
9182 (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
9184 "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
9186 em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
9187 bg->key.objectid, bg->key.offset,
9188 bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
9190 free_extent_map(em);
9191 btrfs_put_block_group(bg);
9194 start = em->start + em->len;
9195 free_extent_map(em);
9196 btrfs_put_block_group(bg);
9201 int btrfs_read_block_groups(struct btrfs_fs_info *info)
9203 struct btrfs_path *path;
9205 struct btrfs_block_group_cache *cache;
9206 struct btrfs_space_info *space_info;
9207 struct btrfs_key key;
9208 struct btrfs_key found_key;
9209 struct extent_buffer *leaf;
9215 feature = btrfs_super_incompat_flags(info->super_copy);
9216 mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
9220 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
9221 path = btrfs_alloc_path();
9224 path->reada = READA_FORWARD;
9226 cache_gen = btrfs_super_cache_generation(info->super_copy);
9227 if (btrfs_test_opt(info, SPACE_CACHE) &&
9228 btrfs_super_generation(info->super_copy) != cache_gen)
9230 if (btrfs_test_opt(info, CLEAR_CACHE))
9234 ret = find_first_block_group(info, path, &key);
9240 leaf = path->nodes[0];
9241 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
9243 cache = btrfs_create_block_group_cache(info, found_key.objectid,
9252 * When we mount with old space cache, we need to
9253 * set BTRFS_DC_CLEAR and set dirty flag.
9255 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
9256 * truncate the old free space cache inode and
9258 * b) Setting 'dirty flag' makes sure that we flush
9259 * the new space cache info onto disk.
9261 if (btrfs_test_opt(info, SPACE_CACHE))
9262 cache->disk_cache_state = BTRFS_DC_CLEAR;
9265 read_extent_buffer(leaf, &cache->item,
9266 btrfs_item_ptr_offset(leaf, path->slots[0]),
9267 sizeof(cache->item));
9268 cache->flags = btrfs_block_group_flags(&cache->item);
9270 ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
9271 (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
9273 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
9274 cache->key.objectid);
9279 key.objectid = found_key.objectid + found_key.offset;
9280 btrfs_release_path(path);
9283 * We need to exclude the super stripes now so that the space
9284 * info has super bytes accounted for, otherwise we'll think
9285 * we have more space than we actually do.
9287 ret = exclude_super_stripes(cache);
9290 * We may have excluded something, so call this just in
9293 free_excluded_extents(cache);
9294 btrfs_put_block_group(cache);
9299 * check for two cases, either we are full, and therefore
9300 * don't need to bother with the caching work since we won't
9301 * find any space, or we are empty, and we can just add all
9302 * the space in and be done with it. This saves us _a_lot_ of
9303 * time, particularly in the full case.
9305 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
9306 cache->last_byte_to_unpin = (u64)-1;
9307 cache->cached = BTRFS_CACHE_FINISHED;
9308 free_excluded_extents(cache);
9309 } else if (btrfs_block_group_used(&cache->item) == 0) {
9310 cache->last_byte_to_unpin = (u64)-1;
9311 cache->cached = BTRFS_CACHE_FINISHED;
9312 add_new_free_space(cache, found_key.objectid,
9313 found_key.objectid +
9315 free_excluded_extents(cache);
9318 ret = btrfs_add_block_group_cache(info, cache);
9320 btrfs_remove_free_space_cache(cache);
9321 btrfs_put_block_group(cache);
9325 trace_btrfs_add_block_group(info, cache, 0);
9326 btrfs_update_space_info(info, cache->flags, found_key.offset,
9327 btrfs_block_group_used(&cache->item),
9328 cache->bytes_super, &space_info);
9330 cache->space_info = space_info;
9332 link_block_group(cache);
9334 set_avail_alloc_bits(info, cache->flags);
9335 if (btrfs_chunk_readonly(info, cache->key.objectid)) {
9336 inc_block_group_ro(cache, 1);
9337 } else if (btrfs_block_group_used(&cache->item) == 0) {
9338 ASSERT(list_empty(&cache->bg_list));
9339 btrfs_mark_bg_unused(cache);
9343 list_for_each_entry_rcu(space_info, &info->space_info, list) {
9344 if (!(get_alloc_profile(info, space_info->flags) &
9345 (BTRFS_BLOCK_GROUP_RAID10 |
9346 BTRFS_BLOCK_GROUP_RAID1_MASK |
9347 BTRFS_BLOCK_GROUP_RAID56_MASK |
9348 BTRFS_BLOCK_GROUP_DUP)))
9351 * avoid allocating from un-mirrored block group if there are
9352 * mirrored block groups.
9354 list_for_each_entry(cache,
9355 &space_info->block_groups[BTRFS_RAID_RAID0],
9357 inc_block_group_ro(cache, 1);
9358 list_for_each_entry(cache,
9359 &space_info->block_groups[BTRFS_RAID_SINGLE],
9361 inc_block_group_ro(cache, 1);
9364 btrfs_add_raid_kobjects(info);
9365 init_global_block_rsv(info);
9366 ret = check_chunk_block_group_mappings(info);
9368 btrfs_free_path(path);
9372 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
9374 struct btrfs_fs_info *fs_info = trans->fs_info;
9375 struct btrfs_block_group_cache *block_group;
9376 struct btrfs_root *extent_root = fs_info->extent_root;
9377 struct btrfs_block_group_item item;
9378 struct btrfs_key key;
9381 if (!trans->can_flush_pending_bgs)
9384 while (!list_empty(&trans->new_bgs)) {
9385 block_group = list_first_entry(&trans->new_bgs,
9386 struct btrfs_block_group_cache,
9391 spin_lock(&block_group->lock);
9392 memcpy(&item, &block_group->item, sizeof(item));
9393 memcpy(&key, &block_group->key, sizeof(key));
9394 spin_unlock(&block_group->lock);
9396 ret = btrfs_insert_item(trans, extent_root, &key, &item,
9399 btrfs_abort_transaction(trans, ret);
9400 ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset);
9402 btrfs_abort_transaction(trans, ret);
9403 add_block_group_free_space(trans, block_group);
9404 /* already aborted the transaction if it failed. */
9406 btrfs_delayed_refs_rsv_release(fs_info, 1);
9407 list_del_init(&block_group->bg_list);
9409 btrfs_trans_release_chunk_metadata(trans);
9412 int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
9413 u64 type, u64 chunk_offset, u64 size)
9415 struct btrfs_fs_info *fs_info = trans->fs_info;
9416 struct btrfs_block_group_cache *cache;
9419 btrfs_set_log_full_commit(trans);
9421 cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size);
9425 btrfs_set_block_group_used(&cache->item, bytes_used);
9426 btrfs_set_block_group_chunk_objectid(&cache->item,
9427 BTRFS_FIRST_CHUNK_TREE_OBJECTID);
9428 btrfs_set_block_group_flags(&cache->item, type);
9430 cache->flags = type;
9431 cache->last_byte_to_unpin = (u64)-1;
9432 cache->cached = BTRFS_CACHE_FINISHED;
9433 cache->needs_free_space = 1;
9434 ret = exclude_super_stripes(cache);
9437 * We may have excluded something, so call this just in
9440 free_excluded_extents(cache);
9441 btrfs_put_block_group(cache);
9445 add_new_free_space(cache, chunk_offset, chunk_offset + size);
9447 free_excluded_extents(cache);
9449 #ifdef CONFIG_BTRFS_DEBUG
9450 if (btrfs_should_fragment_free_space(cache)) {
9451 u64 new_bytes_used = size - bytes_used;
9453 bytes_used += new_bytes_used >> 1;
9454 fragment_free_space(cache);
9458 * Ensure the corresponding space_info object is created and
9459 * assigned to our block group. We want our bg to be added to the rbtree
9460 * with its ->space_info set.
9462 cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
9463 ASSERT(cache->space_info);
9465 ret = btrfs_add_block_group_cache(fs_info, cache);
9467 btrfs_remove_free_space_cache(cache);
9468 btrfs_put_block_group(cache);
9473 * Now that our block group has its ->space_info set and is inserted in
9474 * the rbtree, update the space info's counters.
9476 trace_btrfs_add_block_group(fs_info, cache, 1);
9477 btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
9478 cache->bytes_super, &cache->space_info);
9479 update_global_block_rsv(fs_info);
9481 link_block_group(cache);
9483 list_add_tail(&cache->bg_list, &trans->new_bgs);
9484 trans->delayed_ref_updates++;
9485 btrfs_update_delayed_refs_rsv(trans);
9487 set_avail_alloc_bits(fs_info, type);
9491 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
9493 u64 extra_flags = chunk_to_extended(flags) &
9494 BTRFS_EXTENDED_PROFILE_MASK;
9496 write_seqlock(&fs_info->profiles_lock);
9497 if (flags & BTRFS_BLOCK_GROUP_DATA)
9498 fs_info->avail_data_alloc_bits &= ~extra_flags;
9499 if (flags & BTRFS_BLOCK_GROUP_METADATA)
9500 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
9501 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
9502 fs_info->avail_system_alloc_bits &= ~extra_flags;
9503 write_sequnlock(&fs_info->profiles_lock);
9507 * Clear incompat bits for the following feature(s):
9509 * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
9510 * in the whole filesystem
9512 static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
9514 if (flags & BTRFS_BLOCK_GROUP_RAID56_MASK) {
9515 struct list_head *head = &fs_info->space_info;
9516 struct btrfs_space_info *sinfo;
9518 list_for_each_entry_rcu(sinfo, head, list) {
9521 down_read(&sinfo->groups_sem);
9522 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
9524 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
9526 up_read(&sinfo->groups_sem);
9531 btrfs_clear_fs_incompat(fs_info, RAID56);
9535 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
9536 u64 group_start, struct extent_map *em)
9538 struct btrfs_fs_info *fs_info = trans->fs_info;
9539 struct btrfs_root *root = fs_info->extent_root;
9540 struct btrfs_path *path;
9541 struct btrfs_block_group_cache *block_group;
9542 struct btrfs_free_cluster *cluster;
9543 struct btrfs_root *tree_root = fs_info->tree_root;
9544 struct btrfs_key key;
9545 struct inode *inode;
9546 struct kobject *kobj = NULL;
9550 struct btrfs_caching_control *caching_ctl = NULL;
9552 bool remove_rsv = false;
9554 block_group = btrfs_lookup_block_group(fs_info, group_start);
9555 BUG_ON(!block_group);
9556 BUG_ON(!block_group->ro);
9558 trace_btrfs_remove_block_group(block_group);
9560 * Free the reserved super bytes from this block group before
9563 free_excluded_extents(block_group);
9564 btrfs_free_ref_tree_range(fs_info, block_group->key.objectid,
9565 block_group->key.offset);
9567 memcpy(&key, &block_group->key, sizeof(key));
9568 index = btrfs_bg_flags_to_raid_index(block_group->flags);
9569 factor = btrfs_bg_type_to_factor(block_group->flags);
9571 /* make sure this block group isn't part of an allocation cluster */
9572 cluster = &fs_info->data_alloc_cluster;
9573 spin_lock(&cluster->refill_lock);
9574 btrfs_return_cluster_to_free_space(block_group, cluster);
9575 spin_unlock(&cluster->refill_lock);
9578 * make sure this block group isn't part of a metadata
9579 * allocation cluster
9581 cluster = &fs_info->meta_alloc_cluster;
9582 spin_lock(&cluster->refill_lock);
9583 btrfs_return_cluster_to_free_space(block_group, cluster);
9584 spin_unlock(&cluster->refill_lock);
9586 path = btrfs_alloc_path();
9593 * get the inode first so any iput calls done for the io_list
9594 * aren't the final iput (no unlinks allowed now)
9596 inode = lookup_free_space_inode(block_group, path);
9598 mutex_lock(&trans->transaction->cache_write_mutex);
9600 * Make sure our free space cache IO is done before removing the
9603 spin_lock(&trans->transaction->dirty_bgs_lock);
9604 if (!list_empty(&block_group->io_list)) {
9605 list_del_init(&block_group->io_list);
9607 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
9609 spin_unlock(&trans->transaction->dirty_bgs_lock);
9610 btrfs_wait_cache_io(trans, block_group, path);
9611 btrfs_put_block_group(block_group);
9612 spin_lock(&trans->transaction->dirty_bgs_lock);
9615 if (!list_empty(&block_group->dirty_list)) {
9616 list_del_init(&block_group->dirty_list);
9618 btrfs_put_block_group(block_group);
9620 spin_unlock(&trans->transaction->dirty_bgs_lock);
9621 mutex_unlock(&trans->transaction->cache_write_mutex);
9623 if (!IS_ERR(inode)) {
9624 ret = btrfs_orphan_add(trans, BTRFS_I(inode));
9626 btrfs_add_delayed_iput(inode);
9630 /* One for the block groups ref */
9631 spin_lock(&block_group->lock);
9632 if (block_group->iref) {
9633 block_group->iref = 0;
9634 block_group->inode = NULL;
9635 spin_unlock(&block_group->lock);
9638 spin_unlock(&block_group->lock);
9640 /* One for our lookup ref */
9641 btrfs_add_delayed_iput(inode);
9644 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
9645 key.offset = block_group->key.objectid;
9648 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
9652 btrfs_release_path(path);
9654 ret = btrfs_del_item(trans, tree_root, path);
9657 btrfs_release_path(path);
9660 spin_lock(&fs_info->block_group_cache_lock);
9661 rb_erase(&block_group->cache_node,
9662 &fs_info->block_group_cache_tree);
9663 RB_CLEAR_NODE(&block_group->cache_node);
9665 if (fs_info->first_logical_byte == block_group->key.objectid)
9666 fs_info->first_logical_byte = (u64)-1;
9667 spin_unlock(&fs_info->block_group_cache_lock);
9669 down_write(&block_group->space_info->groups_sem);
9671 * we must use list_del_init so people can check to see if they
9672 * are still on the list after taking the semaphore
9674 list_del_init(&block_group->list);
9675 if (list_empty(&block_group->space_info->block_groups[index])) {
9676 kobj = block_group->space_info->block_group_kobjs[index];
9677 block_group->space_info->block_group_kobjs[index] = NULL;
9678 clear_avail_alloc_bits(fs_info, block_group->flags);
9680 up_write(&block_group->space_info->groups_sem);
9681 clear_incompat_bg_bits(fs_info, block_group->flags);
9687 if (block_group->has_caching_ctl)
9688 caching_ctl = get_caching_control(block_group);
9689 if (block_group->cached == BTRFS_CACHE_STARTED)
9690 wait_block_group_cache_done(block_group);
9691 if (block_group->has_caching_ctl) {
9692 down_write(&fs_info->commit_root_sem);
9694 struct btrfs_caching_control *ctl;
9696 list_for_each_entry(ctl,
9697 &fs_info->caching_block_groups, list)
9698 if (ctl->block_group == block_group) {
9700 refcount_inc(&caching_ctl->count);
9705 list_del_init(&caching_ctl->list);
9706 up_write(&fs_info->commit_root_sem);
9708 /* Once for the caching bgs list and once for us. */
9709 put_caching_control(caching_ctl);
9710 put_caching_control(caching_ctl);
9714 spin_lock(&trans->transaction->dirty_bgs_lock);
9715 WARN_ON(!list_empty(&block_group->dirty_list));
9716 WARN_ON(!list_empty(&block_group->io_list));
9717 spin_unlock(&trans->transaction->dirty_bgs_lock);
9719 btrfs_remove_free_space_cache(block_group);
9721 spin_lock(&block_group->space_info->lock);
9722 list_del_init(&block_group->ro_list);
9724 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
9725 WARN_ON(block_group->space_info->total_bytes
9726 < block_group->key.offset);
9727 WARN_ON(block_group->space_info->bytes_readonly
9728 < block_group->key.offset);
9729 WARN_ON(block_group->space_info->disk_total
9730 < block_group->key.offset * factor);
9732 block_group->space_info->total_bytes -= block_group->key.offset;
9733 block_group->space_info->bytes_readonly -= block_group->key.offset;
9734 block_group->space_info->disk_total -= block_group->key.offset * factor;
9736 spin_unlock(&block_group->space_info->lock);
9738 memcpy(&key, &block_group->key, sizeof(key));
9740 mutex_lock(&fs_info->chunk_mutex);
9741 spin_lock(&block_group->lock);
9742 block_group->removed = 1;
9744 * At this point trimming can't start on this block group, because we
9745 * removed the block group from the tree fs_info->block_group_cache_tree
9746 * so no one can't find it anymore and even if someone already got this
9747 * block group before we removed it from the rbtree, they have already
9748 * incremented block_group->trimming - if they didn't, they won't find
9749 * any free space entries because we already removed them all when we
9750 * called btrfs_remove_free_space_cache().
9752 * And we must not remove the extent map from the fs_info->mapping_tree
9753 * to prevent the same logical address range and physical device space
9754 * ranges from being reused for a new block group. This is because our
9755 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
9756 * completely transactionless, so while it is trimming a range the
9757 * currently running transaction might finish and a new one start,
9758 * allowing for new block groups to be created that can reuse the same
9759 * physical device locations unless we take this special care.
9761 * There may also be an implicit trim operation if the file system
9762 * is mounted with -odiscard. The same protections must remain
9763 * in place until the extents have been discarded completely when
9764 * the transaction commit has completed.
9766 remove_em = (atomic_read(&block_group->trimming) == 0);
9767 spin_unlock(&block_group->lock);
9769 mutex_unlock(&fs_info->chunk_mutex);
9771 ret = remove_block_group_free_space(trans, block_group);
9775 btrfs_put_block_group(block_group);
9776 btrfs_put_block_group(block_group);
9778 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
9784 ret = btrfs_del_item(trans, root, path);
9789 struct extent_map_tree *em_tree;
9791 em_tree = &fs_info->mapping_tree;
9792 write_lock(&em_tree->lock);
9793 remove_extent_mapping(em_tree, em);
9794 write_unlock(&em_tree->lock);
9795 /* once for the tree */
9796 free_extent_map(em);
9800 btrfs_delayed_refs_rsv_release(fs_info, 1);
9801 btrfs_free_path(path);
9805 struct btrfs_trans_handle *
9806 btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
9807 const u64 chunk_offset)
9809 struct extent_map_tree *em_tree = &fs_info->mapping_tree;
9810 struct extent_map *em;
9811 struct map_lookup *map;
9812 unsigned int num_items;
9814 read_lock(&em_tree->lock);
9815 em = lookup_extent_mapping(em_tree, chunk_offset, 1);
9816 read_unlock(&em_tree->lock);
9817 ASSERT(em && em->start == chunk_offset);
9820 * We need to reserve 3 + N units from the metadata space info in order
9821 * to remove a block group (done at btrfs_remove_chunk() and at
9822 * btrfs_remove_block_group()), which are used for:
9824 * 1 unit for adding the free space inode's orphan (located in the tree
9826 * 1 unit for deleting the block group item (located in the extent
9828 * 1 unit for deleting the free space item (located in tree of tree
9830 * N units for deleting N device extent items corresponding to each
9831 * stripe (located in the device tree).
9833 * In order to remove a block group we also need to reserve units in the
9834 * system space info in order to update the chunk tree (update one or
9835 * more device items and remove one chunk item), but this is done at
9836 * btrfs_remove_chunk() through a call to check_system_chunk().
9838 map = em->map_lookup;
9839 num_items = 3 + map->num_stripes;
9840 free_extent_map(em);
9842 return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
9847 * Process the unused_bgs list and remove any that don't have any allocated
9848 * space inside of them.
9850 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
9852 struct btrfs_block_group_cache *block_group;
9853 struct btrfs_space_info *space_info;
9854 struct btrfs_trans_handle *trans;
9857 if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
9860 spin_lock(&fs_info->unused_bgs_lock);
9861 while (!list_empty(&fs_info->unused_bgs)) {
9865 block_group = list_first_entry(&fs_info->unused_bgs,
9866 struct btrfs_block_group_cache,
9868 list_del_init(&block_group->bg_list);
9870 space_info = block_group->space_info;
9872 if (ret || btrfs_mixed_space_info(space_info)) {
9873 btrfs_put_block_group(block_group);
9876 spin_unlock(&fs_info->unused_bgs_lock);
9878 mutex_lock(&fs_info->delete_unused_bgs_mutex);
9880 /* Don't want to race with allocators so take the groups_sem */
9881 down_write(&space_info->groups_sem);
9882 spin_lock(&block_group->lock);
9883 if (block_group->reserved || block_group->pinned ||
9884 btrfs_block_group_used(&block_group->item) ||
9886 list_is_singular(&block_group->list)) {
9888 * We want to bail if we made new allocations or have
9889 * outstanding allocations in this block group. We do
9890 * the ro check in case balance is currently acting on
9893 trace_btrfs_skip_unused_block_group(block_group);
9894 spin_unlock(&block_group->lock);
9895 up_write(&space_info->groups_sem);
9898 spin_unlock(&block_group->lock);
9900 /* We don't want to force the issue, only flip if it's ok. */
9901 ret = inc_block_group_ro(block_group, 0);
9902 up_write(&space_info->groups_sem);
9909 * Want to do this before we do anything else so we can recover
9910 * properly if we fail to join the transaction.
9912 trans = btrfs_start_trans_remove_block_group(fs_info,
9913 block_group->key.objectid);
9914 if (IS_ERR(trans)) {
9915 btrfs_dec_block_group_ro(block_group);
9916 ret = PTR_ERR(trans);
9921 * We could have pending pinned extents for this block group,
9922 * just delete them, we don't care about them anymore.
9924 start = block_group->key.objectid;
9925 end = start + block_group->key.offset - 1;
9927 * Hold the unused_bg_unpin_mutex lock to avoid racing with
9928 * btrfs_finish_extent_commit(). If we are at transaction N,
9929 * another task might be running finish_extent_commit() for the
9930 * previous transaction N - 1, and have seen a range belonging
9931 * to the block group in freed_extents[] before we were able to
9932 * clear the whole block group range from freed_extents[]. This
9933 * means that task can lookup for the block group after we
9934 * unpinned it from freed_extents[] and removed it, leading to
9935 * a BUG_ON() at btrfs_unpin_extent_range().
9937 mutex_lock(&fs_info->unused_bg_unpin_mutex);
9938 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
9941 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
9942 btrfs_dec_block_group_ro(block_group);
9945 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
9948 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
9949 btrfs_dec_block_group_ro(block_group);
9952 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
9954 /* Reset pinned so btrfs_put_block_group doesn't complain */
9955 spin_lock(&space_info->lock);
9956 spin_lock(&block_group->lock);
9958 btrfs_space_info_update_bytes_pinned(fs_info, space_info,
9959 -block_group->pinned);
9960 space_info->bytes_readonly += block_group->pinned;
9961 percpu_counter_add_batch(&space_info->total_bytes_pinned,
9962 -block_group->pinned,
9963 BTRFS_TOTAL_BYTES_PINNED_BATCH);
9964 block_group->pinned = 0;
9966 spin_unlock(&block_group->lock);
9967 spin_unlock(&space_info->lock);
9969 /* DISCARD can flip during remount */
9970 trimming = btrfs_test_opt(fs_info, DISCARD);
9972 /* Implicit trim during transaction commit. */
9974 btrfs_get_block_group_trimming(block_group);
9977 * Btrfs_remove_chunk will abort the transaction if things go
9980 ret = btrfs_remove_chunk(trans, block_group->key.objectid);
9984 btrfs_put_block_group_trimming(block_group);
9989 * If we're not mounted with -odiscard, we can just forget
9990 * about this block group. Otherwise we'll need to wait
9991 * until transaction commit to do the actual discard.
9994 spin_lock(&fs_info->unused_bgs_lock);
9996 * A concurrent scrub might have added us to the list
9997 * fs_info->unused_bgs, so use a list_move operation
9998 * to add the block group to the deleted_bgs list.
10000 list_move(&block_group->bg_list,
10001 &trans->transaction->deleted_bgs);
10002 spin_unlock(&fs_info->unused_bgs_lock);
10003 btrfs_get_block_group(block_group);
10006 btrfs_end_transaction(trans);
10008 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
10009 btrfs_put_block_group(block_group);
10010 spin_lock(&fs_info->unused_bgs_lock);
10012 spin_unlock(&fs_info->unused_bgs_lock);
10015 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
10016 u64 start, u64 end)
10018 return unpin_extent_range(fs_info, start, end, false);
10022 * It used to be that old block groups would be left around forever.
10023 * Iterating over them would be enough to trim unused space. Since we
10024 * now automatically remove them, we also need to iterate over unallocated
10027 * We don't want a transaction for this since the discard may take a
10028 * substantial amount of time. We don't require that a transaction be
10029 * running, but we do need to take a running transaction into account
10030 * to ensure that we're not discarding chunks that were released or
10031 * allocated in the current transaction.
10033 * Holding the chunks lock will prevent other threads from allocating
10034 * or releasing chunks, but it won't prevent a running transaction
10035 * from committing and releasing the memory that the pending chunks
10036 * list head uses. For that, we need to take a reference to the
10037 * transaction and hold the commit root sem. We only need to hold
10038 * it while performing the free space search since we have already
10039 * held back allocations.
10041 static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
10043 u64 start = SZ_1M, len = 0, end = 0;
10048 /* Discard not supported = nothing to do. */
10049 if (!blk_queue_discard(bdev_get_queue(device->bdev)))
10052 /* Not writable = nothing to do. */
10053 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
10056 /* No free space = nothing to do. */
10057 if (device->total_bytes <= device->bytes_used)
10063 struct btrfs_fs_info *fs_info = device->fs_info;
10066 ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
10070 find_first_clear_extent_bit(&device->alloc_state, start,
10072 CHUNK_TRIMMED | CHUNK_ALLOCATED);
10074 /* Ensure we skip the reserved area in the first 1M */
10075 start = max_t(u64, start, SZ_1M);
10078 * If find_first_clear_extent_bit find a range that spans the
10079 * end of the device it will set end to -1, in this case it's up
10080 * to the caller to trim the value to the size of the device.
10082 end = min(end, device->total_bytes - 1);
10084 len = end - start + 1;
10086 /* We didn't find any extents */
10088 mutex_unlock(&fs_info->chunk_mutex);
10093 ret = btrfs_issue_discard(device->bdev, start, len,
10096 set_extent_bits(&device->alloc_state, start,
10099 mutex_unlock(&fs_info->chunk_mutex);
10107 if (fatal_signal_pending(current)) {
10108 ret = -ERESTARTSYS;
10119 * Trim the whole filesystem by:
10120 * 1) trimming the free space in each block group
10121 * 2) trimming the unallocated space on each device
10123 * This will also continue trimming even if a block group or device encounters
10124 * an error. The return value will be the last error, or 0 if nothing bad
10127 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
10129 struct btrfs_block_group_cache *cache = NULL;
10130 struct btrfs_device *device;
10131 struct list_head *devices;
10137 u64 dev_failed = 0;
10142 cache = btrfs_lookup_first_block_group(fs_info, range->start);
10143 for (; cache; cache = next_block_group(cache)) {
10144 if (cache->key.objectid >= (range->start + range->len)) {
10145 btrfs_put_block_group(cache);
10149 start = max(range->start, cache->key.objectid);
10150 end = min(range->start + range->len,
10151 cache->key.objectid + cache->key.offset);
10153 if (end - start >= range->minlen) {
10154 if (!block_group_cache_done(cache)) {
10155 ret = cache_block_group(cache, 0);
10161 ret = wait_block_group_cache_done(cache);
10168 ret = btrfs_trim_block_group(cache,
10174 trimmed += group_trimmed;
10184 btrfs_warn(fs_info,
10185 "failed to trim %llu block group(s), last error %d",
10186 bg_failed, bg_ret);
10187 mutex_lock(&fs_info->fs_devices->device_list_mutex);
10188 devices = &fs_info->fs_devices->devices;
10189 list_for_each_entry(device, devices, dev_list) {
10190 ret = btrfs_trim_free_extents(device, &group_trimmed);
10197 trimmed += group_trimmed;
10199 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
10202 btrfs_warn(fs_info,
10203 "failed to trim %llu device(s), last error %d",
10204 dev_failed, dev_ret);
10205 range->len = trimmed;
10212 * btrfs_{start,end}_write_no_snapshotting() are similar to
10213 * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
10214 * data into the page cache through nocow before the subvolume is snapshoted,
10215 * but flush the data into disk after the snapshot creation, or to prevent
10216 * operations while snapshotting is ongoing and that cause the snapshot to be
10217 * inconsistent (writes followed by expanding truncates for example).
10219 void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
10221 percpu_counter_dec(&root->subv_writers->counter);
10222 cond_wake_up(&root->subv_writers->wait);
10225 int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
10227 if (atomic_read(&root->will_be_snapshotted))
10230 percpu_counter_inc(&root->subv_writers->counter);
10232 * Make sure counter is updated before we check for snapshot creation.
10235 if (atomic_read(&root->will_be_snapshotted)) {
10236 btrfs_end_write_no_snapshotting(root);
10242 void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
10247 ret = btrfs_start_write_no_snapshotting(root);
10250 wait_var_event(&root->will_be_snapshotted,
10251 !atomic_read(&root->will_be_snapshotted));
10255 void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg)
10257 struct btrfs_fs_info *fs_info = bg->fs_info;
10259 spin_lock(&fs_info->unused_bgs_lock);
10260 if (list_empty(&bg->bg_list)) {
10261 btrfs_get_block_group(bg);
10262 trace_btrfs_add_unused_block_group(bg);
10263 list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
10265 spin_unlock(&fs_info->unused_bgs_lock);