1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2007 Oracle. All rights reserved.
6 #include <linux/sched.h>
7 #include <linux/sched/signal.h>
8 #include <linux/pagemap.h>
9 #include <linux/writeback.h>
10 #include <linux/blkdev.h>
11 #include <linux/sort.h>
12 #include <linux/rcupdate.h>
13 #include <linux/kthread.h>
14 #include <linux/slab.h>
15 #include <linux/ratelimit.h>
16 #include <linux/percpu_counter.h>
17 #include <linux/lockdep.h>
18 #include <linux/crc32c.h>
21 #include "print-tree.h"
25 #include "free-space-cache.h"
26 #include "free-space-tree.h"
30 #include "ref-verify.h"
31 #include "space-info.h"
33 #undef SCRAMBLE_DELAYED_REFS
36 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
37 struct btrfs_delayed_ref_node *node, u64 parent,
38 u64 root_objectid, u64 owner_objectid,
39 u64 owner_offset, int refs_to_drop,
40 struct btrfs_delayed_extent_op *extra_op);
41 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
42 struct extent_buffer *leaf,
43 struct btrfs_extent_item *ei);
44 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
45 u64 parent, u64 root_objectid,
46 u64 flags, u64 owner, u64 offset,
47 struct btrfs_key *ins, int ref_mod);
48 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
49 struct btrfs_delayed_ref_node *node,
50 struct btrfs_delayed_extent_op *extent_op);
51 static int find_next_key(struct btrfs_path *path, int level,
52 struct btrfs_key *key);
53 static void dump_space_info(struct btrfs_fs_info *fs_info,
54 struct btrfs_space_info *info, u64 bytes,
55 int dump_block_groups);
56 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
60 block_group_cache_done(struct btrfs_block_group_cache *cache)
63 return cache->cached == BTRFS_CACHE_FINISHED ||
64 cache->cached == BTRFS_CACHE_ERROR;
67 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
69 return (cache->flags & bits) == bits;
72 void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
74 atomic_inc(&cache->count);
77 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
79 if (atomic_dec_and_test(&cache->count)) {
80 WARN_ON(cache->pinned > 0);
81 WARN_ON(cache->reserved > 0);
84 * If not empty, someone is still holding mutex of
85 * full_stripe_lock, which can only be released by caller.
86 * And it will definitely cause use-after-free when caller
87 * tries to release full stripe lock.
89 * No better way to resolve, but only to warn.
91 WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
92 kfree(cache->free_space_ctl);
98 * this adds the block group to the fs_info rb tree for the block group
101 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
102 struct btrfs_block_group_cache *block_group)
105 struct rb_node *parent = NULL;
106 struct btrfs_block_group_cache *cache;
108 spin_lock(&info->block_group_cache_lock);
109 p = &info->block_group_cache_tree.rb_node;
113 cache = rb_entry(parent, struct btrfs_block_group_cache,
115 if (block_group->key.objectid < cache->key.objectid) {
117 } else if (block_group->key.objectid > cache->key.objectid) {
120 spin_unlock(&info->block_group_cache_lock);
125 rb_link_node(&block_group->cache_node, parent, p);
126 rb_insert_color(&block_group->cache_node,
127 &info->block_group_cache_tree);
129 if (info->first_logical_byte > block_group->key.objectid)
130 info->first_logical_byte = block_group->key.objectid;
132 spin_unlock(&info->block_group_cache_lock);
138 * This will return the block group at or after bytenr if contains is 0, else
139 * it will return the block group that contains the bytenr
141 static struct btrfs_block_group_cache *
142 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
145 struct btrfs_block_group_cache *cache, *ret = NULL;
149 spin_lock(&info->block_group_cache_lock);
150 n = info->block_group_cache_tree.rb_node;
153 cache = rb_entry(n, struct btrfs_block_group_cache,
155 end = cache->key.objectid + cache->key.offset - 1;
156 start = cache->key.objectid;
158 if (bytenr < start) {
159 if (!contains && (!ret || start < ret->key.objectid))
162 } else if (bytenr > start) {
163 if (contains && bytenr <= end) {
174 btrfs_get_block_group(ret);
175 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
176 info->first_logical_byte = ret->key.objectid;
178 spin_unlock(&info->block_group_cache_lock);
183 static int add_excluded_extent(struct btrfs_fs_info *fs_info,
184 u64 start, u64 num_bytes)
186 u64 end = start + num_bytes - 1;
187 set_extent_bits(&fs_info->freed_extents[0],
188 start, end, EXTENT_UPTODATE);
189 set_extent_bits(&fs_info->freed_extents[1],
190 start, end, EXTENT_UPTODATE);
194 static void free_excluded_extents(struct btrfs_block_group_cache *cache)
196 struct btrfs_fs_info *fs_info = cache->fs_info;
199 start = cache->key.objectid;
200 end = start + cache->key.offset - 1;
202 clear_extent_bits(&fs_info->freed_extents[0],
203 start, end, EXTENT_UPTODATE);
204 clear_extent_bits(&fs_info->freed_extents[1],
205 start, end, EXTENT_UPTODATE);
208 static int exclude_super_stripes(struct btrfs_block_group_cache *cache)
210 struct btrfs_fs_info *fs_info = cache->fs_info;
216 if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
217 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
218 cache->bytes_super += stripe_len;
219 ret = add_excluded_extent(fs_info, cache->key.objectid,
225 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
226 bytenr = btrfs_sb_offset(i);
227 ret = btrfs_rmap_block(fs_info, cache->key.objectid,
228 bytenr, &logical, &nr, &stripe_len);
235 if (logical[nr] > cache->key.objectid +
239 if (logical[nr] + stripe_len <= cache->key.objectid)
243 if (start < cache->key.objectid) {
244 start = cache->key.objectid;
245 len = (logical[nr] + stripe_len) - start;
247 len = min_t(u64, stripe_len,
248 cache->key.objectid +
249 cache->key.offset - start);
252 cache->bytes_super += len;
253 ret = add_excluded_extent(fs_info, start, len);
265 static struct btrfs_caching_control *
266 get_caching_control(struct btrfs_block_group_cache *cache)
268 struct btrfs_caching_control *ctl;
270 spin_lock(&cache->lock);
271 if (!cache->caching_ctl) {
272 spin_unlock(&cache->lock);
276 ctl = cache->caching_ctl;
277 refcount_inc(&ctl->count);
278 spin_unlock(&cache->lock);
282 static void put_caching_control(struct btrfs_caching_control *ctl)
284 if (refcount_dec_and_test(&ctl->count))
288 #ifdef CONFIG_BTRFS_DEBUG
289 static void fragment_free_space(struct btrfs_block_group_cache *block_group)
291 struct btrfs_fs_info *fs_info = block_group->fs_info;
292 u64 start = block_group->key.objectid;
293 u64 len = block_group->key.offset;
294 u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
295 fs_info->nodesize : fs_info->sectorsize;
296 u64 step = chunk << 1;
298 while (len > chunk) {
299 btrfs_remove_free_space(block_group, start, chunk);
310 * this is only called by cache_block_group, since we could have freed extents
311 * we need to check the pinned_extents for any extents that can't be used yet
312 * since their free space will be released as soon as the transaction commits.
314 u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
317 struct btrfs_fs_info *info = block_group->fs_info;
318 u64 extent_start, extent_end, size, total_added = 0;
321 while (start < end) {
322 ret = find_first_extent_bit(info->pinned_extents, start,
323 &extent_start, &extent_end,
324 EXTENT_DIRTY | EXTENT_UPTODATE,
329 if (extent_start <= start) {
330 start = extent_end + 1;
331 } else if (extent_start > start && extent_start < end) {
332 size = extent_start - start;
334 ret = btrfs_add_free_space(block_group, start,
336 BUG_ON(ret); /* -ENOMEM or logic error */
337 start = extent_end + 1;
346 ret = btrfs_add_free_space(block_group, start, size);
347 BUG_ON(ret); /* -ENOMEM or logic error */
353 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
355 struct btrfs_block_group_cache *block_group = caching_ctl->block_group;
356 struct btrfs_fs_info *fs_info = block_group->fs_info;
357 struct btrfs_root *extent_root = fs_info->extent_root;
358 struct btrfs_path *path;
359 struct extent_buffer *leaf;
360 struct btrfs_key key;
367 path = btrfs_alloc_path();
371 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
373 #ifdef CONFIG_BTRFS_DEBUG
375 * If we're fragmenting we don't want to make anybody think we can
376 * allocate from this block group until we've had a chance to fragment
379 if (btrfs_should_fragment_free_space(block_group))
383 * We don't want to deadlock with somebody trying to allocate a new
384 * extent for the extent root while also trying to search the extent
385 * root to add free space. So we skip locking and search the commit
386 * root, since its read-only
388 path->skip_locking = 1;
389 path->search_commit_root = 1;
390 path->reada = READA_FORWARD;
394 key.type = BTRFS_EXTENT_ITEM_KEY;
397 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
401 leaf = path->nodes[0];
402 nritems = btrfs_header_nritems(leaf);
405 if (btrfs_fs_closing(fs_info) > 1) {
410 if (path->slots[0] < nritems) {
411 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
413 ret = find_next_key(path, 0, &key);
417 if (need_resched() ||
418 rwsem_is_contended(&fs_info->commit_root_sem)) {
420 caching_ctl->progress = last;
421 btrfs_release_path(path);
422 up_read(&fs_info->commit_root_sem);
423 mutex_unlock(&caching_ctl->mutex);
425 mutex_lock(&caching_ctl->mutex);
426 down_read(&fs_info->commit_root_sem);
430 ret = btrfs_next_leaf(extent_root, path);
435 leaf = path->nodes[0];
436 nritems = btrfs_header_nritems(leaf);
440 if (key.objectid < last) {
443 key.type = BTRFS_EXTENT_ITEM_KEY;
446 caching_ctl->progress = last;
447 btrfs_release_path(path);
451 if (key.objectid < block_group->key.objectid) {
456 if (key.objectid >= block_group->key.objectid +
457 block_group->key.offset)
460 if (key.type == BTRFS_EXTENT_ITEM_KEY ||
461 key.type == BTRFS_METADATA_ITEM_KEY) {
462 total_found += add_new_free_space(block_group, last,
464 if (key.type == BTRFS_METADATA_ITEM_KEY)
465 last = key.objectid +
468 last = key.objectid + key.offset;
470 if (total_found > CACHING_CTL_WAKE_UP) {
473 wake_up(&caching_ctl->wait);
480 total_found += add_new_free_space(block_group, last,
481 block_group->key.objectid +
482 block_group->key.offset);
483 caching_ctl->progress = (u64)-1;
486 btrfs_free_path(path);
490 static noinline void caching_thread(struct btrfs_work *work)
492 struct btrfs_block_group_cache *block_group;
493 struct btrfs_fs_info *fs_info;
494 struct btrfs_caching_control *caching_ctl;
497 caching_ctl = container_of(work, struct btrfs_caching_control, work);
498 block_group = caching_ctl->block_group;
499 fs_info = block_group->fs_info;
501 mutex_lock(&caching_ctl->mutex);
502 down_read(&fs_info->commit_root_sem);
504 if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
505 ret = load_free_space_tree(caching_ctl);
507 ret = load_extent_tree_free(caching_ctl);
509 spin_lock(&block_group->lock);
510 block_group->caching_ctl = NULL;
511 block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
512 spin_unlock(&block_group->lock);
514 #ifdef CONFIG_BTRFS_DEBUG
515 if (btrfs_should_fragment_free_space(block_group)) {
518 spin_lock(&block_group->space_info->lock);
519 spin_lock(&block_group->lock);
520 bytes_used = block_group->key.offset -
521 btrfs_block_group_used(&block_group->item);
522 block_group->space_info->bytes_used += bytes_used >> 1;
523 spin_unlock(&block_group->lock);
524 spin_unlock(&block_group->space_info->lock);
525 fragment_free_space(block_group);
529 caching_ctl->progress = (u64)-1;
531 up_read(&fs_info->commit_root_sem);
532 free_excluded_extents(block_group);
533 mutex_unlock(&caching_ctl->mutex);
535 wake_up(&caching_ctl->wait);
537 put_caching_control(caching_ctl);
538 btrfs_put_block_group(block_group);
541 static int cache_block_group(struct btrfs_block_group_cache *cache,
545 struct btrfs_fs_info *fs_info = cache->fs_info;
546 struct btrfs_caching_control *caching_ctl;
549 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
553 INIT_LIST_HEAD(&caching_ctl->list);
554 mutex_init(&caching_ctl->mutex);
555 init_waitqueue_head(&caching_ctl->wait);
556 caching_ctl->block_group = cache;
557 caching_ctl->progress = cache->key.objectid;
558 refcount_set(&caching_ctl->count, 1);
559 btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
560 caching_thread, NULL, NULL);
562 spin_lock(&cache->lock);
564 * This should be a rare occasion, but this could happen I think in the
565 * case where one thread starts to load the space cache info, and then
566 * some other thread starts a transaction commit which tries to do an
567 * allocation while the other thread is still loading the space cache
568 * info. The previous loop should have kept us from choosing this block
569 * group, but if we've moved to the state where we will wait on caching
570 * block groups we need to first check if we're doing a fast load here,
571 * so we can wait for it to finish, otherwise we could end up allocating
572 * from a block group who's cache gets evicted for one reason or
575 while (cache->cached == BTRFS_CACHE_FAST) {
576 struct btrfs_caching_control *ctl;
578 ctl = cache->caching_ctl;
579 refcount_inc(&ctl->count);
580 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
581 spin_unlock(&cache->lock);
585 finish_wait(&ctl->wait, &wait);
586 put_caching_control(ctl);
587 spin_lock(&cache->lock);
590 if (cache->cached != BTRFS_CACHE_NO) {
591 spin_unlock(&cache->lock);
595 WARN_ON(cache->caching_ctl);
596 cache->caching_ctl = caching_ctl;
597 cache->cached = BTRFS_CACHE_FAST;
598 spin_unlock(&cache->lock);
600 if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
601 mutex_lock(&caching_ctl->mutex);
602 ret = load_free_space_cache(cache);
604 spin_lock(&cache->lock);
606 cache->caching_ctl = NULL;
607 cache->cached = BTRFS_CACHE_FINISHED;
608 cache->last_byte_to_unpin = (u64)-1;
609 caching_ctl->progress = (u64)-1;
611 if (load_cache_only) {
612 cache->caching_ctl = NULL;
613 cache->cached = BTRFS_CACHE_NO;
615 cache->cached = BTRFS_CACHE_STARTED;
616 cache->has_caching_ctl = 1;
619 spin_unlock(&cache->lock);
620 #ifdef CONFIG_BTRFS_DEBUG
622 btrfs_should_fragment_free_space(cache)) {
625 spin_lock(&cache->space_info->lock);
626 spin_lock(&cache->lock);
627 bytes_used = cache->key.offset -
628 btrfs_block_group_used(&cache->item);
629 cache->space_info->bytes_used += bytes_used >> 1;
630 spin_unlock(&cache->lock);
631 spin_unlock(&cache->space_info->lock);
632 fragment_free_space(cache);
635 mutex_unlock(&caching_ctl->mutex);
637 wake_up(&caching_ctl->wait);
639 put_caching_control(caching_ctl);
640 free_excluded_extents(cache);
645 * We're either using the free space tree or no caching at all.
646 * Set cached to the appropriate value and wakeup any waiters.
648 spin_lock(&cache->lock);
649 if (load_cache_only) {
650 cache->caching_ctl = NULL;
651 cache->cached = BTRFS_CACHE_NO;
653 cache->cached = BTRFS_CACHE_STARTED;
654 cache->has_caching_ctl = 1;
656 spin_unlock(&cache->lock);
657 wake_up(&caching_ctl->wait);
660 if (load_cache_only) {
661 put_caching_control(caching_ctl);
665 down_write(&fs_info->commit_root_sem);
666 refcount_inc(&caching_ctl->count);
667 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
668 up_write(&fs_info->commit_root_sem);
670 btrfs_get_block_group(cache);
672 btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
678 * return the block group that starts at or after bytenr
680 static struct btrfs_block_group_cache *
681 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
683 return block_group_cache_tree_search(info, bytenr, 0);
687 * return the block group that contains the given bytenr
689 struct btrfs_block_group_cache *btrfs_lookup_block_group(
690 struct btrfs_fs_info *info,
693 return block_group_cache_tree_search(info, bytenr, 1);
696 static u64 generic_ref_to_space_flags(struct btrfs_ref *ref)
698 if (ref->type == BTRFS_REF_METADATA) {
699 if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID)
700 return BTRFS_BLOCK_GROUP_SYSTEM;
702 return BTRFS_BLOCK_GROUP_METADATA;
704 return BTRFS_BLOCK_GROUP_DATA;
707 static void add_pinned_bytes(struct btrfs_fs_info *fs_info,
708 struct btrfs_ref *ref)
710 struct btrfs_space_info *space_info;
711 u64 flags = generic_ref_to_space_flags(ref);
713 space_info = btrfs_find_space_info(fs_info, flags);
715 percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len,
716 BTRFS_TOTAL_BYTES_PINNED_BATCH);
719 static void sub_pinned_bytes(struct btrfs_fs_info *fs_info,
720 struct btrfs_ref *ref)
722 struct btrfs_space_info *space_info;
723 u64 flags = generic_ref_to_space_flags(ref);
725 space_info = btrfs_find_space_info(fs_info, flags);
727 percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len,
728 BTRFS_TOTAL_BYTES_PINNED_BATCH);
731 /* simple helper to search for an existing data extent at a given offset */
732 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
735 struct btrfs_key key;
736 struct btrfs_path *path;
738 path = btrfs_alloc_path();
742 key.objectid = start;
744 key.type = BTRFS_EXTENT_ITEM_KEY;
745 ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
746 btrfs_free_path(path);
751 * helper function to lookup reference count and flags of a tree block.
753 * the head node for delayed ref is used to store the sum of all the
754 * reference count modifications queued up in the rbtree. the head
755 * node may also store the extent flags to set. This way you can check
756 * to see what the reference count and extent flags would be if all of
757 * the delayed refs are not processed.
759 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
760 struct btrfs_fs_info *fs_info, u64 bytenr,
761 u64 offset, int metadata, u64 *refs, u64 *flags)
763 struct btrfs_delayed_ref_head *head;
764 struct btrfs_delayed_ref_root *delayed_refs;
765 struct btrfs_path *path;
766 struct btrfs_extent_item *ei;
767 struct extent_buffer *leaf;
768 struct btrfs_key key;
775 * If we don't have skinny metadata, don't bother doing anything
778 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
779 offset = fs_info->nodesize;
783 path = btrfs_alloc_path();
788 path->skip_locking = 1;
789 path->search_commit_root = 1;
793 key.objectid = bytenr;
796 key.type = BTRFS_METADATA_ITEM_KEY;
798 key.type = BTRFS_EXTENT_ITEM_KEY;
800 ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
804 if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
805 if (path->slots[0]) {
807 btrfs_item_key_to_cpu(path->nodes[0], &key,
809 if (key.objectid == bytenr &&
810 key.type == BTRFS_EXTENT_ITEM_KEY &&
811 key.offset == fs_info->nodesize)
817 leaf = path->nodes[0];
818 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
819 if (item_size >= sizeof(*ei)) {
820 ei = btrfs_item_ptr(leaf, path->slots[0],
821 struct btrfs_extent_item);
822 num_refs = btrfs_extent_refs(leaf, ei);
823 extent_flags = btrfs_extent_flags(leaf, ei);
826 btrfs_print_v0_err(fs_info);
828 btrfs_abort_transaction(trans, ret);
830 btrfs_handle_fs_error(fs_info, ret, NULL);
835 BUG_ON(num_refs == 0);
845 delayed_refs = &trans->transaction->delayed_refs;
846 spin_lock(&delayed_refs->lock);
847 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
849 if (!mutex_trylock(&head->mutex)) {
850 refcount_inc(&head->refs);
851 spin_unlock(&delayed_refs->lock);
853 btrfs_release_path(path);
856 * Mutex was contended, block until it's released and try
859 mutex_lock(&head->mutex);
860 mutex_unlock(&head->mutex);
861 btrfs_put_delayed_ref_head(head);
864 spin_lock(&head->lock);
865 if (head->extent_op && head->extent_op->update_flags)
866 extent_flags |= head->extent_op->flags_to_set;
868 BUG_ON(num_refs == 0);
870 num_refs += head->ref_mod;
871 spin_unlock(&head->lock);
872 mutex_unlock(&head->mutex);
874 spin_unlock(&delayed_refs->lock);
876 WARN_ON(num_refs == 0);
880 *flags = extent_flags;
882 btrfs_free_path(path);
887 * Back reference rules. Back refs have three main goals:
889 * 1) differentiate between all holders of references to an extent so that
890 * when a reference is dropped we can make sure it was a valid reference
891 * before freeing the extent.
893 * 2) Provide enough information to quickly find the holders of an extent
894 * if we notice a given block is corrupted or bad.
896 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
897 * maintenance. This is actually the same as #2, but with a slightly
898 * different use case.
900 * There are two kinds of back refs. The implicit back refs is optimized
901 * for pointers in non-shared tree blocks. For a given pointer in a block,
902 * back refs of this kind provide information about the block's owner tree
903 * and the pointer's key. These information allow us to find the block by
904 * b-tree searching. The full back refs is for pointers in tree blocks not
905 * referenced by their owner trees. The location of tree block is recorded
906 * in the back refs. Actually the full back refs is generic, and can be
907 * used in all cases the implicit back refs is used. The major shortcoming
908 * of the full back refs is its overhead. Every time a tree block gets
909 * COWed, we have to update back refs entry for all pointers in it.
911 * For a newly allocated tree block, we use implicit back refs for
912 * pointers in it. This means most tree related operations only involve
913 * implicit back refs. For a tree block created in old transaction, the
914 * only way to drop a reference to it is COW it. So we can detect the
915 * event that tree block loses its owner tree's reference and do the
916 * back refs conversion.
918 * When a tree block is COWed through a tree, there are four cases:
920 * The reference count of the block is one and the tree is the block's
921 * owner tree. Nothing to do in this case.
923 * The reference count of the block is one and the tree is not the
924 * block's owner tree. In this case, full back refs is used for pointers
925 * in the block. Remove these full back refs, add implicit back refs for
926 * every pointers in the new block.
928 * The reference count of the block is greater than one and the tree is
929 * the block's owner tree. In this case, implicit back refs is used for
930 * pointers in the block. Add full back refs for every pointers in the
931 * block, increase lower level extents' reference counts. The original
932 * implicit back refs are entailed to the new block.
934 * The reference count of the block is greater than one and the tree is
935 * not the block's owner tree. Add implicit back refs for every pointer in
936 * the new block, increase lower level extents' reference count.
938 * Back Reference Key composing:
940 * The key objectid corresponds to the first byte in the extent,
941 * The key type is used to differentiate between types of back refs.
942 * There are different meanings of the key offset for different types
945 * File extents can be referenced by:
947 * - multiple snapshots, subvolumes, or different generations in one subvol
948 * - different files inside a single subvolume
949 * - different offsets inside a file (bookend extents in file.c)
951 * The extent ref structure for the implicit back refs has fields for:
953 * - Objectid of the subvolume root
954 * - objectid of the file holding the reference
955 * - original offset in the file
956 * - how many bookend extents
958 * The key offset for the implicit back refs is hash of the first
961 * The extent ref structure for the full back refs has field for:
963 * - number of pointers in the tree leaf
965 * The key offset for the implicit back refs is the first byte of
968 * When a file extent is allocated, The implicit back refs is used.
969 * the fields are filled in:
971 * (root_key.objectid, inode objectid, offset in file, 1)
973 * When a file extent is removed file truncation, we find the
974 * corresponding implicit back refs and check the following fields:
976 * (btrfs_header_owner(leaf), inode objectid, offset in file)
978 * Btree extents can be referenced by:
980 * - Different subvolumes
982 * Both the implicit back refs and the full back refs for tree blocks
983 * only consist of key. The key offset for the implicit back refs is
984 * objectid of block's owner tree. The key offset for the full back refs
985 * is the first byte of parent block.
987 * When implicit back refs is used, information about the lowest key and
988 * level of the tree block are required. These information are stored in
989 * tree block info structure.
993 * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
994 * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
995 * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
997 int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
998 struct btrfs_extent_inline_ref *iref,
999 enum btrfs_inline_ref_type is_data)
1001 int type = btrfs_extent_inline_ref_type(eb, iref);
1002 u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
1004 if (type == BTRFS_TREE_BLOCK_REF_KEY ||
1005 type == BTRFS_SHARED_BLOCK_REF_KEY ||
1006 type == BTRFS_SHARED_DATA_REF_KEY ||
1007 type == BTRFS_EXTENT_DATA_REF_KEY) {
1008 if (is_data == BTRFS_REF_TYPE_BLOCK) {
1009 if (type == BTRFS_TREE_BLOCK_REF_KEY)
1011 if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1012 ASSERT(eb->fs_info);
1014 * Every shared one has parent tree
1015 * block, which must be aligned to
1019 IS_ALIGNED(offset, eb->fs_info->nodesize))
1022 } else if (is_data == BTRFS_REF_TYPE_DATA) {
1023 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1025 if (type == BTRFS_SHARED_DATA_REF_KEY) {
1026 ASSERT(eb->fs_info);
1028 * Every shared one has parent tree
1029 * block, which must be aligned to
1033 IS_ALIGNED(offset, eb->fs_info->nodesize))
1037 ASSERT(is_data == BTRFS_REF_TYPE_ANY);
1042 btrfs_print_leaf((struct extent_buffer *)eb);
1043 btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d",
1047 return BTRFS_REF_TYPE_INVALID;
1050 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1052 u32 high_crc = ~(u32)0;
1053 u32 low_crc = ~(u32)0;
1056 lenum = cpu_to_le64(root_objectid);
1057 high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
1058 lenum = cpu_to_le64(owner);
1059 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1060 lenum = cpu_to_le64(offset);
1061 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1063 return ((u64)high_crc << 31) ^ (u64)low_crc;
1066 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1067 struct btrfs_extent_data_ref *ref)
1069 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1070 btrfs_extent_data_ref_objectid(leaf, ref),
1071 btrfs_extent_data_ref_offset(leaf, ref));
1074 static int match_extent_data_ref(struct extent_buffer *leaf,
1075 struct btrfs_extent_data_ref *ref,
1076 u64 root_objectid, u64 owner, u64 offset)
1078 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1079 btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1080 btrfs_extent_data_ref_offset(leaf, ref) != offset)
1085 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1086 struct btrfs_path *path,
1087 u64 bytenr, u64 parent,
1089 u64 owner, u64 offset)
1091 struct btrfs_root *root = trans->fs_info->extent_root;
1092 struct btrfs_key key;
1093 struct btrfs_extent_data_ref *ref;
1094 struct extent_buffer *leaf;
1100 key.objectid = bytenr;
1102 key.type = BTRFS_SHARED_DATA_REF_KEY;
1103 key.offset = parent;
1105 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1106 key.offset = hash_extent_data_ref(root_objectid,
1111 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1123 leaf = path->nodes[0];
1124 nritems = btrfs_header_nritems(leaf);
1126 if (path->slots[0] >= nritems) {
1127 ret = btrfs_next_leaf(root, path);
1133 leaf = path->nodes[0];
1134 nritems = btrfs_header_nritems(leaf);
1138 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1139 if (key.objectid != bytenr ||
1140 key.type != BTRFS_EXTENT_DATA_REF_KEY)
1143 ref = btrfs_item_ptr(leaf, path->slots[0],
1144 struct btrfs_extent_data_ref);
1146 if (match_extent_data_ref(leaf, ref, root_objectid,
1149 btrfs_release_path(path);
1161 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1162 struct btrfs_path *path,
1163 u64 bytenr, u64 parent,
1164 u64 root_objectid, u64 owner,
1165 u64 offset, int refs_to_add)
1167 struct btrfs_root *root = trans->fs_info->extent_root;
1168 struct btrfs_key key;
1169 struct extent_buffer *leaf;
1174 key.objectid = bytenr;
1176 key.type = BTRFS_SHARED_DATA_REF_KEY;
1177 key.offset = parent;
1178 size = sizeof(struct btrfs_shared_data_ref);
1180 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1181 key.offset = hash_extent_data_ref(root_objectid,
1183 size = sizeof(struct btrfs_extent_data_ref);
1186 ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1187 if (ret && ret != -EEXIST)
1190 leaf = path->nodes[0];
1192 struct btrfs_shared_data_ref *ref;
1193 ref = btrfs_item_ptr(leaf, path->slots[0],
1194 struct btrfs_shared_data_ref);
1196 btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1198 num_refs = btrfs_shared_data_ref_count(leaf, ref);
1199 num_refs += refs_to_add;
1200 btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1203 struct btrfs_extent_data_ref *ref;
1204 while (ret == -EEXIST) {
1205 ref = btrfs_item_ptr(leaf, path->slots[0],
1206 struct btrfs_extent_data_ref);
1207 if (match_extent_data_ref(leaf, ref, root_objectid,
1210 btrfs_release_path(path);
1212 ret = btrfs_insert_empty_item(trans, root, path, &key,
1214 if (ret && ret != -EEXIST)
1217 leaf = path->nodes[0];
1219 ref = btrfs_item_ptr(leaf, path->slots[0],
1220 struct btrfs_extent_data_ref);
1222 btrfs_set_extent_data_ref_root(leaf, ref,
1224 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1225 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1226 btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1228 num_refs = btrfs_extent_data_ref_count(leaf, ref);
1229 num_refs += refs_to_add;
1230 btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1233 btrfs_mark_buffer_dirty(leaf);
1236 btrfs_release_path(path);
1240 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1241 struct btrfs_path *path,
1242 int refs_to_drop, int *last_ref)
1244 struct btrfs_key key;
1245 struct btrfs_extent_data_ref *ref1 = NULL;
1246 struct btrfs_shared_data_ref *ref2 = NULL;
1247 struct extent_buffer *leaf;
1251 leaf = path->nodes[0];
1252 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1254 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1255 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1256 struct btrfs_extent_data_ref);
1257 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1258 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1259 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1260 struct btrfs_shared_data_ref);
1261 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1262 } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
1263 btrfs_print_v0_err(trans->fs_info);
1264 btrfs_abort_transaction(trans, -EINVAL);
1270 BUG_ON(num_refs < refs_to_drop);
1271 num_refs -= refs_to_drop;
1273 if (num_refs == 0) {
1274 ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
1277 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1278 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1279 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1280 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1281 btrfs_mark_buffer_dirty(leaf);
1286 static noinline u32 extent_data_ref_count(struct btrfs_path *path,
1287 struct btrfs_extent_inline_ref *iref)
1289 struct btrfs_key key;
1290 struct extent_buffer *leaf;
1291 struct btrfs_extent_data_ref *ref1;
1292 struct btrfs_shared_data_ref *ref2;
1296 leaf = path->nodes[0];
1297 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1299 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
1302 * If type is invalid, we should have bailed out earlier than
1305 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
1306 ASSERT(type != BTRFS_REF_TYPE_INVALID);
1307 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1308 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1309 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1311 ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1312 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1314 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1315 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1316 struct btrfs_extent_data_ref);
1317 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1318 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1319 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1320 struct btrfs_shared_data_ref);
1321 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1328 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1329 struct btrfs_path *path,
1330 u64 bytenr, u64 parent,
1333 struct btrfs_root *root = trans->fs_info->extent_root;
1334 struct btrfs_key key;
1337 key.objectid = bytenr;
1339 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1340 key.offset = parent;
1342 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1343 key.offset = root_objectid;
1346 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1352 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1353 struct btrfs_path *path,
1354 u64 bytenr, u64 parent,
1357 struct btrfs_key key;
1360 key.objectid = bytenr;
1362 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1363 key.offset = parent;
1365 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1366 key.offset = root_objectid;
1369 ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root,
1371 btrfs_release_path(path);
1375 static inline int extent_ref_type(u64 parent, u64 owner)
1378 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1380 type = BTRFS_SHARED_BLOCK_REF_KEY;
1382 type = BTRFS_TREE_BLOCK_REF_KEY;
1385 type = BTRFS_SHARED_DATA_REF_KEY;
1387 type = BTRFS_EXTENT_DATA_REF_KEY;
1392 static int find_next_key(struct btrfs_path *path, int level,
1393 struct btrfs_key *key)
1396 for (; level < BTRFS_MAX_LEVEL; level++) {
1397 if (!path->nodes[level])
1399 if (path->slots[level] + 1 >=
1400 btrfs_header_nritems(path->nodes[level]))
1403 btrfs_item_key_to_cpu(path->nodes[level], key,
1404 path->slots[level] + 1);
1406 btrfs_node_key_to_cpu(path->nodes[level], key,
1407 path->slots[level] + 1);
1414 * look for inline back ref. if back ref is found, *ref_ret is set
1415 * to the address of inline back ref, and 0 is returned.
1417 * if back ref isn't found, *ref_ret is set to the address where it
1418 * should be inserted, and -ENOENT is returned.
1420 * if insert is true and there are too many inline back refs, the path
1421 * points to the extent item, and -EAGAIN is returned.
1423 * NOTE: inline back refs are ordered in the same way that back ref
1424 * items in the tree are ordered.
1426 static noinline_for_stack
1427 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1428 struct btrfs_path *path,
1429 struct btrfs_extent_inline_ref **ref_ret,
1430 u64 bytenr, u64 num_bytes,
1431 u64 parent, u64 root_objectid,
1432 u64 owner, u64 offset, int insert)
1434 struct btrfs_fs_info *fs_info = trans->fs_info;
1435 struct btrfs_root *root = fs_info->extent_root;
1436 struct btrfs_key key;
1437 struct extent_buffer *leaf;
1438 struct btrfs_extent_item *ei;
1439 struct btrfs_extent_inline_ref *iref;
1449 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
1452 key.objectid = bytenr;
1453 key.type = BTRFS_EXTENT_ITEM_KEY;
1454 key.offset = num_bytes;
1456 want = extent_ref_type(parent, owner);
1458 extra_size = btrfs_extent_inline_ref_size(want);
1459 path->keep_locks = 1;
1464 * Owner is our level, so we can just add one to get the level for the
1465 * block we are interested in.
1467 if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
1468 key.type = BTRFS_METADATA_ITEM_KEY;
1473 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1480 * We may be a newly converted file system which still has the old fat
1481 * extent entries for metadata, so try and see if we have one of those.
1483 if (ret > 0 && skinny_metadata) {
1484 skinny_metadata = false;
1485 if (path->slots[0]) {
1487 btrfs_item_key_to_cpu(path->nodes[0], &key,
1489 if (key.objectid == bytenr &&
1490 key.type == BTRFS_EXTENT_ITEM_KEY &&
1491 key.offset == num_bytes)
1495 key.objectid = bytenr;
1496 key.type = BTRFS_EXTENT_ITEM_KEY;
1497 key.offset = num_bytes;
1498 btrfs_release_path(path);
1503 if (ret && !insert) {
1506 } else if (WARN_ON(ret)) {
1511 leaf = path->nodes[0];
1512 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1513 if (unlikely(item_size < sizeof(*ei))) {
1515 btrfs_print_v0_err(fs_info);
1516 btrfs_abort_transaction(trans, err);
1520 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1521 flags = btrfs_extent_flags(leaf, ei);
1523 ptr = (unsigned long)(ei + 1);
1524 end = (unsigned long)ei + item_size;
1526 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
1527 ptr += sizeof(struct btrfs_tree_block_info);
1531 if (owner >= BTRFS_FIRST_FREE_OBJECTID)
1532 needed = BTRFS_REF_TYPE_DATA;
1534 needed = BTRFS_REF_TYPE_BLOCK;
1542 iref = (struct btrfs_extent_inline_ref *)ptr;
1543 type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
1544 if (type == BTRFS_REF_TYPE_INVALID) {
1552 ptr += btrfs_extent_inline_ref_size(type);
1556 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1557 struct btrfs_extent_data_ref *dref;
1558 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1559 if (match_extent_data_ref(leaf, dref, root_objectid,
1564 if (hash_extent_data_ref_item(leaf, dref) <
1565 hash_extent_data_ref(root_objectid, owner, offset))
1569 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1571 if (parent == ref_offset) {
1575 if (ref_offset < parent)
1578 if (root_objectid == ref_offset) {
1582 if (ref_offset < root_objectid)
1586 ptr += btrfs_extent_inline_ref_size(type);
1588 if (err == -ENOENT && insert) {
1589 if (item_size + extra_size >=
1590 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1595 * To add new inline back ref, we have to make sure
1596 * there is no corresponding back ref item.
1597 * For simplicity, we just do not add new inline back
1598 * ref if there is any kind of item for this block
1600 if (find_next_key(path, 0, &key) == 0 &&
1601 key.objectid == bytenr &&
1602 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1607 *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1610 path->keep_locks = 0;
1611 btrfs_unlock_up_safe(path, 1);
1617 * helper to add new inline back ref
1619 static noinline_for_stack
1620 void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
1621 struct btrfs_path *path,
1622 struct btrfs_extent_inline_ref *iref,
1623 u64 parent, u64 root_objectid,
1624 u64 owner, u64 offset, int refs_to_add,
1625 struct btrfs_delayed_extent_op *extent_op)
1627 struct extent_buffer *leaf;
1628 struct btrfs_extent_item *ei;
1631 unsigned long item_offset;
1636 leaf = path->nodes[0];
1637 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1638 item_offset = (unsigned long)iref - (unsigned long)ei;
1640 type = extent_ref_type(parent, owner);
1641 size = btrfs_extent_inline_ref_size(type);
1643 btrfs_extend_item(path, size);
1645 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1646 refs = btrfs_extent_refs(leaf, ei);
1647 refs += refs_to_add;
1648 btrfs_set_extent_refs(leaf, ei, refs);
1650 __run_delayed_extent_op(extent_op, leaf, ei);
1652 ptr = (unsigned long)ei + item_offset;
1653 end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1654 if (ptr < end - size)
1655 memmove_extent_buffer(leaf, ptr + size, ptr,
1658 iref = (struct btrfs_extent_inline_ref *)ptr;
1659 btrfs_set_extent_inline_ref_type(leaf, iref, type);
1660 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1661 struct btrfs_extent_data_ref *dref;
1662 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1663 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1664 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1665 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1666 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1667 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1668 struct btrfs_shared_data_ref *sref;
1669 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1670 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1671 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1672 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1673 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1675 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1677 btrfs_mark_buffer_dirty(leaf);
1680 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1681 struct btrfs_path *path,
1682 struct btrfs_extent_inline_ref **ref_ret,
1683 u64 bytenr, u64 num_bytes, u64 parent,
1684 u64 root_objectid, u64 owner, u64 offset)
1688 ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
1689 num_bytes, parent, root_objectid,
1694 btrfs_release_path(path);
1697 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1698 ret = lookup_tree_block_ref(trans, path, bytenr, parent,
1701 ret = lookup_extent_data_ref(trans, path, bytenr, parent,
1702 root_objectid, owner, offset);
1708 * helper to update/remove inline back ref
1710 static noinline_for_stack
1711 void update_inline_extent_backref(struct btrfs_path *path,
1712 struct btrfs_extent_inline_ref *iref,
1714 struct btrfs_delayed_extent_op *extent_op,
1717 struct extent_buffer *leaf = path->nodes[0];
1718 struct btrfs_extent_item *ei;
1719 struct btrfs_extent_data_ref *dref = NULL;
1720 struct btrfs_shared_data_ref *sref = NULL;
1728 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1729 refs = btrfs_extent_refs(leaf, ei);
1730 WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1731 refs += refs_to_mod;
1732 btrfs_set_extent_refs(leaf, ei, refs);
1734 __run_delayed_extent_op(extent_op, leaf, ei);
1737 * If type is invalid, we should have bailed out after
1738 * lookup_inline_extent_backref().
1740 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
1741 ASSERT(type != BTRFS_REF_TYPE_INVALID);
1743 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1744 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1745 refs = btrfs_extent_data_ref_count(leaf, dref);
1746 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1747 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1748 refs = btrfs_shared_data_ref_count(leaf, sref);
1751 BUG_ON(refs_to_mod != -1);
1754 BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1755 refs += refs_to_mod;
1758 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1759 btrfs_set_extent_data_ref_count(leaf, dref, refs);
1761 btrfs_set_shared_data_ref_count(leaf, sref, refs);
1764 size = btrfs_extent_inline_ref_size(type);
1765 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1766 ptr = (unsigned long)iref;
1767 end = (unsigned long)ei + item_size;
1768 if (ptr + size < end)
1769 memmove_extent_buffer(leaf, ptr, ptr + size,
1772 btrfs_truncate_item(path, item_size, 1);
1774 btrfs_mark_buffer_dirty(leaf);
1777 static noinline_for_stack
1778 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1779 struct btrfs_path *path,
1780 u64 bytenr, u64 num_bytes, u64 parent,
1781 u64 root_objectid, u64 owner,
1782 u64 offset, int refs_to_add,
1783 struct btrfs_delayed_extent_op *extent_op)
1785 struct btrfs_extent_inline_ref *iref;
1788 ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
1789 num_bytes, parent, root_objectid,
1792 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1793 update_inline_extent_backref(path, iref, refs_to_add,
1795 } else if (ret == -ENOENT) {
1796 setup_inline_extent_backref(trans->fs_info, path, iref, parent,
1797 root_objectid, owner, offset,
1798 refs_to_add, extent_op);
1804 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1805 struct btrfs_path *path,
1806 u64 bytenr, u64 parent, u64 root_objectid,
1807 u64 owner, u64 offset, int refs_to_add)
1810 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1811 BUG_ON(refs_to_add != 1);
1812 ret = insert_tree_block_ref(trans, path, bytenr, parent,
1815 ret = insert_extent_data_ref(trans, path, bytenr, parent,
1816 root_objectid, owner, offset,
1822 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1823 struct btrfs_path *path,
1824 struct btrfs_extent_inline_ref *iref,
1825 int refs_to_drop, int is_data, int *last_ref)
1829 BUG_ON(!is_data && refs_to_drop != 1);
1831 update_inline_extent_backref(path, iref, -refs_to_drop, NULL,
1833 } else if (is_data) {
1834 ret = remove_extent_data_ref(trans, path, refs_to_drop,
1838 ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
1843 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1844 u64 *discarded_bytes)
1847 u64 bytes_left, end;
1848 u64 aligned_start = ALIGN(start, 1 << 9);
1850 if (WARN_ON(start != aligned_start)) {
1851 len -= aligned_start - start;
1852 len = round_down(len, 1 << 9);
1853 start = aligned_start;
1856 *discarded_bytes = 0;
1864 /* Skip any superblocks on this device. */
1865 for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1866 u64 sb_start = btrfs_sb_offset(j);
1867 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1868 u64 size = sb_start - start;
1870 if (!in_range(sb_start, start, bytes_left) &&
1871 !in_range(sb_end, start, bytes_left) &&
1872 !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1876 * Superblock spans beginning of range. Adjust start and
1879 if (sb_start <= start) {
1880 start += sb_end - start;
1885 bytes_left = end - start;
1890 ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
1893 *discarded_bytes += size;
1894 else if (ret != -EOPNOTSUPP)
1903 bytes_left = end - start;
1907 ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
1910 *discarded_bytes += bytes_left;
1915 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1916 u64 num_bytes, u64 *actual_bytes)
1919 u64 discarded_bytes = 0;
1920 struct btrfs_bio *bbio = NULL;
1924 * Avoid races with device replace and make sure our bbio has devices
1925 * associated to its stripes that don't go away while we are discarding.
1927 btrfs_bio_counter_inc_blocked(fs_info);
1928 /* Tell the block device(s) that the sectors can be discarded */
1929 ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes,
1931 /* Error condition is -ENOMEM */
1933 struct btrfs_bio_stripe *stripe = bbio->stripes;
1937 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1939 struct request_queue *req_q;
1941 if (!stripe->dev->bdev) {
1942 ASSERT(btrfs_test_opt(fs_info, DEGRADED));
1945 req_q = bdev_get_queue(stripe->dev->bdev);
1946 if (!blk_queue_discard(req_q))
1949 ret = btrfs_issue_discard(stripe->dev->bdev,
1954 discarded_bytes += bytes;
1955 else if (ret != -EOPNOTSUPP)
1956 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
1959 * Just in case we get back EOPNOTSUPP for some reason,
1960 * just ignore the return value so we don't screw up
1961 * people calling discard_extent.
1965 btrfs_put_bbio(bbio);
1967 btrfs_bio_counter_dec(fs_info);
1970 *actual_bytes = discarded_bytes;
1973 if (ret == -EOPNOTSUPP)
1978 /* Can return -ENOMEM */
1979 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1980 struct btrfs_ref *generic_ref)
1982 struct btrfs_fs_info *fs_info = trans->fs_info;
1983 int old_ref_mod, new_ref_mod;
1986 ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
1987 generic_ref->action);
1988 BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
1989 generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID);
1991 if (generic_ref->type == BTRFS_REF_METADATA)
1992 ret = btrfs_add_delayed_tree_ref(trans, generic_ref,
1993 NULL, &old_ref_mod, &new_ref_mod);
1995 ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0,
1996 &old_ref_mod, &new_ref_mod);
1998 btrfs_ref_tree_mod(fs_info, generic_ref);
2000 if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
2001 sub_pinned_bytes(fs_info, generic_ref);
2007 * __btrfs_inc_extent_ref - insert backreference for a given extent
2009 * @trans: Handle of transaction
2011 * @node: The delayed ref node used to get the bytenr/length for
2012 * extent whose references are incremented.
2014 * @parent: If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
2015 * BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
2016 * bytenr of the parent block. Since new extents are always
2017 * created with indirect references, this will only be the case
2018 * when relocating a shared extent. In that case, root_objectid
2019 * will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must
2022 * @root_objectid: The id of the root where this modification has originated,
2023 * this can be either one of the well-known metadata trees or
2024 * the subvolume id which references this extent.
2026 * @owner: For data extents it is the inode number of the owning file.
2027 * For metadata extents this parameter holds the level in the
2028 * tree of the extent.
2030 * @offset: For metadata extents the offset is ignored and is currently
2031 * always passed as 0. For data extents it is the fileoffset
2032 * this extent belongs to.
2034 * @refs_to_add Number of references to add
2036 * @extent_op Pointer to a structure, holding information necessary when
2037 * updating a tree block's flags
2040 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2041 struct btrfs_delayed_ref_node *node,
2042 u64 parent, u64 root_objectid,
2043 u64 owner, u64 offset, int refs_to_add,
2044 struct btrfs_delayed_extent_op *extent_op)
2046 struct btrfs_path *path;
2047 struct extent_buffer *leaf;
2048 struct btrfs_extent_item *item;
2049 struct btrfs_key key;
2050 u64 bytenr = node->bytenr;
2051 u64 num_bytes = node->num_bytes;
2055 path = btrfs_alloc_path();
2059 path->reada = READA_FORWARD;
2060 path->leave_spinning = 1;
2061 /* this will setup the path even if it fails to insert the back ref */
2062 ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
2063 parent, root_objectid, owner,
2064 offset, refs_to_add, extent_op);
2065 if ((ret < 0 && ret != -EAGAIN) || !ret)
2069 * Ok we had -EAGAIN which means we didn't have space to insert and
2070 * inline extent ref, so just update the reference count and add a
2073 leaf = path->nodes[0];
2074 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2075 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2076 refs = btrfs_extent_refs(leaf, item);
2077 btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
2079 __run_delayed_extent_op(extent_op, leaf, item);
2081 btrfs_mark_buffer_dirty(leaf);
2082 btrfs_release_path(path);
2084 path->reada = READA_FORWARD;
2085 path->leave_spinning = 1;
2086 /* now insert the actual backref */
2087 ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid,
2088 owner, offset, refs_to_add);
2090 btrfs_abort_transaction(trans, ret);
2092 btrfs_free_path(path);
2096 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
2097 struct btrfs_delayed_ref_node *node,
2098 struct btrfs_delayed_extent_op *extent_op,
2099 int insert_reserved)
2102 struct btrfs_delayed_data_ref *ref;
2103 struct btrfs_key ins;
2108 ins.objectid = node->bytenr;
2109 ins.offset = node->num_bytes;
2110 ins.type = BTRFS_EXTENT_ITEM_KEY;
2112 ref = btrfs_delayed_node_to_data_ref(node);
2113 trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
2115 if (node->type == BTRFS_SHARED_DATA_REF_KEY)
2116 parent = ref->parent;
2117 ref_root = ref->root;
2119 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2121 flags |= extent_op->flags_to_set;
2122 ret = alloc_reserved_file_extent(trans, parent, ref_root,
2123 flags, ref->objectid,
2126 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2127 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
2128 ref->objectid, ref->offset,
2129 node->ref_mod, extent_op);
2130 } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2131 ret = __btrfs_free_extent(trans, node, parent,
2132 ref_root, ref->objectid,
2133 ref->offset, node->ref_mod,
2141 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2142 struct extent_buffer *leaf,
2143 struct btrfs_extent_item *ei)
2145 u64 flags = btrfs_extent_flags(leaf, ei);
2146 if (extent_op->update_flags) {
2147 flags |= extent_op->flags_to_set;
2148 btrfs_set_extent_flags(leaf, ei, flags);
2151 if (extent_op->update_key) {
2152 struct btrfs_tree_block_info *bi;
2153 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2154 bi = (struct btrfs_tree_block_info *)(ei + 1);
2155 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2159 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2160 struct btrfs_delayed_ref_head *head,
2161 struct btrfs_delayed_extent_op *extent_op)
2163 struct btrfs_fs_info *fs_info = trans->fs_info;
2164 struct btrfs_key key;
2165 struct btrfs_path *path;
2166 struct btrfs_extent_item *ei;
2167 struct extent_buffer *leaf;
2171 int metadata = !extent_op->is_data;
2176 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2179 path = btrfs_alloc_path();
2183 key.objectid = head->bytenr;
2186 key.type = BTRFS_METADATA_ITEM_KEY;
2187 key.offset = extent_op->level;
2189 key.type = BTRFS_EXTENT_ITEM_KEY;
2190 key.offset = head->num_bytes;
2194 path->reada = READA_FORWARD;
2195 path->leave_spinning = 1;
2196 ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
2203 if (path->slots[0] > 0) {
2205 btrfs_item_key_to_cpu(path->nodes[0], &key,
2207 if (key.objectid == head->bytenr &&
2208 key.type == BTRFS_EXTENT_ITEM_KEY &&
2209 key.offset == head->num_bytes)
2213 btrfs_release_path(path);
2216 key.objectid = head->bytenr;
2217 key.offset = head->num_bytes;
2218 key.type = BTRFS_EXTENT_ITEM_KEY;
2227 leaf = path->nodes[0];
2228 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2230 if (unlikely(item_size < sizeof(*ei))) {
2232 btrfs_print_v0_err(fs_info);
2233 btrfs_abort_transaction(trans, err);
2237 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2238 __run_delayed_extent_op(extent_op, leaf, ei);
2240 btrfs_mark_buffer_dirty(leaf);
2242 btrfs_free_path(path);
2246 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2247 struct btrfs_delayed_ref_node *node,
2248 struct btrfs_delayed_extent_op *extent_op,
2249 int insert_reserved)
2252 struct btrfs_delayed_tree_ref *ref;
2256 ref = btrfs_delayed_node_to_tree_ref(node);
2257 trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
2259 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2260 parent = ref->parent;
2261 ref_root = ref->root;
2263 if (node->ref_mod != 1) {
2264 btrfs_err(trans->fs_info,
2265 "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
2266 node->bytenr, node->ref_mod, node->action, ref_root,
2270 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2271 BUG_ON(!extent_op || !extent_op->update_flags);
2272 ret = alloc_reserved_tree_block(trans, node, extent_op);
2273 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2274 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
2275 ref->level, 0, 1, extent_op);
2276 } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2277 ret = __btrfs_free_extent(trans, node, parent, ref_root,
2278 ref->level, 0, 1, extent_op);
2285 /* helper function to actually process a single delayed ref entry */
2286 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2287 struct btrfs_delayed_ref_node *node,
2288 struct btrfs_delayed_extent_op *extent_op,
2289 int insert_reserved)
2293 if (trans->aborted) {
2294 if (insert_reserved)
2295 btrfs_pin_extent(trans->fs_info, node->bytenr,
2296 node->num_bytes, 1);
2300 if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2301 node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2302 ret = run_delayed_tree_ref(trans, node, extent_op,
2304 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2305 node->type == BTRFS_SHARED_DATA_REF_KEY)
2306 ret = run_delayed_data_ref(trans, node, extent_op,
2310 if (ret && insert_reserved)
2311 btrfs_pin_extent(trans->fs_info, node->bytenr,
2312 node->num_bytes, 1);
2316 static inline struct btrfs_delayed_ref_node *
2317 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2319 struct btrfs_delayed_ref_node *ref;
2321 if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
2325 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
2326 * This is to prevent a ref count from going down to zero, which deletes
2327 * the extent item from the extent tree, when there still are references
2328 * to add, which would fail because they would not find the extent item.
2330 if (!list_empty(&head->ref_add_list))
2331 return list_first_entry(&head->ref_add_list,
2332 struct btrfs_delayed_ref_node, add_list);
2334 ref = rb_entry(rb_first_cached(&head->ref_tree),
2335 struct btrfs_delayed_ref_node, ref_node);
2336 ASSERT(list_empty(&ref->add_list));
2340 static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
2341 struct btrfs_delayed_ref_head *head)
2343 spin_lock(&delayed_refs->lock);
2344 head->processing = 0;
2345 delayed_refs->num_heads_ready++;
2346 spin_unlock(&delayed_refs->lock);
2347 btrfs_delayed_ref_unlock(head);
2350 static struct btrfs_delayed_extent_op *cleanup_extent_op(
2351 struct btrfs_delayed_ref_head *head)
2353 struct btrfs_delayed_extent_op *extent_op = head->extent_op;
2358 if (head->must_insert_reserved) {
2359 head->extent_op = NULL;
2360 btrfs_free_delayed_extent_op(extent_op);
2366 static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
2367 struct btrfs_delayed_ref_head *head)
2369 struct btrfs_delayed_extent_op *extent_op;
2372 extent_op = cleanup_extent_op(head);
2375 head->extent_op = NULL;
2376 spin_unlock(&head->lock);
2377 ret = run_delayed_extent_op(trans, head, extent_op);
2378 btrfs_free_delayed_extent_op(extent_op);
2379 return ret ? ret : 1;
2382 void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
2383 struct btrfs_delayed_ref_root *delayed_refs,
2384 struct btrfs_delayed_ref_head *head)
2386 int nr_items = 1; /* Dropping this ref head update. */
2388 if (head->total_ref_mod < 0) {
2389 struct btrfs_space_info *space_info;
2393 flags = BTRFS_BLOCK_GROUP_DATA;
2394 else if (head->is_system)
2395 flags = BTRFS_BLOCK_GROUP_SYSTEM;
2397 flags = BTRFS_BLOCK_GROUP_METADATA;
2398 space_info = btrfs_find_space_info(fs_info, flags);
2400 percpu_counter_add_batch(&space_info->total_bytes_pinned,
2402 BTRFS_TOTAL_BYTES_PINNED_BATCH);
2405 * We had csum deletions accounted for in our delayed refs rsv,
2406 * we need to drop the csum leaves for this update from our
2409 if (head->is_data) {
2410 spin_lock(&delayed_refs->lock);
2411 delayed_refs->pending_csums -= head->num_bytes;
2412 spin_unlock(&delayed_refs->lock);
2413 nr_items += btrfs_csum_bytes_to_leaves(fs_info,
2418 btrfs_delayed_refs_rsv_release(fs_info, nr_items);
2421 static int cleanup_ref_head(struct btrfs_trans_handle *trans,
2422 struct btrfs_delayed_ref_head *head)
2425 struct btrfs_fs_info *fs_info = trans->fs_info;
2426 struct btrfs_delayed_ref_root *delayed_refs;
2429 delayed_refs = &trans->transaction->delayed_refs;
2431 ret = run_and_cleanup_extent_op(trans, head);
2433 unselect_delayed_ref_head(delayed_refs, head);
2434 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
2441 * Need to drop our head ref lock and re-acquire the delayed ref lock
2442 * and then re-check to make sure nobody got added.
2444 spin_unlock(&head->lock);
2445 spin_lock(&delayed_refs->lock);
2446 spin_lock(&head->lock);
2447 if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
2448 spin_unlock(&head->lock);
2449 spin_unlock(&delayed_refs->lock);
2452 btrfs_delete_ref_head(delayed_refs, head);
2453 spin_unlock(&head->lock);
2454 spin_unlock(&delayed_refs->lock);
2456 if (head->must_insert_reserved) {
2457 btrfs_pin_extent(fs_info, head->bytenr,
2458 head->num_bytes, 1);
2459 if (head->is_data) {
2460 ret = btrfs_del_csums(trans, fs_info, head->bytenr,
2465 btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
2467 trace_run_delayed_ref_head(fs_info, head, 0);
2468 btrfs_delayed_ref_unlock(head);
2469 btrfs_put_delayed_ref_head(head);
2473 static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
2474 struct btrfs_trans_handle *trans)
2476 struct btrfs_delayed_ref_root *delayed_refs =
2477 &trans->transaction->delayed_refs;
2478 struct btrfs_delayed_ref_head *head = NULL;
2481 spin_lock(&delayed_refs->lock);
2482 head = btrfs_select_ref_head(delayed_refs);
2484 spin_unlock(&delayed_refs->lock);
2489 * Grab the lock that says we are going to process all the refs for
2492 ret = btrfs_delayed_ref_lock(delayed_refs, head);
2493 spin_unlock(&delayed_refs->lock);
2496 * We may have dropped the spin lock to get the head mutex lock, and
2497 * that might have given someone else time to free the head. If that's
2498 * true, it has been removed from our list and we can move on.
2501 head = ERR_PTR(-EAGAIN);
2506 static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
2507 struct btrfs_delayed_ref_head *locked_ref,
2508 unsigned long *run_refs)
2510 struct btrfs_fs_info *fs_info = trans->fs_info;
2511 struct btrfs_delayed_ref_root *delayed_refs;
2512 struct btrfs_delayed_extent_op *extent_op;
2513 struct btrfs_delayed_ref_node *ref;
2514 int must_insert_reserved = 0;
2517 delayed_refs = &trans->transaction->delayed_refs;
2519 lockdep_assert_held(&locked_ref->mutex);
2520 lockdep_assert_held(&locked_ref->lock);
2522 while ((ref = select_delayed_ref(locked_ref))) {
2524 btrfs_check_delayed_seq(fs_info, ref->seq)) {
2525 spin_unlock(&locked_ref->lock);
2526 unselect_delayed_ref_head(delayed_refs, locked_ref);
2532 rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
2533 RB_CLEAR_NODE(&ref->ref_node);
2534 if (!list_empty(&ref->add_list))
2535 list_del(&ref->add_list);
2537 * When we play the delayed ref, also correct the ref_mod on
2540 switch (ref->action) {
2541 case BTRFS_ADD_DELAYED_REF:
2542 case BTRFS_ADD_DELAYED_EXTENT:
2543 locked_ref->ref_mod -= ref->ref_mod;
2545 case BTRFS_DROP_DELAYED_REF:
2546 locked_ref->ref_mod += ref->ref_mod;
2551 atomic_dec(&delayed_refs->num_entries);
2554 * Record the must_insert_reserved flag before we drop the
2557 must_insert_reserved = locked_ref->must_insert_reserved;
2558 locked_ref->must_insert_reserved = 0;
2560 extent_op = locked_ref->extent_op;
2561 locked_ref->extent_op = NULL;
2562 spin_unlock(&locked_ref->lock);
2564 ret = run_one_delayed_ref(trans, ref, extent_op,
2565 must_insert_reserved);
2567 btrfs_free_delayed_extent_op(extent_op);
2569 unselect_delayed_ref_head(delayed_refs, locked_ref);
2570 btrfs_put_delayed_ref(ref);
2571 btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
2576 btrfs_put_delayed_ref(ref);
2579 spin_lock(&locked_ref->lock);
2580 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2587 * Returns 0 on success or if called with an already aborted transaction.
2588 * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2590 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2593 struct btrfs_fs_info *fs_info = trans->fs_info;
2594 struct btrfs_delayed_ref_root *delayed_refs;
2595 struct btrfs_delayed_ref_head *locked_ref = NULL;
2596 ktime_t start = ktime_get();
2598 unsigned long count = 0;
2599 unsigned long actual_count = 0;
2601 delayed_refs = &trans->transaction->delayed_refs;
2604 locked_ref = btrfs_obtain_ref_head(trans);
2605 if (IS_ERR_OR_NULL(locked_ref)) {
2606 if (PTR_ERR(locked_ref) == -EAGAIN) {
2615 * We need to try and merge add/drops of the same ref since we
2616 * can run into issues with relocate dropping the implicit ref
2617 * and then it being added back again before the drop can
2618 * finish. If we merged anything we need to re-loop so we can
2620 * Or we can get node references of the same type that weren't
2621 * merged when created due to bumps in the tree mod seq, and
2622 * we need to merge them to prevent adding an inline extent
2623 * backref before dropping it (triggering a BUG_ON at
2624 * insert_inline_extent_backref()).
2626 spin_lock(&locked_ref->lock);
2627 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2629 ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
2631 if (ret < 0 && ret != -EAGAIN) {
2633 * Error, btrfs_run_delayed_refs_for_head already
2634 * unlocked everything so just bail out
2639 * Success, perform the usual cleanup of a processed
2642 ret = cleanup_ref_head(trans, locked_ref);
2644 /* We dropped our lock, we need to loop. */
2653 * Either success case or btrfs_run_delayed_refs_for_head
2654 * returned -EAGAIN, meaning we need to select another head
2659 } while ((nr != -1 && count < nr) || locked_ref);
2662 * We don't want to include ref heads since we can have empty ref heads
2663 * and those will drastically skew our runtime down since we just do
2664 * accounting, no actual extent tree updates.
2666 if (actual_count > 0) {
2667 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2671 * We weigh the current average higher than our current runtime
2672 * to avoid large swings in the average.
2674 spin_lock(&delayed_refs->lock);
2675 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2676 fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */
2677 spin_unlock(&delayed_refs->lock);
2682 #ifdef SCRAMBLE_DELAYED_REFS
2684 * Normally delayed refs get processed in ascending bytenr order. This
2685 * correlates in most cases to the order added. To expose dependencies on this
2686 * order, we start to process the tree in the middle instead of the beginning
2688 static u64 find_middle(struct rb_root *root)
2690 struct rb_node *n = root->rb_node;
2691 struct btrfs_delayed_ref_node *entry;
2694 u64 first = 0, last = 0;
2698 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2699 first = entry->bytenr;
2703 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2704 last = entry->bytenr;
2709 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2710 WARN_ON(!entry->in_tree);
2712 middle = entry->bytenr;
2725 static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads)
2729 num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2730 sizeof(struct btrfs_extent_inline_ref));
2731 if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2732 num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2735 * We don't ever fill up leaves all the way so multiply by 2 just to be
2736 * closer to what we're really going to want to use.
2738 return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info));
2742 * Takes the number of bytes to be csumm'ed and figures out how many leaves it
2743 * would require to store the csums for that many bytes.
2745 u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
2748 u64 num_csums_per_leaf;
2751 csum_size = BTRFS_MAX_ITEM_SIZE(fs_info);
2752 num_csums_per_leaf = div64_u64(csum_size,
2753 (u64)btrfs_super_csum_size(fs_info->super_copy));
2754 num_csums = div64_u64(csum_bytes, fs_info->sectorsize);
2755 num_csums += num_csums_per_leaf - 1;
2756 num_csums = div64_u64(num_csums, num_csums_per_leaf);
2760 bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info)
2762 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
2763 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
2767 spin_lock(&global_rsv->lock);
2768 reserved = global_rsv->reserved;
2769 spin_unlock(&global_rsv->lock);
2772 * Since the global reserve is just kind of magic we don't really want
2773 * to rely on it to save our bacon, so if our size is more than the
2774 * delayed_refs_rsv and the global rsv then it's time to think about
2777 spin_lock(&delayed_refs_rsv->lock);
2778 reserved += delayed_refs_rsv->reserved;
2779 if (delayed_refs_rsv->size >= reserved)
2781 spin_unlock(&delayed_refs_rsv->lock);
2785 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
2788 atomic_read(&trans->transaction->delayed_refs.num_entries);
2793 avg_runtime = trans->fs_info->avg_delayed_ref_runtime;
2794 val = num_entries * avg_runtime;
2795 if (val >= NSEC_PER_SEC)
2797 if (val >= NSEC_PER_SEC / 2)
2800 return btrfs_check_space_for_delayed_refs(trans->fs_info);
2804 * this starts processing the delayed reference count updates and
2805 * extent insertions we have queued up so far. count can be
2806 * 0, which means to process everything in the tree at the start
2807 * of the run (but not newly added entries), or it can be some target
2808 * number you'd like to process.
2810 * Returns 0 on success or if called with an aborted transaction
2811 * Returns <0 on error and aborts the transaction
2813 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2814 unsigned long count)
2816 struct btrfs_fs_info *fs_info = trans->fs_info;
2817 struct rb_node *node;
2818 struct btrfs_delayed_ref_root *delayed_refs;
2819 struct btrfs_delayed_ref_head *head;
2821 int run_all = count == (unsigned long)-1;
2823 /* We'll clean this up in btrfs_cleanup_transaction */
2827 if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
2830 delayed_refs = &trans->transaction->delayed_refs;
2832 count = atomic_read(&delayed_refs->num_entries) * 2;
2835 #ifdef SCRAMBLE_DELAYED_REFS
2836 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2838 ret = __btrfs_run_delayed_refs(trans, count);
2840 btrfs_abort_transaction(trans, ret);
2845 btrfs_create_pending_block_groups(trans);
2847 spin_lock(&delayed_refs->lock);
2848 node = rb_first_cached(&delayed_refs->href_root);
2850 spin_unlock(&delayed_refs->lock);
2853 head = rb_entry(node, struct btrfs_delayed_ref_head,
2855 refcount_inc(&head->refs);
2856 spin_unlock(&delayed_refs->lock);
2858 /* Mutex was contended, block until it's released and retry. */
2859 mutex_lock(&head->mutex);
2860 mutex_unlock(&head->mutex);
2862 btrfs_put_delayed_ref_head(head);
2870 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2871 u64 bytenr, u64 num_bytes, u64 flags,
2872 int level, int is_data)
2874 struct btrfs_delayed_extent_op *extent_op;
2877 extent_op = btrfs_alloc_delayed_extent_op();
2881 extent_op->flags_to_set = flags;
2882 extent_op->update_flags = true;
2883 extent_op->update_key = false;
2884 extent_op->is_data = is_data ? true : false;
2885 extent_op->level = level;
2887 ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
2889 btrfs_free_delayed_extent_op(extent_op);
2893 static noinline int check_delayed_ref(struct btrfs_root *root,
2894 struct btrfs_path *path,
2895 u64 objectid, u64 offset, u64 bytenr)
2897 struct btrfs_delayed_ref_head *head;
2898 struct btrfs_delayed_ref_node *ref;
2899 struct btrfs_delayed_data_ref *data_ref;
2900 struct btrfs_delayed_ref_root *delayed_refs;
2901 struct btrfs_transaction *cur_trans;
2902 struct rb_node *node;
2905 spin_lock(&root->fs_info->trans_lock);
2906 cur_trans = root->fs_info->running_transaction;
2908 refcount_inc(&cur_trans->use_count);
2909 spin_unlock(&root->fs_info->trans_lock);
2913 delayed_refs = &cur_trans->delayed_refs;
2914 spin_lock(&delayed_refs->lock);
2915 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
2917 spin_unlock(&delayed_refs->lock);
2918 btrfs_put_transaction(cur_trans);
2922 if (!mutex_trylock(&head->mutex)) {
2923 refcount_inc(&head->refs);
2924 spin_unlock(&delayed_refs->lock);
2926 btrfs_release_path(path);
2929 * Mutex was contended, block until it's released and let
2932 mutex_lock(&head->mutex);
2933 mutex_unlock(&head->mutex);
2934 btrfs_put_delayed_ref_head(head);
2935 btrfs_put_transaction(cur_trans);
2938 spin_unlock(&delayed_refs->lock);
2940 spin_lock(&head->lock);
2942 * XXX: We should replace this with a proper search function in the
2945 for (node = rb_first_cached(&head->ref_tree); node;
2946 node = rb_next(node)) {
2947 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
2948 /* If it's a shared ref we know a cross reference exists */
2949 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
2954 data_ref = btrfs_delayed_node_to_data_ref(ref);
2957 * If our ref doesn't match the one we're currently looking at
2958 * then we have a cross reference.
2960 if (data_ref->root != root->root_key.objectid ||
2961 data_ref->objectid != objectid ||
2962 data_ref->offset != offset) {
2967 spin_unlock(&head->lock);
2968 mutex_unlock(&head->mutex);
2969 btrfs_put_transaction(cur_trans);
2973 static noinline int check_committed_ref(struct btrfs_root *root,
2974 struct btrfs_path *path,
2975 u64 objectid, u64 offset, u64 bytenr)
2977 struct btrfs_fs_info *fs_info = root->fs_info;
2978 struct btrfs_root *extent_root = fs_info->extent_root;
2979 struct extent_buffer *leaf;
2980 struct btrfs_extent_data_ref *ref;
2981 struct btrfs_extent_inline_ref *iref;
2982 struct btrfs_extent_item *ei;
2983 struct btrfs_key key;
2988 key.objectid = bytenr;
2989 key.offset = (u64)-1;
2990 key.type = BTRFS_EXTENT_ITEM_KEY;
2992 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2995 BUG_ON(ret == 0); /* Corruption */
2998 if (path->slots[0] == 0)
3002 leaf = path->nodes[0];
3003 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3005 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
3009 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3010 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
3012 if (item_size != sizeof(*ei) +
3013 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
3016 if (btrfs_extent_generation(leaf, ei) <=
3017 btrfs_root_last_snapshot(&root->root_item))
3020 iref = (struct btrfs_extent_inline_ref *)(ei + 1);
3022 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
3023 if (type != BTRFS_EXTENT_DATA_REF_KEY)
3026 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
3027 if (btrfs_extent_refs(leaf, ei) !=
3028 btrfs_extent_data_ref_count(leaf, ref) ||
3029 btrfs_extent_data_ref_root(leaf, ref) !=
3030 root->root_key.objectid ||
3031 btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
3032 btrfs_extent_data_ref_offset(leaf, ref) != offset)
3040 int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
3043 struct btrfs_path *path;
3046 path = btrfs_alloc_path();
3051 ret = check_committed_ref(root, path, objectid,
3053 if (ret && ret != -ENOENT)
3056 ret = check_delayed_ref(root, path, objectid, offset, bytenr);
3057 } while (ret == -EAGAIN);
3060 btrfs_free_path(path);
3061 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
3066 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3067 struct btrfs_root *root,
3068 struct extent_buffer *buf,
3069 int full_backref, int inc)
3071 struct btrfs_fs_info *fs_info = root->fs_info;
3077 struct btrfs_key key;
3078 struct btrfs_file_extent_item *fi;
3079 struct btrfs_ref generic_ref = { 0 };
3080 bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
3086 if (btrfs_is_testing(fs_info))
3089 ref_root = btrfs_header_owner(buf);
3090 nritems = btrfs_header_nritems(buf);
3091 level = btrfs_header_level(buf);
3093 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
3097 parent = buf->start;
3101 action = BTRFS_ADD_DELAYED_REF;
3103 action = BTRFS_DROP_DELAYED_REF;
3105 for (i = 0; i < nritems; i++) {
3107 btrfs_item_key_to_cpu(buf, &key, i);
3108 if (key.type != BTRFS_EXTENT_DATA_KEY)
3110 fi = btrfs_item_ptr(buf, i,
3111 struct btrfs_file_extent_item);
3112 if (btrfs_file_extent_type(buf, fi) ==
3113 BTRFS_FILE_EXTENT_INLINE)
3115 bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
3119 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
3120 key.offset -= btrfs_file_extent_offset(buf, fi);
3121 btrfs_init_generic_ref(&generic_ref, action, bytenr,
3123 generic_ref.real_root = root->root_key.objectid;
3124 btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
3126 generic_ref.skip_qgroup = for_reloc;
3128 ret = btrfs_inc_extent_ref(trans, &generic_ref);
3130 ret = btrfs_free_extent(trans, &generic_ref);
3134 bytenr = btrfs_node_blockptr(buf, i);
3135 num_bytes = fs_info->nodesize;
3136 btrfs_init_generic_ref(&generic_ref, action, bytenr,
3138 generic_ref.real_root = root->root_key.objectid;
3139 btrfs_init_tree_ref(&generic_ref, level - 1, ref_root);
3140 generic_ref.skip_qgroup = for_reloc;
3142 ret = btrfs_inc_extent_ref(trans, &generic_ref);
3144 ret = btrfs_free_extent(trans, &generic_ref);
3154 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3155 struct extent_buffer *buf, int full_backref)
3157 return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
3160 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3161 struct extent_buffer *buf, int full_backref)
3163 return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
3166 static int write_one_cache_group(struct btrfs_trans_handle *trans,
3167 struct btrfs_path *path,
3168 struct btrfs_block_group_cache *cache)
3170 struct btrfs_fs_info *fs_info = trans->fs_info;
3172 struct btrfs_root *extent_root = fs_info->extent_root;
3174 struct extent_buffer *leaf;
3176 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
3183 leaf = path->nodes[0];
3184 bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3185 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
3186 btrfs_mark_buffer_dirty(leaf);
3188 btrfs_release_path(path);
3193 static struct btrfs_block_group_cache *next_block_group(
3194 struct btrfs_block_group_cache *cache)
3196 struct btrfs_fs_info *fs_info = cache->fs_info;
3197 struct rb_node *node;
3199 spin_lock(&fs_info->block_group_cache_lock);
3201 /* If our block group was removed, we need a full search. */
3202 if (RB_EMPTY_NODE(&cache->cache_node)) {
3203 const u64 next_bytenr = cache->key.objectid + cache->key.offset;
3205 spin_unlock(&fs_info->block_group_cache_lock);
3206 btrfs_put_block_group(cache);
3207 cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
3209 node = rb_next(&cache->cache_node);
3210 btrfs_put_block_group(cache);
3212 cache = rb_entry(node, struct btrfs_block_group_cache,
3214 btrfs_get_block_group(cache);
3217 spin_unlock(&fs_info->block_group_cache_lock);
3221 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
3222 struct btrfs_trans_handle *trans,
3223 struct btrfs_path *path)
3225 struct btrfs_fs_info *fs_info = block_group->fs_info;
3226 struct btrfs_root *root = fs_info->tree_root;
3227 struct inode *inode = NULL;
3228 struct extent_changeset *data_reserved = NULL;
3230 int dcs = BTRFS_DC_ERROR;
3236 * If this block group is smaller than 100 megs don't bother caching the
3239 if (block_group->key.offset < (100 * SZ_1M)) {
3240 spin_lock(&block_group->lock);
3241 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3242 spin_unlock(&block_group->lock);
3249 inode = lookup_free_space_inode(block_group, path);
3250 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3251 ret = PTR_ERR(inode);
3252 btrfs_release_path(path);
3256 if (IS_ERR(inode)) {
3260 if (block_group->ro)
3263 ret = create_free_space_inode(trans, block_group, path);
3270 * We want to set the generation to 0, that way if anything goes wrong
3271 * from here on out we know not to trust this cache when we load up next
3274 BTRFS_I(inode)->generation = 0;
3275 ret = btrfs_update_inode(trans, root, inode);
3278 * So theoretically we could recover from this, simply set the
3279 * super cache generation to 0 so we know to invalidate the
3280 * cache, but then we'd have to keep track of the block groups
3281 * that fail this way so we know we _have_ to reset this cache
3282 * before the next commit or risk reading stale cache. So to
3283 * limit our exposure to horrible edge cases lets just abort the
3284 * transaction, this only happens in really bad situations
3287 btrfs_abort_transaction(trans, ret);
3292 /* We've already setup this transaction, go ahead and exit */
3293 if (block_group->cache_generation == trans->transid &&
3294 i_size_read(inode)) {
3295 dcs = BTRFS_DC_SETUP;
3299 if (i_size_read(inode) > 0) {
3300 ret = btrfs_check_trunc_cache_free_space(fs_info,
3301 &fs_info->global_block_rsv);
3305 ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
3310 spin_lock(&block_group->lock);
3311 if (block_group->cached != BTRFS_CACHE_FINISHED ||
3312 !btrfs_test_opt(fs_info, SPACE_CACHE)) {
3314 * don't bother trying to write stuff out _if_
3315 * a) we're not cached,
3316 * b) we're with nospace_cache mount option,
3317 * c) we're with v2 space_cache (FREE_SPACE_TREE).
3319 dcs = BTRFS_DC_WRITTEN;
3320 spin_unlock(&block_group->lock);
3323 spin_unlock(&block_group->lock);
3326 * We hit an ENOSPC when setting up the cache in this transaction, just
3327 * skip doing the setup, we've already cleared the cache so we're safe.
3329 if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
3335 * Try to preallocate enough space based on how big the block group is.
3336 * Keep in mind this has to include any pinned space which could end up
3337 * taking up quite a bit since it's not folded into the other space
3340 num_pages = div_u64(block_group->key.offset, SZ_256M);
3345 num_pages *= PAGE_SIZE;
3347 ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
3351 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3352 num_pages, num_pages,
3355 * Our cache requires contiguous chunks so that we don't modify a bunch
3356 * of metadata or split extents when writing the cache out, which means
3357 * we can enospc if we are heavily fragmented in addition to just normal
3358 * out of space conditions. So if we hit this just skip setting up any
3359 * other block groups for this transaction, maybe we'll unpin enough
3360 * space the next time around.
3363 dcs = BTRFS_DC_SETUP;
3364 else if (ret == -ENOSPC)
3365 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
3370 btrfs_release_path(path);
3372 spin_lock(&block_group->lock);
3373 if (!ret && dcs == BTRFS_DC_SETUP)
3374 block_group->cache_generation = trans->transid;
3375 block_group->disk_cache_state = dcs;
3376 spin_unlock(&block_group->lock);
3378 extent_changeset_free(data_reserved);
3382 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
3384 struct btrfs_fs_info *fs_info = trans->fs_info;
3385 struct btrfs_block_group_cache *cache, *tmp;
3386 struct btrfs_transaction *cur_trans = trans->transaction;
3387 struct btrfs_path *path;
3389 if (list_empty(&cur_trans->dirty_bgs) ||
3390 !btrfs_test_opt(fs_info, SPACE_CACHE))
3393 path = btrfs_alloc_path();
3397 /* Could add new block groups, use _safe just in case */
3398 list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
3400 if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3401 cache_save_setup(cache, trans, path);
3404 btrfs_free_path(path);
3409 * transaction commit does final block group cache writeback during a
3410 * critical section where nothing is allowed to change the FS. This is
3411 * required in order for the cache to actually match the block group,
3412 * but can introduce a lot of latency into the commit.
3414 * So, btrfs_start_dirty_block_groups is here to kick off block group
3415 * cache IO. There's a chance we'll have to redo some of it if the
3416 * block group changes again during the commit, but it greatly reduces
3417 * the commit latency by getting rid of the easy block groups while
3418 * we're still allowing others to join the commit.
3420 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
3422 struct btrfs_fs_info *fs_info = trans->fs_info;
3423 struct btrfs_block_group_cache *cache;
3424 struct btrfs_transaction *cur_trans = trans->transaction;
3427 struct btrfs_path *path = NULL;
3429 struct list_head *io = &cur_trans->io_bgs;
3430 int num_started = 0;
3433 spin_lock(&cur_trans->dirty_bgs_lock);
3434 if (list_empty(&cur_trans->dirty_bgs)) {
3435 spin_unlock(&cur_trans->dirty_bgs_lock);
3438 list_splice_init(&cur_trans->dirty_bgs, &dirty);
3439 spin_unlock(&cur_trans->dirty_bgs_lock);
3443 * make sure all the block groups on our dirty list actually
3446 btrfs_create_pending_block_groups(trans);
3449 path = btrfs_alloc_path();
3455 * cache_write_mutex is here only to save us from balance or automatic
3456 * removal of empty block groups deleting this block group while we are
3457 * writing out the cache
3459 mutex_lock(&trans->transaction->cache_write_mutex);
3460 while (!list_empty(&dirty)) {
3461 bool drop_reserve = true;
3463 cache = list_first_entry(&dirty,
3464 struct btrfs_block_group_cache,
3467 * this can happen if something re-dirties a block
3468 * group that is already under IO. Just wait for it to
3469 * finish and then do it all again
3471 if (!list_empty(&cache->io_list)) {
3472 list_del_init(&cache->io_list);
3473 btrfs_wait_cache_io(trans, cache, path);
3474 btrfs_put_block_group(cache);
3479 * btrfs_wait_cache_io uses the cache->dirty_list to decide
3480 * if it should update the cache_state. Don't delete
3481 * until after we wait.
3483 * Since we're not running in the commit critical section
3484 * we need the dirty_bgs_lock to protect from update_block_group
3486 spin_lock(&cur_trans->dirty_bgs_lock);
3487 list_del_init(&cache->dirty_list);
3488 spin_unlock(&cur_trans->dirty_bgs_lock);
3492 cache_save_setup(cache, trans, path);
3494 if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3495 cache->io_ctl.inode = NULL;
3496 ret = btrfs_write_out_cache(trans, cache, path);
3497 if (ret == 0 && cache->io_ctl.inode) {
3502 * The cache_write_mutex is protecting the
3503 * io_list, also refer to the definition of
3504 * btrfs_transaction::io_bgs for more details
3506 list_add_tail(&cache->io_list, io);
3509 * if we failed to write the cache, the
3510 * generation will be bad and life goes on
3516 ret = write_one_cache_group(trans, path, cache);
3518 * Our block group might still be attached to the list
3519 * of new block groups in the transaction handle of some
3520 * other task (struct btrfs_trans_handle->new_bgs). This
3521 * means its block group item isn't yet in the extent
3522 * tree. If this happens ignore the error, as we will
3523 * try again later in the critical section of the
3524 * transaction commit.
3526 if (ret == -ENOENT) {
3528 spin_lock(&cur_trans->dirty_bgs_lock);
3529 if (list_empty(&cache->dirty_list)) {
3530 list_add_tail(&cache->dirty_list,
3531 &cur_trans->dirty_bgs);
3532 btrfs_get_block_group(cache);
3533 drop_reserve = false;
3535 spin_unlock(&cur_trans->dirty_bgs_lock);
3537 btrfs_abort_transaction(trans, ret);
3541 /* if it's not on the io list, we need to put the block group */
3543 btrfs_put_block_group(cache);
3545 btrfs_delayed_refs_rsv_release(fs_info, 1);
3551 * Avoid blocking other tasks for too long. It might even save
3552 * us from writing caches for block groups that are going to be
3555 mutex_unlock(&trans->transaction->cache_write_mutex);
3556 mutex_lock(&trans->transaction->cache_write_mutex);
3558 mutex_unlock(&trans->transaction->cache_write_mutex);
3561 * go through delayed refs for all the stuff we've just kicked off
3562 * and then loop back (just once)
3564 ret = btrfs_run_delayed_refs(trans, 0);
3565 if (!ret && loops == 0) {
3567 spin_lock(&cur_trans->dirty_bgs_lock);
3568 list_splice_init(&cur_trans->dirty_bgs, &dirty);
3570 * dirty_bgs_lock protects us from concurrent block group
3571 * deletes too (not just cache_write_mutex).
3573 if (!list_empty(&dirty)) {
3574 spin_unlock(&cur_trans->dirty_bgs_lock);
3577 spin_unlock(&cur_trans->dirty_bgs_lock);
3578 } else if (ret < 0) {
3579 btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
3582 btrfs_free_path(path);
3586 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
3588 struct btrfs_fs_info *fs_info = trans->fs_info;
3589 struct btrfs_block_group_cache *cache;
3590 struct btrfs_transaction *cur_trans = trans->transaction;
3593 struct btrfs_path *path;
3594 struct list_head *io = &cur_trans->io_bgs;
3595 int num_started = 0;
3597 path = btrfs_alloc_path();
3602 * Even though we are in the critical section of the transaction commit,
3603 * we can still have concurrent tasks adding elements to this
3604 * transaction's list of dirty block groups. These tasks correspond to
3605 * endio free space workers started when writeback finishes for a
3606 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
3607 * allocate new block groups as a result of COWing nodes of the root
3608 * tree when updating the free space inode. The writeback for the space
3609 * caches is triggered by an earlier call to
3610 * btrfs_start_dirty_block_groups() and iterations of the following
3612 * Also we want to do the cache_save_setup first and then run the
3613 * delayed refs to make sure we have the best chance at doing this all
3616 spin_lock(&cur_trans->dirty_bgs_lock);
3617 while (!list_empty(&cur_trans->dirty_bgs)) {
3618 cache = list_first_entry(&cur_trans->dirty_bgs,
3619 struct btrfs_block_group_cache,
3623 * this can happen if cache_save_setup re-dirties a block
3624 * group that is already under IO. Just wait for it to
3625 * finish and then do it all again
3627 if (!list_empty(&cache->io_list)) {
3628 spin_unlock(&cur_trans->dirty_bgs_lock);
3629 list_del_init(&cache->io_list);
3630 btrfs_wait_cache_io(trans, cache, path);
3631 btrfs_put_block_group(cache);
3632 spin_lock(&cur_trans->dirty_bgs_lock);
3636 * don't remove from the dirty list until after we've waited
3639 list_del_init(&cache->dirty_list);
3640 spin_unlock(&cur_trans->dirty_bgs_lock);
3643 cache_save_setup(cache, trans, path);
3646 ret = btrfs_run_delayed_refs(trans,
3647 (unsigned long) -1);
3649 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3650 cache->io_ctl.inode = NULL;
3651 ret = btrfs_write_out_cache(trans, cache, path);
3652 if (ret == 0 && cache->io_ctl.inode) {
3655 list_add_tail(&cache->io_list, io);
3658 * if we failed to write the cache, the
3659 * generation will be bad and life goes on
3665 ret = write_one_cache_group(trans, path, cache);
3667 * One of the free space endio workers might have
3668 * created a new block group while updating a free space
3669 * cache's inode (at inode.c:btrfs_finish_ordered_io())
3670 * and hasn't released its transaction handle yet, in
3671 * which case the new block group is still attached to
3672 * its transaction handle and its creation has not
3673 * finished yet (no block group item in the extent tree
3674 * yet, etc). If this is the case, wait for all free
3675 * space endio workers to finish and retry. This is a
3676 * a very rare case so no need for a more efficient and
3679 if (ret == -ENOENT) {
3680 wait_event(cur_trans->writer_wait,
3681 atomic_read(&cur_trans->num_writers) == 1);
3682 ret = write_one_cache_group(trans, path, cache);
3685 btrfs_abort_transaction(trans, ret);
3688 /* if its not on the io list, we need to put the block group */
3690 btrfs_put_block_group(cache);
3691 btrfs_delayed_refs_rsv_release(fs_info, 1);
3692 spin_lock(&cur_trans->dirty_bgs_lock);
3694 spin_unlock(&cur_trans->dirty_bgs_lock);
3697 * Refer to the definition of io_bgs member for details why it's safe
3698 * to use it without any locking
3700 while (!list_empty(io)) {
3701 cache = list_first_entry(io, struct btrfs_block_group_cache,
3703 list_del_init(&cache->io_list);
3704 btrfs_wait_cache_io(trans, cache, path);
3705 btrfs_put_block_group(cache);
3708 btrfs_free_path(path);
3712 int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
3714 struct btrfs_block_group_cache *block_group;
3717 block_group = btrfs_lookup_block_group(fs_info, bytenr);
3718 if (!block_group || block_group->ro)
3721 btrfs_put_block_group(block_group);
3725 bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3727 struct btrfs_block_group_cache *bg;
3730 bg = btrfs_lookup_block_group(fs_info, bytenr);
3734 spin_lock(&bg->lock);
3738 atomic_inc(&bg->nocow_writers);
3739 spin_unlock(&bg->lock);
3741 /* no put on block group, done by btrfs_dec_nocow_writers */
3743 btrfs_put_block_group(bg);
3749 void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3751 struct btrfs_block_group_cache *bg;
3753 bg = btrfs_lookup_block_group(fs_info, bytenr);
3755 if (atomic_dec_and_test(&bg->nocow_writers))
3756 wake_up_var(&bg->nocow_writers);
3758 * Once for our lookup and once for the lookup done by a previous call
3759 * to btrfs_inc_nocow_writers()
3761 btrfs_put_block_group(bg);
3762 btrfs_put_block_group(bg);
3765 void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
3767 wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
3770 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3772 u64 extra_flags = chunk_to_extended(flags) &
3773 BTRFS_EXTENDED_PROFILE_MASK;
3775 write_seqlock(&fs_info->profiles_lock);
3776 if (flags & BTRFS_BLOCK_GROUP_DATA)
3777 fs_info->avail_data_alloc_bits |= extra_flags;
3778 if (flags & BTRFS_BLOCK_GROUP_METADATA)
3779 fs_info->avail_metadata_alloc_bits |= extra_flags;
3780 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3781 fs_info->avail_system_alloc_bits |= extra_flags;
3782 write_sequnlock(&fs_info->profiles_lock);
3786 * returns target flags in extended format or 0 if restripe for this
3787 * chunk_type is not in progress
3789 * should be called with balance_lock held
3791 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
3793 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3799 if (flags & BTRFS_BLOCK_GROUP_DATA &&
3800 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3801 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
3802 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
3803 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3804 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
3805 } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
3806 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3807 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
3814 * @flags: available profiles in extended format (see ctree.h)
3816 * Returns reduced profile in chunk format. If profile changing is in
3817 * progress (either running or paused) picks the target profile (if it's
3818 * already available), otherwise falls back to plain reducing.
3820 static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
3822 u64 num_devices = fs_info->fs_devices->rw_devices;
3828 * see if restripe for this chunk_type is in progress, if so
3829 * try to reduce to the target profile
3831 spin_lock(&fs_info->balance_lock);
3832 target = get_restripe_target(fs_info, flags);
3834 /* pick target profile only if it's already available */
3835 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
3836 spin_unlock(&fs_info->balance_lock);
3837 return extended_to_chunk(target);
3840 spin_unlock(&fs_info->balance_lock);
3842 /* First, mask out the RAID levels which aren't possible */
3843 for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
3844 if (num_devices >= btrfs_raid_array[raid_type].devs_min)
3845 allowed |= btrfs_raid_array[raid_type].bg_flag;
3849 if (allowed & BTRFS_BLOCK_GROUP_RAID6)
3850 allowed = BTRFS_BLOCK_GROUP_RAID6;
3851 else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
3852 allowed = BTRFS_BLOCK_GROUP_RAID5;
3853 else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
3854 allowed = BTRFS_BLOCK_GROUP_RAID10;
3855 else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
3856 allowed = BTRFS_BLOCK_GROUP_RAID1;
3857 else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
3858 allowed = BTRFS_BLOCK_GROUP_RAID0;
3860 flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
3862 return extended_to_chunk(flags | allowed);
3865 static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
3872 seq = read_seqbegin(&fs_info->profiles_lock);
3874 if (flags & BTRFS_BLOCK_GROUP_DATA)
3875 flags |= fs_info->avail_data_alloc_bits;
3876 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3877 flags |= fs_info->avail_system_alloc_bits;
3878 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3879 flags |= fs_info->avail_metadata_alloc_bits;
3880 } while (read_seqretry(&fs_info->profiles_lock, seq));
3882 return btrfs_reduce_alloc_profile(fs_info, flags);
3885 static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
3887 struct btrfs_fs_info *fs_info = root->fs_info;
3892 flags = BTRFS_BLOCK_GROUP_DATA;
3893 else if (root == fs_info->chunk_root)
3894 flags = BTRFS_BLOCK_GROUP_SYSTEM;
3896 flags = BTRFS_BLOCK_GROUP_METADATA;
3898 ret = get_alloc_profile(fs_info, flags);
3902 u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
3904 return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA);
3907 u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info)
3909 return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3912 u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
3914 return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3917 int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
3919 struct btrfs_root *root = inode->root;
3920 struct btrfs_fs_info *fs_info = root->fs_info;
3921 struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
3924 int need_commit = 2;
3925 int have_pinned_space;
3927 /* make sure bytes are sectorsize aligned */
3928 bytes = ALIGN(bytes, fs_info->sectorsize);
3930 if (btrfs_is_free_space_inode(inode)) {
3932 ASSERT(current->journal_info);
3936 /* make sure we have enough space to handle the data first */
3937 spin_lock(&data_sinfo->lock);
3938 used = btrfs_space_info_used(data_sinfo, true);
3940 if (used + bytes > data_sinfo->total_bytes) {
3941 struct btrfs_trans_handle *trans;
3944 * if we don't have enough free bytes in this space then we need
3945 * to alloc a new chunk.
3947 if (!data_sinfo->full) {
3950 data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
3951 spin_unlock(&data_sinfo->lock);
3953 alloc_target = btrfs_data_alloc_profile(fs_info);
3955 * It is ugly that we don't call nolock join
3956 * transaction for the free space inode case here.
3957 * But it is safe because we only do the data space
3958 * reservation for the free space cache in the
3959 * transaction context, the common join transaction
3960 * just increase the counter of the current transaction
3961 * handler, doesn't try to acquire the trans_lock of
3964 trans = btrfs_join_transaction(root);
3966 return PTR_ERR(trans);
3968 ret = btrfs_chunk_alloc(trans, alloc_target,
3969 CHUNK_ALLOC_NO_FORCE);
3970 btrfs_end_transaction(trans);
3975 have_pinned_space = 1;
3984 * If we don't have enough pinned space to deal with this
3985 * allocation, and no removed chunk in current transaction,
3986 * don't bother committing the transaction.
3988 have_pinned_space = __percpu_counter_compare(
3989 &data_sinfo->total_bytes_pinned,
3990 used + bytes - data_sinfo->total_bytes,
3991 BTRFS_TOTAL_BYTES_PINNED_BATCH);
3992 spin_unlock(&data_sinfo->lock);
3994 /* commit the current transaction and try again */
3999 if (need_commit > 0) {
4000 btrfs_start_delalloc_roots(fs_info, -1);
4001 btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
4005 trans = btrfs_join_transaction(root);
4007 return PTR_ERR(trans);
4008 if (have_pinned_space >= 0 ||
4009 test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
4010 &trans->transaction->flags) ||
4012 ret = btrfs_commit_transaction(trans);
4016 * The cleaner kthread might still be doing iput
4017 * operations. Wait for it to finish so that
4018 * more space is released. We don't need to
4019 * explicitly run the delayed iputs here because
4020 * the commit_transaction would have woken up
4023 ret = btrfs_wait_on_delayed_iputs(fs_info);
4028 btrfs_end_transaction(trans);
4032 trace_btrfs_space_reservation(fs_info,
4033 "space_info:enospc",
4034 data_sinfo->flags, bytes, 1);
4037 btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes);
4038 trace_btrfs_space_reservation(fs_info, "space_info",
4039 data_sinfo->flags, bytes, 1);
4040 spin_unlock(&data_sinfo->lock);
4045 int btrfs_check_data_free_space(struct inode *inode,
4046 struct extent_changeset **reserved, u64 start, u64 len)
4048 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4051 /* align the range */
4052 len = round_up(start + len, fs_info->sectorsize) -
4053 round_down(start, fs_info->sectorsize);
4054 start = round_down(start, fs_info->sectorsize);
4056 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len);
4060 /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
4061 ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
4063 btrfs_free_reserved_data_space_noquota(inode, start, len);
4070 * Called if we need to clear a data reservation for this inode
4071 * Normally in a error case.
4073 * This one will *NOT* use accurate qgroup reserved space API, just for case
4074 * which we can't sleep and is sure it won't affect qgroup reserved space.
4075 * Like clear_bit_hook().
4077 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
4080 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4081 struct btrfs_space_info *data_sinfo;
4083 /* Make sure the range is aligned to sectorsize */
4084 len = round_up(start + len, fs_info->sectorsize) -
4085 round_down(start, fs_info->sectorsize);
4086 start = round_down(start, fs_info->sectorsize);
4088 data_sinfo = fs_info->data_sinfo;
4089 spin_lock(&data_sinfo->lock);
4090 btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, -len);
4091 trace_btrfs_space_reservation(fs_info, "space_info",
4092 data_sinfo->flags, len, 0);
4093 spin_unlock(&data_sinfo->lock);
4097 * Called if we need to clear a data reservation for this inode
4098 * Normally in a error case.
4100 * This one will handle the per-inode data rsv map for accurate reserved
4103 void btrfs_free_reserved_data_space(struct inode *inode,
4104 struct extent_changeset *reserved, u64 start, u64 len)
4106 struct btrfs_root *root = BTRFS_I(inode)->root;
4108 /* Make sure the range is aligned to sectorsize */
4109 len = round_up(start + len, root->fs_info->sectorsize) -
4110 round_down(start, root->fs_info->sectorsize);
4111 start = round_down(start, root->fs_info->sectorsize);
4113 btrfs_free_reserved_data_space_noquota(inode, start, len);
4114 btrfs_qgroup_free_data(inode, reserved, start, len);
4117 static void force_metadata_allocation(struct btrfs_fs_info *info)
4119 struct list_head *head = &info->space_info;
4120 struct btrfs_space_info *found;
4123 list_for_each_entry_rcu(found, head, list) {
4124 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
4125 found->force_alloc = CHUNK_ALLOC_FORCE;
4130 static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
4131 struct btrfs_space_info *sinfo, int force)
4133 u64 bytes_used = btrfs_space_info_used(sinfo, false);
4136 if (force == CHUNK_ALLOC_FORCE)
4140 * in limited mode, we want to have some free space up to
4141 * about 1% of the FS size.
4143 if (force == CHUNK_ALLOC_LIMITED) {
4144 thresh = btrfs_super_total_bytes(fs_info->super_copy);
4145 thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
4147 if (sinfo->total_bytes - bytes_used < thresh)
4151 if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
4156 static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
4160 num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
4162 num_dev = fs_info->fs_devices->rw_devices;
4168 * If @is_allocation is true, reserve space in the system space info necessary
4169 * for allocating a chunk, otherwise if it's false, reserve space necessary for
4172 void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
4174 struct btrfs_fs_info *fs_info = trans->fs_info;
4175 struct btrfs_space_info *info;
4182 * Needed because we can end up allocating a system chunk and for an
4183 * atomic and race free space reservation in the chunk block reserve.
4185 lockdep_assert_held(&fs_info->chunk_mutex);
4187 info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4188 spin_lock(&info->lock);
4189 left = info->total_bytes - btrfs_space_info_used(info, true);
4190 spin_unlock(&info->lock);
4192 num_devs = get_profile_num_devs(fs_info, type);
4194 /* num_devs device items to update and 1 chunk item to add or remove */
4195 thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) +
4196 btrfs_calc_trans_metadata_size(fs_info, 1);
4198 if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4199 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
4200 left, thresh, type);
4201 dump_space_info(fs_info, info, 0, 0);
4204 if (left < thresh) {
4205 u64 flags = btrfs_system_alloc_profile(fs_info);
4208 * Ignore failure to create system chunk. We might end up not
4209 * needing it, as we might not need to COW all nodes/leafs from
4210 * the paths we visit in the chunk tree (they were already COWed
4211 * or created in the current transaction for example).
4213 ret = btrfs_alloc_chunk(trans, flags);
4217 ret = btrfs_block_rsv_add(fs_info->chunk_root,
4218 &fs_info->chunk_block_rsv,
4219 thresh, BTRFS_RESERVE_NO_FLUSH);
4221 trans->chunk_bytes_reserved += thresh;
4226 * If force is CHUNK_ALLOC_FORCE:
4227 * - return 1 if it successfully allocates a chunk,
4228 * - return errors including -ENOSPC otherwise.
4229 * If force is NOT CHUNK_ALLOC_FORCE:
4230 * - return 0 if it doesn't need to allocate a new chunk,
4231 * - return 1 if it successfully allocates a chunk,
4232 * - return errors including -ENOSPC otherwise.
4234 int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
4235 enum btrfs_chunk_alloc_enum force)
4237 struct btrfs_fs_info *fs_info = trans->fs_info;
4238 struct btrfs_space_info *space_info;
4239 bool wait_for_alloc = false;
4240 bool should_alloc = false;
4243 /* Don't re-enter if we're already allocating a chunk */
4244 if (trans->allocating_chunk)
4247 space_info = btrfs_find_space_info(fs_info, flags);
4251 spin_lock(&space_info->lock);
4252 if (force < space_info->force_alloc)
4253 force = space_info->force_alloc;
4254 should_alloc = should_alloc_chunk(fs_info, space_info, force);
4255 if (space_info->full) {
4256 /* No more free physical space */
4261 spin_unlock(&space_info->lock);
4263 } else if (!should_alloc) {
4264 spin_unlock(&space_info->lock);
4266 } else if (space_info->chunk_alloc) {
4268 * Someone is already allocating, so we need to block
4269 * until this someone is finished and then loop to
4270 * recheck if we should continue with our allocation
4273 wait_for_alloc = true;
4274 spin_unlock(&space_info->lock);
4275 mutex_lock(&fs_info->chunk_mutex);
4276 mutex_unlock(&fs_info->chunk_mutex);
4278 /* Proceed with allocation */
4279 space_info->chunk_alloc = 1;
4280 wait_for_alloc = false;
4281 spin_unlock(&space_info->lock);
4285 } while (wait_for_alloc);
4287 mutex_lock(&fs_info->chunk_mutex);
4288 trans->allocating_chunk = true;
4291 * If we have mixed data/metadata chunks we want to make sure we keep
4292 * allocating mixed chunks instead of individual chunks.
4294 if (btrfs_mixed_space_info(space_info))
4295 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
4298 * if we're doing a data chunk, go ahead and make sure that
4299 * we keep a reasonable number of metadata chunks allocated in the
4302 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
4303 fs_info->data_chunk_allocations++;
4304 if (!(fs_info->data_chunk_allocations %
4305 fs_info->metadata_ratio))
4306 force_metadata_allocation(fs_info);
4310 * Check if we have enough space in SYSTEM chunk because we may need
4311 * to update devices.
4313 check_system_chunk(trans, flags);
4315 ret = btrfs_alloc_chunk(trans, flags);
4316 trans->allocating_chunk = false;
4318 spin_lock(&space_info->lock);
4321 space_info->full = 1;
4326 space_info->max_extent_size = 0;
4329 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4331 space_info->chunk_alloc = 0;
4332 spin_unlock(&space_info->lock);
4333 mutex_unlock(&fs_info->chunk_mutex);
4335 * When we allocate a new chunk we reserve space in the chunk block
4336 * reserve to make sure we can COW nodes/leafs in the chunk tree or
4337 * add new nodes/leafs to it if we end up needing to do it when
4338 * inserting the chunk item and updating device items as part of the
4339 * second phase of chunk allocation, performed by
4340 * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
4341 * large number of new block groups to create in our transaction
4342 * handle's new_bgs list to avoid exhausting the chunk block reserve
4343 * in extreme cases - like having a single transaction create many new
4344 * block groups when starting to write out the free space caches of all
4345 * the block groups that were made dirty during the lifetime of the
4348 if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
4349 btrfs_create_pending_block_groups(trans);
4354 static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
4355 unsigned long nr_pages, int nr_items)
4357 struct super_block *sb = fs_info->sb;
4359 if (down_read_trylock(&sb->s_umount)) {
4360 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
4361 up_read(&sb->s_umount);
4364 * We needn't worry the filesystem going from r/w to r/o though
4365 * we don't acquire ->s_umount mutex, because the filesystem
4366 * should guarantee the delalloc inodes list be empty after
4367 * the filesystem is readonly(all dirty pages are written to
4370 btrfs_start_delalloc_roots(fs_info, nr_items);
4371 if (!current->journal_info)
4372 btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
4376 static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
4382 bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
4383 nr = div64_u64(to_reclaim, bytes);
4389 #define EXTENT_SIZE_PER_ITEM SZ_256K
4392 * shrink metadata reservation for delalloc
4394 static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
4395 u64 orig, bool wait_ordered)
4397 struct btrfs_space_info *space_info;
4398 struct btrfs_trans_handle *trans;
4404 unsigned long nr_pages;
4407 /* Calc the number of the pages we need flush for space reservation */
4408 items = calc_reclaim_items_nr(fs_info, to_reclaim);
4409 to_reclaim = items * EXTENT_SIZE_PER_ITEM;
4411 trans = (struct btrfs_trans_handle *)current->journal_info;
4412 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4414 delalloc_bytes = percpu_counter_sum_positive(
4415 &fs_info->delalloc_bytes);
4416 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
4417 if (delalloc_bytes == 0 && dio_bytes == 0) {
4421 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
4426 * If we are doing more ordered than delalloc we need to just wait on
4427 * ordered extents, otherwise we'll waste time trying to flush delalloc
4428 * that likely won't give us the space back we need.
4430 if (dio_bytes > delalloc_bytes)
4431 wait_ordered = true;
4434 while ((delalloc_bytes || dio_bytes) && loops < 3) {
4435 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
4438 * Triggers inode writeback for up to nr_pages. This will invoke
4439 * ->writepages callback and trigger delalloc filling
4440 * (btrfs_run_delalloc_range()).
4442 btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
4445 * We need to wait for the compressed pages to start before
4448 async_pages = atomic_read(&fs_info->async_delalloc_pages);
4453 * Calculate how many compressed pages we want to be written
4454 * before we continue. I.e if there are more async pages than we
4455 * require wait_event will wait until nr_pages are written.
4457 if (async_pages <= nr_pages)
4460 async_pages -= nr_pages;
4462 wait_event(fs_info->async_submit_wait,
4463 atomic_read(&fs_info->async_delalloc_pages) <=
4466 spin_lock(&space_info->lock);
4467 if (list_empty(&space_info->tickets) &&
4468 list_empty(&space_info->priority_tickets)) {
4469 spin_unlock(&space_info->lock);
4472 spin_unlock(&space_info->lock);
4475 if (wait_ordered && !trans) {
4476 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
4478 time_left = schedule_timeout_killable(1);
4482 delalloc_bytes = percpu_counter_sum_positive(
4483 &fs_info->delalloc_bytes);
4484 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
4488 struct reserve_ticket {
4492 struct list_head list;
4493 wait_queue_head_t wait;
4497 * maybe_commit_transaction - possibly commit the transaction if its ok to
4498 * @root - the root we're allocating for
4499 * @bytes - the number of bytes we want to reserve
4500 * @force - force the commit
4502 * This will check to make sure that committing the transaction will actually
4503 * get us somewhere and then commit the transaction if it does. Otherwise it
4504 * will return -ENOSPC.
4506 static int may_commit_transaction(struct btrfs_fs_info *fs_info,
4507 struct btrfs_space_info *space_info)
4509 struct reserve_ticket *ticket = NULL;
4510 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
4511 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
4512 struct btrfs_trans_handle *trans;
4514 u64 reclaim_bytes = 0;
4516 trans = (struct btrfs_trans_handle *)current->journal_info;
4520 spin_lock(&space_info->lock);
4521 if (!list_empty(&space_info->priority_tickets))
4522 ticket = list_first_entry(&space_info->priority_tickets,
4523 struct reserve_ticket, list);
4524 else if (!list_empty(&space_info->tickets))
4525 ticket = list_first_entry(&space_info->tickets,
4526 struct reserve_ticket, list);
4527 bytes_needed = (ticket) ? ticket->bytes : 0;
4528 spin_unlock(&space_info->lock);
4533 trans = btrfs_join_transaction(fs_info->extent_root);
4535 return PTR_ERR(trans);
4538 * See if there is enough pinned space to make this reservation, or if
4539 * we have block groups that are going to be freed, allowing us to
4540 * possibly do a chunk allocation the next loop through.
4542 if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
4543 __percpu_counter_compare(&space_info->total_bytes_pinned,
4545 BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
4549 * See if there is some space in the delayed insertion reservation for
4552 if (space_info != delayed_rsv->space_info)
4555 spin_lock(&delayed_rsv->lock);
4556 reclaim_bytes += delayed_rsv->reserved;
4557 spin_unlock(&delayed_rsv->lock);
4559 spin_lock(&delayed_refs_rsv->lock);
4560 reclaim_bytes += delayed_refs_rsv->reserved;
4561 spin_unlock(&delayed_refs_rsv->lock);
4562 if (reclaim_bytes >= bytes_needed)
4564 bytes_needed -= reclaim_bytes;
4566 if (__percpu_counter_compare(&space_info->total_bytes_pinned,
4568 BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
4572 return btrfs_commit_transaction(trans);
4574 btrfs_end_transaction(trans);
4579 * Try to flush some data based on policy set by @state. This is only advisory
4580 * and may fail for various reasons. The caller is supposed to examine the
4581 * state of @space_info to detect the outcome.
4583 static void flush_space(struct btrfs_fs_info *fs_info,
4584 struct btrfs_space_info *space_info, u64 num_bytes,
4587 struct btrfs_root *root = fs_info->extent_root;
4588 struct btrfs_trans_handle *trans;
4593 case FLUSH_DELAYED_ITEMS_NR:
4594 case FLUSH_DELAYED_ITEMS:
4595 if (state == FLUSH_DELAYED_ITEMS_NR)
4596 nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
4600 trans = btrfs_join_transaction(root);
4601 if (IS_ERR(trans)) {
4602 ret = PTR_ERR(trans);
4605 ret = btrfs_run_delayed_items_nr(trans, nr);
4606 btrfs_end_transaction(trans);
4608 case FLUSH_DELALLOC:
4609 case FLUSH_DELALLOC_WAIT:
4610 shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
4611 state == FLUSH_DELALLOC_WAIT);
4613 case FLUSH_DELAYED_REFS_NR:
4614 case FLUSH_DELAYED_REFS:
4615 trans = btrfs_join_transaction(root);
4616 if (IS_ERR(trans)) {
4617 ret = PTR_ERR(trans);
4620 if (state == FLUSH_DELAYED_REFS_NR)
4621 nr = calc_reclaim_items_nr(fs_info, num_bytes);
4624 btrfs_run_delayed_refs(trans, nr);
4625 btrfs_end_transaction(trans);
4628 case ALLOC_CHUNK_FORCE:
4629 trans = btrfs_join_transaction(root);
4630 if (IS_ERR(trans)) {
4631 ret = PTR_ERR(trans);
4634 ret = btrfs_chunk_alloc(trans,
4635 btrfs_metadata_alloc_profile(fs_info),
4636 (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
4638 btrfs_end_transaction(trans);
4639 if (ret > 0 || ret == -ENOSPC)
4644 * If we have pending delayed iputs then we could free up a
4645 * bunch of pinned space, so make sure we run the iputs before
4646 * we do our pinned bytes check below.
4648 btrfs_run_delayed_iputs(fs_info);
4649 btrfs_wait_on_delayed_iputs(fs_info);
4651 ret = may_commit_transaction(fs_info, space_info);
4658 trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
4664 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
4665 struct btrfs_space_info *space_info,
4668 struct reserve_ticket *ticket;
4673 list_for_each_entry(ticket, &space_info->tickets, list)
4674 to_reclaim += ticket->bytes;
4675 list_for_each_entry(ticket, &space_info->priority_tickets, list)
4676 to_reclaim += ticket->bytes;
4680 to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
4681 if (btrfs_can_overcommit(fs_info, space_info, to_reclaim,
4682 BTRFS_RESERVE_FLUSH_ALL, system_chunk))
4685 used = btrfs_space_info_used(space_info, true);
4687 if (btrfs_can_overcommit(fs_info, space_info, SZ_1M,
4688 BTRFS_RESERVE_FLUSH_ALL, system_chunk))
4689 expected = div_factor_fine(space_info->total_bytes, 95);
4691 expected = div_factor_fine(space_info->total_bytes, 90);
4693 if (used > expected)
4694 to_reclaim = used - expected;
4697 to_reclaim = min(to_reclaim, space_info->bytes_may_use +
4698 space_info->bytes_reserved);
4702 static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
4703 struct btrfs_space_info *space_info,
4704 u64 used, bool system_chunk)
4706 u64 thresh = div_factor_fine(space_info->total_bytes, 98);
4708 /* If we're just plain full then async reclaim just slows us down. */
4709 if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
4712 if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
4716 return (used >= thresh && !btrfs_fs_closing(fs_info) &&
4717 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
4720 static bool wake_all_tickets(struct list_head *head)
4722 struct reserve_ticket *ticket;
4724 while (!list_empty(head)) {
4725 ticket = list_first_entry(head, struct reserve_ticket, list);
4726 list_del_init(&ticket->list);
4727 ticket->error = -ENOSPC;
4728 wake_up(&ticket->wait);
4729 if (ticket->bytes != ticket->orig_bytes)
4736 * This is for normal flushers, we can wait all goddamned day if we want to. We
4737 * will loop and continuously try to flush as long as we are making progress.
4738 * We count progress as clearing off tickets each time we have to loop.
4740 static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
4742 struct btrfs_fs_info *fs_info;
4743 struct btrfs_space_info *space_info;
4746 int commit_cycles = 0;
4747 u64 last_tickets_id;
4749 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
4750 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4752 spin_lock(&space_info->lock);
4753 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
4756 space_info->flush = 0;
4757 spin_unlock(&space_info->lock);
4760 last_tickets_id = space_info->tickets_id;
4761 spin_unlock(&space_info->lock);
4763 flush_state = FLUSH_DELAYED_ITEMS_NR;
4765 flush_space(fs_info, space_info, to_reclaim, flush_state);
4766 spin_lock(&space_info->lock);
4767 if (list_empty(&space_info->tickets)) {
4768 space_info->flush = 0;
4769 spin_unlock(&space_info->lock);
4772 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
4775 if (last_tickets_id == space_info->tickets_id) {
4778 last_tickets_id = space_info->tickets_id;
4779 flush_state = FLUSH_DELAYED_ITEMS_NR;
4785 * We don't want to force a chunk allocation until we've tried
4786 * pretty hard to reclaim space. Think of the case where we
4787 * freed up a bunch of space and so have a lot of pinned space
4788 * to reclaim. We would rather use that than possibly create a
4789 * underutilized metadata chunk. So if this is our first run
4790 * through the flushing state machine skip ALLOC_CHUNK_FORCE and
4791 * commit the transaction. If nothing has changed the next go
4792 * around then we can force a chunk allocation.
4794 if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
4797 if (flush_state > COMMIT_TRANS) {
4799 if (commit_cycles > 2) {
4800 if (wake_all_tickets(&space_info->tickets)) {
4801 flush_state = FLUSH_DELAYED_ITEMS_NR;
4804 space_info->flush = 0;
4807 flush_state = FLUSH_DELAYED_ITEMS_NR;
4810 spin_unlock(&space_info->lock);
4811 } while (flush_state <= COMMIT_TRANS);
4814 void btrfs_init_async_reclaim_work(struct work_struct *work)
4816 INIT_WORK(work, btrfs_async_reclaim_metadata_space);
4819 static const enum btrfs_flush_state priority_flush_states[] = {
4820 FLUSH_DELAYED_ITEMS_NR,
4821 FLUSH_DELAYED_ITEMS,
4825 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
4826 struct btrfs_space_info *space_info,
4827 struct reserve_ticket *ticket)
4832 spin_lock(&space_info->lock);
4833 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
4836 spin_unlock(&space_info->lock);
4839 spin_unlock(&space_info->lock);
4843 flush_space(fs_info, space_info, to_reclaim,
4844 priority_flush_states[flush_state]);
4846 spin_lock(&space_info->lock);
4847 if (ticket->bytes == 0) {
4848 spin_unlock(&space_info->lock);
4851 spin_unlock(&space_info->lock);
4852 } while (flush_state < ARRAY_SIZE(priority_flush_states));
4855 static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
4856 struct btrfs_space_info *space_info,
4857 struct reserve_ticket *ticket)
4861 u64 reclaim_bytes = 0;
4864 spin_lock(&space_info->lock);
4865 while (ticket->bytes > 0 && ticket->error == 0) {
4866 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
4871 spin_unlock(&space_info->lock);
4875 finish_wait(&ticket->wait, &wait);
4876 spin_lock(&space_info->lock);
4879 ret = ticket->error;
4880 if (!list_empty(&ticket->list))
4881 list_del_init(&ticket->list);
4882 if (ticket->bytes && ticket->bytes < ticket->orig_bytes)
4883 reclaim_bytes = ticket->orig_bytes - ticket->bytes;
4884 spin_unlock(&space_info->lock);
4887 btrfs_space_info_add_old_bytes(fs_info, space_info,
4893 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
4894 * @root - the root we're allocating for
4895 * @space_info - the space info we want to allocate from
4896 * @orig_bytes - the number of bytes we want
4897 * @flush - whether or not we can flush to make our reservation
4899 * This will reserve orig_bytes number of bytes from the space info associated
4900 * with the block_rsv. If there is not enough space it will make an attempt to
4901 * flush out space to make room. It will do this by flushing delalloc if
4902 * possible or committing the transaction. If flush is 0 then no attempts to
4903 * regain reservations will be made and this will fail if there is not enough
4906 static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
4907 struct btrfs_space_info *space_info,
4909 enum btrfs_reserve_flush_enum flush,
4912 struct reserve_ticket ticket;
4914 u64 reclaim_bytes = 0;
4918 ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
4920 spin_lock(&space_info->lock);
4922 used = btrfs_space_info_used(space_info, true);
4925 * If we have enough space then hooray, make our reservation and carry
4926 * on. If not see if we can overcommit, and if we can, hooray carry on.
4927 * If not things get more complicated.
4929 if (used + orig_bytes <= space_info->total_bytes) {
4930 btrfs_space_info_update_bytes_may_use(fs_info, space_info,
4932 trace_btrfs_space_reservation(fs_info, "space_info",
4933 space_info->flags, orig_bytes, 1);
4935 } else if (btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush,
4937 btrfs_space_info_update_bytes_may_use(fs_info, space_info,
4939 trace_btrfs_space_reservation(fs_info, "space_info",
4940 space_info->flags, orig_bytes, 1);
4945 * If we couldn't make a reservation then setup our reservation ticket
4946 * and kick the async worker if it's not already running.
4948 * If we are a priority flusher then we just need to add our ticket to
4949 * the list and we will do our own flushing further down.
4951 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
4952 ticket.orig_bytes = orig_bytes;
4953 ticket.bytes = orig_bytes;
4955 init_waitqueue_head(&ticket.wait);
4956 if (flush == BTRFS_RESERVE_FLUSH_ALL) {
4957 list_add_tail(&ticket.list, &space_info->tickets);
4958 if (!space_info->flush) {
4959 space_info->flush = 1;
4960 trace_btrfs_trigger_flush(fs_info,
4964 queue_work(system_unbound_wq,
4965 &fs_info->async_reclaim_work);
4968 list_add_tail(&ticket.list,
4969 &space_info->priority_tickets);
4971 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
4974 * We will do the space reservation dance during log replay,
4975 * which means we won't have fs_info->fs_root set, so don't do
4976 * the async reclaim as we will panic.
4978 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
4979 need_do_async_reclaim(fs_info, space_info,
4980 used, system_chunk) &&
4981 !work_busy(&fs_info->async_reclaim_work)) {
4982 trace_btrfs_trigger_flush(fs_info, space_info->flags,
4983 orig_bytes, flush, "preempt");
4984 queue_work(system_unbound_wq,
4985 &fs_info->async_reclaim_work);
4988 spin_unlock(&space_info->lock);
4989 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
4992 if (flush == BTRFS_RESERVE_FLUSH_ALL)
4993 return wait_reserve_ticket(fs_info, space_info, &ticket);
4996 priority_reclaim_metadata_space(fs_info, space_info, &ticket);
4997 spin_lock(&space_info->lock);
4999 if (ticket.bytes < orig_bytes)
5000 reclaim_bytes = orig_bytes - ticket.bytes;
5001 list_del_init(&ticket.list);
5004 spin_unlock(&space_info->lock);
5007 btrfs_space_info_add_old_bytes(fs_info, space_info,
5009 ASSERT(list_empty(&ticket.list));
5014 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5015 * @root - the root we're allocating for
5016 * @block_rsv - the block_rsv we're allocating for
5017 * @orig_bytes - the number of bytes we want
5018 * @flush - whether or not we can flush to make our reservation
5020 * This will reserve orig_bytes number of bytes from the space info associated
5021 * with the block_rsv. If there is not enough space it will make an attempt to
5022 * flush out space to make room. It will do this by flushing delalloc if
5023 * possible or committing the transaction. If flush is 0 then no attempts to
5024 * regain reservations will be made and this will fail if there is not enough
5027 static int reserve_metadata_bytes(struct btrfs_root *root,
5028 struct btrfs_block_rsv *block_rsv,
5030 enum btrfs_reserve_flush_enum flush)
5032 struct btrfs_fs_info *fs_info = root->fs_info;
5033 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5035 bool system_chunk = (root == fs_info->chunk_root);
5037 ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
5038 orig_bytes, flush, system_chunk);
5039 if (ret == -ENOSPC &&
5040 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
5041 if (block_rsv != global_rsv &&
5042 !block_rsv_use_bytes(global_rsv, orig_bytes))
5045 if (ret == -ENOSPC) {
5046 trace_btrfs_space_reservation(fs_info, "space_info:enospc",
5047 block_rsv->space_info->flags,
5050 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
5051 dump_space_info(fs_info, block_rsv->space_info,
5057 static struct btrfs_block_rsv *get_block_rsv(
5058 const struct btrfs_trans_handle *trans,
5059 const struct btrfs_root *root)
5061 struct btrfs_fs_info *fs_info = root->fs_info;
5062 struct btrfs_block_rsv *block_rsv = NULL;
5064 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
5065 (root == fs_info->csum_root && trans->adding_csums) ||
5066 (root == fs_info->uuid_root))
5067 block_rsv = trans->block_rsv;
5070 block_rsv = root->block_rsv;
5073 block_rsv = &fs_info->empty_block_rsv;
5078 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
5082 spin_lock(&block_rsv->lock);
5083 if (block_rsv->reserved >= num_bytes) {
5084 block_rsv->reserved -= num_bytes;
5085 if (block_rsv->reserved < block_rsv->size)
5086 block_rsv->full = 0;
5089 spin_unlock(&block_rsv->lock);
5093 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
5094 u64 num_bytes, bool update_size)
5096 spin_lock(&block_rsv->lock);
5097 block_rsv->reserved += num_bytes;
5099 block_rsv->size += num_bytes;
5100 else if (block_rsv->reserved >= block_rsv->size)
5101 block_rsv->full = 1;
5102 spin_unlock(&block_rsv->lock);
5105 int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
5106 struct btrfs_block_rsv *dest, u64 num_bytes,
5109 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5112 if (global_rsv->space_info != dest->space_info)
5115 spin_lock(&global_rsv->lock);
5116 min_bytes = div_factor(global_rsv->size, min_factor);
5117 if (global_rsv->reserved < min_bytes + num_bytes) {
5118 spin_unlock(&global_rsv->lock);
5121 global_rsv->reserved -= num_bytes;
5122 if (global_rsv->reserved < global_rsv->size)
5123 global_rsv->full = 0;
5124 spin_unlock(&global_rsv->lock);
5126 block_rsv_add_bytes(dest, num_bytes, true);
5131 * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv.
5132 * @fs_info - the fs info for our fs.
5133 * @src - the source block rsv to transfer from.
5134 * @num_bytes - the number of bytes to transfer.
5136 * This transfers up to the num_bytes amount from the src rsv to the
5137 * delayed_refs_rsv. Any extra bytes are returned to the space info.
5139 void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
5140 struct btrfs_block_rsv *src,
5143 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
5146 spin_lock(&src->lock);
5147 src->reserved -= num_bytes;
5148 src->size -= num_bytes;
5149 spin_unlock(&src->lock);
5151 spin_lock(&delayed_refs_rsv->lock);
5152 if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) {
5153 u64 delta = delayed_refs_rsv->size -
5154 delayed_refs_rsv->reserved;
5155 if (num_bytes > delta) {
5156 to_free = num_bytes - delta;
5160 to_free = num_bytes;
5165 delayed_refs_rsv->reserved += num_bytes;
5166 if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size)
5167 delayed_refs_rsv->full = 1;
5168 spin_unlock(&delayed_refs_rsv->lock);
5171 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
5174 btrfs_space_info_add_old_bytes(fs_info,
5175 delayed_refs_rsv->space_info, to_free);
5179 * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage.
5180 * @fs_info - the fs_info for our fs.
5181 * @flush - control how we can flush for this reservation.
5183 * This will refill the delayed block_rsv up to 1 items size worth of space and
5184 * will return -ENOSPC if we can't make the reservation.
5186 int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
5187 enum btrfs_reserve_flush_enum flush)
5189 struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
5190 u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1);
5194 spin_lock(&block_rsv->lock);
5195 if (block_rsv->reserved < block_rsv->size) {
5196 num_bytes = block_rsv->size - block_rsv->reserved;
5197 num_bytes = min(num_bytes, limit);
5199 spin_unlock(&block_rsv->lock);
5204 ret = reserve_metadata_bytes(fs_info->extent_root, block_rsv,
5208 block_rsv_add_bytes(block_rsv, num_bytes, 0);
5209 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
5215 * This is for space we already have accounted in space_info->bytes_may_use, so
5216 * basically when we're returning space from block_rsv's.
5218 void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
5219 struct btrfs_space_info *space_info,
5222 struct reserve_ticket *ticket;
5223 struct list_head *head;
5225 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
5226 bool check_overcommit = false;
5228 spin_lock(&space_info->lock);
5229 head = &space_info->priority_tickets;
5232 * If we are over our limit then we need to check and see if we can
5233 * overcommit, and if we can't then we just need to free up our space
5234 * and not satisfy any requests.
5236 used = btrfs_space_info_used(space_info, true);
5237 if (used - num_bytes >= space_info->total_bytes)
5238 check_overcommit = true;
5240 while (!list_empty(head) && num_bytes) {
5241 ticket = list_first_entry(head, struct reserve_ticket,
5244 * We use 0 bytes because this space is already reserved, so
5245 * adding the ticket space would be a double count.
5247 if (check_overcommit &&
5248 !btrfs_can_overcommit(fs_info, space_info, 0, flush, false))
5250 if (num_bytes >= ticket->bytes) {
5251 list_del_init(&ticket->list);
5252 num_bytes -= ticket->bytes;
5254 space_info->tickets_id++;
5255 wake_up(&ticket->wait);
5257 ticket->bytes -= num_bytes;
5262 if (num_bytes && head == &space_info->priority_tickets) {
5263 head = &space_info->tickets;
5264 flush = BTRFS_RESERVE_FLUSH_ALL;
5267 btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes);
5268 trace_btrfs_space_reservation(fs_info, "space_info",
5269 space_info->flags, num_bytes, 0);
5270 spin_unlock(&space_info->lock);
5274 * This is for newly allocated space that isn't accounted in
5275 * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent
5276 * we use this helper.
5278 void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
5279 struct btrfs_space_info *space_info,
5282 struct reserve_ticket *ticket;
5283 struct list_head *head = &space_info->priority_tickets;
5286 while (!list_empty(head) && num_bytes) {
5287 ticket = list_first_entry(head, struct reserve_ticket,
5289 if (num_bytes >= ticket->bytes) {
5290 trace_btrfs_space_reservation(fs_info, "space_info",
5293 list_del_init(&ticket->list);
5294 num_bytes -= ticket->bytes;
5295 btrfs_space_info_update_bytes_may_use(fs_info,
5296 space_info, ticket->bytes);
5298 space_info->tickets_id++;
5299 wake_up(&ticket->wait);
5301 trace_btrfs_space_reservation(fs_info, "space_info",
5304 btrfs_space_info_update_bytes_may_use(fs_info,
5305 space_info, num_bytes);
5306 ticket->bytes -= num_bytes;
5311 if (num_bytes && head == &space_info->priority_tickets) {
5312 head = &space_info->tickets;
5317 static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
5318 struct btrfs_block_rsv *block_rsv,
5319 struct btrfs_block_rsv *dest, u64 num_bytes,
5320 u64 *qgroup_to_release_ret)
5322 struct btrfs_space_info *space_info = block_rsv->space_info;
5323 u64 qgroup_to_release = 0;
5326 spin_lock(&block_rsv->lock);
5327 if (num_bytes == (u64)-1) {
5328 num_bytes = block_rsv->size;
5329 qgroup_to_release = block_rsv->qgroup_rsv_size;
5331 block_rsv->size -= num_bytes;
5332 if (block_rsv->reserved >= block_rsv->size) {
5333 num_bytes = block_rsv->reserved - block_rsv->size;
5334 block_rsv->reserved = block_rsv->size;
5335 block_rsv->full = 1;
5339 if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) {
5340 qgroup_to_release = block_rsv->qgroup_rsv_reserved -
5341 block_rsv->qgroup_rsv_size;
5342 block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size;
5344 qgroup_to_release = 0;
5346 spin_unlock(&block_rsv->lock);
5349 if (num_bytes > 0) {
5351 spin_lock(&dest->lock);
5355 bytes_to_add = dest->size - dest->reserved;
5356 bytes_to_add = min(num_bytes, bytes_to_add);
5357 dest->reserved += bytes_to_add;
5358 if (dest->reserved >= dest->size)
5360 num_bytes -= bytes_to_add;
5362 spin_unlock(&dest->lock);
5365 btrfs_space_info_add_old_bytes(fs_info, space_info,
5368 if (qgroup_to_release_ret)
5369 *qgroup_to_release_ret = qgroup_to_release;
5373 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
5374 struct btrfs_block_rsv *dst, u64 num_bytes,
5379 ret = block_rsv_use_bytes(src, num_bytes);
5383 block_rsv_add_bytes(dst, num_bytes, update_size);
5387 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
5389 memset(rsv, 0, sizeof(*rsv));
5390 spin_lock_init(&rsv->lock);
5394 void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
5395 struct btrfs_block_rsv *rsv,
5396 unsigned short type)
5398 btrfs_init_block_rsv(rsv, type);
5399 rsv->space_info = btrfs_find_space_info(fs_info,
5400 BTRFS_BLOCK_GROUP_METADATA);
5403 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
5404 unsigned short type)
5406 struct btrfs_block_rsv *block_rsv;
5408 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
5412 btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
5416 void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
5417 struct btrfs_block_rsv *rsv)
5421 btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
5425 int btrfs_block_rsv_add(struct btrfs_root *root,
5426 struct btrfs_block_rsv *block_rsv, u64 num_bytes,
5427 enum btrfs_reserve_flush_enum flush)
5434 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5436 block_rsv_add_bytes(block_rsv, num_bytes, true);
5441 int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
5449 spin_lock(&block_rsv->lock);
5450 num_bytes = div_factor(block_rsv->size, min_factor);
5451 if (block_rsv->reserved >= num_bytes)
5453 spin_unlock(&block_rsv->lock);
5458 int btrfs_block_rsv_refill(struct btrfs_root *root,
5459 struct btrfs_block_rsv *block_rsv, u64 min_reserved,
5460 enum btrfs_reserve_flush_enum flush)
5468 spin_lock(&block_rsv->lock);
5469 num_bytes = min_reserved;
5470 if (block_rsv->reserved >= num_bytes)
5473 num_bytes -= block_rsv->reserved;
5474 spin_unlock(&block_rsv->lock);
5479 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5481 block_rsv_add_bytes(block_rsv, num_bytes, false);
5488 static u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
5489 struct btrfs_block_rsv *block_rsv,
5490 u64 num_bytes, u64 *qgroup_to_release)
5492 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5493 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
5494 struct btrfs_block_rsv *target = delayed_rsv;
5496 if (target->full || target == block_rsv)
5497 target = global_rsv;
5499 if (block_rsv->space_info != target->space_info)
5502 return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes,
5506 void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
5507 struct btrfs_block_rsv *block_rsv,
5510 __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
5514 * btrfs_inode_rsv_release - release any excessive reservation.
5515 * @inode - the inode we need to release from.
5516 * @qgroup_free - free or convert qgroup meta.
5517 * Unlike normal operation, qgroup meta reservation needs to know if we are
5518 * freeing qgroup reservation or just converting it into per-trans. Normally
5519 * @qgroup_free is true for error handling, and false for normal release.
5521 * This is the same as btrfs_block_rsv_release, except that it handles the
5522 * tracepoint for the reservation.
5524 static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
5526 struct btrfs_fs_info *fs_info = inode->root->fs_info;
5527 struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5529 u64 qgroup_to_release = 0;
5532 * Since we statically set the block_rsv->size we just want to say we
5533 * are releasing 0 bytes, and then we'll just get the reservation over
5536 released = __btrfs_block_rsv_release(fs_info, block_rsv, 0,
5537 &qgroup_to_release);
5539 trace_btrfs_space_reservation(fs_info, "delalloc",
5540 btrfs_ino(inode), released, 0);
5542 btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release);
5544 btrfs_qgroup_convert_reserved_meta(inode->root,
5549 * btrfs_delayed_refs_rsv_release - release a ref head's reservation.
5550 * @fs_info - the fs_info for our fs.
5551 * @nr - the number of items to drop.
5553 * This drops the delayed ref head's count from the delayed refs rsv and frees
5554 * any excess reservation we had.
5556 void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
5558 struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
5559 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5560 u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr);
5563 released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv,
5566 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
5570 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
5572 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
5573 struct btrfs_space_info *sinfo = block_rsv->space_info;
5577 * The global block rsv is based on the size of the extent tree, the
5578 * checksum tree and the root tree. If the fs is empty we want to set
5579 * it to a minimal amount for safety.
5581 num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
5582 btrfs_root_used(&fs_info->csum_root->root_item) +
5583 btrfs_root_used(&fs_info->tree_root->root_item);
5584 num_bytes = max_t(u64, num_bytes, SZ_16M);
5586 spin_lock(&sinfo->lock);
5587 spin_lock(&block_rsv->lock);
5589 block_rsv->size = min_t(u64, num_bytes, SZ_512M);
5591 if (block_rsv->reserved < block_rsv->size) {
5592 num_bytes = btrfs_space_info_used(sinfo, true);
5593 if (sinfo->total_bytes > num_bytes) {
5594 num_bytes = sinfo->total_bytes - num_bytes;
5595 num_bytes = min(num_bytes,
5596 block_rsv->size - block_rsv->reserved);
5597 block_rsv->reserved += num_bytes;
5598 btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
5600 trace_btrfs_space_reservation(fs_info, "space_info",
5601 sinfo->flags, num_bytes,
5604 } else if (block_rsv->reserved > block_rsv->size) {
5605 num_bytes = block_rsv->reserved - block_rsv->size;
5606 btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
5608 trace_btrfs_space_reservation(fs_info, "space_info",
5609 sinfo->flags, num_bytes, 0);
5610 block_rsv->reserved = block_rsv->size;
5613 if (block_rsv->reserved == block_rsv->size)
5614 block_rsv->full = 1;
5616 block_rsv->full = 0;
5618 spin_unlock(&block_rsv->lock);
5619 spin_unlock(&sinfo->lock);
5622 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
5624 struct btrfs_space_info *space_info;
5626 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
5627 fs_info->chunk_block_rsv.space_info = space_info;
5629 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5630 fs_info->global_block_rsv.space_info = space_info;
5631 fs_info->trans_block_rsv.space_info = space_info;
5632 fs_info->empty_block_rsv.space_info = space_info;
5633 fs_info->delayed_block_rsv.space_info = space_info;
5634 fs_info->delayed_refs_rsv.space_info = space_info;
5636 fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv;
5637 fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv;
5638 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
5639 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
5640 if (fs_info->quota_root)
5641 fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
5642 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
5644 update_global_block_rsv(fs_info);
5647 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
5649 block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
5651 WARN_ON(fs_info->trans_block_rsv.size > 0);
5652 WARN_ON(fs_info->trans_block_rsv.reserved > 0);
5653 WARN_ON(fs_info->chunk_block_rsv.size > 0);
5654 WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
5655 WARN_ON(fs_info->delayed_block_rsv.size > 0);
5656 WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
5657 WARN_ON(fs_info->delayed_refs_rsv.reserved > 0);
5658 WARN_ON(fs_info->delayed_refs_rsv.size > 0);
5662 * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv
5663 * @trans - the trans that may have generated delayed refs
5665 * This is to be called anytime we may have adjusted trans->delayed_ref_updates,
5666 * it'll calculate the additional size and add it to the delayed_refs_rsv.
5668 void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
5670 struct btrfs_fs_info *fs_info = trans->fs_info;
5671 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
5674 if (!trans->delayed_ref_updates)
5677 num_bytes = btrfs_calc_trans_metadata_size(fs_info,
5678 trans->delayed_ref_updates);
5679 spin_lock(&delayed_rsv->lock);
5680 delayed_rsv->size += num_bytes;
5681 delayed_rsv->full = 0;
5682 spin_unlock(&delayed_rsv->lock);
5683 trans->delayed_ref_updates = 0;
5687 * To be called after all the new block groups attached to the transaction
5688 * handle have been created (btrfs_create_pending_block_groups()).
5690 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
5692 struct btrfs_fs_info *fs_info = trans->fs_info;
5694 if (!trans->chunk_bytes_reserved)
5697 WARN_ON_ONCE(!list_empty(&trans->new_bgs));
5699 block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
5700 trans->chunk_bytes_reserved, NULL);
5701 trans->chunk_bytes_reserved = 0;
5705 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
5706 * root: the root of the parent directory
5707 * rsv: block reservation
5708 * items: the number of items that we need do reservation
5709 * use_global_rsv: allow fallback to the global block reservation
5711 * This function is used to reserve the space for snapshot/subvolume
5712 * creation and deletion. Those operations are different with the
5713 * common file/directory operations, they change two fs/file trees
5714 * and root tree, the number of items that the qgroup reserves is
5715 * different with the free space reservation. So we can not use
5716 * the space reservation mechanism in start_transaction().
5718 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
5719 struct btrfs_block_rsv *rsv, int items,
5720 bool use_global_rsv)
5722 u64 qgroup_num_bytes = 0;
5725 struct btrfs_fs_info *fs_info = root->fs_info;
5726 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5728 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
5729 /* One for parent inode, two for dir entries */
5730 qgroup_num_bytes = 3 * fs_info->nodesize;
5731 ret = btrfs_qgroup_reserve_meta_prealloc(root,
5732 qgroup_num_bytes, true);
5737 num_bytes = btrfs_calc_trans_metadata_size(fs_info, items);
5738 rsv->space_info = btrfs_find_space_info(fs_info,
5739 BTRFS_BLOCK_GROUP_METADATA);
5740 ret = btrfs_block_rsv_add(root, rsv, num_bytes,
5741 BTRFS_RESERVE_FLUSH_ALL);
5743 if (ret == -ENOSPC && use_global_rsv)
5744 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true);
5746 if (ret && qgroup_num_bytes)
5747 btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
5752 void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
5753 struct btrfs_block_rsv *rsv)
5755 btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
5758 static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
5759 struct btrfs_inode *inode)
5761 struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5762 u64 reserve_size = 0;
5763 u64 qgroup_rsv_size = 0;
5765 unsigned outstanding_extents;
5767 lockdep_assert_held(&inode->lock);
5768 outstanding_extents = inode->outstanding_extents;
5769 if (outstanding_extents)
5770 reserve_size = btrfs_calc_trans_metadata_size(fs_info,
5771 outstanding_extents + 1);
5772 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
5774 reserve_size += btrfs_calc_trans_metadata_size(fs_info,
5777 * For qgroup rsv, the calculation is very simple:
5778 * account one nodesize for each outstanding extent
5780 * This is overestimating in most cases.
5782 qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize;
5784 spin_lock(&block_rsv->lock);
5785 block_rsv->size = reserve_size;
5786 block_rsv->qgroup_rsv_size = qgroup_rsv_size;
5787 spin_unlock(&block_rsv->lock);
5790 static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
5791 u64 num_bytes, u64 *meta_reserve,
5792 u64 *qgroup_reserve)
5794 u64 nr_extents = count_max_extents(num_bytes);
5795 u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes);
5797 /* We add one for the inode update at finish ordered time */
5798 *meta_reserve = btrfs_calc_trans_metadata_size(fs_info,
5799 nr_extents + csum_leaves + 1);
5800 *qgroup_reserve = nr_extents * fs_info->nodesize;
5803 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
5805 struct btrfs_root *root = inode->root;
5806 struct btrfs_fs_info *fs_info = root->fs_info;
5807 struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5808 u64 meta_reserve, qgroup_reserve;
5809 unsigned nr_extents;
5810 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
5812 bool delalloc_lock = true;
5814 /* If we are a free space inode we need to not flush since we will be in
5815 * the middle of a transaction commit. We also don't need the delalloc
5816 * mutex since we won't race with anybody. We need this mostly to make
5817 * lockdep shut its filthy mouth.
5819 * If we have a transaction open (can happen if we call truncate_block
5820 * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
5822 if (btrfs_is_free_space_inode(inode)) {
5823 flush = BTRFS_RESERVE_NO_FLUSH;
5824 delalloc_lock = false;
5826 if (current->journal_info)
5827 flush = BTRFS_RESERVE_FLUSH_LIMIT;
5829 if (btrfs_transaction_in_commit(fs_info))
5830 schedule_timeout(1);
5834 mutex_lock(&inode->delalloc_mutex);
5836 num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
5839 * We always want to do it this way, every other way is wrong and ends
5840 * in tears. Pre-reserving the amount we are going to add will always
5841 * be the right way, because otherwise if we have enough parallelism we
5842 * could end up with thousands of inodes all holding little bits of
5843 * reservations they were able to make previously and the only way to
5844 * reclaim that space is to ENOSPC out the operations and clear
5845 * everything out and try again, which is bad. This way we just
5846 * over-reserve slightly, and clean up the mess when we are done.
5848 calc_inode_reservations(fs_info, num_bytes, &meta_reserve,
5850 ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true);
5853 ret = reserve_metadata_bytes(root, block_rsv, meta_reserve, flush);
5858 * Now we need to update our outstanding extents and csum bytes _first_
5859 * and then add the reservation to the block_rsv. This keeps us from
5860 * racing with an ordered completion or some such that would think it
5861 * needs to free the reservation we just made.
5863 spin_lock(&inode->lock);
5864 nr_extents = count_max_extents(num_bytes);
5865 btrfs_mod_outstanding_extents(inode, nr_extents);
5866 inode->csum_bytes += num_bytes;
5867 btrfs_calculate_inode_block_rsv_size(fs_info, inode);
5868 spin_unlock(&inode->lock);
5870 /* Now we can safely add our space to our block rsv */
5871 block_rsv_add_bytes(block_rsv, meta_reserve, false);
5872 trace_btrfs_space_reservation(root->fs_info, "delalloc",
5873 btrfs_ino(inode), meta_reserve, 1);
5875 spin_lock(&block_rsv->lock);
5876 block_rsv->qgroup_rsv_reserved += qgroup_reserve;
5877 spin_unlock(&block_rsv->lock);
5880 mutex_unlock(&inode->delalloc_mutex);
5883 btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);
5885 btrfs_inode_rsv_release(inode, true);
5887 mutex_unlock(&inode->delalloc_mutex);
5892 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
5893 * @inode: the inode to release the reservation for.
5894 * @num_bytes: the number of bytes we are releasing.
5895 * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
5897 * This will release the metadata reservation for an inode. This can be called
5898 * once we complete IO for a given set of bytes to release their metadata
5899 * reservations, or on error for the same reason.
5901 void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
5904 struct btrfs_fs_info *fs_info = inode->root->fs_info;
5906 num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
5907 spin_lock(&inode->lock);
5908 inode->csum_bytes -= num_bytes;
5909 btrfs_calculate_inode_block_rsv_size(fs_info, inode);
5910 spin_unlock(&inode->lock);
5912 if (btrfs_is_testing(fs_info))
5915 btrfs_inode_rsv_release(inode, qgroup_free);
5919 * btrfs_delalloc_release_extents - release our outstanding_extents
5920 * @inode: the inode to balance the reservation for.
5921 * @num_bytes: the number of bytes we originally reserved with
5922 * @qgroup_free: do we need to free qgroup meta reservation or convert them.
5924 * When we reserve space we increase outstanding_extents for the extents we may
5925 * add. Once we've set the range as delalloc or created our ordered extents we
5926 * have outstanding_extents to track the real usage, so we use this to free our
5927 * temporarily tracked outstanding_extents. This _must_ be used in conjunction
5928 * with btrfs_delalloc_reserve_metadata.
5930 void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
5933 struct btrfs_fs_info *fs_info = inode->root->fs_info;
5934 unsigned num_extents;
5936 spin_lock(&inode->lock);
5937 num_extents = count_max_extents(num_bytes);
5938 btrfs_mod_outstanding_extents(inode, -num_extents);
5939 btrfs_calculate_inode_block_rsv_size(fs_info, inode);
5940 spin_unlock(&inode->lock);
5942 if (btrfs_is_testing(fs_info))
5945 btrfs_inode_rsv_release(inode, qgroup_free);
5949 * btrfs_delalloc_reserve_space - reserve data and metadata space for
5951 * @inode: inode we're writing to
5952 * @start: start range we are writing to
5953 * @len: how long the range we are writing to
5954 * @reserved: mandatory parameter, record actually reserved qgroup ranges of
5955 * current reservation.
5957 * This will do the following things
5959 * o reserve space in data space info for num bytes
5960 * and reserve precious corresponding qgroup space
5961 * (Done in check_data_free_space)
5963 * o reserve space for metadata space, based on the number of outstanding
5964 * extents and how much csums will be needed
5965 * also reserve metadata space in a per root over-reserve method.
5966 * o add to the inodes->delalloc_bytes
5967 * o add it to the fs_info's delalloc inodes list.
5968 * (Above 3 all done in delalloc_reserve_metadata)
5970 * Return 0 for success
5971 * Return <0 for error(-ENOSPC or -EQUOT)
5973 int btrfs_delalloc_reserve_space(struct inode *inode,
5974 struct extent_changeset **reserved, u64 start, u64 len)
5978 ret = btrfs_check_data_free_space(inode, reserved, start, len);
5981 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
5983 btrfs_free_reserved_data_space(inode, *reserved, start, len);
5988 * btrfs_delalloc_release_space - release data and metadata space for delalloc
5989 * @inode: inode we're releasing space for
5990 * @start: start position of the space already reserved
5991 * @len: the len of the space already reserved
5992 * @release_bytes: the len of the space we consumed or didn't use
5994 * This function will release the metadata space that was not used and will
5995 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
5996 * list if there are no delalloc bytes left.
5997 * Also it will handle the qgroup reserved space.
5999 void btrfs_delalloc_release_space(struct inode *inode,
6000 struct extent_changeset *reserved,
6001 u64 start, u64 len, bool qgroup_free)
6003 btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
6004 btrfs_free_reserved_data_space(inode, reserved, start, len);
6007 static int update_block_group(struct btrfs_trans_handle *trans,
6008 u64 bytenr, u64 num_bytes, int alloc)
6010 struct btrfs_fs_info *info = trans->fs_info;
6011 struct btrfs_block_group_cache *cache = NULL;
6012 u64 total = num_bytes;
6018 /* block accounting for super block */
6019 spin_lock(&info->delalloc_root_lock);
6020 old_val = btrfs_super_bytes_used(info->super_copy);
6022 old_val += num_bytes;
6024 old_val -= num_bytes;
6025 btrfs_set_super_bytes_used(info->super_copy, old_val);
6026 spin_unlock(&info->delalloc_root_lock);
6029 cache = btrfs_lookup_block_group(info, bytenr);
6034 factor = btrfs_bg_type_to_factor(cache->flags);
6037 * If this block group has free space cache written out, we
6038 * need to make sure to load it if we are removing space. This
6039 * is because we need the unpinning stage to actually add the
6040 * space back to the block group, otherwise we will leak space.
6042 if (!alloc && cache->cached == BTRFS_CACHE_NO)
6043 cache_block_group(cache, 1);
6045 byte_in_group = bytenr - cache->key.objectid;
6046 WARN_ON(byte_in_group > cache->key.offset);
6048 spin_lock(&cache->space_info->lock);
6049 spin_lock(&cache->lock);
6051 if (btrfs_test_opt(info, SPACE_CACHE) &&
6052 cache->disk_cache_state < BTRFS_DC_CLEAR)
6053 cache->disk_cache_state = BTRFS_DC_CLEAR;
6055 old_val = btrfs_block_group_used(&cache->item);
6056 num_bytes = min(total, cache->key.offset - byte_in_group);
6058 old_val += num_bytes;
6059 btrfs_set_block_group_used(&cache->item, old_val);
6060 cache->reserved -= num_bytes;
6061 cache->space_info->bytes_reserved -= num_bytes;
6062 cache->space_info->bytes_used += num_bytes;
6063 cache->space_info->disk_used += num_bytes * factor;
6064 spin_unlock(&cache->lock);
6065 spin_unlock(&cache->space_info->lock);
6067 old_val -= num_bytes;
6068 btrfs_set_block_group_used(&cache->item, old_val);
6069 cache->pinned += num_bytes;
6070 btrfs_space_info_update_bytes_pinned(info,
6071 cache->space_info, num_bytes);
6072 cache->space_info->bytes_used -= num_bytes;
6073 cache->space_info->disk_used -= num_bytes * factor;
6074 spin_unlock(&cache->lock);
6075 spin_unlock(&cache->space_info->lock);
6077 trace_btrfs_space_reservation(info, "pinned",
6078 cache->space_info->flags,
6080 percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
6082 BTRFS_TOTAL_BYTES_PINNED_BATCH);
6083 set_extent_dirty(info->pinned_extents,
6084 bytenr, bytenr + num_bytes - 1,
6085 GFP_NOFS | __GFP_NOFAIL);
6088 spin_lock(&trans->transaction->dirty_bgs_lock);
6089 if (list_empty(&cache->dirty_list)) {
6090 list_add_tail(&cache->dirty_list,
6091 &trans->transaction->dirty_bgs);
6092 trans->delayed_ref_updates++;
6093 btrfs_get_block_group(cache);
6095 spin_unlock(&trans->transaction->dirty_bgs_lock);
6098 * No longer have used bytes in this block group, queue it for
6099 * deletion. We do this after adding the block group to the
6100 * dirty list to avoid races between cleaner kthread and space
6103 if (!alloc && old_val == 0)
6104 btrfs_mark_bg_unused(cache);
6106 btrfs_put_block_group(cache);
6108 bytenr += num_bytes;
6111 /* Modified block groups are accounted for in the delayed_refs_rsv. */
6112 btrfs_update_delayed_refs_rsv(trans);
6116 static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
6118 struct btrfs_block_group_cache *cache;
6121 spin_lock(&fs_info->block_group_cache_lock);
6122 bytenr = fs_info->first_logical_byte;
6123 spin_unlock(&fs_info->block_group_cache_lock);
6125 if (bytenr < (u64)-1)
6128 cache = btrfs_lookup_first_block_group(fs_info, search_start);
6132 bytenr = cache->key.objectid;
6133 btrfs_put_block_group(cache);
6138 static int pin_down_extent(struct btrfs_block_group_cache *cache,
6139 u64 bytenr, u64 num_bytes, int reserved)
6141 struct btrfs_fs_info *fs_info = cache->fs_info;
6143 spin_lock(&cache->space_info->lock);
6144 spin_lock(&cache->lock);
6145 cache->pinned += num_bytes;
6146 btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
6149 cache->reserved -= num_bytes;
6150 cache->space_info->bytes_reserved -= num_bytes;
6152 spin_unlock(&cache->lock);
6153 spin_unlock(&cache->space_info->lock);
6155 trace_btrfs_space_reservation(fs_info, "pinned",
6156 cache->space_info->flags, num_bytes, 1);
6157 percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
6158 num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
6159 set_extent_dirty(fs_info->pinned_extents, bytenr,
6160 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
6165 * this function must be called within transaction
6167 int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
6168 u64 bytenr, u64 num_bytes, int reserved)
6170 struct btrfs_block_group_cache *cache;
6172 cache = btrfs_lookup_block_group(fs_info, bytenr);
6173 BUG_ON(!cache); /* Logic error */
6175 pin_down_extent(cache, bytenr, num_bytes, reserved);
6177 btrfs_put_block_group(cache);
6182 * this function must be called within transaction
6184 int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
6185 u64 bytenr, u64 num_bytes)
6187 struct btrfs_block_group_cache *cache;
6190 cache = btrfs_lookup_block_group(fs_info, bytenr);
6195 * pull in the free space cache (if any) so that our pin
6196 * removes the free space from the cache. We have load_only set
6197 * to one because the slow code to read in the free extents does check
6198 * the pinned extents.
6200 cache_block_group(cache, 1);
6202 pin_down_extent(cache, bytenr, num_bytes, 0);
6204 /* remove us from the free space cache (if we're there at all) */
6205 ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
6206 btrfs_put_block_group(cache);
6210 static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
6211 u64 start, u64 num_bytes)
6214 struct btrfs_block_group_cache *block_group;
6215 struct btrfs_caching_control *caching_ctl;
6217 block_group = btrfs_lookup_block_group(fs_info, start);
6221 cache_block_group(block_group, 0);
6222 caching_ctl = get_caching_control(block_group);
6226 BUG_ON(!block_group_cache_done(block_group));
6227 ret = btrfs_remove_free_space(block_group, start, num_bytes);
6229 mutex_lock(&caching_ctl->mutex);
6231 if (start >= caching_ctl->progress) {
6232 ret = add_excluded_extent(fs_info, start, num_bytes);
6233 } else if (start + num_bytes <= caching_ctl->progress) {
6234 ret = btrfs_remove_free_space(block_group,
6237 num_bytes = caching_ctl->progress - start;
6238 ret = btrfs_remove_free_space(block_group,
6243 num_bytes = (start + num_bytes) -
6244 caching_ctl->progress;
6245 start = caching_ctl->progress;
6246 ret = add_excluded_extent(fs_info, start, num_bytes);
6249 mutex_unlock(&caching_ctl->mutex);
6250 put_caching_control(caching_ctl);
6252 btrfs_put_block_group(block_group);
6256 int btrfs_exclude_logged_extents(struct extent_buffer *eb)
6258 struct btrfs_fs_info *fs_info = eb->fs_info;
6259 struct btrfs_file_extent_item *item;
6260 struct btrfs_key key;
6265 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
6268 for (i = 0; i < btrfs_header_nritems(eb); i++) {
6269 btrfs_item_key_to_cpu(eb, &key, i);
6270 if (key.type != BTRFS_EXTENT_DATA_KEY)
6272 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
6273 found_type = btrfs_file_extent_type(eb, item);
6274 if (found_type == BTRFS_FILE_EXTENT_INLINE)
6276 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
6278 key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
6279 key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
6280 ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
6289 btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
6291 atomic_inc(&bg->reservations);
6294 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
6297 struct btrfs_block_group_cache *bg;
6299 bg = btrfs_lookup_block_group(fs_info, start);
6301 if (atomic_dec_and_test(&bg->reservations))
6302 wake_up_var(&bg->reservations);
6303 btrfs_put_block_group(bg);
6306 void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
6308 struct btrfs_space_info *space_info = bg->space_info;
6312 if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
6316 * Our block group is read only but before we set it to read only,
6317 * some task might have had allocated an extent from it already, but it
6318 * has not yet created a respective ordered extent (and added it to a
6319 * root's list of ordered extents).
6320 * Therefore wait for any task currently allocating extents, since the
6321 * block group's reservations counter is incremented while a read lock
6322 * on the groups' semaphore is held and decremented after releasing
6323 * the read access on that semaphore and creating the ordered extent.
6325 down_write(&space_info->groups_sem);
6326 up_write(&space_info->groups_sem);
6328 wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
6332 * btrfs_add_reserved_bytes - update the block_group and space info counters
6333 * @cache: The cache we are manipulating
6334 * @ram_bytes: The number of bytes of file content, and will be same to
6335 * @num_bytes except for the compress path.
6336 * @num_bytes: The number of bytes in question
6337 * @delalloc: The blocks are allocated for the delalloc write
6339 * This is called by the allocator when it reserves space. If this is a
6340 * reservation and the block group has become read only we cannot make the
6341 * reservation and return -EAGAIN, otherwise this function always succeeds.
6343 static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
6344 u64 ram_bytes, u64 num_bytes, int delalloc)
6346 struct btrfs_space_info *space_info = cache->space_info;
6349 spin_lock(&space_info->lock);
6350 spin_lock(&cache->lock);
6354 cache->reserved += num_bytes;
6355 space_info->bytes_reserved += num_bytes;
6356 btrfs_space_info_update_bytes_may_use(cache->fs_info,
6357 space_info, -ram_bytes);
6359 cache->delalloc_bytes += num_bytes;
6361 spin_unlock(&cache->lock);
6362 spin_unlock(&space_info->lock);
6367 * btrfs_free_reserved_bytes - update the block_group and space info counters
6368 * @cache: The cache we are manipulating
6369 * @num_bytes: The number of bytes in question
6370 * @delalloc: The blocks are allocated for the delalloc write
6372 * This is called by somebody who is freeing space that was never actually used
6373 * on disk. For example if you reserve some space for a new leaf in transaction
6374 * A and before transaction A commits you free that leaf, you call this with
6375 * reserve set to 0 in order to clear the reservation.
6378 static void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
6379 u64 num_bytes, int delalloc)
6381 struct btrfs_space_info *space_info = cache->space_info;
6383 spin_lock(&space_info->lock);
6384 spin_lock(&cache->lock);
6386 space_info->bytes_readonly += num_bytes;
6387 cache->reserved -= num_bytes;
6388 space_info->bytes_reserved -= num_bytes;
6389 space_info->max_extent_size = 0;
6392 cache->delalloc_bytes -= num_bytes;
6393 spin_unlock(&cache->lock);
6394 spin_unlock(&space_info->lock);
6396 void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
6398 struct btrfs_caching_control *next;
6399 struct btrfs_caching_control *caching_ctl;
6400 struct btrfs_block_group_cache *cache;
6402 down_write(&fs_info->commit_root_sem);
6404 list_for_each_entry_safe(caching_ctl, next,
6405 &fs_info->caching_block_groups, list) {
6406 cache = caching_ctl->block_group;
6407 if (block_group_cache_done(cache)) {
6408 cache->last_byte_to_unpin = (u64)-1;
6409 list_del_init(&caching_ctl->list);
6410 put_caching_control(caching_ctl);
6412 cache->last_byte_to_unpin = caching_ctl->progress;
6416 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6417 fs_info->pinned_extents = &fs_info->freed_extents[1];
6419 fs_info->pinned_extents = &fs_info->freed_extents[0];
6421 up_write(&fs_info->commit_root_sem);
6423 update_global_block_rsv(fs_info);
6427 * Returns the free cluster for the given space info and sets empty_cluster to
6428 * what it should be based on the mount options.
6430 static struct btrfs_free_cluster *
6431 fetch_cluster_info(struct btrfs_fs_info *fs_info,
6432 struct btrfs_space_info *space_info, u64 *empty_cluster)
6434 struct btrfs_free_cluster *ret = NULL;
6437 if (btrfs_mixed_space_info(space_info))
6440 if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
6441 ret = &fs_info->meta_alloc_cluster;
6442 if (btrfs_test_opt(fs_info, SSD))
6443 *empty_cluster = SZ_2M;
6445 *empty_cluster = SZ_64K;
6446 } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
6447 btrfs_test_opt(fs_info, SSD_SPREAD)) {
6448 *empty_cluster = SZ_2M;
6449 ret = &fs_info->data_alloc_cluster;
6455 static int unpin_extent_range(struct btrfs_fs_info *fs_info,
6457 const bool return_free_space)
6459 struct btrfs_block_group_cache *cache = NULL;
6460 struct btrfs_space_info *space_info;
6461 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
6462 struct btrfs_free_cluster *cluster = NULL;
6464 u64 total_unpinned = 0;
6465 u64 empty_cluster = 0;
6468 while (start <= end) {
6471 start >= cache->key.objectid + cache->key.offset) {
6473 btrfs_put_block_group(cache);
6475 cache = btrfs_lookup_block_group(fs_info, start);
6476 BUG_ON(!cache); /* Logic error */
6478 cluster = fetch_cluster_info(fs_info,
6481 empty_cluster <<= 1;
6484 len = cache->key.objectid + cache->key.offset - start;
6485 len = min(len, end + 1 - start);
6487 if (start < cache->last_byte_to_unpin) {
6488 len = min(len, cache->last_byte_to_unpin - start);
6489 if (return_free_space)
6490 btrfs_add_free_space(cache, start, len);
6494 total_unpinned += len;
6495 space_info = cache->space_info;
6498 * If this space cluster has been marked as fragmented and we've
6499 * unpinned enough in this block group to potentially allow a
6500 * cluster to be created inside of it go ahead and clear the
6503 if (cluster && cluster->fragmented &&
6504 total_unpinned > empty_cluster) {
6505 spin_lock(&cluster->lock);
6506 cluster->fragmented = 0;
6507 spin_unlock(&cluster->lock);
6510 spin_lock(&space_info->lock);
6511 spin_lock(&cache->lock);
6512 cache->pinned -= len;
6513 btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
6515 trace_btrfs_space_reservation(fs_info, "pinned",
6516 space_info->flags, len, 0);
6517 space_info->max_extent_size = 0;
6518 percpu_counter_add_batch(&space_info->total_bytes_pinned,
6519 -len, BTRFS_TOTAL_BYTES_PINNED_BATCH);
6521 space_info->bytes_readonly += len;
6524 spin_unlock(&cache->lock);
6525 if (!readonly && return_free_space &&
6526 global_rsv->space_info == space_info) {
6529 spin_lock(&global_rsv->lock);
6530 if (!global_rsv->full) {
6531 to_add = min(len, global_rsv->size -
6532 global_rsv->reserved);
6533 global_rsv->reserved += to_add;
6534 btrfs_space_info_update_bytes_may_use(fs_info,
6535 space_info, to_add);
6536 if (global_rsv->reserved >= global_rsv->size)
6537 global_rsv->full = 1;
6538 trace_btrfs_space_reservation(fs_info,
6544 spin_unlock(&global_rsv->lock);
6545 /* Add to any tickets we may have */
6547 btrfs_space_info_add_new_bytes(fs_info,
6550 spin_unlock(&space_info->lock);
6554 btrfs_put_block_group(cache);
6558 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
6560 struct btrfs_fs_info *fs_info = trans->fs_info;
6561 struct btrfs_block_group_cache *block_group, *tmp;
6562 struct list_head *deleted_bgs;
6563 struct extent_io_tree *unpin;
6568 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6569 unpin = &fs_info->freed_extents[1];
6571 unpin = &fs_info->freed_extents[0];
6573 while (!trans->aborted) {
6574 struct extent_state *cached_state = NULL;
6576 mutex_lock(&fs_info->unused_bg_unpin_mutex);
6577 ret = find_first_extent_bit(unpin, 0, &start, &end,
6578 EXTENT_DIRTY, &cached_state);
6580 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6584 if (btrfs_test_opt(fs_info, DISCARD))
6585 ret = btrfs_discard_extent(fs_info, start,
6586 end + 1 - start, NULL);
6588 clear_extent_dirty(unpin, start, end, &cached_state);
6589 unpin_extent_range(fs_info, start, end, true);
6590 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6591 free_extent_state(cached_state);
6596 * Transaction is finished. We don't need the lock anymore. We
6597 * do need to clean up the block groups in case of a transaction
6600 deleted_bgs = &trans->transaction->deleted_bgs;
6601 list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
6605 if (!trans->aborted)
6606 ret = btrfs_discard_extent(fs_info,
6607 block_group->key.objectid,
6608 block_group->key.offset,
6611 list_del_init(&block_group->bg_list);
6612 btrfs_put_block_group_trimming(block_group);
6613 btrfs_put_block_group(block_group);
6616 const char *errstr = btrfs_decode_error(ret);
6618 "discard failed while removing blockgroup: errno=%d %s",
6626 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
6627 struct btrfs_delayed_ref_node *node, u64 parent,
6628 u64 root_objectid, u64 owner_objectid,
6629 u64 owner_offset, int refs_to_drop,
6630 struct btrfs_delayed_extent_op *extent_op)
6632 struct btrfs_fs_info *info = trans->fs_info;
6633 struct btrfs_key key;
6634 struct btrfs_path *path;
6635 struct btrfs_root *extent_root = info->extent_root;
6636 struct extent_buffer *leaf;
6637 struct btrfs_extent_item *ei;
6638 struct btrfs_extent_inline_ref *iref;
6641 int extent_slot = 0;
6642 int found_extent = 0;
6646 u64 bytenr = node->bytenr;
6647 u64 num_bytes = node->num_bytes;
6649 bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
6651 path = btrfs_alloc_path();
6655 path->reada = READA_FORWARD;
6656 path->leave_spinning = 1;
6658 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
6659 BUG_ON(!is_data && refs_to_drop != 1);
6662 skinny_metadata = false;
6664 ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
6665 parent, root_objectid, owner_objectid,
6668 extent_slot = path->slots[0];
6669 while (extent_slot >= 0) {
6670 btrfs_item_key_to_cpu(path->nodes[0], &key,
6672 if (key.objectid != bytenr)
6674 if (key.type == BTRFS_EXTENT_ITEM_KEY &&
6675 key.offset == num_bytes) {
6679 if (key.type == BTRFS_METADATA_ITEM_KEY &&
6680 key.offset == owner_objectid) {
6684 if (path->slots[0] - extent_slot > 5)
6689 if (!found_extent) {
6691 ret = remove_extent_backref(trans, path, NULL,
6693 is_data, &last_ref);
6695 btrfs_abort_transaction(trans, ret);
6698 btrfs_release_path(path);
6699 path->leave_spinning = 1;
6701 key.objectid = bytenr;
6702 key.type = BTRFS_EXTENT_ITEM_KEY;
6703 key.offset = num_bytes;
6705 if (!is_data && skinny_metadata) {
6706 key.type = BTRFS_METADATA_ITEM_KEY;
6707 key.offset = owner_objectid;
6710 ret = btrfs_search_slot(trans, extent_root,
6712 if (ret > 0 && skinny_metadata && path->slots[0]) {
6714 * Couldn't find our skinny metadata item,
6715 * see if we have ye olde extent item.
6718 btrfs_item_key_to_cpu(path->nodes[0], &key,
6720 if (key.objectid == bytenr &&
6721 key.type == BTRFS_EXTENT_ITEM_KEY &&
6722 key.offset == num_bytes)
6726 if (ret > 0 && skinny_metadata) {
6727 skinny_metadata = false;
6728 key.objectid = bytenr;
6729 key.type = BTRFS_EXTENT_ITEM_KEY;
6730 key.offset = num_bytes;
6731 btrfs_release_path(path);
6732 ret = btrfs_search_slot(trans, extent_root,
6738 "umm, got %d back from search, was looking for %llu",
6741 btrfs_print_leaf(path->nodes[0]);
6744 btrfs_abort_transaction(trans, ret);
6747 extent_slot = path->slots[0];
6749 } else if (WARN_ON(ret == -ENOENT)) {
6750 btrfs_print_leaf(path->nodes[0]);
6752 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu",
6753 bytenr, parent, root_objectid, owner_objectid,
6755 btrfs_abort_transaction(trans, ret);
6758 btrfs_abort_transaction(trans, ret);
6762 leaf = path->nodes[0];
6763 item_size = btrfs_item_size_nr(leaf, extent_slot);
6764 if (unlikely(item_size < sizeof(*ei))) {
6766 btrfs_print_v0_err(info);
6767 btrfs_abort_transaction(trans, ret);
6770 ei = btrfs_item_ptr(leaf, extent_slot,
6771 struct btrfs_extent_item);
6772 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
6773 key.type == BTRFS_EXTENT_ITEM_KEY) {
6774 struct btrfs_tree_block_info *bi;
6775 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
6776 bi = (struct btrfs_tree_block_info *)(ei + 1);
6777 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
6780 refs = btrfs_extent_refs(leaf, ei);
6781 if (refs < refs_to_drop) {
6783 "trying to drop %d refs but we only have %Lu for bytenr %Lu",
6784 refs_to_drop, refs, bytenr);
6786 btrfs_abort_transaction(trans, ret);
6789 refs -= refs_to_drop;
6793 __run_delayed_extent_op(extent_op, leaf, ei);
6795 * In the case of inline back ref, reference count will
6796 * be updated by remove_extent_backref
6799 BUG_ON(!found_extent);
6801 btrfs_set_extent_refs(leaf, ei, refs);
6802 btrfs_mark_buffer_dirty(leaf);
6805 ret = remove_extent_backref(trans, path, iref,
6806 refs_to_drop, is_data,
6809 btrfs_abort_transaction(trans, ret);
6815 BUG_ON(is_data && refs_to_drop !=
6816 extent_data_ref_count(path, iref));
6818 BUG_ON(path->slots[0] != extent_slot);
6820 BUG_ON(path->slots[0] != extent_slot + 1);
6821 path->slots[0] = extent_slot;
6827 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
6830 btrfs_abort_transaction(trans, ret);
6833 btrfs_release_path(path);
6836 ret = btrfs_del_csums(trans, info, bytenr, num_bytes);
6838 btrfs_abort_transaction(trans, ret);
6843 ret = add_to_free_space_tree(trans, bytenr, num_bytes);
6845 btrfs_abort_transaction(trans, ret);
6849 ret = update_block_group(trans, bytenr, num_bytes, 0);
6851 btrfs_abort_transaction(trans, ret);
6855 btrfs_release_path(path);
6858 btrfs_free_path(path);
6863 * when we free an block, it is possible (and likely) that we free the last
6864 * delayed ref for that extent as well. This searches the delayed ref tree for
6865 * a given extent, and if there are no other delayed refs to be processed, it
6866 * removes it from the tree.
6868 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
6871 struct btrfs_delayed_ref_head *head;
6872 struct btrfs_delayed_ref_root *delayed_refs;
6875 delayed_refs = &trans->transaction->delayed_refs;
6876 spin_lock(&delayed_refs->lock);
6877 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
6879 goto out_delayed_unlock;
6881 spin_lock(&head->lock);
6882 if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
6885 if (cleanup_extent_op(head) != NULL)
6889 * waiting for the lock here would deadlock. If someone else has it
6890 * locked they are already in the process of dropping it anyway
6892 if (!mutex_trylock(&head->mutex))
6895 btrfs_delete_ref_head(delayed_refs, head);
6896 head->processing = 0;
6898 spin_unlock(&head->lock);
6899 spin_unlock(&delayed_refs->lock);
6901 BUG_ON(head->extent_op);
6902 if (head->must_insert_reserved)
6905 btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
6906 mutex_unlock(&head->mutex);
6907 btrfs_put_delayed_ref_head(head);
6910 spin_unlock(&head->lock);
6913 spin_unlock(&delayed_refs->lock);
6917 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
6918 struct btrfs_root *root,
6919 struct extent_buffer *buf,
6920 u64 parent, int last_ref)
6922 struct btrfs_fs_info *fs_info = root->fs_info;
6923 struct btrfs_ref generic_ref = { 0 };
6927 btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
6928 buf->start, buf->len, parent);
6929 btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
6930 root->root_key.objectid);
6932 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
6933 int old_ref_mod, new_ref_mod;
6935 btrfs_ref_tree_mod(fs_info, &generic_ref);
6936 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL,
6937 &old_ref_mod, &new_ref_mod);
6938 BUG_ON(ret); /* -ENOMEM */
6939 pin = old_ref_mod >= 0 && new_ref_mod < 0;
6942 if (last_ref && btrfs_header_generation(buf) == trans->transid) {
6943 struct btrfs_block_group_cache *cache;
6945 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
6946 ret = check_ref_cleanup(trans, buf->start);
6952 cache = btrfs_lookup_block_group(fs_info, buf->start);
6954 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
6955 pin_down_extent(cache, buf->start, buf->len, 1);
6956 btrfs_put_block_group(cache);
6960 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
6962 btrfs_add_free_space(cache, buf->start, buf->len);
6963 btrfs_free_reserved_bytes(cache, buf->len, 0);
6964 btrfs_put_block_group(cache);
6965 trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
6969 add_pinned_bytes(fs_info, &generic_ref);
6973 * Deleting the buffer, clear the corrupt flag since it doesn't
6976 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
6980 /* Can return -ENOMEM */
6981 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
6983 struct btrfs_fs_info *fs_info = trans->fs_info;
6984 int old_ref_mod, new_ref_mod;
6987 if (btrfs_is_testing(fs_info))
6991 * tree log blocks never actually go into the extent allocation
6992 * tree, just update pinning info and exit early.
6994 if ((ref->type == BTRFS_REF_METADATA &&
6995 ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
6996 (ref->type == BTRFS_REF_DATA &&
6997 ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
6998 /* unlocks the pinned mutex */
6999 btrfs_pin_extent(fs_info, ref->bytenr, ref->len, 1);
7000 old_ref_mod = new_ref_mod = 0;
7002 } else if (ref->type == BTRFS_REF_METADATA) {
7003 ret = btrfs_add_delayed_tree_ref(trans, ref, NULL,
7004 &old_ref_mod, &new_ref_mod);
7006 ret = btrfs_add_delayed_data_ref(trans, ref, 0,
7007 &old_ref_mod, &new_ref_mod);
7010 if (!((ref->type == BTRFS_REF_METADATA &&
7011 ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
7012 (ref->type == BTRFS_REF_DATA &&
7013 ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
7014 btrfs_ref_tree_mod(fs_info, ref);
7016 if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0)
7017 add_pinned_bytes(fs_info, ref);
7023 * when we wait for progress in the block group caching, its because
7024 * our allocation attempt failed at least once. So, we must sleep
7025 * and let some progress happen before we try again.
7027 * This function will sleep at least once waiting for new free space to
7028 * show up, and then it will check the block group free space numbers
7029 * for our min num_bytes. Another option is to have it go ahead
7030 * and look in the rbtree for a free extent of a given size, but this
7033 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
7034 * any of the information in this block group.
7036 static noinline void
7037 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
7040 struct btrfs_caching_control *caching_ctl;
7042 caching_ctl = get_caching_control(cache);
7046 wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
7047 (cache->free_space_ctl->free_space >= num_bytes));
7049 put_caching_control(caching_ctl);
7053 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
7055 struct btrfs_caching_control *caching_ctl;
7058 caching_ctl = get_caching_control(cache);
7060 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
7062 wait_event(caching_ctl->wait, block_group_cache_done(cache));
7063 if (cache->cached == BTRFS_CACHE_ERROR)
7065 put_caching_control(caching_ctl);
7069 enum btrfs_loop_type {
7070 LOOP_CACHING_NOWAIT,
7077 btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
7081 down_read(&cache->data_rwsem);
7085 btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
7088 btrfs_get_block_group(cache);
7090 down_read(&cache->data_rwsem);
7093 static struct btrfs_block_group_cache *
7094 btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
7095 struct btrfs_free_cluster *cluster,
7098 struct btrfs_block_group_cache *used_bg = NULL;
7100 spin_lock(&cluster->refill_lock);
7102 used_bg = cluster->block_group;
7106 if (used_bg == block_group)
7109 btrfs_get_block_group(used_bg);
7114 if (down_read_trylock(&used_bg->data_rwsem))
7117 spin_unlock(&cluster->refill_lock);
7119 /* We should only have one-level nested. */
7120 down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
7122 spin_lock(&cluster->refill_lock);
7123 if (used_bg == cluster->block_group)
7126 up_read(&used_bg->data_rwsem);
7127 btrfs_put_block_group(used_bg);
7132 btrfs_release_block_group(struct btrfs_block_group_cache *cache,
7136 up_read(&cache->data_rwsem);
7137 btrfs_put_block_group(cache);
7141 * Structure used internally for find_free_extent() function. Wraps needed
7144 struct find_free_extent_ctl {
7145 /* Basic allocation info */
7152 /* Where to start the search inside the bg */
7155 /* For clustered allocation */
7158 bool have_caching_bg;
7159 bool orig_have_caching_bg;
7161 /* RAID index, converted from flags */
7165 * Current loop number, check find_free_extent_update_loop() for details
7170 * Whether we're refilling a cluster, if true we need to re-search
7171 * current block group but don't try to refill the cluster again.
7173 bool retry_clustered;
7176 * Whether we're updating free space cache, if true we need to re-search
7177 * current block group but don't try updating free space cache again.
7179 bool retry_unclustered;
7181 /* If current block group is cached */
7184 /* Max contiguous hole found */
7185 u64 max_extent_size;
7187 /* Total free space from free space cache, not always contiguous */
7188 u64 total_free_space;
7196 * Helper function for find_free_extent().
7198 * Return -ENOENT to inform caller that we need fallback to unclustered mode.
7199 * Return -EAGAIN to inform caller that we need to re-search this block group
7200 * Return >0 to inform caller that we find nothing
7201 * Return 0 means we have found a location and set ffe_ctl->found_offset.
7203 static int find_free_extent_clustered(struct btrfs_block_group_cache *bg,
7204 struct btrfs_free_cluster *last_ptr,
7205 struct find_free_extent_ctl *ffe_ctl,
7206 struct btrfs_block_group_cache **cluster_bg_ret)
7208 struct btrfs_block_group_cache *cluster_bg;
7209 u64 aligned_cluster;
7213 cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
7215 goto refill_cluster;
7216 if (cluster_bg != bg && (cluster_bg->ro ||
7217 !block_group_bits(cluster_bg, ffe_ctl->flags)))
7218 goto release_cluster;
7220 offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
7221 ffe_ctl->num_bytes, cluster_bg->key.objectid,
7222 &ffe_ctl->max_extent_size);
7224 /* We have a block, we're done */
7225 spin_unlock(&last_ptr->refill_lock);
7226 trace_btrfs_reserve_extent_cluster(cluster_bg,
7227 ffe_ctl->search_start, ffe_ctl->num_bytes);
7228 *cluster_bg_ret = cluster_bg;
7229 ffe_ctl->found_offset = offset;
7232 WARN_ON(last_ptr->block_group != cluster_bg);
7236 * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
7237 * lets just skip it and let the allocator find whatever block it can
7238 * find. If we reach this point, we will have tried the cluster
7239 * allocator plenty of times and not have found anything, so we are
7240 * likely way too fragmented for the clustering stuff to find anything.
7242 * However, if the cluster is taken from the current block group,
7243 * release the cluster first, so that we stand a better chance of
7244 * succeeding in the unclustered allocation.
7246 if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
7247 spin_unlock(&last_ptr->refill_lock);
7248 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
7252 /* This cluster didn't work out, free it and start over */
7253 btrfs_return_cluster_to_free_space(NULL, last_ptr);
7255 if (cluster_bg != bg)
7256 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
7259 if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
7260 spin_unlock(&last_ptr->refill_lock);
7264 aligned_cluster = max_t(u64,
7265 ffe_ctl->empty_cluster + ffe_ctl->empty_size,
7266 bg->full_stripe_len);
7267 ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
7268 ffe_ctl->num_bytes, aligned_cluster);
7270 /* Now pull our allocation out of this cluster */
7271 offset = btrfs_alloc_from_cluster(bg, last_ptr,
7272 ffe_ctl->num_bytes, ffe_ctl->search_start,
7273 &ffe_ctl->max_extent_size);
7275 /* We found one, proceed */
7276 spin_unlock(&last_ptr->refill_lock);
7277 trace_btrfs_reserve_extent_cluster(bg,
7278 ffe_ctl->search_start,
7279 ffe_ctl->num_bytes);
7280 ffe_ctl->found_offset = offset;
7283 } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
7284 !ffe_ctl->retry_clustered) {
7285 spin_unlock(&last_ptr->refill_lock);
7287 ffe_ctl->retry_clustered = true;
7288 wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
7289 ffe_ctl->empty_cluster + ffe_ctl->empty_size);
7293 * At this point we either didn't find a cluster or we weren't able to
7294 * allocate a block from our cluster. Free the cluster we've been
7295 * trying to use, and go to the next block group.
7297 btrfs_return_cluster_to_free_space(NULL, last_ptr);
7298 spin_unlock(&last_ptr->refill_lock);
7303 * Return >0 to inform caller that we find nothing
7304 * Return 0 when we found an free extent and set ffe_ctrl->found_offset
7305 * Return -EAGAIN to inform caller that we need to re-search this block group
7307 static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg,
7308 struct btrfs_free_cluster *last_ptr,
7309 struct find_free_extent_ctl *ffe_ctl)
7314 * We are doing an unclustered allocation, set the fragmented flag so
7315 * we don't bother trying to setup a cluster again until we get more
7318 if (unlikely(last_ptr)) {
7319 spin_lock(&last_ptr->lock);
7320 last_ptr->fragmented = 1;
7321 spin_unlock(&last_ptr->lock);
7323 if (ffe_ctl->cached) {
7324 struct btrfs_free_space_ctl *free_space_ctl;
7326 free_space_ctl = bg->free_space_ctl;
7327 spin_lock(&free_space_ctl->tree_lock);
7328 if (free_space_ctl->free_space <
7329 ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
7330 ffe_ctl->empty_size) {
7331 ffe_ctl->total_free_space = max_t(u64,
7332 ffe_ctl->total_free_space,
7333 free_space_ctl->free_space);
7334 spin_unlock(&free_space_ctl->tree_lock);
7337 spin_unlock(&free_space_ctl->tree_lock);
7340 offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
7341 ffe_ctl->num_bytes, ffe_ctl->empty_size,
7342 &ffe_ctl->max_extent_size);
7345 * If we didn't find a chunk, and we haven't failed on this block group
7346 * before, and this block group is in the middle of caching and we are
7347 * ok with waiting, then go ahead and wait for progress to be made, and
7348 * set @retry_unclustered to true.
7350 * If @retry_unclustered is true then we've already waited on this
7351 * block group once and should move on to the next block group.
7353 if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
7354 ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
7355 wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
7356 ffe_ctl->empty_size);
7357 ffe_ctl->retry_unclustered = true;
7359 } else if (!offset) {
7362 ffe_ctl->found_offset = offset;
7367 * Return >0 means caller needs to re-search for free extent
7368 * Return 0 means we have the needed free extent.
7369 * Return <0 means we failed to locate any free extent.
7371 static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
7372 struct btrfs_free_cluster *last_ptr,
7373 struct btrfs_key *ins,
7374 struct find_free_extent_ctl *ffe_ctl,
7375 int full_search, bool use_cluster)
7377 struct btrfs_root *root = fs_info->extent_root;
7380 if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
7381 ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
7382 ffe_ctl->orig_have_caching_bg = true;
7384 if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT &&
7385 ffe_ctl->have_caching_bg)
7388 if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
7391 if (ins->objectid) {
7392 if (!use_cluster && last_ptr) {
7393 spin_lock(&last_ptr->lock);
7394 last_ptr->window_start = ins->objectid;
7395 spin_unlock(&last_ptr->lock);
7401 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
7402 * caching kthreads as we move along
7403 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
7404 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
7405 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
7408 if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
7410 if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
7412 * We want to skip the LOOP_CACHING_WAIT step if we
7413 * don't have any uncached bgs and we've already done a
7414 * full search through.
7416 if (ffe_ctl->orig_have_caching_bg || !full_search)
7417 ffe_ctl->loop = LOOP_CACHING_WAIT;
7419 ffe_ctl->loop = LOOP_ALLOC_CHUNK;
7424 if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
7425 struct btrfs_trans_handle *trans;
7428 trans = current->journal_info;
7432 trans = btrfs_join_transaction(root);
7434 if (IS_ERR(trans)) {
7435 ret = PTR_ERR(trans);
7439 ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
7443 * If we can't allocate a new chunk we've already looped
7444 * through at least once, move on to the NO_EMPTY_SIZE
7448 ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
7450 /* Do not bail out on ENOSPC since we can do more. */
7451 if (ret < 0 && ret != -ENOSPC)
7452 btrfs_abort_transaction(trans, ret);
7456 btrfs_end_transaction(trans);
7461 if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
7463 * Don't loop again if we already have no empty_size and
7466 if (ffe_ctl->empty_size == 0 &&
7467 ffe_ctl->empty_cluster == 0)
7469 ffe_ctl->empty_size = 0;
7470 ffe_ctl->empty_cluster = 0;
7478 * walks the btree of allocated extents and find a hole of a given size.
7479 * The key ins is changed to record the hole:
7480 * ins->objectid == start position
7481 * ins->flags = BTRFS_EXTENT_ITEM_KEY
7482 * ins->offset == the size of the hole.
7483 * Any available blocks before search_start are skipped.
7485 * If there is no suitable free space, we will record the max size of
7486 * the free space extent currently.
7488 * The overall logic and call chain:
7490 * find_free_extent()
7491 * |- Iterate through all block groups
7492 * | |- Get a valid block group
7493 * | |- Try to do clustered allocation in that block group
7494 * | |- Try to do unclustered allocation in that block group
7495 * | |- Check if the result is valid
7496 * | | |- If valid, then exit
7497 * | |- Jump to next block group
7499 * |- Push harder to find free extents
7500 * |- If not found, re-iterate all block groups
7502 static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
7503 u64 ram_bytes, u64 num_bytes, u64 empty_size,
7504 u64 hint_byte, struct btrfs_key *ins,
7505 u64 flags, int delalloc)
7508 struct btrfs_free_cluster *last_ptr = NULL;
7509 struct btrfs_block_group_cache *block_group = NULL;
7510 struct find_free_extent_ctl ffe_ctl = {0};
7511 struct btrfs_space_info *space_info;
7512 bool use_cluster = true;
7513 bool full_search = false;
7515 WARN_ON(num_bytes < fs_info->sectorsize);
7517 ffe_ctl.ram_bytes = ram_bytes;
7518 ffe_ctl.num_bytes = num_bytes;
7519 ffe_ctl.empty_size = empty_size;
7520 ffe_ctl.flags = flags;
7521 ffe_ctl.search_start = 0;
7522 ffe_ctl.retry_clustered = false;
7523 ffe_ctl.retry_unclustered = false;
7524 ffe_ctl.delalloc = delalloc;
7525 ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
7526 ffe_ctl.have_caching_bg = false;
7527 ffe_ctl.orig_have_caching_bg = false;
7528 ffe_ctl.found_offset = 0;
7530 ins->type = BTRFS_EXTENT_ITEM_KEY;
7534 trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
7536 space_info = btrfs_find_space_info(fs_info, flags);
7538 btrfs_err(fs_info, "No space info for %llu", flags);
7543 * If our free space is heavily fragmented we may not be able to make
7544 * big contiguous allocations, so instead of doing the expensive search
7545 * for free space, simply return ENOSPC with our max_extent_size so we
7546 * can go ahead and search for a more manageable chunk.
7548 * If our max_extent_size is large enough for our allocation simply
7549 * disable clustering since we will likely not be able to find enough
7550 * space to create a cluster and induce latency trying.
7552 if (unlikely(space_info->max_extent_size)) {
7553 spin_lock(&space_info->lock);
7554 if (space_info->max_extent_size &&
7555 num_bytes > space_info->max_extent_size) {
7556 ins->offset = space_info->max_extent_size;
7557 spin_unlock(&space_info->lock);
7559 } else if (space_info->max_extent_size) {
7560 use_cluster = false;
7562 spin_unlock(&space_info->lock);
7565 last_ptr = fetch_cluster_info(fs_info, space_info,
7566 &ffe_ctl.empty_cluster);
7568 spin_lock(&last_ptr->lock);
7569 if (last_ptr->block_group)
7570 hint_byte = last_ptr->window_start;
7571 if (last_ptr->fragmented) {
7573 * We still set window_start so we can keep track of the
7574 * last place we found an allocation to try and save
7577 hint_byte = last_ptr->window_start;
7578 use_cluster = false;
7580 spin_unlock(&last_ptr->lock);
7583 ffe_ctl.search_start = max(ffe_ctl.search_start,
7584 first_logical_byte(fs_info, 0));
7585 ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte);
7586 if (ffe_ctl.search_start == hint_byte) {
7587 block_group = btrfs_lookup_block_group(fs_info,
7588 ffe_ctl.search_start);
7590 * we don't want to use the block group if it doesn't match our
7591 * allocation bits, or if its not cached.
7593 * However if we are re-searching with an ideal block group
7594 * picked out then we don't care that the block group is cached.
7596 if (block_group && block_group_bits(block_group, flags) &&
7597 block_group->cached != BTRFS_CACHE_NO) {
7598 down_read(&space_info->groups_sem);
7599 if (list_empty(&block_group->list) ||
7602 * someone is removing this block group,
7603 * we can't jump into the have_block_group
7604 * target because our list pointers are not
7607 btrfs_put_block_group(block_group);
7608 up_read(&space_info->groups_sem);
7610 ffe_ctl.index = btrfs_bg_flags_to_raid_index(
7611 block_group->flags);
7612 btrfs_lock_block_group(block_group, delalloc);
7613 goto have_block_group;
7615 } else if (block_group) {
7616 btrfs_put_block_group(block_group);
7620 ffe_ctl.have_caching_bg = false;
7621 if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) ||
7624 down_read(&space_info->groups_sem);
7625 list_for_each_entry(block_group,
7626 &space_info->block_groups[ffe_ctl.index], list) {
7627 /* If the block group is read-only, we can skip it entirely. */
7628 if (unlikely(block_group->ro))
7631 btrfs_grab_block_group(block_group, delalloc);
7632 ffe_ctl.search_start = block_group->key.objectid;
7635 * this can happen if we end up cycling through all the
7636 * raid types, but we want to make sure we only allocate
7637 * for the proper type.
7639 if (!block_group_bits(block_group, flags)) {
7640 u64 extra = BTRFS_BLOCK_GROUP_DUP |
7641 BTRFS_BLOCK_GROUP_RAID1_MASK |
7642 BTRFS_BLOCK_GROUP_RAID56_MASK |
7643 BTRFS_BLOCK_GROUP_RAID10;
7646 * if they asked for extra copies and this block group
7647 * doesn't provide them, bail. This does allow us to
7648 * fill raid0 from raid1.
7650 if ((flags & extra) && !(block_group->flags & extra))
7655 ffe_ctl.cached = block_group_cache_done(block_group);
7656 if (unlikely(!ffe_ctl.cached)) {
7657 ffe_ctl.have_caching_bg = true;
7658 ret = cache_block_group(block_group, 0);
7663 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
7667 * Ok we want to try and use the cluster allocator, so
7670 if (last_ptr && use_cluster) {
7671 struct btrfs_block_group_cache *cluster_bg = NULL;
7673 ret = find_free_extent_clustered(block_group, last_ptr,
7674 &ffe_ctl, &cluster_bg);
7677 if (cluster_bg && cluster_bg != block_group) {
7678 btrfs_release_block_group(block_group,
7680 block_group = cluster_bg;
7683 } else if (ret == -EAGAIN) {
7684 goto have_block_group;
7685 } else if (ret > 0) {
7688 /* ret == -ENOENT case falls through */
7691 ret = find_free_extent_unclustered(block_group, last_ptr,
7694 goto have_block_group;
7697 /* ret == 0 case falls through */
7699 ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
7700 fs_info->stripesize);
7702 /* move on to the next group */
7703 if (ffe_ctl.search_start + num_bytes >
7704 block_group->key.objectid + block_group->key.offset) {
7705 btrfs_add_free_space(block_group, ffe_ctl.found_offset,
7710 if (ffe_ctl.found_offset < ffe_ctl.search_start)
7711 btrfs_add_free_space(block_group, ffe_ctl.found_offset,
7712 ffe_ctl.search_start - ffe_ctl.found_offset);
7714 ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
7715 num_bytes, delalloc);
7716 if (ret == -EAGAIN) {
7717 btrfs_add_free_space(block_group, ffe_ctl.found_offset,
7721 btrfs_inc_block_group_reservations(block_group);
7723 /* we are all good, lets return */
7724 ins->objectid = ffe_ctl.search_start;
7725 ins->offset = num_bytes;
7727 trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start,
7729 btrfs_release_block_group(block_group, delalloc);
7732 ffe_ctl.retry_clustered = false;
7733 ffe_ctl.retry_unclustered = false;
7734 BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
7736 btrfs_release_block_group(block_group, delalloc);
7739 up_read(&space_info->groups_sem);
7741 ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl,
7742 full_search, use_cluster);
7746 if (ret == -ENOSPC) {
7748 * Use ffe_ctl->total_free_space as fallback if we can't find
7749 * any contiguous hole.
7751 if (!ffe_ctl.max_extent_size)
7752 ffe_ctl.max_extent_size = ffe_ctl.total_free_space;
7753 spin_lock(&space_info->lock);
7754 space_info->max_extent_size = ffe_ctl.max_extent_size;
7755 spin_unlock(&space_info->lock);
7756 ins->offset = ffe_ctl.max_extent_size;
7761 #define DUMP_BLOCK_RSV(fs_info, rsv_name) \
7763 struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \
7764 spin_lock(&__rsv->lock); \
7765 btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \
7766 __rsv->size, __rsv->reserved); \
7767 spin_unlock(&__rsv->lock); \
7770 static void dump_space_info(struct btrfs_fs_info *fs_info,
7771 struct btrfs_space_info *info, u64 bytes,
7772 int dump_block_groups)
7774 struct btrfs_block_group_cache *cache;
7777 spin_lock(&info->lock);
7778 btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
7780 info->total_bytes - btrfs_space_info_used(info, true),
7781 info->full ? "" : "not ");
7783 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
7784 info->total_bytes, info->bytes_used, info->bytes_pinned,
7785 info->bytes_reserved, info->bytes_may_use,
7786 info->bytes_readonly);
7787 spin_unlock(&info->lock);
7789 DUMP_BLOCK_RSV(fs_info, global_block_rsv);
7790 DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
7791 DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
7792 DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
7793 DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
7795 if (!dump_block_groups)
7798 down_read(&info->groups_sem);
7800 list_for_each_entry(cache, &info->block_groups[index], list) {
7801 spin_lock(&cache->lock);
7803 "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
7804 cache->key.objectid, cache->key.offset,
7805 btrfs_block_group_used(&cache->item), cache->pinned,
7806 cache->reserved, cache->ro ? "[readonly]" : "");
7807 btrfs_dump_free_space(cache, bytes);
7808 spin_unlock(&cache->lock);
7810 if (++index < BTRFS_NR_RAID_TYPES)
7812 up_read(&info->groups_sem);
7816 * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
7817 * hole that is at least as big as @num_bytes.
7819 * @root - The root that will contain this extent
7821 * @ram_bytes - The amount of space in ram that @num_bytes take. This
7822 * is used for accounting purposes. This value differs
7823 * from @num_bytes only in the case of compressed extents.
7825 * @num_bytes - Number of bytes to allocate on-disk.
7827 * @min_alloc_size - Indicates the minimum amount of space that the
7828 * allocator should try to satisfy. In some cases
7829 * @num_bytes may be larger than what is required and if
7830 * the filesystem is fragmented then allocation fails.
7831 * However, the presence of @min_alloc_size gives a
7832 * chance to try and satisfy the smaller allocation.
7834 * @empty_size - A hint that you plan on doing more COW. This is the
7835 * size in bytes the allocator should try to find free
7836 * next to the block it returns. This is just a hint and
7837 * may be ignored by the allocator.
7839 * @hint_byte - Hint to the allocator to start searching above the byte
7840 * address passed. It might be ignored.
7842 * @ins - This key is modified to record the found hole. It will
7843 * have the following values:
7844 * ins->objectid == start position
7845 * ins->flags = BTRFS_EXTENT_ITEM_KEY
7846 * ins->offset == the size of the hole.
7848 * @is_data - Boolean flag indicating whether an extent is
7849 * allocated for data (true) or metadata (false)
7851 * @delalloc - Boolean flag indicating whether this allocation is for
7852 * delalloc or not. If 'true' data_rwsem of block groups
7853 * is going to be acquired.
7856 * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
7857 * case -ENOSPC is returned then @ins->offset will contain the size of the
7858 * largest available hole the allocator managed to find.
7860 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
7861 u64 num_bytes, u64 min_alloc_size,
7862 u64 empty_size, u64 hint_byte,
7863 struct btrfs_key *ins, int is_data, int delalloc)
7865 struct btrfs_fs_info *fs_info = root->fs_info;
7866 bool final_tried = num_bytes == min_alloc_size;
7870 flags = get_alloc_profile_by_root(root, is_data);
7872 WARN_ON(num_bytes < fs_info->sectorsize);
7873 ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
7874 hint_byte, ins, flags, delalloc);
7875 if (!ret && !is_data) {
7876 btrfs_dec_block_group_reservations(fs_info, ins->objectid);
7877 } else if (ret == -ENOSPC) {
7878 if (!final_tried && ins->offset) {
7879 num_bytes = min(num_bytes >> 1, ins->offset);
7880 num_bytes = round_down(num_bytes,
7881 fs_info->sectorsize);
7882 num_bytes = max(num_bytes, min_alloc_size);
7883 ram_bytes = num_bytes;
7884 if (num_bytes == min_alloc_size)
7887 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
7888 struct btrfs_space_info *sinfo;
7890 sinfo = btrfs_find_space_info(fs_info, flags);
7892 "allocation failed flags %llu, wanted %llu",
7895 dump_space_info(fs_info, sinfo, num_bytes, 1);
7902 static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
7904 int pin, int delalloc)
7906 struct btrfs_block_group_cache *cache;
7909 cache = btrfs_lookup_block_group(fs_info, start);
7911 btrfs_err(fs_info, "Unable to find block group for %llu",
7917 pin_down_extent(cache, start, len, 1);
7919 if (btrfs_test_opt(fs_info, DISCARD))
7920 ret = btrfs_discard_extent(fs_info, start, len, NULL);
7921 btrfs_add_free_space(cache, start, len);
7922 btrfs_free_reserved_bytes(cache, len, delalloc);
7923 trace_btrfs_reserved_extent_free(fs_info, start, len);
7926 btrfs_put_block_group(cache);
7930 int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
7931 u64 start, u64 len, int delalloc)
7933 return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc);
7936 int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
7939 return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0);
7942 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
7943 u64 parent, u64 root_objectid,
7944 u64 flags, u64 owner, u64 offset,
7945 struct btrfs_key *ins, int ref_mod)
7947 struct btrfs_fs_info *fs_info = trans->fs_info;
7949 struct btrfs_extent_item *extent_item;
7950 struct btrfs_extent_inline_ref *iref;
7951 struct btrfs_path *path;
7952 struct extent_buffer *leaf;
7957 type = BTRFS_SHARED_DATA_REF_KEY;
7959 type = BTRFS_EXTENT_DATA_REF_KEY;
7961 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
7963 path = btrfs_alloc_path();
7967 path->leave_spinning = 1;
7968 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
7971 btrfs_free_path(path);
7975 leaf = path->nodes[0];
7976 extent_item = btrfs_item_ptr(leaf, path->slots[0],
7977 struct btrfs_extent_item);
7978 btrfs_set_extent_refs(leaf, extent_item, ref_mod);
7979 btrfs_set_extent_generation(leaf, extent_item, trans->transid);
7980 btrfs_set_extent_flags(leaf, extent_item,
7981 flags | BTRFS_EXTENT_FLAG_DATA);
7983 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
7984 btrfs_set_extent_inline_ref_type(leaf, iref, type);
7986 struct btrfs_shared_data_ref *ref;
7987 ref = (struct btrfs_shared_data_ref *)(iref + 1);
7988 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
7989 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
7991 struct btrfs_extent_data_ref *ref;
7992 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
7993 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
7994 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
7995 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
7996 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
7999 btrfs_mark_buffer_dirty(path->nodes[0]);
8000 btrfs_free_path(path);
8002 ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset);
8006 ret = update_block_group(trans, ins->objectid, ins->offset, 1);
8007 if (ret) { /* -ENOENT, logic error */
8008 btrfs_err(fs_info, "update block group failed for %llu %llu",
8009 ins->objectid, ins->offset);
8012 trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
8016 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
8017 struct btrfs_delayed_ref_node *node,
8018 struct btrfs_delayed_extent_op *extent_op)
8020 struct btrfs_fs_info *fs_info = trans->fs_info;
8022 struct btrfs_extent_item *extent_item;
8023 struct btrfs_key extent_key;
8024 struct btrfs_tree_block_info *block_info;
8025 struct btrfs_extent_inline_ref *iref;
8026 struct btrfs_path *path;
8027 struct extent_buffer *leaf;
8028 struct btrfs_delayed_tree_ref *ref;
8029 u32 size = sizeof(*extent_item) + sizeof(*iref);
8031 u64 flags = extent_op->flags_to_set;
8032 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
8034 ref = btrfs_delayed_node_to_tree_ref(node);
8036 extent_key.objectid = node->bytenr;
8037 if (skinny_metadata) {
8038 extent_key.offset = ref->level;
8039 extent_key.type = BTRFS_METADATA_ITEM_KEY;
8040 num_bytes = fs_info->nodesize;
8042 extent_key.offset = node->num_bytes;
8043 extent_key.type = BTRFS_EXTENT_ITEM_KEY;
8044 size += sizeof(*block_info);
8045 num_bytes = node->num_bytes;
8048 path = btrfs_alloc_path();
8052 path->leave_spinning = 1;
8053 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
8056 btrfs_free_path(path);
8060 leaf = path->nodes[0];
8061 extent_item = btrfs_item_ptr(leaf, path->slots[0],
8062 struct btrfs_extent_item);
8063 btrfs_set_extent_refs(leaf, extent_item, 1);
8064 btrfs_set_extent_generation(leaf, extent_item, trans->transid);
8065 btrfs_set_extent_flags(leaf, extent_item,
8066 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
8068 if (skinny_metadata) {
8069 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
8071 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
8072 btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
8073 btrfs_set_tree_block_level(leaf, block_info, ref->level);
8074 iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
8077 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
8078 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
8079 btrfs_set_extent_inline_ref_type(leaf, iref,
8080 BTRFS_SHARED_BLOCK_REF_KEY);
8081 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
8083 btrfs_set_extent_inline_ref_type(leaf, iref,
8084 BTRFS_TREE_BLOCK_REF_KEY);
8085 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
8088 btrfs_mark_buffer_dirty(leaf);
8089 btrfs_free_path(path);
8091 ret = remove_from_free_space_tree(trans, extent_key.objectid,
8096 ret = update_block_group(trans, extent_key.objectid,
8097 fs_info->nodesize, 1);
8098 if (ret) { /* -ENOENT, logic error */
8099 btrfs_err(fs_info, "update block group failed for %llu %llu",
8100 extent_key.objectid, extent_key.offset);
8104 trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid,
8109 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
8110 struct btrfs_root *root, u64 owner,
8111 u64 offset, u64 ram_bytes,
8112 struct btrfs_key *ins)
8114 struct btrfs_ref generic_ref = { 0 };
8117 BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
8119 btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
8120 ins->objectid, ins->offset, 0);
8121 btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset);
8122 btrfs_ref_tree_mod(root->fs_info, &generic_ref);
8123 ret = btrfs_add_delayed_data_ref(trans, &generic_ref,
8124 ram_bytes, NULL, NULL);
8129 * this is used by the tree logging recovery code. It records that
8130 * an extent has been allocated and makes sure to clear the free
8131 * space cache bits as well
8133 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
8134 u64 root_objectid, u64 owner, u64 offset,
8135 struct btrfs_key *ins)
8137 struct btrfs_fs_info *fs_info = trans->fs_info;
8139 struct btrfs_block_group_cache *block_group;
8140 struct btrfs_space_info *space_info;
8143 * Mixed block groups will exclude before processing the log so we only
8144 * need to do the exclude dance if this fs isn't mixed.
8146 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
8147 ret = __exclude_logged_extent(fs_info, ins->objectid,
8153 block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
8157 space_info = block_group->space_info;
8158 spin_lock(&space_info->lock);
8159 spin_lock(&block_group->lock);
8160 space_info->bytes_reserved += ins->offset;
8161 block_group->reserved += ins->offset;
8162 spin_unlock(&block_group->lock);
8163 spin_unlock(&space_info->lock);
8165 ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
8167 btrfs_put_block_group(block_group);
8171 static struct extent_buffer *
8172 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
8173 u64 bytenr, int level, u64 owner)
8175 struct btrfs_fs_info *fs_info = root->fs_info;
8176 struct extent_buffer *buf;
8178 buf = btrfs_find_create_tree_block(fs_info, bytenr);
8183 * Extra safety check in case the extent tree is corrupted and extent
8184 * allocator chooses to use a tree block which is already used and
8187 if (buf->lock_owner == current->pid) {
8188 btrfs_err_rl(fs_info,
8189 "tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
8190 buf->start, btrfs_header_owner(buf), current->pid);
8191 free_extent_buffer(buf);
8192 return ERR_PTR(-EUCLEAN);
8195 btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
8196 btrfs_tree_lock(buf);
8197 btrfs_clean_tree_block(buf);
8198 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
8200 btrfs_set_lock_blocking_write(buf);
8201 set_extent_buffer_uptodate(buf);
8203 memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
8204 btrfs_set_header_level(buf, level);
8205 btrfs_set_header_bytenr(buf, buf->start);
8206 btrfs_set_header_generation(buf, trans->transid);
8207 btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
8208 btrfs_set_header_owner(buf, owner);
8209 write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
8210 write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
8211 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
8212 buf->log_index = root->log_transid % 2;
8214 * we allow two log transactions at a time, use different
8215 * EXTENT bit to differentiate dirty pages.
8217 if (buf->log_index == 0)
8218 set_extent_dirty(&root->dirty_log_pages, buf->start,
8219 buf->start + buf->len - 1, GFP_NOFS);
8221 set_extent_new(&root->dirty_log_pages, buf->start,
8222 buf->start + buf->len - 1);
8224 buf->log_index = -1;
8225 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
8226 buf->start + buf->len - 1, GFP_NOFS);
8228 trans->dirty = true;
8229 /* this returns a buffer locked for blocking */
8233 static struct btrfs_block_rsv *
8234 use_block_rsv(struct btrfs_trans_handle *trans,
8235 struct btrfs_root *root, u32 blocksize)
8237 struct btrfs_fs_info *fs_info = root->fs_info;
8238 struct btrfs_block_rsv *block_rsv;
8239 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
8241 bool global_updated = false;
8243 block_rsv = get_block_rsv(trans, root);
8245 if (unlikely(block_rsv->size == 0))
8248 ret = block_rsv_use_bytes(block_rsv, blocksize);
8252 if (block_rsv->failfast)
8253 return ERR_PTR(ret);
8255 if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
8256 global_updated = true;
8257 update_global_block_rsv(fs_info);
8262 * The global reserve still exists to save us from ourselves, so don't
8263 * warn_on if we are short on our delayed refs reserve.
8265 if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS &&
8266 btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
8267 static DEFINE_RATELIMIT_STATE(_rs,
8268 DEFAULT_RATELIMIT_INTERVAL * 10,
8269 /*DEFAULT_RATELIMIT_BURST*/ 1);
8270 if (__ratelimit(&_rs))
8272 "BTRFS: block rsv returned %d\n", ret);
8275 ret = reserve_metadata_bytes(root, block_rsv, blocksize,
8276 BTRFS_RESERVE_NO_FLUSH);
8280 * If we couldn't reserve metadata bytes try and use some from
8281 * the global reserve if its space type is the same as the global
8284 if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
8285 block_rsv->space_info == global_rsv->space_info) {
8286 ret = block_rsv_use_bytes(global_rsv, blocksize);
8290 return ERR_PTR(ret);
8293 static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
8294 struct btrfs_block_rsv *block_rsv, u32 blocksize)
8296 block_rsv_add_bytes(block_rsv, blocksize, false);
8297 block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL);
8301 * finds a free extent and does all the dirty work required for allocation
8302 * returns the tree buffer or an ERR_PTR on error.
8304 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
8305 struct btrfs_root *root,
8306 u64 parent, u64 root_objectid,
8307 const struct btrfs_disk_key *key,
8308 int level, u64 hint,
8311 struct btrfs_fs_info *fs_info = root->fs_info;
8312 struct btrfs_key ins;
8313 struct btrfs_block_rsv *block_rsv;
8314 struct extent_buffer *buf;
8315 struct btrfs_delayed_extent_op *extent_op;
8316 struct btrfs_ref generic_ref = { 0 };
8319 u32 blocksize = fs_info->nodesize;
8320 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
8322 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
8323 if (btrfs_is_testing(fs_info)) {
8324 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
8325 level, root_objectid);
8327 root->alloc_bytenr += blocksize;
8332 block_rsv = use_block_rsv(trans, root, blocksize);
8333 if (IS_ERR(block_rsv))
8334 return ERR_CAST(block_rsv);
8336 ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
8337 empty_size, hint, &ins, 0, 0);
8341 buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
8345 goto out_free_reserved;
8348 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
8350 parent = ins.objectid;
8351 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
8355 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
8356 extent_op = btrfs_alloc_delayed_extent_op();
8362 memcpy(&extent_op->key, key, sizeof(extent_op->key));
8364 memset(&extent_op->key, 0, sizeof(extent_op->key));
8365 extent_op->flags_to_set = flags;
8366 extent_op->update_key = skinny_metadata ? false : true;
8367 extent_op->update_flags = true;
8368 extent_op->is_data = false;
8369 extent_op->level = level;
8371 btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
8372 ins.objectid, ins.offset, parent);
8373 generic_ref.real_root = root->root_key.objectid;
8374 btrfs_init_tree_ref(&generic_ref, level, root_objectid);
8375 btrfs_ref_tree_mod(fs_info, &generic_ref);
8376 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref,
8377 extent_op, NULL, NULL);
8379 goto out_free_delayed;
8384 btrfs_free_delayed_extent_op(extent_op);
8386 free_extent_buffer(buf);
8388 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
8390 unuse_block_rsv(fs_info, block_rsv, blocksize);
8391 return ERR_PTR(ret);
8394 struct walk_control {
8395 u64 refs[BTRFS_MAX_LEVEL];
8396 u64 flags[BTRFS_MAX_LEVEL];
8397 struct btrfs_key update_progress;
8398 struct btrfs_key drop_progress;
8410 #define DROP_REFERENCE 1
8411 #define UPDATE_BACKREF 2
8413 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
8414 struct btrfs_root *root,
8415 struct walk_control *wc,
8416 struct btrfs_path *path)
8418 struct btrfs_fs_info *fs_info = root->fs_info;
8424 struct btrfs_key key;
8425 struct extent_buffer *eb;
8430 if (path->slots[wc->level] < wc->reada_slot) {
8431 wc->reada_count = wc->reada_count * 2 / 3;
8432 wc->reada_count = max(wc->reada_count, 2);
8434 wc->reada_count = wc->reada_count * 3 / 2;
8435 wc->reada_count = min_t(int, wc->reada_count,
8436 BTRFS_NODEPTRS_PER_BLOCK(fs_info));
8439 eb = path->nodes[wc->level];
8440 nritems = btrfs_header_nritems(eb);
8442 for (slot = path->slots[wc->level]; slot < nritems; slot++) {
8443 if (nread >= wc->reada_count)
8447 bytenr = btrfs_node_blockptr(eb, slot);
8448 generation = btrfs_node_ptr_generation(eb, slot);
8450 if (slot == path->slots[wc->level])
8453 if (wc->stage == UPDATE_BACKREF &&
8454 generation <= root->root_key.offset)
8457 /* We don't lock the tree block, it's OK to be racy here */
8458 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
8459 wc->level - 1, 1, &refs,
8461 /* We don't care about errors in readahead. */
8466 if (wc->stage == DROP_REFERENCE) {
8470 if (wc->level == 1 &&
8471 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8473 if (!wc->update_ref ||
8474 generation <= root->root_key.offset)
8476 btrfs_node_key_to_cpu(eb, &key, slot);
8477 ret = btrfs_comp_cpu_keys(&key,
8478 &wc->update_progress);
8482 if (wc->level == 1 &&
8483 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8487 readahead_tree_block(fs_info, bytenr);
8490 wc->reada_slot = slot;
8494 * helper to process tree block while walking down the tree.
8496 * when wc->stage == UPDATE_BACKREF, this function updates
8497 * back refs for pointers in the block.
8499 * NOTE: return value 1 means we should stop walking down.
8501 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
8502 struct btrfs_root *root,
8503 struct btrfs_path *path,
8504 struct walk_control *wc, int lookup_info)
8506 struct btrfs_fs_info *fs_info = root->fs_info;
8507 int level = wc->level;
8508 struct extent_buffer *eb = path->nodes[level];
8509 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
8512 if (wc->stage == UPDATE_BACKREF &&
8513 btrfs_header_owner(eb) != root->root_key.objectid)
8517 * when reference count of tree block is 1, it won't increase
8518 * again. once full backref flag is set, we never clear it.
8521 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
8522 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
8523 BUG_ON(!path->locks[level]);
8524 ret = btrfs_lookup_extent_info(trans, fs_info,
8525 eb->start, level, 1,
8528 BUG_ON(ret == -ENOMEM);
8531 BUG_ON(wc->refs[level] == 0);
8534 if (wc->stage == DROP_REFERENCE) {
8535 if (wc->refs[level] > 1)
8538 if (path->locks[level] && !wc->keep_locks) {
8539 btrfs_tree_unlock_rw(eb, path->locks[level]);
8540 path->locks[level] = 0;
8545 /* wc->stage == UPDATE_BACKREF */
8546 if (!(wc->flags[level] & flag)) {
8547 BUG_ON(!path->locks[level]);
8548 ret = btrfs_inc_ref(trans, root, eb, 1);
8549 BUG_ON(ret); /* -ENOMEM */
8550 ret = btrfs_dec_ref(trans, root, eb, 0);
8551 BUG_ON(ret); /* -ENOMEM */
8552 ret = btrfs_set_disk_extent_flags(trans, eb->start,
8554 btrfs_header_level(eb), 0);
8555 BUG_ON(ret); /* -ENOMEM */
8556 wc->flags[level] |= flag;
8560 * the block is shared by multiple trees, so it's not good to
8561 * keep the tree lock
8563 if (path->locks[level] && level > 0) {
8564 btrfs_tree_unlock_rw(eb, path->locks[level]);
8565 path->locks[level] = 0;
8571 * This is used to verify a ref exists for this root to deal with a bug where we
8572 * would have a drop_progress key that hadn't been updated properly.
8574 static int check_ref_exists(struct btrfs_trans_handle *trans,
8575 struct btrfs_root *root, u64 bytenr, u64 parent,
8578 struct btrfs_path *path;
8579 struct btrfs_extent_inline_ref *iref;
8582 path = btrfs_alloc_path();
8586 ret = lookup_extent_backref(trans, path, &iref, bytenr,
8587 root->fs_info->nodesize, parent,
8588 root->root_key.objectid, level, 0);
8589 btrfs_free_path(path);
8598 * helper to process tree block pointer.
8600 * when wc->stage == DROP_REFERENCE, this function checks
8601 * reference count of the block pointed to. if the block
8602 * is shared and we need update back refs for the subtree
8603 * rooted at the block, this function changes wc->stage to
8604 * UPDATE_BACKREF. if the block is shared and there is no
8605 * need to update back, this function drops the reference
8608 * NOTE: return value 1 means we should stop walking down.
8610 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
8611 struct btrfs_root *root,
8612 struct btrfs_path *path,
8613 struct walk_control *wc, int *lookup_info)
8615 struct btrfs_fs_info *fs_info = root->fs_info;
8619 struct btrfs_key key;
8620 struct btrfs_key first_key;
8621 struct btrfs_ref ref = { 0 };
8622 struct extent_buffer *next;
8623 int level = wc->level;
8626 bool need_account = false;
8628 generation = btrfs_node_ptr_generation(path->nodes[level],
8629 path->slots[level]);
8631 * if the lower level block was created before the snapshot
8632 * was created, we know there is no need to update back refs
8635 if (wc->stage == UPDATE_BACKREF &&
8636 generation <= root->root_key.offset) {
8641 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
8642 btrfs_node_key_to_cpu(path->nodes[level], &first_key,
8643 path->slots[level]);
8645 next = find_extent_buffer(fs_info, bytenr);
8647 next = btrfs_find_create_tree_block(fs_info, bytenr);
8649 return PTR_ERR(next);
8651 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
8655 btrfs_tree_lock(next);
8656 btrfs_set_lock_blocking_write(next);
8658 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
8659 &wc->refs[level - 1],
8660 &wc->flags[level - 1]);
8664 if (unlikely(wc->refs[level - 1] == 0)) {
8665 btrfs_err(fs_info, "Missing references.");
8671 if (wc->stage == DROP_REFERENCE) {
8672 if (wc->refs[level - 1] > 1) {
8673 need_account = true;
8675 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8678 if (!wc->update_ref ||
8679 generation <= root->root_key.offset)
8682 btrfs_node_key_to_cpu(path->nodes[level], &key,
8683 path->slots[level]);
8684 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
8688 wc->stage = UPDATE_BACKREF;
8689 wc->shared_level = level - 1;
8693 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8697 if (!btrfs_buffer_uptodate(next, generation, 0)) {
8698 btrfs_tree_unlock(next);
8699 free_extent_buffer(next);
8705 if (reada && level == 1)
8706 reada_walk_down(trans, root, wc, path);
8707 next = read_tree_block(fs_info, bytenr, generation, level - 1,
8710 return PTR_ERR(next);
8711 } else if (!extent_buffer_uptodate(next)) {
8712 free_extent_buffer(next);
8715 btrfs_tree_lock(next);
8716 btrfs_set_lock_blocking_write(next);
8720 ASSERT(level == btrfs_header_level(next));
8721 if (level != btrfs_header_level(next)) {
8722 btrfs_err(root->fs_info, "mismatched level");
8726 path->nodes[level] = next;
8727 path->slots[level] = 0;
8728 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8734 wc->refs[level - 1] = 0;
8735 wc->flags[level - 1] = 0;
8736 if (wc->stage == DROP_REFERENCE) {
8737 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
8738 parent = path->nodes[level]->start;
8740 ASSERT(root->root_key.objectid ==
8741 btrfs_header_owner(path->nodes[level]));
8742 if (root->root_key.objectid !=
8743 btrfs_header_owner(path->nodes[level])) {
8744 btrfs_err(root->fs_info,
8745 "mismatched block owner");
8753 * If we had a drop_progress we need to verify the refs are set
8754 * as expected. If we find our ref then we know that from here
8755 * on out everything should be correct, and we can clear the
8758 if (wc->restarted) {
8759 ret = check_ref_exists(trans, root, bytenr, parent,
8770 * Reloc tree doesn't contribute to qgroup numbers, and we have
8771 * already accounted them at merge time (replace_path),
8772 * thus we could skip expensive subtree trace here.
8774 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
8776 ret = btrfs_qgroup_trace_subtree(trans, next,
8777 generation, level - 1);
8779 btrfs_err_rl(fs_info,
8780 "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
8786 * We need to update the next key in our walk control so we can
8787 * update the drop_progress key accordingly. We don't care if
8788 * find_next_key doesn't find a key because that means we're at
8789 * the end and are going to clean up now.
8791 wc->drop_level = level;
8792 find_next_key(path, level, &wc->drop_progress);
8794 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
8795 fs_info->nodesize, parent);
8796 btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid);
8797 ret = btrfs_free_extent(trans, &ref);
8806 btrfs_tree_unlock(next);
8807 free_extent_buffer(next);
8813 * helper to process tree block while walking up the tree.
8815 * when wc->stage == DROP_REFERENCE, this function drops
8816 * reference count on the block.
8818 * when wc->stage == UPDATE_BACKREF, this function changes
8819 * wc->stage back to DROP_REFERENCE if we changed wc->stage
8820 * to UPDATE_BACKREF previously while processing the block.
8822 * NOTE: return value 1 means we should stop walking up.
8824 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
8825 struct btrfs_root *root,
8826 struct btrfs_path *path,
8827 struct walk_control *wc)
8829 struct btrfs_fs_info *fs_info = root->fs_info;
8831 int level = wc->level;
8832 struct extent_buffer *eb = path->nodes[level];
8835 if (wc->stage == UPDATE_BACKREF) {
8836 BUG_ON(wc->shared_level < level);
8837 if (level < wc->shared_level)
8840 ret = find_next_key(path, level + 1, &wc->update_progress);
8844 wc->stage = DROP_REFERENCE;
8845 wc->shared_level = -1;
8846 path->slots[level] = 0;
8849 * check reference count again if the block isn't locked.
8850 * we should start walking down the tree again if reference
8853 if (!path->locks[level]) {
8855 btrfs_tree_lock(eb);
8856 btrfs_set_lock_blocking_write(eb);
8857 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8859 ret = btrfs_lookup_extent_info(trans, fs_info,
8860 eb->start, level, 1,
8864 btrfs_tree_unlock_rw(eb, path->locks[level]);
8865 path->locks[level] = 0;
8868 BUG_ON(wc->refs[level] == 0);
8869 if (wc->refs[level] == 1) {
8870 btrfs_tree_unlock_rw(eb, path->locks[level]);
8871 path->locks[level] = 0;
8877 /* wc->stage == DROP_REFERENCE */
8878 BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
8880 if (wc->refs[level] == 1) {
8882 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8883 ret = btrfs_dec_ref(trans, root, eb, 1);
8885 ret = btrfs_dec_ref(trans, root, eb, 0);
8886 BUG_ON(ret); /* -ENOMEM */
8887 if (is_fstree(root->root_key.objectid)) {
8888 ret = btrfs_qgroup_trace_leaf_items(trans, eb);
8890 btrfs_err_rl(fs_info,
8891 "error %d accounting leaf items, quota is out of sync, rescan required",
8896 /* make block locked assertion in btrfs_clean_tree_block happy */
8897 if (!path->locks[level] &&
8898 btrfs_header_generation(eb) == trans->transid) {
8899 btrfs_tree_lock(eb);
8900 btrfs_set_lock_blocking_write(eb);
8901 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8903 btrfs_clean_tree_block(eb);
8906 if (eb == root->node) {
8907 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8909 else if (root->root_key.objectid != btrfs_header_owner(eb))
8910 goto owner_mismatch;
8912 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8913 parent = path->nodes[level + 1]->start;
8914 else if (root->root_key.objectid !=
8915 btrfs_header_owner(path->nodes[level + 1]))
8916 goto owner_mismatch;
8919 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
8921 wc->refs[level] = 0;
8922 wc->flags[level] = 0;
8926 btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
8927 btrfs_header_owner(eb), root->root_key.objectid);
8931 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
8932 struct btrfs_root *root,
8933 struct btrfs_path *path,
8934 struct walk_control *wc)
8936 int level = wc->level;
8937 int lookup_info = 1;
8940 while (level >= 0) {
8941 ret = walk_down_proc(trans, root, path, wc, lookup_info);
8948 if (path->slots[level] >=
8949 btrfs_header_nritems(path->nodes[level]))
8952 ret = do_walk_down(trans, root, path, wc, &lookup_info);
8954 path->slots[level]++;
8963 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
8964 struct btrfs_root *root,
8965 struct btrfs_path *path,
8966 struct walk_control *wc, int max_level)
8968 int level = wc->level;
8971 path->slots[level] = btrfs_header_nritems(path->nodes[level]);
8972 while (level < max_level && path->nodes[level]) {
8974 if (path->slots[level] + 1 <
8975 btrfs_header_nritems(path->nodes[level])) {
8976 path->slots[level]++;
8979 ret = walk_up_proc(trans, root, path, wc);
8985 if (path->locks[level]) {
8986 btrfs_tree_unlock_rw(path->nodes[level],
8987 path->locks[level]);
8988 path->locks[level] = 0;
8990 free_extent_buffer(path->nodes[level]);
8991 path->nodes[level] = NULL;
8999 * drop a subvolume tree.
9001 * this function traverses the tree freeing any blocks that only
9002 * referenced by the tree.
9004 * when a shared tree block is found. this function decreases its
9005 * reference count by one. if update_ref is true, this function
9006 * also make sure backrefs for the shared block and all lower level
9007 * blocks are properly updated.
9009 * If called with for_reloc == 0, may exit early with -EAGAIN
9011 int btrfs_drop_snapshot(struct btrfs_root *root,
9012 struct btrfs_block_rsv *block_rsv, int update_ref,
9015 struct btrfs_fs_info *fs_info = root->fs_info;
9016 struct btrfs_path *path;
9017 struct btrfs_trans_handle *trans;
9018 struct btrfs_root *tree_root = fs_info->tree_root;
9019 struct btrfs_root_item *root_item = &root->root_item;
9020 struct walk_control *wc;
9021 struct btrfs_key key;
9025 bool root_dropped = false;
9027 btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
9029 path = btrfs_alloc_path();
9035 wc = kzalloc(sizeof(*wc), GFP_NOFS);
9037 btrfs_free_path(path);
9042 trans = btrfs_start_transaction(tree_root, 0);
9043 if (IS_ERR(trans)) {
9044 err = PTR_ERR(trans);
9048 err = btrfs_run_delayed_items(trans);
9053 trans->block_rsv = block_rsv;
9056 * This will help us catch people modifying the fs tree while we're
9057 * dropping it. It is unsafe to mess with the fs tree while it's being
9058 * dropped as we unlock the root node and parent nodes as we walk down
9059 * the tree, assuming nothing will change. If something does change
9060 * then we'll have stale information and drop references to blocks we've
9063 set_bit(BTRFS_ROOT_DELETING, &root->state);
9064 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
9065 level = btrfs_header_level(root->node);
9066 path->nodes[level] = btrfs_lock_root_node(root);
9067 btrfs_set_lock_blocking_write(path->nodes[level]);
9068 path->slots[level] = 0;
9069 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9070 memset(&wc->update_progress, 0,
9071 sizeof(wc->update_progress));
9073 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
9074 memcpy(&wc->update_progress, &key,
9075 sizeof(wc->update_progress));
9077 level = root_item->drop_level;
9079 path->lowest_level = level;
9080 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
9081 path->lowest_level = 0;
9089 * unlock our path, this is safe because only this
9090 * function is allowed to delete this snapshot
9092 btrfs_unlock_up_safe(path, 0);
9094 level = btrfs_header_level(root->node);
9096 btrfs_tree_lock(path->nodes[level]);
9097 btrfs_set_lock_blocking_write(path->nodes[level]);
9098 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9100 ret = btrfs_lookup_extent_info(trans, fs_info,
9101 path->nodes[level]->start,
9102 level, 1, &wc->refs[level],
9108 BUG_ON(wc->refs[level] == 0);
9110 if (level == root_item->drop_level)
9113 btrfs_tree_unlock(path->nodes[level]);
9114 path->locks[level] = 0;
9115 WARN_ON(wc->refs[level] != 1);
9120 wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
9122 wc->shared_level = -1;
9123 wc->stage = DROP_REFERENCE;
9124 wc->update_ref = update_ref;
9126 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
9130 ret = walk_down_tree(trans, root, path, wc);
9136 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
9143 BUG_ON(wc->stage != DROP_REFERENCE);
9147 if (wc->stage == DROP_REFERENCE) {
9148 wc->drop_level = wc->level;
9149 btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
9151 path->slots[wc->drop_level]);
9153 btrfs_cpu_key_to_disk(&root_item->drop_progress,
9154 &wc->drop_progress);
9155 root_item->drop_level = wc->drop_level;
9157 BUG_ON(wc->level == 0);
9158 if (btrfs_should_end_transaction(trans) ||
9159 (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
9160 ret = btrfs_update_root(trans, tree_root,
9164 btrfs_abort_transaction(trans, ret);
9169 btrfs_end_transaction_throttle(trans);
9170 if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
9171 btrfs_debug(fs_info,
9172 "drop snapshot early exit");
9177 trans = btrfs_start_transaction(tree_root, 0);
9178 if (IS_ERR(trans)) {
9179 err = PTR_ERR(trans);
9183 trans->block_rsv = block_rsv;
9186 btrfs_release_path(path);
9190 ret = btrfs_del_root(trans, &root->root_key);
9192 btrfs_abort_transaction(trans, ret);
9197 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
9198 ret = btrfs_find_root(tree_root, &root->root_key, path,
9201 btrfs_abort_transaction(trans, ret);
9204 } else if (ret > 0) {
9205 /* if we fail to delete the orphan item this time
9206 * around, it'll get picked up the next time.
9208 * The most common failure here is just -ENOENT.
9210 btrfs_del_orphan_item(trans, tree_root,
9211 root->root_key.objectid);
9215 if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
9216 btrfs_add_dropped_root(trans, root);
9218 free_extent_buffer(root->node);
9219 free_extent_buffer(root->commit_root);
9220 btrfs_put_fs_root(root);
9222 root_dropped = true;
9224 btrfs_end_transaction_throttle(trans);
9227 btrfs_free_path(path);
9230 * So if we need to stop dropping the snapshot for whatever reason we
9231 * need to make sure to add it back to the dead root list so that we
9232 * keep trying to do the work later. This also cleans up roots if we
9233 * don't have it in the radix (like when we recover after a power fail
9234 * or unmount) so we don't leak memory.
9236 if (!for_reloc && !root_dropped)
9237 btrfs_add_dead_root(root);
9238 if (err && err != -EAGAIN)
9239 btrfs_handle_fs_error(fs_info, err, NULL);
9244 * drop subtree rooted at tree block 'node'.
9246 * NOTE: this function will unlock and release tree block 'node'
9247 * only used by relocation code
9249 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
9250 struct btrfs_root *root,
9251 struct extent_buffer *node,
9252 struct extent_buffer *parent)
9254 struct btrfs_fs_info *fs_info = root->fs_info;
9255 struct btrfs_path *path;
9256 struct walk_control *wc;
9262 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
9264 path = btrfs_alloc_path();
9268 wc = kzalloc(sizeof(*wc), GFP_NOFS);
9270 btrfs_free_path(path);
9274 btrfs_assert_tree_locked(parent);
9275 parent_level = btrfs_header_level(parent);
9276 extent_buffer_get(parent);
9277 path->nodes[parent_level] = parent;
9278 path->slots[parent_level] = btrfs_header_nritems(parent);
9280 btrfs_assert_tree_locked(node);
9281 level = btrfs_header_level(node);
9282 path->nodes[level] = node;
9283 path->slots[level] = 0;
9284 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9286 wc->refs[parent_level] = 1;
9287 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
9289 wc->shared_level = -1;
9290 wc->stage = DROP_REFERENCE;
9293 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
9296 wret = walk_down_tree(trans, root, path, wc);
9302 wret = walk_up_tree(trans, root, path, wc, parent_level);
9310 btrfs_free_path(path);
9314 static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
9320 * if restripe for this chunk_type is on pick target profile and
9321 * return, otherwise do the usual balance
9323 stripped = get_restripe_target(fs_info, flags);
9325 return extended_to_chunk(stripped);
9327 num_devices = fs_info->fs_devices->rw_devices;
9329 stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK |
9330 BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10;
9332 if (num_devices == 1) {
9333 stripped |= BTRFS_BLOCK_GROUP_DUP;
9334 stripped = flags & ~stripped;
9336 /* turn raid0 into single device chunks */
9337 if (flags & BTRFS_BLOCK_GROUP_RAID0)
9340 /* turn mirroring into duplication */
9341 if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK |
9342 BTRFS_BLOCK_GROUP_RAID10))
9343 return stripped | BTRFS_BLOCK_GROUP_DUP;
9345 /* they already had raid on here, just return */
9346 if (flags & stripped)
9349 stripped |= BTRFS_BLOCK_GROUP_DUP;
9350 stripped = flags & ~stripped;
9352 /* switch duplicated blocks with raid1 */
9353 if (flags & BTRFS_BLOCK_GROUP_DUP)
9354 return stripped | BTRFS_BLOCK_GROUP_RAID1;
9356 /* this is drive concat, leave it alone */
9362 static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
9364 struct btrfs_space_info *sinfo = cache->space_info;
9367 u64 min_allocable_bytes;
9371 * We need some metadata space and system metadata space for
9372 * allocating chunks in some corner cases until we force to set
9373 * it to be readonly.
9376 (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
9378 min_allocable_bytes = SZ_1M;
9380 min_allocable_bytes = 0;
9382 spin_lock(&sinfo->lock);
9383 spin_lock(&cache->lock);
9391 num_bytes = cache->key.offset - cache->reserved - cache->pinned -
9392 cache->bytes_super - btrfs_block_group_used(&cache->item);
9393 sinfo_used = btrfs_space_info_used(sinfo, true);
9395 if (sinfo_used + num_bytes + min_allocable_bytes <=
9396 sinfo->total_bytes) {
9397 sinfo->bytes_readonly += num_bytes;
9399 list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
9403 spin_unlock(&cache->lock);
9404 spin_unlock(&sinfo->lock);
9405 if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
9406 btrfs_info(cache->fs_info,
9407 "unable to make block group %llu ro",
9408 cache->key.objectid);
9409 btrfs_info(cache->fs_info,
9410 "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu",
9411 sinfo_used, num_bytes, min_allocable_bytes);
9412 dump_space_info(cache->fs_info, cache->space_info, 0, 0);
9417 int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache)
9420 struct btrfs_fs_info *fs_info = cache->fs_info;
9421 struct btrfs_trans_handle *trans;
9426 trans = btrfs_join_transaction(fs_info->extent_root);
9428 return PTR_ERR(trans);
9431 * we're not allowed to set block groups readonly after the dirty
9432 * block groups cache has started writing. If it already started,
9433 * back off and let this transaction commit
9435 mutex_lock(&fs_info->ro_block_group_mutex);
9436 if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
9437 u64 transid = trans->transid;
9439 mutex_unlock(&fs_info->ro_block_group_mutex);
9440 btrfs_end_transaction(trans);
9442 ret = btrfs_wait_for_commit(fs_info, transid);
9449 * if we are changing raid levels, try to allocate a corresponding
9450 * block group with the new raid level.
9452 alloc_flags = update_block_group_flags(fs_info, cache->flags);
9453 if (alloc_flags != cache->flags) {
9454 ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
9456 * ENOSPC is allowed here, we may have enough space
9457 * already allocated at the new raid level to
9466 ret = inc_block_group_ro(cache, 0);
9469 alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags);
9470 ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
9473 ret = inc_block_group_ro(cache, 0);
9475 if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
9476 alloc_flags = update_block_group_flags(fs_info, cache->flags);
9477 mutex_lock(&fs_info->chunk_mutex);
9478 check_system_chunk(trans, alloc_flags);
9479 mutex_unlock(&fs_info->chunk_mutex);
9481 mutex_unlock(&fs_info->ro_block_group_mutex);
9483 btrfs_end_transaction(trans);
9487 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
9489 u64 alloc_flags = get_alloc_profile(trans->fs_info, type);
9491 return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
9495 * helper to account the unused space of all the readonly block group in the
9496 * space_info. takes mirrors into account.
9498 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
9500 struct btrfs_block_group_cache *block_group;
9504 /* It's df, we don't care if it's racy */
9505 if (list_empty(&sinfo->ro_bgs))
9508 spin_lock(&sinfo->lock);
9509 list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
9510 spin_lock(&block_group->lock);
9512 if (!block_group->ro) {
9513 spin_unlock(&block_group->lock);
9517 factor = btrfs_bg_type_to_factor(block_group->flags);
9518 free_bytes += (block_group->key.offset -
9519 btrfs_block_group_used(&block_group->item)) *
9522 spin_unlock(&block_group->lock);
9524 spin_unlock(&sinfo->lock);
9529 void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
9531 struct btrfs_space_info *sinfo = cache->space_info;
9536 spin_lock(&sinfo->lock);
9537 spin_lock(&cache->lock);
9539 num_bytes = cache->key.offset - cache->reserved -
9540 cache->pinned - cache->bytes_super -
9541 btrfs_block_group_used(&cache->item);
9542 sinfo->bytes_readonly -= num_bytes;
9543 list_del_init(&cache->ro_list);
9545 spin_unlock(&cache->lock);
9546 spin_unlock(&sinfo->lock);
9550 * Checks to see if it's even possible to relocate this block group.
9552 * @return - -1 if it's not a good idea to relocate this block group, 0 if its
9553 * ok to go ahead and try.
9555 int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
9557 struct btrfs_block_group_cache *block_group;
9558 struct btrfs_space_info *space_info;
9559 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
9560 struct btrfs_device *device;
9570 debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG);
9572 block_group = btrfs_lookup_block_group(fs_info, bytenr);
9574 /* odd, couldn't find the block group, leave it alone */
9578 "can't find block group for bytenr %llu",
9583 min_free = btrfs_block_group_used(&block_group->item);
9585 /* no bytes used, we're good */
9589 space_info = block_group->space_info;
9590 spin_lock(&space_info->lock);
9592 full = space_info->full;
9595 * if this is the last block group we have in this space, we can't
9596 * relocate it unless we're able to allocate a new chunk below.
9598 * Otherwise, we need to make sure we have room in the space to handle
9599 * all of the extents from this block group. If we can, we're good
9601 if ((space_info->total_bytes != block_group->key.offset) &&
9602 (btrfs_space_info_used(space_info, false) + min_free <
9603 space_info->total_bytes)) {
9604 spin_unlock(&space_info->lock);
9607 spin_unlock(&space_info->lock);
9610 * ok we don't have enough space, but maybe we have free space on our
9611 * devices to allocate new chunks for relocation, so loop through our
9612 * alloc devices and guess if we have enough space. if this block
9613 * group is going to be restriped, run checks against the target
9614 * profile instead of the current one.
9626 target = get_restripe_target(fs_info, block_group->flags);
9628 index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target));
9631 * this is just a balance, so if we were marked as full
9632 * we know there is no space for a new chunk
9637 "no space to alloc new chunk for block group %llu",
9638 block_group->key.objectid);
9642 index = btrfs_bg_flags_to_raid_index(block_group->flags);
9645 if (index == BTRFS_RAID_RAID10) {
9649 } else if (index == BTRFS_RAID_RAID1) {
9651 } else if (index == BTRFS_RAID_DUP) {
9654 } else if (index == BTRFS_RAID_RAID0) {
9655 dev_min = fs_devices->rw_devices;
9656 min_free = div64_u64(min_free, dev_min);
9659 mutex_lock(&fs_info->chunk_mutex);
9660 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
9664 * check to make sure we can actually find a chunk with enough
9665 * space to fit our block group in.
9667 if (device->total_bytes > device->bytes_used + min_free &&
9668 !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
9669 ret = find_free_dev_extent(device, min_free,
9674 if (dev_nr >= dev_min)
9680 if (debug && ret == -1)
9682 "no space to allocate a new chunk for block group %llu",
9683 block_group->key.objectid);
9684 mutex_unlock(&fs_info->chunk_mutex);
9686 btrfs_put_block_group(block_group);
9690 static int find_first_block_group(struct btrfs_fs_info *fs_info,
9691 struct btrfs_path *path,
9692 struct btrfs_key *key)
9694 struct btrfs_root *root = fs_info->extent_root;
9696 struct btrfs_key found_key;
9697 struct extent_buffer *leaf;
9698 struct btrfs_block_group_item bg;
9702 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
9707 slot = path->slots[0];
9708 leaf = path->nodes[0];
9709 if (slot >= btrfs_header_nritems(leaf)) {
9710 ret = btrfs_next_leaf(root, path);
9717 btrfs_item_key_to_cpu(leaf, &found_key, slot);
9719 if (found_key.objectid >= key->objectid &&
9720 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
9721 struct extent_map_tree *em_tree;
9722 struct extent_map *em;
9724 em_tree = &root->fs_info->mapping_tree;
9725 read_lock(&em_tree->lock);
9726 em = lookup_extent_mapping(em_tree, found_key.objectid,
9728 read_unlock(&em_tree->lock);
9731 "logical %llu len %llu found bg but no related chunk",
9732 found_key.objectid, found_key.offset);
9734 } else if (em->start != found_key.objectid ||
9735 em->len != found_key.offset) {
9737 "block group %llu len %llu mismatch with chunk %llu len %llu",
9738 found_key.objectid, found_key.offset,
9739 em->start, em->len);
9742 read_extent_buffer(leaf, &bg,
9743 btrfs_item_ptr_offset(leaf, slot),
9745 flags = btrfs_block_group_flags(&bg) &
9746 BTRFS_BLOCK_GROUP_TYPE_MASK;
9748 if (flags != (em->map_lookup->type &
9749 BTRFS_BLOCK_GROUP_TYPE_MASK)) {
9751 "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
9753 found_key.offset, flags,
9754 (BTRFS_BLOCK_GROUP_TYPE_MASK &
9755 em->map_lookup->type));
9761 free_extent_map(em);
9770 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
9772 struct btrfs_block_group_cache *block_group;
9776 struct inode *inode;
9778 block_group = btrfs_lookup_first_block_group(info, last);
9779 while (block_group) {
9780 wait_block_group_cache_done(block_group);
9781 spin_lock(&block_group->lock);
9782 if (block_group->iref)
9784 spin_unlock(&block_group->lock);
9785 block_group = next_block_group(block_group);
9794 inode = block_group->inode;
9795 block_group->iref = 0;
9796 block_group->inode = NULL;
9797 spin_unlock(&block_group->lock);
9798 ASSERT(block_group->io_ctl.inode == NULL);
9800 last = block_group->key.objectid + block_group->key.offset;
9801 btrfs_put_block_group(block_group);
9806 * Must be called only after stopping all workers, since we could have block
9807 * group caching kthreads running, and therefore they could race with us if we
9808 * freed the block groups before stopping them.
9810 int btrfs_free_block_groups(struct btrfs_fs_info *info)
9812 struct btrfs_block_group_cache *block_group;
9813 struct btrfs_space_info *space_info;
9814 struct btrfs_caching_control *caching_ctl;
9817 down_write(&info->commit_root_sem);
9818 while (!list_empty(&info->caching_block_groups)) {
9819 caching_ctl = list_entry(info->caching_block_groups.next,
9820 struct btrfs_caching_control, list);
9821 list_del(&caching_ctl->list);
9822 put_caching_control(caching_ctl);
9824 up_write(&info->commit_root_sem);
9826 spin_lock(&info->unused_bgs_lock);
9827 while (!list_empty(&info->unused_bgs)) {
9828 block_group = list_first_entry(&info->unused_bgs,
9829 struct btrfs_block_group_cache,
9831 list_del_init(&block_group->bg_list);
9832 btrfs_put_block_group(block_group);
9834 spin_unlock(&info->unused_bgs_lock);
9836 spin_lock(&info->block_group_cache_lock);
9837 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
9838 block_group = rb_entry(n, struct btrfs_block_group_cache,
9840 rb_erase(&block_group->cache_node,
9841 &info->block_group_cache_tree);
9842 RB_CLEAR_NODE(&block_group->cache_node);
9843 spin_unlock(&info->block_group_cache_lock);
9845 down_write(&block_group->space_info->groups_sem);
9846 list_del(&block_group->list);
9847 up_write(&block_group->space_info->groups_sem);
9850 * We haven't cached this block group, which means we could
9851 * possibly have excluded extents on this block group.
9853 if (block_group->cached == BTRFS_CACHE_NO ||
9854 block_group->cached == BTRFS_CACHE_ERROR)
9855 free_excluded_extents(block_group);
9857 btrfs_remove_free_space_cache(block_group);
9858 ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
9859 ASSERT(list_empty(&block_group->dirty_list));
9860 ASSERT(list_empty(&block_group->io_list));
9861 ASSERT(list_empty(&block_group->bg_list));
9862 ASSERT(atomic_read(&block_group->count) == 1);
9863 btrfs_put_block_group(block_group);
9865 spin_lock(&info->block_group_cache_lock);
9867 spin_unlock(&info->block_group_cache_lock);
9869 /* now that all the block groups are freed, go through and
9870 * free all the space_info structs. This is only called during
9871 * the final stages of unmount, and so we know nobody is
9872 * using them. We call synchronize_rcu() once before we start,
9873 * just to be on the safe side.
9877 release_global_block_rsv(info);
9879 while (!list_empty(&info->space_info)) {
9882 space_info = list_entry(info->space_info.next,
9883 struct btrfs_space_info,
9887 * Do not hide this behind enospc_debug, this is actually
9888 * important and indicates a real bug if this happens.
9890 if (WARN_ON(space_info->bytes_pinned > 0 ||
9891 space_info->bytes_reserved > 0 ||
9892 space_info->bytes_may_use > 0))
9893 dump_space_info(info, space_info, 0, 0);
9894 list_del(&space_info->list);
9895 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
9896 struct kobject *kobj;
9897 kobj = space_info->block_group_kobjs[i];
9898 space_info->block_group_kobjs[i] = NULL;
9904 kobject_del(&space_info->kobj);
9905 kobject_put(&space_info->kobj);
9910 /* link_block_group will queue up kobjects to add when we're reclaim-safe */
9911 void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info)
9913 struct btrfs_space_info *space_info;
9914 struct raid_kobject *rkobj;
9918 spin_lock(&fs_info->pending_raid_kobjs_lock);
9919 list_splice_init(&fs_info->pending_raid_kobjs, &list);
9920 spin_unlock(&fs_info->pending_raid_kobjs_lock);
9922 list_for_each_entry(rkobj, &list, list) {
9923 space_info = btrfs_find_space_info(fs_info, rkobj->flags);
9925 ret = kobject_add(&rkobj->kobj, &space_info->kobj,
9926 "%s", btrfs_bg_type_to_raid_name(rkobj->flags));
9928 kobject_put(&rkobj->kobj);
9934 "failed to add kobject for block cache, ignoring");
9937 static void link_block_group(struct btrfs_block_group_cache *cache)
9939 struct btrfs_space_info *space_info = cache->space_info;
9940 struct btrfs_fs_info *fs_info = cache->fs_info;
9941 int index = btrfs_bg_flags_to_raid_index(cache->flags);
9944 down_write(&space_info->groups_sem);
9945 if (list_empty(&space_info->block_groups[index]))
9947 list_add_tail(&cache->list, &space_info->block_groups[index]);
9948 up_write(&space_info->groups_sem);
9951 struct raid_kobject *rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
9953 btrfs_warn(cache->fs_info,
9954 "couldn't alloc memory for raid level kobject");
9957 rkobj->flags = cache->flags;
9958 kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
9960 spin_lock(&fs_info->pending_raid_kobjs_lock);
9961 list_add_tail(&rkobj->list, &fs_info->pending_raid_kobjs);
9962 spin_unlock(&fs_info->pending_raid_kobjs_lock);
9963 space_info->block_group_kobjs[index] = &rkobj->kobj;
9967 static struct btrfs_block_group_cache *
9968 btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
9969 u64 start, u64 size)
9971 struct btrfs_block_group_cache *cache;
9973 cache = kzalloc(sizeof(*cache), GFP_NOFS);
9977 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
9979 if (!cache->free_space_ctl) {
9984 cache->key.objectid = start;
9985 cache->key.offset = size;
9986 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
9988 cache->fs_info = fs_info;
9989 cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
9990 set_free_space_tree_thresholds(cache);
9992 atomic_set(&cache->count, 1);
9993 spin_lock_init(&cache->lock);
9994 init_rwsem(&cache->data_rwsem);
9995 INIT_LIST_HEAD(&cache->list);
9996 INIT_LIST_HEAD(&cache->cluster_list);
9997 INIT_LIST_HEAD(&cache->bg_list);
9998 INIT_LIST_HEAD(&cache->ro_list);
9999 INIT_LIST_HEAD(&cache->dirty_list);
10000 INIT_LIST_HEAD(&cache->io_list);
10001 btrfs_init_free_space_ctl(cache);
10002 atomic_set(&cache->trimming, 0);
10003 mutex_init(&cache->free_space_lock);
10004 btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
10011 * Iterate all chunks and verify that each of them has the corresponding block
10014 static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
10016 struct extent_map_tree *map_tree = &fs_info->mapping_tree;
10017 struct extent_map *em;
10018 struct btrfs_block_group_cache *bg;
10023 read_lock(&map_tree->lock);
10025 * lookup_extent_mapping will return the first extent map
10026 * intersecting the range, so setting @len to 1 is enough to
10027 * get the first chunk.
10029 em = lookup_extent_mapping(map_tree, start, 1);
10030 read_unlock(&map_tree->lock);
10034 bg = btrfs_lookup_block_group(fs_info, em->start);
10037 "chunk start=%llu len=%llu doesn't have corresponding block group",
10038 em->start, em->len);
10040 free_extent_map(em);
10043 if (bg->key.objectid != em->start ||
10044 bg->key.offset != em->len ||
10045 (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
10046 (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
10048 "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
10049 em->start, em->len,
10050 em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
10051 bg->key.objectid, bg->key.offset,
10052 bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
10054 free_extent_map(em);
10055 btrfs_put_block_group(bg);
10058 start = em->start + em->len;
10059 free_extent_map(em);
10060 btrfs_put_block_group(bg);
10065 int btrfs_read_block_groups(struct btrfs_fs_info *info)
10067 struct btrfs_path *path;
10069 struct btrfs_block_group_cache *cache;
10070 struct btrfs_space_info *space_info;
10071 struct btrfs_key key;
10072 struct btrfs_key found_key;
10073 struct extent_buffer *leaf;
10074 int need_clear = 0;
10079 feature = btrfs_super_incompat_flags(info->super_copy);
10080 mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
10084 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
10085 path = btrfs_alloc_path();
10088 path->reada = READA_FORWARD;
10090 cache_gen = btrfs_super_cache_generation(info->super_copy);
10091 if (btrfs_test_opt(info, SPACE_CACHE) &&
10092 btrfs_super_generation(info->super_copy) != cache_gen)
10094 if (btrfs_test_opt(info, CLEAR_CACHE))
10098 ret = find_first_block_group(info, path, &key);
10104 leaf = path->nodes[0];
10105 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
10107 cache = btrfs_create_block_group_cache(info, found_key.objectid,
10116 * When we mount with old space cache, we need to
10117 * set BTRFS_DC_CLEAR and set dirty flag.
10119 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
10120 * truncate the old free space cache inode and
10122 * b) Setting 'dirty flag' makes sure that we flush
10123 * the new space cache info onto disk.
10125 if (btrfs_test_opt(info, SPACE_CACHE))
10126 cache->disk_cache_state = BTRFS_DC_CLEAR;
10129 read_extent_buffer(leaf, &cache->item,
10130 btrfs_item_ptr_offset(leaf, path->slots[0]),
10131 sizeof(cache->item));
10132 cache->flags = btrfs_block_group_flags(&cache->item);
10134 ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
10135 (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
10137 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
10138 cache->key.objectid);
10143 key.objectid = found_key.objectid + found_key.offset;
10144 btrfs_release_path(path);
10147 * We need to exclude the super stripes now so that the space
10148 * info has super bytes accounted for, otherwise we'll think
10149 * we have more space than we actually do.
10151 ret = exclude_super_stripes(cache);
10154 * We may have excluded something, so call this just in
10157 free_excluded_extents(cache);
10158 btrfs_put_block_group(cache);
10163 * check for two cases, either we are full, and therefore
10164 * don't need to bother with the caching work since we won't
10165 * find any space, or we are empty, and we can just add all
10166 * the space in and be done with it. This saves us _a_lot_ of
10167 * time, particularly in the full case.
10169 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
10170 cache->last_byte_to_unpin = (u64)-1;
10171 cache->cached = BTRFS_CACHE_FINISHED;
10172 free_excluded_extents(cache);
10173 } else if (btrfs_block_group_used(&cache->item) == 0) {
10174 cache->last_byte_to_unpin = (u64)-1;
10175 cache->cached = BTRFS_CACHE_FINISHED;
10176 add_new_free_space(cache, found_key.objectid,
10177 found_key.objectid +
10179 free_excluded_extents(cache);
10182 ret = btrfs_add_block_group_cache(info, cache);
10184 btrfs_remove_free_space_cache(cache);
10185 btrfs_put_block_group(cache);
10189 trace_btrfs_add_block_group(info, cache, 0);
10190 btrfs_update_space_info(info, cache->flags, found_key.offset,
10191 btrfs_block_group_used(&cache->item),
10192 cache->bytes_super, &space_info);
10194 cache->space_info = space_info;
10196 link_block_group(cache);
10198 set_avail_alloc_bits(info, cache->flags);
10199 if (btrfs_chunk_readonly(info, cache->key.objectid)) {
10200 inc_block_group_ro(cache, 1);
10201 } else if (btrfs_block_group_used(&cache->item) == 0) {
10202 ASSERT(list_empty(&cache->bg_list));
10203 btrfs_mark_bg_unused(cache);
10207 list_for_each_entry_rcu(space_info, &info->space_info, list) {
10208 if (!(get_alloc_profile(info, space_info->flags) &
10209 (BTRFS_BLOCK_GROUP_RAID10 |
10210 BTRFS_BLOCK_GROUP_RAID1_MASK |
10211 BTRFS_BLOCK_GROUP_RAID56_MASK |
10212 BTRFS_BLOCK_GROUP_DUP)))
10215 * avoid allocating from un-mirrored block group if there are
10216 * mirrored block groups.
10218 list_for_each_entry(cache,
10219 &space_info->block_groups[BTRFS_RAID_RAID0],
10221 inc_block_group_ro(cache, 1);
10222 list_for_each_entry(cache,
10223 &space_info->block_groups[BTRFS_RAID_SINGLE],
10225 inc_block_group_ro(cache, 1);
10228 btrfs_add_raid_kobjects(info);
10229 init_global_block_rsv(info);
10230 ret = check_chunk_block_group_mappings(info);
10232 btrfs_free_path(path);
10236 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
10238 struct btrfs_fs_info *fs_info = trans->fs_info;
10239 struct btrfs_block_group_cache *block_group;
10240 struct btrfs_root *extent_root = fs_info->extent_root;
10241 struct btrfs_block_group_item item;
10242 struct btrfs_key key;
10245 if (!trans->can_flush_pending_bgs)
10248 while (!list_empty(&trans->new_bgs)) {
10249 block_group = list_first_entry(&trans->new_bgs,
10250 struct btrfs_block_group_cache,
10255 spin_lock(&block_group->lock);
10256 memcpy(&item, &block_group->item, sizeof(item));
10257 memcpy(&key, &block_group->key, sizeof(key));
10258 spin_unlock(&block_group->lock);
10260 ret = btrfs_insert_item(trans, extent_root, &key, &item,
10263 btrfs_abort_transaction(trans, ret);
10264 ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset);
10266 btrfs_abort_transaction(trans, ret);
10267 add_block_group_free_space(trans, block_group);
10268 /* already aborted the transaction if it failed. */
10270 btrfs_delayed_refs_rsv_release(fs_info, 1);
10271 list_del_init(&block_group->bg_list);
10273 btrfs_trans_release_chunk_metadata(trans);
10276 int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
10277 u64 type, u64 chunk_offset, u64 size)
10279 struct btrfs_fs_info *fs_info = trans->fs_info;
10280 struct btrfs_block_group_cache *cache;
10283 btrfs_set_log_full_commit(trans);
10285 cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size);
10289 btrfs_set_block_group_used(&cache->item, bytes_used);
10290 btrfs_set_block_group_chunk_objectid(&cache->item,
10291 BTRFS_FIRST_CHUNK_TREE_OBJECTID);
10292 btrfs_set_block_group_flags(&cache->item, type);
10294 cache->flags = type;
10295 cache->last_byte_to_unpin = (u64)-1;
10296 cache->cached = BTRFS_CACHE_FINISHED;
10297 cache->needs_free_space = 1;
10298 ret = exclude_super_stripes(cache);
10301 * We may have excluded something, so call this just in
10304 free_excluded_extents(cache);
10305 btrfs_put_block_group(cache);
10309 add_new_free_space(cache, chunk_offset, chunk_offset + size);
10311 free_excluded_extents(cache);
10313 #ifdef CONFIG_BTRFS_DEBUG
10314 if (btrfs_should_fragment_free_space(cache)) {
10315 u64 new_bytes_used = size - bytes_used;
10317 bytes_used += new_bytes_used >> 1;
10318 fragment_free_space(cache);
10322 * Ensure the corresponding space_info object is created and
10323 * assigned to our block group. We want our bg to be added to the rbtree
10324 * with its ->space_info set.
10326 cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
10327 ASSERT(cache->space_info);
10329 ret = btrfs_add_block_group_cache(fs_info, cache);
10331 btrfs_remove_free_space_cache(cache);
10332 btrfs_put_block_group(cache);
10337 * Now that our block group has its ->space_info set and is inserted in
10338 * the rbtree, update the space info's counters.
10340 trace_btrfs_add_block_group(fs_info, cache, 1);
10341 btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
10342 cache->bytes_super, &cache->space_info);
10343 update_global_block_rsv(fs_info);
10345 link_block_group(cache);
10347 list_add_tail(&cache->bg_list, &trans->new_bgs);
10348 trans->delayed_ref_updates++;
10349 btrfs_update_delayed_refs_rsv(trans);
10351 set_avail_alloc_bits(fs_info, type);
10355 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
10357 u64 extra_flags = chunk_to_extended(flags) &
10358 BTRFS_EXTENDED_PROFILE_MASK;
10360 write_seqlock(&fs_info->profiles_lock);
10361 if (flags & BTRFS_BLOCK_GROUP_DATA)
10362 fs_info->avail_data_alloc_bits &= ~extra_flags;
10363 if (flags & BTRFS_BLOCK_GROUP_METADATA)
10364 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
10365 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
10366 fs_info->avail_system_alloc_bits &= ~extra_flags;
10367 write_sequnlock(&fs_info->profiles_lock);
10371 * Clear incompat bits for the following feature(s):
10373 * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
10374 * in the whole filesystem
10376 static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
10378 if (flags & BTRFS_BLOCK_GROUP_RAID56_MASK) {
10379 struct list_head *head = &fs_info->space_info;
10380 struct btrfs_space_info *sinfo;
10382 list_for_each_entry_rcu(sinfo, head, list) {
10383 bool found = false;
10385 down_read(&sinfo->groups_sem);
10386 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
10388 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
10390 up_read(&sinfo->groups_sem);
10395 btrfs_clear_fs_incompat(fs_info, RAID56);
10399 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
10400 u64 group_start, struct extent_map *em)
10402 struct btrfs_fs_info *fs_info = trans->fs_info;
10403 struct btrfs_root *root = fs_info->extent_root;
10404 struct btrfs_path *path;
10405 struct btrfs_block_group_cache *block_group;
10406 struct btrfs_free_cluster *cluster;
10407 struct btrfs_root *tree_root = fs_info->tree_root;
10408 struct btrfs_key key;
10409 struct inode *inode;
10410 struct kobject *kobj = NULL;
10414 struct btrfs_caching_control *caching_ctl = NULL;
10416 bool remove_rsv = false;
10418 block_group = btrfs_lookup_block_group(fs_info, group_start);
10419 BUG_ON(!block_group);
10420 BUG_ON(!block_group->ro);
10422 trace_btrfs_remove_block_group(block_group);
10424 * Free the reserved super bytes from this block group before
10427 free_excluded_extents(block_group);
10428 btrfs_free_ref_tree_range(fs_info, block_group->key.objectid,
10429 block_group->key.offset);
10431 memcpy(&key, &block_group->key, sizeof(key));
10432 index = btrfs_bg_flags_to_raid_index(block_group->flags);
10433 factor = btrfs_bg_type_to_factor(block_group->flags);
10435 /* make sure this block group isn't part of an allocation cluster */
10436 cluster = &fs_info->data_alloc_cluster;
10437 spin_lock(&cluster->refill_lock);
10438 btrfs_return_cluster_to_free_space(block_group, cluster);
10439 spin_unlock(&cluster->refill_lock);
10442 * make sure this block group isn't part of a metadata
10443 * allocation cluster
10445 cluster = &fs_info->meta_alloc_cluster;
10446 spin_lock(&cluster->refill_lock);
10447 btrfs_return_cluster_to_free_space(block_group, cluster);
10448 spin_unlock(&cluster->refill_lock);
10450 path = btrfs_alloc_path();
10457 * get the inode first so any iput calls done for the io_list
10458 * aren't the final iput (no unlinks allowed now)
10460 inode = lookup_free_space_inode(block_group, path);
10462 mutex_lock(&trans->transaction->cache_write_mutex);
10464 * Make sure our free space cache IO is done before removing the
10467 spin_lock(&trans->transaction->dirty_bgs_lock);
10468 if (!list_empty(&block_group->io_list)) {
10469 list_del_init(&block_group->io_list);
10471 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
10473 spin_unlock(&trans->transaction->dirty_bgs_lock);
10474 btrfs_wait_cache_io(trans, block_group, path);
10475 btrfs_put_block_group(block_group);
10476 spin_lock(&trans->transaction->dirty_bgs_lock);
10479 if (!list_empty(&block_group->dirty_list)) {
10480 list_del_init(&block_group->dirty_list);
10482 btrfs_put_block_group(block_group);
10484 spin_unlock(&trans->transaction->dirty_bgs_lock);
10485 mutex_unlock(&trans->transaction->cache_write_mutex);
10487 if (!IS_ERR(inode)) {
10488 ret = btrfs_orphan_add(trans, BTRFS_I(inode));
10490 btrfs_add_delayed_iput(inode);
10493 clear_nlink(inode);
10494 /* One for the block groups ref */
10495 spin_lock(&block_group->lock);
10496 if (block_group->iref) {
10497 block_group->iref = 0;
10498 block_group->inode = NULL;
10499 spin_unlock(&block_group->lock);
10502 spin_unlock(&block_group->lock);
10504 /* One for our lookup ref */
10505 btrfs_add_delayed_iput(inode);
10508 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
10509 key.offset = block_group->key.objectid;
10512 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
10516 btrfs_release_path(path);
10518 ret = btrfs_del_item(trans, tree_root, path);
10521 btrfs_release_path(path);
10524 spin_lock(&fs_info->block_group_cache_lock);
10525 rb_erase(&block_group->cache_node,
10526 &fs_info->block_group_cache_tree);
10527 RB_CLEAR_NODE(&block_group->cache_node);
10529 if (fs_info->first_logical_byte == block_group->key.objectid)
10530 fs_info->first_logical_byte = (u64)-1;
10531 spin_unlock(&fs_info->block_group_cache_lock);
10533 down_write(&block_group->space_info->groups_sem);
10535 * we must use list_del_init so people can check to see if they
10536 * are still on the list after taking the semaphore
10538 list_del_init(&block_group->list);
10539 if (list_empty(&block_group->space_info->block_groups[index])) {
10540 kobj = block_group->space_info->block_group_kobjs[index];
10541 block_group->space_info->block_group_kobjs[index] = NULL;
10542 clear_avail_alloc_bits(fs_info, block_group->flags);
10544 up_write(&block_group->space_info->groups_sem);
10545 clear_incompat_bg_bits(fs_info, block_group->flags);
10551 if (block_group->has_caching_ctl)
10552 caching_ctl = get_caching_control(block_group);
10553 if (block_group->cached == BTRFS_CACHE_STARTED)
10554 wait_block_group_cache_done(block_group);
10555 if (block_group->has_caching_ctl) {
10556 down_write(&fs_info->commit_root_sem);
10557 if (!caching_ctl) {
10558 struct btrfs_caching_control *ctl;
10560 list_for_each_entry(ctl,
10561 &fs_info->caching_block_groups, list)
10562 if (ctl->block_group == block_group) {
10564 refcount_inc(&caching_ctl->count);
10569 list_del_init(&caching_ctl->list);
10570 up_write(&fs_info->commit_root_sem);
10572 /* Once for the caching bgs list and once for us. */
10573 put_caching_control(caching_ctl);
10574 put_caching_control(caching_ctl);
10578 spin_lock(&trans->transaction->dirty_bgs_lock);
10579 WARN_ON(!list_empty(&block_group->dirty_list));
10580 WARN_ON(!list_empty(&block_group->io_list));
10581 spin_unlock(&trans->transaction->dirty_bgs_lock);
10583 btrfs_remove_free_space_cache(block_group);
10585 spin_lock(&block_group->space_info->lock);
10586 list_del_init(&block_group->ro_list);
10588 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
10589 WARN_ON(block_group->space_info->total_bytes
10590 < block_group->key.offset);
10591 WARN_ON(block_group->space_info->bytes_readonly
10592 < block_group->key.offset);
10593 WARN_ON(block_group->space_info->disk_total
10594 < block_group->key.offset * factor);
10596 block_group->space_info->total_bytes -= block_group->key.offset;
10597 block_group->space_info->bytes_readonly -= block_group->key.offset;
10598 block_group->space_info->disk_total -= block_group->key.offset * factor;
10600 spin_unlock(&block_group->space_info->lock);
10602 memcpy(&key, &block_group->key, sizeof(key));
10604 mutex_lock(&fs_info->chunk_mutex);
10605 spin_lock(&block_group->lock);
10606 block_group->removed = 1;
10608 * At this point trimming can't start on this block group, because we
10609 * removed the block group from the tree fs_info->block_group_cache_tree
10610 * so no one can't find it anymore and even if someone already got this
10611 * block group before we removed it from the rbtree, they have already
10612 * incremented block_group->trimming - if they didn't, they won't find
10613 * any free space entries because we already removed them all when we
10614 * called btrfs_remove_free_space_cache().
10616 * And we must not remove the extent map from the fs_info->mapping_tree
10617 * to prevent the same logical address range and physical device space
10618 * ranges from being reused for a new block group. This is because our
10619 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
10620 * completely transactionless, so while it is trimming a range the
10621 * currently running transaction might finish and a new one start,
10622 * allowing for new block groups to be created that can reuse the same
10623 * physical device locations unless we take this special care.
10625 * There may also be an implicit trim operation if the file system
10626 * is mounted with -odiscard. The same protections must remain
10627 * in place until the extents have been discarded completely when
10628 * the transaction commit has completed.
10630 remove_em = (atomic_read(&block_group->trimming) == 0);
10631 spin_unlock(&block_group->lock);
10633 mutex_unlock(&fs_info->chunk_mutex);
10635 ret = remove_block_group_free_space(trans, block_group);
10639 btrfs_put_block_group(block_group);
10640 btrfs_put_block_group(block_group);
10642 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
10648 ret = btrfs_del_item(trans, root, path);
10653 struct extent_map_tree *em_tree;
10655 em_tree = &fs_info->mapping_tree;
10656 write_lock(&em_tree->lock);
10657 remove_extent_mapping(em_tree, em);
10658 write_unlock(&em_tree->lock);
10659 /* once for the tree */
10660 free_extent_map(em);
10664 btrfs_delayed_refs_rsv_release(fs_info, 1);
10665 btrfs_free_path(path);
10669 struct btrfs_trans_handle *
10670 btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
10671 const u64 chunk_offset)
10673 struct extent_map_tree *em_tree = &fs_info->mapping_tree;
10674 struct extent_map *em;
10675 struct map_lookup *map;
10676 unsigned int num_items;
10678 read_lock(&em_tree->lock);
10679 em = lookup_extent_mapping(em_tree, chunk_offset, 1);
10680 read_unlock(&em_tree->lock);
10681 ASSERT(em && em->start == chunk_offset);
10684 * We need to reserve 3 + N units from the metadata space info in order
10685 * to remove a block group (done at btrfs_remove_chunk() and at
10686 * btrfs_remove_block_group()), which are used for:
10688 * 1 unit for adding the free space inode's orphan (located in the tree
10690 * 1 unit for deleting the block group item (located in the extent
10692 * 1 unit for deleting the free space item (located in tree of tree
10694 * N units for deleting N device extent items corresponding to each
10695 * stripe (located in the device tree).
10697 * In order to remove a block group we also need to reserve units in the
10698 * system space info in order to update the chunk tree (update one or
10699 * more device items and remove one chunk item), but this is done at
10700 * btrfs_remove_chunk() through a call to check_system_chunk().
10702 map = em->map_lookup;
10703 num_items = 3 + map->num_stripes;
10704 free_extent_map(em);
10706 return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
10711 * Process the unused_bgs list and remove any that don't have any allocated
10712 * space inside of them.
10714 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
10716 struct btrfs_block_group_cache *block_group;
10717 struct btrfs_space_info *space_info;
10718 struct btrfs_trans_handle *trans;
10721 if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
10724 spin_lock(&fs_info->unused_bgs_lock);
10725 while (!list_empty(&fs_info->unused_bgs)) {
10729 block_group = list_first_entry(&fs_info->unused_bgs,
10730 struct btrfs_block_group_cache,
10732 list_del_init(&block_group->bg_list);
10734 space_info = block_group->space_info;
10736 if (ret || btrfs_mixed_space_info(space_info)) {
10737 btrfs_put_block_group(block_group);
10740 spin_unlock(&fs_info->unused_bgs_lock);
10742 mutex_lock(&fs_info->delete_unused_bgs_mutex);
10744 /* Don't want to race with allocators so take the groups_sem */
10745 down_write(&space_info->groups_sem);
10746 spin_lock(&block_group->lock);
10747 if (block_group->reserved || block_group->pinned ||
10748 btrfs_block_group_used(&block_group->item) ||
10750 list_is_singular(&block_group->list)) {
10752 * We want to bail if we made new allocations or have
10753 * outstanding allocations in this block group. We do
10754 * the ro check in case balance is currently acting on
10755 * this block group.
10757 trace_btrfs_skip_unused_block_group(block_group);
10758 spin_unlock(&block_group->lock);
10759 up_write(&space_info->groups_sem);
10762 spin_unlock(&block_group->lock);
10764 /* We don't want to force the issue, only flip if it's ok. */
10765 ret = inc_block_group_ro(block_group, 0);
10766 up_write(&space_info->groups_sem);
10773 * Want to do this before we do anything else so we can recover
10774 * properly if we fail to join the transaction.
10776 trans = btrfs_start_trans_remove_block_group(fs_info,
10777 block_group->key.objectid);
10778 if (IS_ERR(trans)) {
10779 btrfs_dec_block_group_ro(block_group);
10780 ret = PTR_ERR(trans);
10785 * We could have pending pinned extents for this block group,
10786 * just delete them, we don't care about them anymore.
10788 start = block_group->key.objectid;
10789 end = start + block_group->key.offset - 1;
10791 * Hold the unused_bg_unpin_mutex lock to avoid racing with
10792 * btrfs_finish_extent_commit(). If we are at transaction N,
10793 * another task might be running finish_extent_commit() for the
10794 * previous transaction N - 1, and have seen a range belonging
10795 * to the block group in freed_extents[] before we were able to
10796 * clear the whole block group range from freed_extents[]. This
10797 * means that task can lookup for the block group after we
10798 * unpinned it from freed_extents[] and removed it, leading to
10799 * a BUG_ON() at btrfs_unpin_extent_range().
10801 mutex_lock(&fs_info->unused_bg_unpin_mutex);
10802 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
10805 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10806 btrfs_dec_block_group_ro(block_group);
10809 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
10812 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10813 btrfs_dec_block_group_ro(block_group);
10816 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10818 /* Reset pinned so btrfs_put_block_group doesn't complain */
10819 spin_lock(&space_info->lock);
10820 spin_lock(&block_group->lock);
10822 btrfs_space_info_update_bytes_pinned(fs_info, space_info,
10823 -block_group->pinned);
10824 space_info->bytes_readonly += block_group->pinned;
10825 percpu_counter_add_batch(&space_info->total_bytes_pinned,
10826 -block_group->pinned,
10827 BTRFS_TOTAL_BYTES_PINNED_BATCH);
10828 block_group->pinned = 0;
10830 spin_unlock(&block_group->lock);
10831 spin_unlock(&space_info->lock);
10833 /* DISCARD can flip during remount */
10834 trimming = btrfs_test_opt(fs_info, DISCARD);
10836 /* Implicit trim during transaction commit. */
10838 btrfs_get_block_group_trimming(block_group);
10841 * Btrfs_remove_chunk will abort the transaction if things go
10844 ret = btrfs_remove_chunk(trans, block_group->key.objectid);
10848 btrfs_put_block_group_trimming(block_group);
10853 * If we're not mounted with -odiscard, we can just forget
10854 * about this block group. Otherwise we'll need to wait
10855 * until transaction commit to do the actual discard.
10858 spin_lock(&fs_info->unused_bgs_lock);
10860 * A concurrent scrub might have added us to the list
10861 * fs_info->unused_bgs, so use a list_move operation
10862 * to add the block group to the deleted_bgs list.
10864 list_move(&block_group->bg_list,
10865 &trans->transaction->deleted_bgs);
10866 spin_unlock(&fs_info->unused_bgs_lock);
10867 btrfs_get_block_group(block_group);
10870 btrfs_end_transaction(trans);
10872 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
10873 btrfs_put_block_group(block_group);
10874 spin_lock(&fs_info->unused_bgs_lock);
10876 spin_unlock(&fs_info->unused_bgs_lock);
10879 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
10880 u64 start, u64 end)
10882 return unpin_extent_range(fs_info, start, end, false);
10886 * It used to be that old block groups would be left around forever.
10887 * Iterating over them would be enough to trim unused space. Since we
10888 * now automatically remove them, we also need to iterate over unallocated
10891 * We don't want a transaction for this since the discard may take a
10892 * substantial amount of time. We don't require that a transaction be
10893 * running, but we do need to take a running transaction into account
10894 * to ensure that we're not discarding chunks that were released or
10895 * allocated in the current transaction.
10897 * Holding the chunks lock will prevent other threads from allocating
10898 * or releasing chunks, but it won't prevent a running transaction
10899 * from committing and releasing the memory that the pending chunks
10900 * list head uses. For that, we need to take a reference to the
10901 * transaction and hold the commit root sem. We only need to hold
10902 * it while performing the free space search since we have already
10903 * held back allocations.
10905 static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
10907 u64 start = SZ_1M, len = 0, end = 0;
10912 /* Discard not supported = nothing to do. */
10913 if (!blk_queue_discard(bdev_get_queue(device->bdev)))
10916 /* Not writable = nothing to do. */
10917 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
10920 /* No free space = nothing to do. */
10921 if (device->total_bytes <= device->bytes_used)
10927 struct btrfs_fs_info *fs_info = device->fs_info;
10930 ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
10934 find_first_clear_extent_bit(&device->alloc_state, start,
10936 CHUNK_TRIMMED | CHUNK_ALLOCATED);
10938 /* Ensure we skip the reserved area in the first 1M */
10939 start = max_t(u64, start, SZ_1M);
10942 * If find_first_clear_extent_bit find a range that spans the
10943 * end of the device it will set end to -1, in this case it's up
10944 * to the caller to trim the value to the size of the device.
10946 end = min(end, device->total_bytes - 1);
10948 len = end - start + 1;
10950 /* We didn't find any extents */
10952 mutex_unlock(&fs_info->chunk_mutex);
10957 ret = btrfs_issue_discard(device->bdev, start, len,
10960 set_extent_bits(&device->alloc_state, start,
10963 mutex_unlock(&fs_info->chunk_mutex);
10971 if (fatal_signal_pending(current)) {
10972 ret = -ERESTARTSYS;
10983 * Trim the whole filesystem by:
10984 * 1) trimming the free space in each block group
10985 * 2) trimming the unallocated space on each device
10987 * This will also continue trimming even if a block group or device encounters
10988 * an error. The return value will be the last error, or 0 if nothing bad
10991 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
10993 struct btrfs_block_group_cache *cache = NULL;
10994 struct btrfs_device *device;
10995 struct list_head *devices;
11001 u64 dev_failed = 0;
11006 cache = btrfs_lookup_first_block_group(fs_info, range->start);
11007 for (; cache; cache = next_block_group(cache)) {
11008 if (cache->key.objectid >= (range->start + range->len)) {
11009 btrfs_put_block_group(cache);
11013 start = max(range->start, cache->key.objectid);
11014 end = min(range->start + range->len,
11015 cache->key.objectid + cache->key.offset);
11017 if (end - start >= range->minlen) {
11018 if (!block_group_cache_done(cache)) {
11019 ret = cache_block_group(cache, 0);
11025 ret = wait_block_group_cache_done(cache);
11032 ret = btrfs_trim_block_group(cache,
11038 trimmed += group_trimmed;
11048 btrfs_warn(fs_info,
11049 "failed to trim %llu block group(s), last error %d",
11050 bg_failed, bg_ret);
11051 mutex_lock(&fs_info->fs_devices->device_list_mutex);
11052 devices = &fs_info->fs_devices->devices;
11053 list_for_each_entry(device, devices, dev_list) {
11054 ret = btrfs_trim_free_extents(device, &group_trimmed);
11061 trimmed += group_trimmed;
11063 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
11066 btrfs_warn(fs_info,
11067 "failed to trim %llu device(s), last error %d",
11068 dev_failed, dev_ret);
11069 range->len = trimmed;
11076 * btrfs_{start,end}_write_no_snapshotting() are similar to
11077 * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
11078 * data into the page cache through nocow before the subvolume is snapshoted,
11079 * but flush the data into disk after the snapshot creation, or to prevent
11080 * operations while snapshotting is ongoing and that cause the snapshot to be
11081 * inconsistent (writes followed by expanding truncates for example).
11083 void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
11085 percpu_counter_dec(&root->subv_writers->counter);
11086 cond_wake_up(&root->subv_writers->wait);
11089 int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
11091 if (atomic_read(&root->will_be_snapshotted))
11094 percpu_counter_inc(&root->subv_writers->counter);
11096 * Make sure counter is updated before we check for snapshot creation.
11099 if (atomic_read(&root->will_be_snapshotted)) {
11100 btrfs_end_write_no_snapshotting(root);
11106 void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
11111 ret = btrfs_start_write_no_snapshotting(root);
11114 wait_var_event(&root->will_be_snapshotted,
11115 !atomic_read(&root->will_be_snapshotted));
11119 void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg)
11121 struct btrfs_fs_info *fs_info = bg->fs_info;
11123 spin_lock(&fs_info->unused_bgs_lock);
11124 if (list_empty(&bg->bg_list)) {
11125 btrfs_get_block_group(bg);
11126 trace_btrfs_add_unused_block_group(bg);
11127 list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
11129 spin_unlock(&fs_info->unused_bgs_lock);