2 * Copyright (C) 2007 Oracle. All rights reserved.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
19 #include <linux/kernel.h>
20 #include <linux/bio.h>
21 #include <linux/buffer_head.h>
22 #include <linux/file.h>
24 #include <linux/pagemap.h>
25 #include <linux/highmem.h>
26 #include <linux/time.h>
27 #include <linux/init.h>
28 #include <linux/string.h>
29 #include <linux/backing-dev.h>
30 #include <linux/mpage.h>
31 #include <linux/swap.h>
32 #include <linux/writeback.h>
33 #include <linux/statfs.h>
34 #include <linux/compat.h>
35 #include <linux/bit_spinlock.h>
36 #include <linux/xattr.h>
37 #include <linux/posix_acl.h>
38 #include <linux/falloc.h>
39 #include <linux/slab.h>
40 #include <linux/ratelimit.h>
41 #include <linux/mount.h>
42 #include <linux/btrfs.h>
43 #include <linux/blkdev.h>
44 #include <linux/posix_acl_xattr.h>
45 #include <linux/uio.h>
48 #include "transaction.h"
49 #include "btrfs_inode.h"
50 #include "print-tree.h"
51 #include "ordered-data.h"
55 #include "compression.h"
57 #include "free-space-cache.h"
58 #include "inode-map.h"
64 struct btrfs_iget_args {
65 struct btrfs_key *location;
66 struct btrfs_root *root;
69 struct btrfs_dio_data {
70 u64 outstanding_extents;
72 u64 unsubmitted_oe_range_start;
73 u64 unsubmitted_oe_range_end;
76 static const struct inode_operations btrfs_dir_inode_operations;
77 static const struct inode_operations btrfs_symlink_inode_operations;
78 static const struct inode_operations btrfs_dir_ro_inode_operations;
79 static const struct inode_operations btrfs_special_inode_operations;
80 static const struct inode_operations btrfs_file_inode_operations;
81 static const struct address_space_operations btrfs_aops;
82 static const struct address_space_operations btrfs_symlink_aops;
83 static const struct file_operations btrfs_dir_file_operations;
84 static const struct extent_io_ops btrfs_extent_io_ops;
86 static struct kmem_cache *btrfs_inode_cachep;
87 struct kmem_cache *btrfs_trans_handle_cachep;
88 struct kmem_cache *btrfs_transaction_cachep;
89 struct kmem_cache *btrfs_path_cachep;
90 struct kmem_cache *btrfs_free_space_cachep;
93 static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
94 [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE,
95 [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR,
96 [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV,
97 [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV,
98 [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO,
99 [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK,
100 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK,
103 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
104 static int btrfs_truncate(struct inode *inode);
105 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
106 static noinline int cow_file_range(struct inode *inode,
107 struct page *locked_page,
108 u64 start, u64 end, int *page_started,
109 unsigned long *nr_written, int unlock);
110 static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
111 u64 len, u64 orig_start,
112 u64 block_start, u64 block_len,
113 u64 orig_block_len, u64 ram_bytes,
116 static int btrfs_dirty_inode(struct inode *inode);
118 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
119 void btrfs_test_inode_set_ops(struct inode *inode)
121 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
125 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
126 struct inode *inode, struct inode *dir,
127 const struct qstr *qstr)
131 err = btrfs_init_acl(trans, inode, dir);
133 err = btrfs_xattr_security_init(trans, inode, dir, qstr);
138 * this does all the hard work for inserting an inline extent into
139 * the btree. The caller should have done a btrfs_drop_extents so that
140 * no overlapping inline items exist in the btree
142 static int insert_inline_extent(struct btrfs_trans_handle *trans,
143 struct btrfs_path *path, int extent_inserted,
144 struct btrfs_root *root, struct inode *inode,
145 u64 start, size_t size, size_t compressed_size,
147 struct page **compressed_pages)
149 struct extent_buffer *leaf;
150 struct page *page = NULL;
153 struct btrfs_file_extent_item *ei;
156 size_t cur_size = size;
157 unsigned long offset;
159 if (compressed_size && compressed_pages)
160 cur_size = compressed_size;
162 inode_add_bytes(inode, size);
164 if (!extent_inserted) {
165 struct btrfs_key key;
168 key.objectid = btrfs_ino(inode);
170 key.type = BTRFS_EXTENT_DATA_KEY;
172 datasize = btrfs_file_extent_calc_inline_size(cur_size);
173 path->leave_spinning = 1;
174 ret = btrfs_insert_empty_item(trans, root, path, &key,
181 leaf = path->nodes[0];
182 ei = btrfs_item_ptr(leaf, path->slots[0],
183 struct btrfs_file_extent_item);
184 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
185 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
186 btrfs_set_file_extent_encryption(leaf, ei, 0);
187 btrfs_set_file_extent_other_encoding(leaf, ei, 0);
188 btrfs_set_file_extent_ram_bytes(leaf, ei, size);
189 ptr = btrfs_file_extent_inline_start(ei);
191 if (compress_type != BTRFS_COMPRESS_NONE) {
194 while (compressed_size > 0) {
195 cpage = compressed_pages[i];
196 cur_size = min_t(unsigned long, compressed_size,
199 kaddr = kmap_atomic(cpage);
200 write_extent_buffer(leaf, kaddr, ptr, cur_size);
201 kunmap_atomic(kaddr);
205 compressed_size -= cur_size;
207 btrfs_set_file_extent_compression(leaf, ei,
210 page = find_get_page(inode->i_mapping,
211 start >> PAGE_SHIFT);
212 btrfs_set_file_extent_compression(leaf, ei, 0);
213 kaddr = kmap_atomic(page);
214 offset = start & (PAGE_SIZE - 1);
215 write_extent_buffer(leaf, kaddr + offset, ptr, size);
216 kunmap_atomic(kaddr);
219 btrfs_mark_buffer_dirty(leaf);
220 btrfs_release_path(path);
223 * we're an inline extent, so nobody can
224 * extend the file past i_size without locking
225 * a page we already have locked.
227 * We must do any isize and inode updates
228 * before we unlock the pages. Otherwise we
229 * could end up racing with unlink.
231 BTRFS_I(inode)->disk_i_size = inode->i_size;
232 ret = btrfs_update_inode(trans, root, inode);
241 * conditionally insert an inline extent into the file. This
242 * does the checks required to make sure the data is small enough
243 * to fit as an inline extent.
245 static noinline int cow_file_range_inline(struct btrfs_root *root,
246 struct inode *inode, u64 start,
247 u64 end, size_t compressed_size,
249 struct page **compressed_pages)
251 struct btrfs_trans_handle *trans;
252 u64 isize = i_size_read(inode);
253 u64 actual_end = min(end + 1, isize);
254 u64 inline_len = actual_end - start;
255 u64 aligned_end = ALIGN(end, root->sectorsize);
256 u64 data_len = inline_len;
258 struct btrfs_path *path;
259 int extent_inserted = 0;
260 u32 extent_item_size;
263 data_len = compressed_size;
266 actual_end > root->sectorsize ||
267 data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) ||
269 (actual_end & (root->sectorsize - 1)) == 0) ||
271 data_len > root->fs_info->max_inline) {
275 path = btrfs_alloc_path();
279 trans = btrfs_join_transaction(root);
281 btrfs_free_path(path);
282 return PTR_ERR(trans);
284 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
286 if (compressed_size && compressed_pages)
287 extent_item_size = btrfs_file_extent_calc_inline_size(
290 extent_item_size = btrfs_file_extent_calc_inline_size(
293 ret = __btrfs_drop_extents(trans, root, inode, path,
294 start, aligned_end, NULL,
295 1, 1, extent_item_size, &extent_inserted);
297 btrfs_abort_transaction(trans, root, ret);
301 if (isize > actual_end)
302 inline_len = min_t(u64, isize, actual_end);
303 ret = insert_inline_extent(trans, path, extent_inserted,
305 inline_len, compressed_size,
306 compress_type, compressed_pages);
307 if (ret && ret != -ENOSPC) {
308 btrfs_abort_transaction(trans, root, ret);
310 } else if (ret == -ENOSPC) {
315 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
316 btrfs_delalloc_release_metadata(inode, end + 1 - start);
317 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
320 * Don't forget to free the reserved space, as for inlined extent
321 * it won't count as data extent, free them directly here.
322 * And at reserve time, it's always aligned to page size, so
323 * just free one page here.
325 btrfs_qgroup_free_data(inode, 0, PAGE_SIZE);
326 btrfs_free_path(path);
327 btrfs_end_transaction(trans, root);
331 struct async_extent {
336 unsigned long nr_pages;
338 struct list_head list;
343 struct btrfs_root *root;
344 struct page *locked_page;
347 struct list_head extents;
348 struct btrfs_work work;
351 static noinline int add_async_extent(struct async_cow *cow,
352 u64 start, u64 ram_size,
355 unsigned long nr_pages,
358 struct async_extent *async_extent;
360 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
361 BUG_ON(!async_extent); /* -ENOMEM */
362 async_extent->start = start;
363 async_extent->ram_size = ram_size;
364 async_extent->compressed_size = compressed_size;
365 async_extent->pages = pages;
366 async_extent->nr_pages = nr_pages;
367 async_extent->compress_type = compress_type;
368 list_add_tail(&async_extent->list, &cow->extents);
372 static inline int inode_need_compress(struct inode *inode)
374 struct btrfs_root *root = BTRFS_I(inode)->root;
377 if (btrfs_test_opt(root, FORCE_COMPRESS))
379 /* bad compression ratios */
380 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
382 if (btrfs_test_opt(root, COMPRESS) ||
383 BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
384 BTRFS_I(inode)->force_compress)
390 * we create compressed extents in two phases. The first
391 * phase compresses a range of pages that have already been
392 * locked (both pages and state bits are locked).
394 * This is done inside an ordered work queue, and the compression
395 * is spread across many cpus. The actual IO submission is step
396 * two, and the ordered work queue takes care of making sure that
397 * happens in the same order things were put onto the queue by
398 * writepages and friends.
400 * If this code finds it can't get good compression, it puts an
401 * entry onto the work queue to write the uncompressed bytes. This
402 * makes sure that both compressed inodes and uncompressed inodes
403 * are written in the same order that the flusher thread sent them
406 static noinline void compress_file_range(struct inode *inode,
407 struct page *locked_page,
409 struct async_cow *async_cow,
412 struct btrfs_root *root = BTRFS_I(inode)->root;
414 u64 blocksize = root->sectorsize;
416 u64 isize = i_size_read(inode);
418 struct page **pages = NULL;
419 unsigned long nr_pages;
420 unsigned long nr_pages_ret = 0;
421 unsigned long total_compressed = 0;
422 unsigned long total_in = 0;
423 unsigned long max_compressed = SZ_128K;
424 unsigned long max_uncompressed = SZ_128K;
427 int compress_type = root->fs_info->compress_type;
430 /* if this is a small write inside eof, kick off a defrag */
431 if ((end - start + 1) < SZ_16K &&
432 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
433 btrfs_add_inode_defrag(NULL, inode);
435 actual_end = min_t(u64, isize, end + 1);
438 nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
439 nr_pages = min_t(unsigned long, nr_pages, SZ_128K / PAGE_SIZE);
442 * we don't want to send crud past the end of i_size through
443 * compression, that's just a waste of CPU time. So, if the
444 * end of the file is before the start of our current
445 * requested range of bytes, we bail out to the uncompressed
446 * cleanup code that can deal with all of this.
448 * It isn't really the fastest way to fix things, but this is a
449 * very uncommon corner.
451 if (actual_end <= start)
452 goto cleanup_and_bail_uncompressed;
454 total_compressed = actual_end - start;
457 * skip compression for a small file range(<=blocksize) that
458 * isn't an inline extent, since it doesn't save disk space at all.
460 if (total_compressed <= blocksize &&
461 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
462 goto cleanup_and_bail_uncompressed;
464 /* we want to make sure that amount of ram required to uncompress
465 * an extent is reasonable, so we limit the total size in ram
466 * of a compressed extent to 128k. This is a crucial number
467 * because it also controls how easily we can spread reads across
468 * cpus for decompression.
470 * We also want to make sure the amount of IO required to do
471 * a random read is reasonably small, so we limit the size of
472 * a compressed extent to 128k.
474 total_compressed = min(total_compressed, max_uncompressed);
475 num_bytes = ALIGN(end - start + 1, blocksize);
476 num_bytes = max(blocksize, num_bytes);
481 * we do compression for mount -o compress and when the
482 * inode has not been flagged as nocompress. This flag can
483 * change at any time if we discover bad compression ratios.
485 if (inode_need_compress(inode)) {
487 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
489 /* just bail out to the uncompressed code */
493 if (BTRFS_I(inode)->force_compress)
494 compress_type = BTRFS_I(inode)->force_compress;
497 * we need to call clear_page_dirty_for_io on each
498 * page in the range. Otherwise applications with the file
499 * mmap'd can wander in and change the page contents while
500 * we are compressing them.
502 * If the compression fails for any reason, we set the pages
503 * dirty again later on.
505 extent_range_clear_dirty_for_io(inode, start, end);
507 ret = btrfs_compress_pages(compress_type,
508 inode->i_mapping, start,
509 total_compressed, pages,
510 nr_pages, &nr_pages_ret,
516 unsigned long offset = total_compressed &
518 struct page *page = pages[nr_pages_ret - 1];
521 /* zero the tail end of the last page, we might be
522 * sending it down to disk
525 kaddr = kmap_atomic(page);
526 memset(kaddr + offset, 0,
528 kunmap_atomic(kaddr);
535 /* lets try to make an inline extent */
536 if (ret || total_in < (actual_end - start)) {
537 /* we didn't compress the entire range, try
538 * to make an uncompressed inline extent.
540 ret = cow_file_range_inline(root, inode, start, end,
543 /* try making a compressed inline extent */
544 ret = cow_file_range_inline(root, inode, start, end,
546 compress_type, pages);
549 unsigned long clear_flags = EXTENT_DELALLOC |
551 unsigned long page_error_op;
553 clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
554 page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
557 * inline extent creation worked or returned error,
558 * we don't need to create any more async work items.
559 * Unlock and free up our temp pages.
561 extent_clear_unlock_delalloc(inode, start, end, NULL,
562 clear_flags, PAGE_UNLOCK |
573 * we aren't doing an inline extent round the compressed size
574 * up to a block size boundary so the allocator does sane
577 total_compressed = ALIGN(total_compressed, blocksize);
580 * one last check to make sure the compression is really a
581 * win, compare the page count read with the blocks on disk
583 total_in = ALIGN(total_in, PAGE_SIZE);
584 if (total_compressed >= total_in) {
587 num_bytes = total_in;
590 if (!will_compress && pages) {
592 * the compression code ran but failed to make things smaller,
593 * free any pages it allocated and our page pointer array
595 for (i = 0; i < nr_pages_ret; i++) {
596 WARN_ON(pages[i]->mapping);
601 total_compressed = 0;
604 /* flag the file so we don't compress in the future */
605 if (!btrfs_test_opt(root, FORCE_COMPRESS) &&
606 !(BTRFS_I(inode)->force_compress)) {
607 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
613 /* the async work queues will take care of doing actual
614 * allocation on disk for these compressed pages,
615 * and will submit them to the elevator.
617 add_async_extent(async_cow, start, num_bytes,
618 total_compressed, pages, nr_pages_ret,
621 if (start + num_bytes < end) {
628 cleanup_and_bail_uncompressed:
630 * No compression, but we still need to write the pages in
631 * the file we've been given so far. redirty the locked
632 * page if it corresponds to our extent and set things up
633 * for the async work queue to run cow_file_range to do
634 * the normal delalloc dance
636 if (page_offset(locked_page) >= start &&
637 page_offset(locked_page) <= end) {
638 __set_page_dirty_nobuffers(locked_page);
639 /* unlocked later on in the async handlers */
642 extent_range_redirty_for_io(inode, start, end);
643 add_async_extent(async_cow, start, end - start + 1,
644 0, NULL, 0, BTRFS_COMPRESS_NONE);
651 for (i = 0; i < nr_pages_ret; i++) {
652 WARN_ON(pages[i]->mapping);
658 static void free_async_extent_pages(struct async_extent *async_extent)
662 if (!async_extent->pages)
665 for (i = 0; i < async_extent->nr_pages; i++) {
666 WARN_ON(async_extent->pages[i]->mapping);
667 put_page(async_extent->pages[i]);
669 kfree(async_extent->pages);
670 async_extent->nr_pages = 0;
671 async_extent->pages = NULL;
675 * phase two of compressed writeback. This is the ordered portion
676 * of the code, which only gets called in the order the work was
677 * queued. We walk all the async extents created by compress_file_range
678 * and send them down to the disk.
680 static noinline void submit_compressed_extents(struct inode *inode,
681 struct async_cow *async_cow)
683 struct async_extent *async_extent;
685 struct btrfs_key ins;
686 struct extent_map *em;
687 struct btrfs_root *root = BTRFS_I(inode)->root;
688 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
689 struct extent_io_tree *io_tree;
693 while (!list_empty(&async_cow->extents)) {
694 async_extent = list_entry(async_cow->extents.next,
695 struct async_extent, list);
696 list_del(&async_extent->list);
698 io_tree = &BTRFS_I(inode)->io_tree;
701 /* did the compression code fall back to uncompressed IO? */
702 if (!async_extent->pages) {
703 int page_started = 0;
704 unsigned long nr_written = 0;
706 lock_extent(io_tree, async_extent->start,
707 async_extent->start +
708 async_extent->ram_size - 1);
710 /* allocate blocks */
711 ret = cow_file_range(inode, async_cow->locked_page,
713 async_extent->start +
714 async_extent->ram_size - 1,
715 &page_started, &nr_written, 0);
720 * if page_started, cow_file_range inserted an
721 * inline extent and took care of all the unlocking
722 * and IO for us. Otherwise, we need to submit
723 * all those pages down to the drive.
725 if (!page_started && !ret)
726 extent_write_locked_range(io_tree,
727 inode, async_extent->start,
728 async_extent->start +
729 async_extent->ram_size - 1,
733 unlock_page(async_cow->locked_page);
739 lock_extent(io_tree, async_extent->start,
740 async_extent->start + async_extent->ram_size - 1);
742 ret = btrfs_reserve_extent(root,
743 async_extent->compressed_size,
744 async_extent->compressed_size,
745 0, alloc_hint, &ins, 1, 1);
747 free_async_extent_pages(async_extent);
749 if (ret == -ENOSPC) {
750 unlock_extent(io_tree, async_extent->start,
751 async_extent->start +
752 async_extent->ram_size - 1);
755 * we need to redirty the pages if we decide to
756 * fallback to uncompressed IO, otherwise we
757 * will not submit these pages down to lower
760 extent_range_redirty_for_io(inode,
762 async_extent->start +
763 async_extent->ram_size - 1);
770 * here we're doing allocation and writeback of the
773 btrfs_drop_extent_cache(inode, async_extent->start,
774 async_extent->start +
775 async_extent->ram_size - 1, 0);
777 em = alloc_extent_map();
780 goto out_free_reserve;
782 em->start = async_extent->start;
783 em->len = async_extent->ram_size;
784 em->orig_start = em->start;
785 em->mod_start = em->start;
786 em->mod_len = em->len;
788 em->block_start = ins.objectid;
789 em->block_len = ins.offset;
790 em->orig_block_len = ins.offset;
791 em->ram_bytes = async_extent->ram_size;
792 em->bdev = root->fs_info->fs_devices->latest_bdev;
793 em->compress_type = async_extent->compress_type;
794 set_bit(EXTENT_FLAG_PINNED, &em->flags);
795 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
799 write_lock(&em_tree->lock);
800 ret = add_extent_mapping(em_tree, em, 1);
801 write_unlock(&em_tree->lock);
802 if (ret != -EEXIST) {
806 btrfs_drop_extent_cache(inode, async_extent->start,
807 async_extent->start +
808 async_extent->ram_size - 1, 0);
812 goto out_free_reserve;
814 ret = btrfs_add_ordered_extent_compress(inode,
817 async_extent->ram_size,
819 BTRFS_ORDERED_COMPRESSED,
820 async_extent->compress_type);
822 btrfs_drop_extent_cache(inode, async_extent->start,
823 async_extent->start +
824 async_extent->ram_size - 1, 0);
825 goto out_free_reserve;
827 btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
830 * clear dirty, set writeback and unlock the pages.
832 extent_clear_unlock_delalloc(inode, async_extent->start,
833 async_extent->start +
834 async_extent->ram_size - 1,
835 NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
836 PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
838 ret = btrfs_submit_compressed_write(inode,
840 async_extent->ram_size,
842 ins.offset, async_extent->pages,
843 async_extent->nr_pages);
845 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
846 struct page *p = async_extent->pages[0];
847 const u64 start = async_extent->start;
848 const u64 end = start + async_extent->ram_size - 1;
850 p->mapping = inode->i_mapping;
851 tree->ops->writepage_end_io_hook(p, start, end,
854 extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
857 free_async_extent_pages(async_extent);
859 alloc_hint = ins.objectid + ins.offset;
865 btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
866 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
868 extent_clear_unlock_delalloc(inode, async_extent->start,
869 async_extent->start +
870 async_extent->ram_size - 1,
871 NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
872 EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
873 PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
874 PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
876 free_async_extent_pages(async_extent);
881 static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
884 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
885 struct extent_map *em;
888 read_lock(&em_tree->lock);
889 em = search_extent_mapping(em_tree, start, num_bytes);
892 * if block start isn't an actual block number then find the
893 * first block in this inode and use that as a hint. If that
894 * block is also bogus then just don't worry about it.
896 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
898 em = search_extent_mapping(em_tree, 0, 0);
899 if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
900 alloc_hint = em->block_start;
904 alloc_hint = em->block_start;
908 read_unlock(&em_tree->lock);
914 * when extent_io.c finds a delayed allocation range in the file,
915 * the call backs end up in this code. The basic idea is to
916 * allocate extents on disk for the range, and create ordered data structs
917 * in ram to track those extents.
919 * locked_page is the page that writepage had locked already. We use
920 * it to make sure we don't do extra locks or unlocks.
922 * *page_started is set to one if we unlock locked_page and do everything
923 * required to start IO on it. It may be clean and already done with
926 static noinline int cow_file_range(struct inode *inode,
927 struct page *locked_page,
928 u64 start, u64 end, int *page_started,
929 unsigned long *nr_written,
932 struct btrfs_root *root = BTRFS_I(inode)->root;
935 unsigned long ram_size;
938 u64 blocksize = root->sectorsize;
939 struct btrfs_key ins;
940 struct extent_map *em;
941 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
944 if (btrfs_is_free_space_inode(inode)) {
950 num_bytes = ALIGN(end - start + 1, blocksize);
951 num_bytes = max(blocksize, num_bytes);
952 disk_num_bytes = num_bytes;
954 /* if this is a small write inside eof, kick off defrag */
955 if (num_bytes < SZ_64K &&
956 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
957 btrfs_add_inode_defrag(NULL, inode);
960 /* lets try to make an inline extent */
961 ret = cow_file_range_inline(root, inode, start, end, 0, 0,
964 extent_clear_unlock_delalloc(inode, start, end, NULL,
965 EXTENT_LOCKED | EXTENT_DELALLOC |
966 EXTENT_DEFRAG, PAGE_UNLOCK |
967 PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
970 *nr_written = *nr_written +
971 (end - start + PAGE_SIZE) / PAGE_SIZE;
974 } else if (ret < 0) {
979 BUG_ON(disk_num_bytes >
980 btrfs_super_total_bytes(root->fs_info->super_copy));
982 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
983 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
985 while (disk_num_bytes > 0) {
988 cur_alloc_size = disk_num_bytes;
989 ret = btrfs_reserve_extent(root, cur_alloc_size,
990 root->sectorsize, 0, alloc_hint,
995 em = alloc_extent_map();
1001 em->orig_start = em->start;
1002 ram_size = ins.offset;
1003 em->len = ins.offset;
1004 em->mod_start = em->start;
1005 em->mod_len = em->len;
1007 em->block_start = ins.objectid;
1008 em->block_len = ins.offset;
1009 em->orig_block_len = ins.offset;
1010 em->ram_bytes = ram_size;
1011 em->bdev = root->fs_info->fs_devices->latest_bdev;
1012 set_bit(EXTENT_FLAG_PINNED, &em->flags);
1013 em->generation = -1;
1016 write_lock(&em_tree->lock);
1017 ret = add_extent_mapping(em_tree, em, 1);
1018 write_unlock(&em_tree->lock);
1019 if (ret != -EEXIST) {
1020 free_extent_map(em);
1023 btrfs_drop_extent_cache(inode, start,
1024 start + ram_size - 1, 0);
1029 cur_alloc_size = ins.offset;
1030 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
1031 ram_size, cur_alloc_size, 0);
1033 goto out_drop_extent_cache;
1035 if (root->root_key.objectid ==
1036 BTRFS_DATA_RELOC_TREE_OBJECTID) {
1037 ret = btrfs_reloc_clone_csums(inode, start,
1040 goto out_drop_extent_cache;
1043 btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
1045 if (disk_num_bytes < cur_alloc_size)
1048 /* we're not doing compressed IO, don't unlock the first
1049 * page (which the caller expects to stay locked), don't
1050 * clear any dirty bits and don't set any writeback bits
1052 * Do set the Private2 bit so we know this page was properly
1053 * setup for writepage
1055 op = unlock ? PAGE_UNLOCK : 0;
1056 op |= PAGE_SET_PRIVATE2;
1058 extent_clear_unlock_delalloc(inode, start,
1059 start + ram_size - 1, locked_page,
1060 EXTENT_LOCKED | EXTENT_DELALLOC,
1062 disk_num_bytes -= cur_alloc_size;
1063 num_bytes -= cur_alloc_size;
1064 alloc_hint = ins.objectid + ins.offset;
1065 start += cur_alloc_size;
1070 out_drop_extent_cache:
1071 btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
1073 btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
1074 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
1076 extent_clear_unlock_delalloc(inode, start, end, locked_page,
1077 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
1078 EXTENT_DELALLOC | EXTENT_DEFRAG,
1079 PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
1080 PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
1085 * work queue call back to started compression on a file and pages
1087 static noinline void async_cow_start(struct btrfs_work *work)
1089 struct async_cow *async_cow;
1091 async_cow = container_of(work, struct async_cow, work);
1093 compress_file_range(async_cow->inode, async_cow->locked_page,
1094 async_cow->start, async_cow->end, async_cow,
1096 if (num_added == 0) {
1097 btrfs_add_delayed_iput(async_cow->inode);
1098 async_cow->inode = NULL;
1103 * work queue call back to submit previously compressed pages
1105 static noinline void async_cow_submit(struct btrfs_work *work)
1107 struct async_cow *async_cow;
1108 struct btrfs_root *root;
1109 unsigned long nr_pages;
1111 async_cow = container_of(work, struct async_cow, work);
1113 root = async_cow->root;
1114 nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >>
1118 * atomic_sub_return implies a barrier for waitqueue_active
1120 if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
1122 waitqueue_active(&root->fs_info->async_submit_wait))
1123 wake_up(&root->fs_info->async_submit_wait);
1125 if (async_cow->inode)
1126 submit_compressed_extents(async_cow->inode, async_cow);
1129 static noinline void async_cow_free(struct btrfs_work *work)
1131 struct async_cow *async_cow;
1132 async_cow = container_of(work, struct async_cow, work);
1133 if (async_cow->inode)
1134 btrfs_add_delayed_iput(async_cow->inode);
1138 static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1139 u64 start, u64 end, int *page_started,
1140 unsigned long *nr_written)
1142 struct async_cow *async_cow;
1143 struct btrfs_root *root = BTRFS_I(inode)->root;
1144 unsigned long nr_pages;
1146 int limit = 10 * SZ_1M;
1148 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1149 1, 0, NULL, GFP_NOFS);
1150 while (start < end) {
1151 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
1152 BUG_ON(!async_cow); /* -ENOMEM */
1153 async_cow->inode = igrab(inode);
1154 async_cow->root = root;
1155 async_cow->locked_page = locked_page;
1156 async_cow->start = start;
1158 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
1159 !btrfs_test_opt(root, FORCE_COMPRESS))
1162 cur_end = min(end, start + SZ_512K - 1);
1164 async_cow->end = cur_end;
1165 INIT_LIST_HEAD(&async_cow->extents);
1167 btrfs_init_work(&async_cow->work,
1168 btrfs_delalloc_helper,
1169 async_cow_start, async_cow_submit,
1172 nr_pages = (cur_end - start + PAGE_SIZE) >>
1174 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
1176 btrfs_queue_work(root->fs_info->delalloc_workers,
1179 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
1180 wait_event(root->fs_info->async_submit_wait,
1181 (atomic_read(&root->fs_info->async_delalloc_pages) <
1185 while (atomic_read(&root->fs_info->async_submit_draining) &&
1186 atomic_read(&root->fs_info->async_delalloc_pages)) {
1187 wait_event(root->fs_info->async_submit_wait,
1188 (atomic_read(&root->fs_info->async_delalloc_pages) ==
1192 *nr_written += nr_pages;
1193 start = cur_end + 1;
1199 static noinline int csum_exist_in_range(struct btrfs_root *root,
1200 u64 bytenr, u64 num_bytes)
1203 struct btrfs_ordered_sum *sums;
1206 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
1207 bytenr + num_bytes - 1, &list, 0);
1208 if (ret == 0 && list_empty(&list))
1211 while (!list_empty(&list)) {
1212 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1213 list_del(&sums->list);
1220 * when nowcow writeback call back. This checks for snapshots or COW copies
1221 * of the extents that exist in the file, and COWs the file as required.
1223 * If no cow copies or snapshots exist, we write directly to the existing
1226 static noinline int run_delalloc_nocow(struct inode *inode,
1227 struct page *locked_page,
1228 u64 start, u64 end, int *page_started, int force,
1229 unsigned long *nr_written)
1231 struct btrfs_root *root = BTRFS_I(inode)->root;
1232 struct btrfs_trans_handle *trans;
1233 struct extent_buffer *leaf;
1234 struct btrfs_path *path;
1235 struct btrfs_file_extent_item *fi;
1236 struct btrfs_key found_key;
1251 u64 ino = btrfs_ino(inode);
1253 path = btrfs_alloc_path();
1255 extent_clear_unlock_delalloc(inode, start, end, locked_page,
1256 EXTENT_LOCKED | EXTENT_DELALLOC |
1257 EXTENT_DO_ACCOUNTING |
1258 EXTENT_DEFRAG, PAGE_UNLOCK |
1260 PAGE_SET_WRITEBACK |
1261 PAGE_END_WRITEBACK);
1265 nolock = btrfs_is_free_space_inode(inode);
1268 trans = btrfs_join_transaction_nolock(root);
1270 trans = btrfs_join_transaction(root);
1272 if (IS_ERR(trans)) {
1273 extent_clear_unlock_delalloc(inode, start, end, locked_page,
1274 EXTENT_LOCKED | EXTENT_DELALLOC |
1275 EXTENT_DO_ACCOUNTING |
1276 EXTENT_DEFRAG, PAGE_UNLOCK |
1278 PAGE_SET_WRITEBACK |
1279 PAGE_END_WRITEBACK);
1280 btrfs_free_path(path);
1281 return PTR_ERR(trans);
1284 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1286 cow_start = (u64)-1;
1289 ret = btrfs_lookup_file_extent(trans, root, path, ino,
1293 if (ret > 0 && path->slots[0] > 0 && check_prev) {
1294 leaf = path->nodes[0];
1295 btrfs_item_key_to_cpu(leaf, &found_key,
1296 path->slots[0] - 1);
1297 if (found_key.objectid == ino &&
1298 found_key.type == BTRFS_EXTENT_DATA_KEY)
1303 leaf = path->nodes[0];
1304 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1305 ret = btrfs_next_leaf(root, path);
1310 leaf = path->nodes[0];
1316 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1318 if (found_key.objectid > ino)
1320 if (WARN_ON_ONCE(found_key.objectid < ino) ||
1321 found_key.type < BTRFS_EXTENT_DATA_KEY) {
1325 if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
1326 found_key.offset > end)
1329 if (found_key.offset > cur_offset) {
1330 extent_end = found_key.offset;
1335 fi = btrfs_item_ptr(leaf, path->slots[0],
1336 struct btrfs_file_extent_item);
1337 extent_type = btrfs_file_extent_type(leaf, fi);
1339 ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1340 if (extent_type == BTRFS_FILE_EXTENT_REG ||
1341 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1342 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1343 extent_offset = btrfs_file_extent_offset(leaf, fi);
1344 extent_end = found_key.offset +
1345 btrfs_file_extent_num_bytes(leaf, fi);
1347 btrfs_file_extent_disk_num_bytes(leaf, fi);
1348 if (extent_end <= start) {
1352 if (disk_bytenr == 0)
1354 if (btrfs_file_extent_compression(leaf, fi) ||
1355 btrfs_file_extent_encryption(leaf, fi) ||
1356 btrfs_file_extent_other_encoding(leaf, fi))
1358 if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1360 if (btrfs_extent_readonly(root, disk_bytenr))
1362 if (btrfs_cross_ref_exist(trans, root, ino,
1364 extent_offset, disk_bytenr))
1366 disk_bytenr += extent_offset;
1367 disk_bytenr += cur_offset - found_key.offset;
1368 num_bytes = min(end + 1, extent_end) - cur_offset;
1370 * if there are pending snapshots for this root,
1371 * we fall into common COW way.
1374 err = btrfs_start_write_no_snapshoting(root);
1379 * force cow if csum exists in the range.
1380 * this ensure that csum for a given extent are
1381 * either valid or do not exist.
1383 if (csum_exist_in_range(root, disk_bytenr, num_bytes))
1385 if (!btrfs_inc_nocow_writers(root->fs_info,
1389 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1390 extent_end = found_key.offset +
1391 btrfs_file_extent_inline_len(leaf,
1392 path->slots[0], fi);
1393 extent_end = ALIGN(extent_end, root->sectorsize);
1398 if (extent_end <= start) {
1400 if (!nolock && nocow)
1401 btrfs_end_write_no_snapshoting(root);
1403 btrfs_dec_nocow_writers(root->fs_info,
1408 if (cow_start == (u64)-1)
1409 cow_start = cur_offset;
1410 cur_offset = extent_end;
1411 if (cur_offset > end)
1417 btrfs_release_path(path);
1418 if (cow_start != (u64)-1) {
1419 ret = cow_file_range(inode, locked_page,
1420 cow_start, found_key.offset - 1,
1421 page_started, nr_written, 1);
1423 if (!nolock && nocow)
1424 btrfs_end_write_no_snapshoting(root);
1426 btrfs_dec_nocow_writers(root->fs_info,
1430 cow_start = (u64)-1;
1433 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1434 struct extent_map *em;
1435 struct extent_map_tree *em_tree;
1436 em_tree = &BTRFS_I(inode)->extent_tree;
1437 em = alloc_extent_map();
1438 BUG_ON(!em); /* -ENOMEM */
1439 em->start = cur_offset;
1440 em->orig_start = found_key.offset - extent_offset;
1441 em->len = num_bytes;
1442 em->block_len = num_bytes;
1443 em->block_start = disk_bytenr;
1444 em->orig_block_len = disk_num_bytes;
1445 em->ram_bytes = ram_bytes;
1446 em->bdev = root->fs_info->fs_devices->latest_bdev;
1447 em->mod_start = em->start;
1448 em->mod_len = em->len;
1449 set_bit(EXTENT_FLAG_PINNED, &em->flags);
1450 set_bit(EXTENT_FLAG_FILLING, &em->flags);
1451 em->generation = -1;
1453 write_lock(&em_tree->lock);
1454 ret = add_extent_mapping(em_tree, em, 1);
1455 write_unlock(&em_tree->lock);
1456 if (ret != -EEXIST) {
1457 free_extent_map(em);
1460 btrfs_drop_extent_cache(inode, em->start,
1461 em->start + em->len - 1, 0);
1463 type = BTRFS_ORDERED_PREALLOC;
1465 type = BTRFS_ORDERED_NOCOW;
1468 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1469 num_bytes, num_bytes, type);
1471 btrfs_dec_nocow_writers(root->fs_info, disk_bytenr);
1472 BUG_ON(ret); /* -ENOMEM */
1474 if (root->root_key.objectid ==
1475 BTRFS_DATA_RELOC_TREE_OBJECTID) {
1476 ret = btrfs_reloc_clone_csums(inode, cur_offset,
1479 if (!nolock && nocow)
1480 btrfs_end_write_no_snapshoting(root);
1485 extent_clear_unlock_delalloc(inode, cur_offset,
1486 cur_offset + num_bytes - 1,
1487 locked_page, EXTENT_LOCKED |
1488 EXTENT_DELALLOC, PAGE_UNLOCK |
1490 if (!nolock && nocow)
1491 btrfs_end_write_no_snapshoting(root);
1492 cur_offset = extent_end;
1493 if (cur_offset > end)
1496 btrfs_release_path(path);
1498 if (cur_offset <= end && cow_start == (u64)-1) {
1499 cow_start = cur_offset;
1503 if (cow_start != (u64)-1) {
1504 ret = cow_file_range(inode, locked_page, cow_start, end,
1505 page_started, nr_written, 1);
1511 err = btrfs_end_transaction(trans, root);
1515 if (ret && cur_offset < end)
1516 extent_clear_unlock_delalloc(inode, cur_offset, end,
1517 locked_page, EXTENT_LOCKED |
1518 EXTENT_DELALLOC | EXTENT_DEFRAG |
1519 EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1521 PAGE_SET_WRITEBACK |
1522 PAGE_END_WRITEBACK);
1523 btrfs_free_path(path);
1527 static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
1530 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
1531 !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
1535 * @defrag_bytes is a hint value, no spinlock held here,
1536 * if is not zero, it means the file is defragging.
1537 * Force cow if given extent needs to be defragged.
1539 if (BTRFS_I(inode)->defrag_bytes &&
1540 test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1541 EXTENT_DEFRAG, 0, NULL))
1548 * extent_io.c call back to do delayed allocation processing
1550 static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1551 u64 start, u64 end, int *page_started,
1552 unsigned long *nr_written)
1555 int force_cow = need_force_cow(inode, start, end);
1557 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
1558 ret = run_delalloc_nocow(inode, locked_page, start, end,
1559 page_started, 1, nr_written);
1560 } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
1561 ret = run_delalloc_nocow(inode, locked_page, start, end,
1562 page_started, 0, nr_written);
1563 } else if (!inode_need_compress(inode)) {
1564 ret = cow_file_range(inode, locked_page, start, end,
1565 page_started, nr_written, 1);
1567 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1568 &BTRFS_I(inode)->runtime_flags);
1569 ret = cow_file_range_async(inode, locked_page, start, end,
1570 page_started, nr_written);
1575 static void btrfs_split_extent_hook(struct inode *inode,
1576 struct extent_state *orig, u64 split)
1580 /* not delalloc, ignore it */
1581 if (!(orig->state & EXTENT_DELALLOC))
1584 size = orig->end - orig->start + 1;
1585 if (size > BTRFS_MAX_EXTENT_SIZE) {
1590 * See the explanation in btrfs_merge_extent_hook, the same
1591 * applies here, just in reverse.
1593 new_size = orig->end - split + 1;
1594 num_extents = div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
1595 BTRFS_MAX_EXTENT_SIZE);
1596 new_size = split - orig->start;
1597 num_extents += div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
1598 BTRFS_MAX_EXTENT_SIZE);
1599 if (div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1,
1600 BTRFS_MAX_EXTENT_SIZE) >= num_extents)
1604 spin_lock(&BTRFS_I(inode)->lock);
1605 BTRFS_I(inode)->outstanding_extents++;
1606 spin_unlock(&BTRFS_I(inode)->lock);
1610 * extent_io.c merge_extent_hook, used to track merged delayed allocation
1611 * extents so we can keep track of new extents that are just merged onto old
1612 * extents, such as when we are doing sequential writes, so we can properly
1613 * account for the metadata space we'll need.
1615 static void btrfs_merge_extent_hook(struct inode *inode,
1616 struct extent_state *new,
1617 struct extent_state *other)
1619 u64 new_size, old_size;
1622 /* not delalloc, ignore it */
1623 if (!(other->state & EXTENT_DELALLOC))
1626 if (new->start > other->start)
1627 new_size = new->end - other->start + 1;
1629 new_size = other->end - new->start + 1;
1631 /* we're not bigger than the max, unreserve the space and go */
1632 if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
1633 spin_lock(&BTRFS_I(inode)->lock);
1634 BTRFS_I(inode)->outstanding_extents--;
1635 spin_unlock(&BTRFS_I(inode)->lock);
1640 * We have to add up either side to figure out how many extents were
1641 * accounted for before we merged into one big extent. If the number of
1642 * extents we accounted for is <= the amount we need for the new range
1643 * then we can return, otherwise drop. Think of it like this
1647 * So we've grown the extent by a MAX_SIZE extent, this would mean we
1648 * need 2 outstanding extents, on one side we have 1 and the other side
1649 * we have 1 so they are == and we can return. But in this case
1651 * [MAX_SIZE+4k][MAX_SIZE+4k]
1653 * Each range on their own accounts for 2 extents, but merged together
1654 * they are only 3 extents worth of accounting, so we need to drop in
1657 old_size = other->end - other->start + 1;
1658 num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
1659 BTRFS_MAX_EXTENT_SIZE);
1660 old_size = new->end - new->start + 1;
1661 num_extents += div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
1662 BTRFS_MAX_EXTENT_SIZE);
1664 if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
1665 BTRFS_MAX_EXTENT_SIZE) >= num_extents)
1668 spin_lock(&BTRFS_I(inode)->lock);
1669 BTRFS_I(inode)->outstanding_extents--;
1670 spin_unlock(&BTRFS_I(inode)->lock);
1673 static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
1674 struct inode *inode)
1676 spin_lock(&root->delalloc_lock);
1677 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1678 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1679 &root->delalloc_inodes);
1680 set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1681 &BTRFS_I(inode)->runtime_flags);
1682 root->nr_delalloc_inodes++;
1683 if (root->nr_delalloc_inodes == 1) {
1684 spin_lock(&root->fs_info->delalloc_root_lock);
1685 BUG_ON(!list_empty(&root->delalloc_root));
1686 list_add_tail(&root->delalloc_root,
1687 &root->fs_info->delalloc_roots);
1688 spin_unlock(&root->fs_info->delalloc_root_lock);
1691 spin_unlock(&root->delalloc_lock);
1694 static void btrfs_del_delalloc_inode(struct btrfs_root *root,
1695 struct inode *inode)
1697 spin_lock(&root->delalloc_lock);
1698 if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1699 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1700 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1701 &BTRFS_I(inode)->runtime_flags);
1702 root->nr_delalloc_inodes--;
1703 if (!root->nr_delalloc_inodes) {
1704 spin_lock(&root->fs_info->delalloc_root_lock);
1705 BUG_ON(list_empty(&root->delalloc_root));
1706 list_del_init(&root->delalloc_root);
1707 spin_unlock(&root->fs_info->delalloc_root_lock);
1710 spin_unlock(&root->delalloc_lock);
1714 * extent_io.c set_bit_hook, used to track delayed allocation
1715 * bytes in this file, and to maintain the list of inodes that
1716 * have pending delalloc work to be done.
1718 static void btrfs_set_bit_hook(struct inode *inode,
1719 struct extent_state *state, unsigned *bits)
1722 if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
1725 * set_bit and clear bit hooks normally require _irqsave/restore
1726 * but in this case, we are only testing for the DELALLOC
1727 * bit, which is only set or cleared with irqs on
1729 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1730 struct btrfs_root *root = BTRFS_I(inode)->root;
1731 u64 len = state->end + 1 - state->start;
1732 bool do_list = !btrfs_is_free_space_inode(inode);
1734 if (*bits & EXTENT_FIRST_DELALLOC) {
1735 *bits &= ~EXTENT_FIRST_DELALLOC;
1737 spin_lock(&BTRFS_I(inode)->lock);
1738 BTRFS_I(inode)->outstanding_extents++;
1739 spin_unlock(&BTRFS_I(inode)->lock);
1742 /* For sanity tests */
1743 if (btrfs_test_is_dummy_root(root))
1746 __percpu_counter_add(&root->fs_info->delalloc_bytes, len,
1747 root->fs_info->delalloc_batch);
1748 spin_lock(&BTRFS_I(inode)->lock);
1749 BTRFS_I(inode)->delalloc_bytes += len;
1750 if (*bits & EXTENT_DEFRAG)
1751 BTRFS_I(inode)->defrag_bytes += len;
1752 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1753 &BTRFS_I(inode)->runtime_flags))
1754 btrfs_add_delalloc_inodes(root, inode);
1755 spin_unlock(&BTRFS_I(inode)->lock);
1760 * extent_io.c clear_bit_hook, see set_bit_hook for why
1762 static void btrfs_clear_bit_hook(struct inode *inode,
1763 struct extent_state *state,
1766 u64 len = state->end + 1 - state->start;
1767 u64 num_extents = div64_u64(len + BTRFS_MAX_EXTENT_SIZE -1,
1768 BTRFS_MAX_EXTENT_SIZE);
1770 spin_lock(&BTRFS_I(inode)->lock);
1771 if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG))
1772 BTRFS_I(inode)->defrag_bytes -= len;
1773 spin_unlock(&BTRFS_I(inode)->lock);
1776 * set_bit and clear bit hooks normally require _irqsave/restore
1777 * but in this case, we are only testing for the DELALLOC
1778 * bit, which is only set or cleared with irqs on
1780 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1781 struct btrfs_root *root = BTRFS_I(inode)->root;
1782 bool do_list = !btrfs_is_free_space_inode(inode);
1784 if (*bits & EXTENT_FIRST_DELALLOC) {
1785 *bits &= ~EXTENT_FIRST_DELALLOC;
1786 } else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
1787 spin_lock(&BTRFS_I(inode)->lock);
1788 BTRFS_I(inode)->outstanding_extents -= num_extents;
1789 spin_unlock(&BTRFS_I(inode)->lock);
1793 * We don't reserve metadata space for space cache inodes so we
1794 * don't need to call dellalloc_release_metadata if there is an
1797 if (*bits & EXTENT_DO_ACCOUNTING &&
1798 root != root->fs_info->tree_root)
1799 btrfs_delalloc_release_metadata(inode, len);
1801 /* For sanity tests. */
1802 if (btrfs_test_is_dummy_root(root))
1805 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
1806 && do_list && !(state->state & EXTENT_NORESERVE))
1807 btrfs_free_reserved_data_space_noquota(inode,
1810 __percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
1811 root->fs_info->delalloc_batch);
1812 spin_lock(&BTRFS_I(inode)->lock);
1813 BTRFS_I(inode)->delalloc_bytes -= len;
1814 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
1815 test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1816 &BTRFS_I(inode)->runtime_flags))
1817 btrfs_del_delalloc_inode(root, inode);
1818 spin_unlock(&BTRFS_I(inode)->lock);
1823 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1824 * we don't create bios that span stripes or chunks
1826 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1827 size_t size, struct bio *bio,
1828 unsigned long bio_flags)
1830 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
1831 u64 logical = (u64)bio->bi_iter.bi_sector << 9;
1836 if (bio_flags & EXTENT_BIO_COMPRESSED)
1839 length = bio->bi_iter.bi_size;
1840 map_length = length;
1841 ret = btrfs_map_block(root->fs_info, bio_op(bio), logical,
1842 &map_length, NULL, 0);
1843 /* Will always return 0 with map_multi == NULL */
1845 if (map_length < length + size)
1851 * in order to insert checksums into the metadata in large chunks,
1852 * we wait until bio submission time. All the pages in the bio are
1853 * checksummed and sums are attached onto the ordered extent record.
1855 * At IO completion time the cums attached on the ordered extent record
1856 * are inserted into the btree
1858 static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
1859 int mirror_num, unsigned long bio_flags,
1862 struct btrfs_root *root = BTRFS_I(inode)->root;
1865 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1866 BUG_ON(ret); /* -ENOMEM */
1871 * in order to insert checksums into the metadata in large chunks,
1872 * we wait until bio submission time. All the pages in the bio are
1873 * checksummed and sums are attached onto the ordered extent record.
1875 * At IO completion time the cums attached on the ordered extent record
1876 * are inserted into the btree
1878 static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio,
1879 int mirror_num, unsigned long bio_flags,
1882 struct btrfs_root *root = BTRFS_I(inode)->root;
1885 ret = btrfs_map_bio(root, bio, mirror_num, 1);
1887 bio->bi_error = ret;
1894 * extent_io.c submission hook. This does the right thing for csum calculation
1895 * on write, or reading the csums from the tree before a read
1897 static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
1898 int mirror_num, unsigned long bio_flags,
1901 struct btrfs_root *root = BTRFS_I(inode)->root;
1902 enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
1905 int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
1907 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1909 if (btrfs_is_free_space_inode(inode))
1910 metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
1912 if (bio_op(bio) != REQ_OP_WRITE) {
1913 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
1917 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1918 ret = btrfs_submit_compressed_read(inode, bio,
1922 } else if (!skip_sum) {
1923 ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
1928 } else if (async && !skip_sum) {
1929 /* csum items have already been cloned */
1930 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
1932 /* we're doing a write, do the async checksumming */
1933 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1934 inode, bio, mirror_num,
1935 bio_flags, bio_offset,
1936 __btrfs_submit_bio_start,
1937 __btrfs_submit_bio_done);
1939 } else if (!skip_sum) {
1940 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1946 ret = btrfs_map_bio(root, bio, mirror_num, 0);
1950 bio->bi_error = ret;
1957 * given a list of ordered sums record them in the inode. This happens
1958 * at IO completion time based on sums calculated at bio submission time.
1960 static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1961 struct inode *inode, u64 file_offset,
1962 struct list_head *list)
1964 struct btrfs_ordered_sum *sum;
1966 list_for_each_entry(sum, list, list) {
1967 trans->adding_csums = 1;
1968 btrfs_csum_file_blocks(trans,
1969 BTRFS_I(inode)->root->fs_info->csum_root, sum);
1970 trans->adding_csums = 0;
1975 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
1976 struct extent_state **cached_state)
1978 WARN_ON((end & (PAGE_SIZE - 1)) == 0);
1979 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1983 /* see btrfs_writepage_start_hook for details on why this is required */
1984 struct btrfs_writepage_fixup {
1986 struct btrfs_work work;
1989 static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
1991 struct btrfs_writepage_fixup *fixup;
1992 struct btrfs_ordered_extent *ordered;
1993 struct extent_state *cached_state = NULL;
1995 struct inode *inode;
2000 fixup = container_of(work, struct btrfs_writepage_fixup, work);
2004 if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
2005 ClearPageChecked(page);
2009 inode = page->mapping->host;
2010 page_start = page_offset(page);
2011 page_end = page_offset(page) + PAGE_SIZE - 1;
2013 lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
2016 /* already ordered? We're done */
2017 if (PagePrivate2(page))
2020 ordered = btrfs_lookup_ordered_range(inode, page_start,
2023 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
2024 page_end, &cached_state, GFP_NOFS);
2026 btrfs_start_ordered_extent(inode, ordered, 1);
2027 btrfs_put_ordered_extent(ordered);
2031 ret = btrfs_delalloc_reserve_space(inode, page_start,
2034 mapping_set_error(page->mapping, ret);
2035 end_extent_writepage(page, ret, page_start, page_end);
2036 ClearPageChecked(page);
2040 btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
2041 ClearPageChecked(page);
2042 set_page_dirty(page);
2044 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
2045 &cached_state, GFP_NOFS);
2053 * There are a few paths in the higher layers of the kernel that directly
2054 * set the page dirty bit without asking the filesystem if it is a
2055 * good idea. This causes problems because we want to make sure COW
2056 * properly happens and the data=ordered rules are followed.
2058 * In our case any range that doesn't have the ORDERED bit set
2059 * hasn't been properly setup for IO. We kick off an async process
2060 * to fix it up. The async helper will wait for ordered extents, set
2061 * the delalloc bit and make it safe to write the page.
2063 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
2065 struct inode *inode = page->mapping->host;
2066 struct btrfs_writepage_fixup *fixup;
2067 struct btrfs_root *root = BTRFS_I(inode)->root;
2069 /* this page is properly in the ordered list */
2070 if (TestClearPagePrivate2(page))
2073 if (PageChecked(page))
2076 fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
2080 SetPageChecked(page);
2082 btrfs_init_work(&fixup->work, btrfs_fixup_helper,
2083 btrfs_writepage_fixup_worker, NULL, NULL);
2085 btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
2089 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2090 struct inode *inode, u64 file_pos,
2091 u64 disk_bytenr, u64 disk_num_bytes,
2092 u64 num_bytes, u64 ram_bytes,
2093 u8 compression, u8 encryption,
2094 u16 other_encoding, int extent_type)
2096 struct btrfs_root *root = BTRFS_I(inode)->root;
2097 struct btrfs_file_extent_item *fi;
2098 struct btrfs_path *path;
2099 struct extent_buffer *leaf;
2100 struct btrfs_key ins;
2101 int extent_inserted = 0;
2104 path = btrfs_alloc_path();
2109 * we may be replacing one extent in the tree with another.
2110 * The new extent is pinned in the extent map, and we don't want
2111 * to drop it from the cache until it is completely in the btree.
2113 * So, tell btrfs_drop_extents to leave this extent in the cache.
2114 * the caller is expected to unpin it and allow it to be merged
2117 ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
2118 file_pos + num_bytes, NULL, 0,
2119 1, sizeof(*fi), &extent_inserted);
2123 if (!extent_inserted) {
2124 ins.objectid = btrfs_ino(inode);
2125 ins.offset = file_pos;
2126 ins.type = BTRFS_EXTENT_DATA_KEY;
2128 path->leave_spinning = 1;
2129 ret = btrfs_insert_empty_item(trans, root, path, &ins,
2134 leaf = path->nodes[0];
2135 fi = btrfs_item_ptr(leaf, path->slots[0],
2136 struct btrfs_file_extent_item);
2137 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2138 btrfs_set_file_extent_type(leaf, fi, extent_type);
2139 btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
2140 btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
2141 btrfs_set_file_extent_offset(leaf, fi, 0);
2142 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2143 btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
2144 btrfs_set_file_extent_compression(leaf, fi, compression);
2145 btrfs_set_file_extent_encryption(leaf, fi, encryption);
2146 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
2148 btrfs_mark_buffer_dirty(leaf);
2149 btrfs_release_path(path);
2151 inode_add_bytes(inode, num_bytes);
2153 ins.objectid = disk_bytenr;
2154 ins.offset = disk_num_bytes;
2155 ins.type = BTRFS_EXTENT_ITEM_KEY;
2156 ret = btrfs_alloc_reserved_file_extent(trans, root,
2157 root->root_key.objectid,
2158 btrfs_ino(inode), file_pos,
2161 * Release the reserved range from inode dirty range map, as it is
2162 * already moved into delayed_ref_head
2164 btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
2166 btrfs_free_path(path);
2171 /* snapshot-aware defrag */
2172 struct sa_defrag_extent_backref {
2173 struct rb_node node;
2174 struct old_sa_defrag_extent *old;
2183 struct old_sa_defrag_extent {
2184 struct list_head list;
2185 struct new_sa_defrag_extent *new;
2194 struct new_sa_defrag_extent {
2195 struct rb_root root;
2196 struct list_head head;
2197 struct btrfs_path *path;
2198 struct inode *inode;
2206 static int backref_comp(struct sa_defrag_extent_backref *b1,
2207 struct sa_defrag_extent_backref *b2)
2209 if (b1->root_id < b2->root_id)
2211 else if (b1->root_id > b2->root_id)
2214 if (b1->inum < b2->inum)
2216 else if (b1->inum > b2->inum)
2219 if (b1->file_pos < b2->file_pos)
2221 else if (b1->file_pos > b2->file_pos)
2225 * [------------------------------] ===> (a range of space)
2226 * |<--->| |<---->| =============> (fs/file tree A)
2227 * |<---------------------------->| ===> (fs/file tree B)
2229 * A range of space can refer to two file extents in one tree while
2230 * refer to only one file extent in another tree.
2232 * So we may process a disk offset more than one time(two extents in A)
2233 * and locate at the same extent(one extent in B), then insert two same
2234 * backrefs(both refer to the extent in B).
2239 static void backref_insert(struct rb_root *root,
2240 struct sa_defrag_extent_backref *backref)
2242 struct rb_node **p = &root->rb_node;
2243 struct rb_node *parent = NULL;
2244 struct sa_defrag_extent_backref *entry;
2249 entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
2251 ret = backref_comp(backref, entry);
2255 p = &(*p)->rb_right;
2258 rb_link_node(&backref->node, parent, p);
2259 rb_insert_color(&backref->node, root);
2263 * Note the backref might has changed, and in this case we just return 0.
2265 static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2268 struct btrfs_file_extent_item *extent;
2269 struct btrfs_fs_info *fs_info;
2270 struct old_sa_defrag_extent *old = ctx;
2271 struct new_sa_defrag_extent *new = old->new;
2272 struct btrfs_path *path = new->path;
2273 struct btrfs_key key;
2274 struct btrfs_root *root;
2275 struct sa_defrag_extent_backref *backref;
2276 struct extent_buffer *leaf;
2277 struct inode *inode = new->inode;
2283 if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
2284 inum == btrfs_ino(inode))
2287 key.objectid = root_id;
2288 key.type = BTRFS_ROOT_ITEM_KEY;
2289 key.offset = (u64)-1;
2291 fs_info = BTRFS_I(inode)->root->fs_info;
2292 root = btrfs_read_fs_root_no_name(fs_info, &key);
2294 if (PTR_ERR(root) == -ENOENT)
2297 pr_debug("inum=%llu, offset=%llu, root_id=%llu\n",
2298 inum, offset, root_id);
2299 return PTR_ERR(root);
2302 key.objectid = inum;
2303 key.type = BTRFS_EXTENT_DATA_KEY;
2304 if (offset > (u64)-1 << 32)
2307 key.offset = offset;
2309 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2310 if (WARN_ON(ret < 0))
2317 leaf = path->nodes[0];
2318 slot = path->slots[0];
2320 if (slot >= btrfs_header_nritems(leaf)) {
2321 ret = btrfs_next_leaf(root, path);
2324 } else if (ret > 0) {
2333 btrfs_item_key_to_cpu(leaf, &key, slot);
2335 if (key.objectid > inum)
2338 if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
2341 extent = btrfs_item_ptr(leaf, slot,
2342 struct btrfs_file_extent_item);
2344 if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
2348 * 'offset' refers to the exact key.offset,
2349 * NOT the 'offset' field in btrfs_extent_data_ref, ie.
2350 * (key.offset - extent_offset).
2352 if (key.offset != offset)
2355 extent_offset = btrfs_file_extent_offset(leaf, extent);
2356 num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
2358 if (extent_offset >= old->extent_offset + old->offset +
2359 old->len || extent_offset + num_bytes <=
2360 old->extent_offset + old->offset)
2365 backref = kmalloc(sizeof(*backref), GFP_NOFS);
2371 backref->root_id = root_id;
2372 backref->inum = inum;
2373 backref->file_pos = offset;
2374 backref->num_bytes = num_bytes;
2375 backref->extent_offset = extent_offset;
2376 backref->generation = btrfs_file_extent_generation(leaf, extent);
2378 backref_insert(&new->root, backref);
2381 btrfs_release_path(path);
2386 static noinline bool record_extent_backrefs(struct btrfs_path *path,
2387 struct new_sa_defrag_extent *new)
2389 struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info;
2390 struct old_sa_defrag_extent *old, *tmp;
2395 list_for_each_entry_safe(old, tmp, &new->head, list) {
2396 ret = iterate_inodes_from_logical(old->bytenr +
2397 old->extent_offset, fs_info,
2398 path, record_one_backref,
2400 if (ret < 0 && ret != -ENOENT)
2403 /* no backref to be processed for this extent */
2405 list_del(&old->list);
2410 if (list_empty(&new->head))
2416 static int relink_is_mergable(struct extent_buffer *leaf,
2417 struct btrfs_file_extent_item *fi,
2418 struct new_sa_defrag_extent *new)
2420 if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
2423 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2426 if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
2429 if (btrfs_file_extent_encryption(leaf, fi) ||
2430 btrfs_file_extent_other_encoding(leaf, fi))
2437 * Note the backref might has changed, and in this case we just return 0.
2439 static noinline int relink_extent_backref(struct btrfs_path *path,
2440 struct sa_defrag_extent_backref *prev,
2441 struct sa_defrag_extent_backref *backref)
2443 struct btrfs_file_extent_item *extent;
2444 struct btrfs_file_extent_item *item;
2445 struct btrfs_ordered_extent *ordered;
2446 struct btrfs_trans_handle *trans;
2447 struct btrfs_fs_info *fs_info;
2448 struct btrfs_root *root;
2449 struct btrfs_key key;
2450 struct extent_buffer *leaf;
2451 struct old_sa_defrag_extent *old = backref->old;
2452 struct new_sa_defrag_extent *new = old->new;
2453 struct inode *src_inode = new->inode;
2454 struct inode *inode;
2455 struct extent_state *cached = NULL;
2464 if (prev && prev->root_id == backref->root_id &&
2465 prev->inum == backref->inum &&
2466 prev->file_pos + prev->num_bytes == backref->file_pos)
2469 /* step 1: get root */
2470 key.objectid = backref->root_id;
2471 key.type = BTRFS_ROOT_ITEM_KEY;
2472 key.offset = (u64)-1;
2474 fs_info = BTRFS_I(src_inode)->root->fs_info;
2475 index = srcu_read_lock(&fs_info->subvol_srcu);
2477 root = btrfs_read_fs_root_no_name(fs_info, &key);
2479 srcu_read_unlock(&fs_info->subvol_srcu, index);
2480 if (PTR_ERR(root) == -ENOENT)
2482 return PTR_ERR(root);
2485 if (btrfs_root_readonly(root)) {
2486 srcu_read_unlock(&fs_info->subvol_srcu, index);
2490 /* step 2: get inode */
2491 key.objectid = backref->inum;
2492 key.type = BTRFS_INODE_ITEM_KEY;
2495 inode = btrfs_iget(fs_info->sb, &key, root, NULL);
2496 if (IS_ERR(inode)) {
2497 srcu_read_unlock(&fs_info->subvol_srcu, index);
2501 srcu_read_unlock(&fs_info->subvol_srcu, index);
2503 /* step 3: relink backref */
2504 lock_start = backref->file_pos;
2505 lock_end = backref->file_pos + backref->num_bytes - 1;
2506 lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2509 ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
2511 btrfs_put_ordered_extent(ordered);
2515 trans = btrfs_join_transaction(root);
2516 if (IS_ERR(trans)) {
2517 ret = PTR_ERR(trans);
2521 key.objectid = backref->inum;
2522 key.type = BTRFS_EXTENT_DATA_KEY;
2523 key.offset = backref->file_pos;
2525 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2528 } else if (ret > 0) {
2533 extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
2534 struct btrfs_file_extent_item);
2536 if (btrfs_file_extent_generation(path->nodes[0], extent) !=
2537 backref->generation)
2540 btrfs_release_path(path);
2542 start = backref->file_pos;
2543 if (backref->extent_offset < old->extent_offset + old->offset)
2544 start += old->extent_offset + old->offset -
2545 backref->extent_offset;
2547 len = min(backref->extent_offset + backref->num_bytes,
2548 old->extent_offset + old->offset + old->len);
2549 len -= max(backref->extent_offset, old->extent_offset + old->offset);
2551 ret = btrfs_drop_extents(trans, root, inode, start,
2556 key.objectid = btrfs_ino(inode);
2557 key.type = BTRFS_EXTENT_DATA_KEY;
2560 path->leave_spinning = 1;
2562 struct btrfs_file_extent_item *fi;
2564 struct btrfs_key found_key;
2566 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2571 leaf = path->nodes[0];
2572 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2574 fi = btrfs_item_ptr(leaf, path->slots[0],
2575 struct btrfs_file_extent_item);
2576 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
2578 if (extent_len + found_key.offset == start &&
2579 relink_is_mergable(leaf, fi, new)) {
2580 btrfs_set_file_extent_num_bytes(leaf, fi,
2582 btrfs_mark_buffer_dirty(leaf);
2583 inode_add_bytes(inode, len);
2589 btrfs_release_path(path);
2594 ret = btrfs_insert_empty_item(trans, root, path, &key,
2597 btrfs_abort_transaction(trans, root, ret);
2601 leaf = path->nodes[0];
2602 item = btrfs_item_ptr(leaf, path->slots[0],
2603 struct btrfs_file_extent_item);
2604 btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
2605 btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
2606 btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
2607 btrfs_set_file_extent_num_bytes(leaf, item, len);
2608 btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
2609 btrfs_set_file_extent_generation(leaf, item, trans->transid);
2610 btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
2611 btrfs_set_file_extent_compression(leaf, item, new->compress_type);
2612 btrfs_set_file_extent_encryption(leaf, item, 0);
2613 btrfs_set_file_extent_other_encoding(leaf, item, 0);
2615 btrfs_mark_buffer_dirty(leaf);
2616 inode_add_bytes(inode, len);
2617 btrfs_release_path(path);
2619 ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
2621 backref->root_id, backref->inum,
2622 new->file_pos); /* start - extent_offset */
2624 btrfs_abort_transaction(trans, root, ret);
2630 btrfs_release_path(path);
2631 path->leave_spinning = 0;
2632 btrfs_end_transaction(trans, root);
2634 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2640 static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
2642 struct old_sa_defrag_extent *old, *tmp;
2647 list_for_each_entry_safe(old, tmp, &new->head, list) {
2653 static void relink_file_extents(struct new_sa_defrag_extent *new)
2655 struct btrfs_path *path;
2656 struct sa_defrag_extent_backref *backref;
2657 struct sa_defrag_extent_backref *prev = NULL;
2658 struct inode *inode;
2659 struct btrfs_root *root;
2660 struct rb_node *node;
2664 root = BTRFS_I(inode)->root;
2666 path = btrfs_alloc_path();
2670 if (!record_extent_backrefs(path, new)) {
2671 btrfs_free_path(path);
2674 btrfs_release_path(path);
2677 node = rb_first(&new->root);
2680 rb_erase(node, &new->root);
2682 backref = rb_entry(node, struct sa_defrag_extent_backref, node);
2684 ret = relink_extent_backref(path, prev, backref);
2697 btrfs_free_path(path);
2699 free_sa_defrag_extent(new);
2701 atomic_dec(&root->fs_info->defrag_running);
2702 wake_up(&root->fs_info->transaction_wait);
2705 static struct new_sa_defrag_extent *
2706 record_old_file_extents(struct inode *inode,
2707 struct btrfs_ordered_extent *ordered)
2709 struct btrfs_root *root = BTRFS_I(inode)->root;
2710 struct btrfs_path *path;
2711 struct btrfs_key key;
2712 struct old_sa_defrag_extent *old;
2713 struct new_sa_defrag_extent *new;
2716 new = kmalloc(sizeof(*new), GFP_NOFS);
2721 new->file_pos = ordered->file_offset;
2722 new->len = ordered->len;
2723 new->bytenr = ordered->start;
2724 new->disk_len = ordered->disk_len;
2725 new->compress_type = ordered->compress_type;
2726 new->root = RB_ROOT;
2727 INIT_LIST_HEAD(&new->head);
2729 path = btrfs_alloc_path();
2733 key.objectid = btrfs_ino(inode);
2734 key.type = BTRFS_EXTENT_DATA_KEY;
2735 key.offset = new->file_pos;
2737 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2740 if (ret > 0 && path->slots[0] > 0)
2743 /* find out all the old extents for the file range */
2745 struct btrfs_file_extent_item *extent;
2746 struct extent_buffer *l;
2755 slot = path->slots[0];
2757 if (slot >= btrfs_header_nritems(l)) {
2758 ret = btrfs_next_leaf(root, path);
2766 btrfs_item_key_to_cpu(l, &key, slot);
2768 if (key.objectid != btrfs_ino(inode))
2770 if (key.type != BTRFS_EXTENT_DATA_KEY)
2772 if (key.offset >= new->file_pos + new->len)
2775 extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
2777 num_bytes = btrfs_file_extent_num_bytes(l, extent);
2778 if (key.offset + num_bytes < new->file_pos)
2781 disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
2785 extent_offset = btrfs_file_extent_offset(l, extent);
2787 old = kmalloc(sizeof(*old), GFP_NOFS);
2791 offset = max(new->file_pos, key.offset);
2792 end = min(new->file_pos + new->len, key.offset + num_bytes);
2794 old->bytenr = disk_bytenr;
2795 old->extent_offset = extent_offset;
2796 old->offset = offset - key.offset;
2797 old->len = end - offset;
2800 list_add_tail(&old->list, &new->head);
2806 btrfs_free_path(path);
2807 atomic_inc(&root->fs_info->defrag_running);
2812 btrfs_free_path(path);
2814 free_sa_defrag_extent(new);
2818 static void btrfs_release_delalloc_bytes(struct btrfs_root *root,
2821 struct btrfs_block_group_cache *cache;
2823 cache = btrfs_lookup_block_group(root->fs_info, start);
2826 spin_lock(&cache->lock);
2827 cache->delalloc_bytes -= len;
2828 spin_unlock(&cache->lock);
2830 btrfs_put_block_group(cache);
2833 /* as ordered data IO finishes, this gets called so we can finish
2834 * an ordered extent if the range of bytes in the file it covers are
2837 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2839 struct inode *inode = ordered_extent->inode;
2840 struct btrfs_root *root = BTRFS_I(inode)->root;
2841 struct btrfs_trans_handle *trans = NULL;
2842 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2843 struct extent_state *cached_state = NULL;
2844 struct new_sa_defrag_extent *new = NULL;
2845 int compress_type = 0;
2847 u64 logical_len = ordered_extent->len;
2849 bool truncated = false;
2851 nolock = btrfs_is_free_space_inode(inode);
2853 if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
2858 btrfs_free_io_failure_record(inode, ordered_extent->file_offset,
2859 ordered_extent->file_offset +
2860 ordered_extent->len - 1);
2862 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
2864 logical_len = ordered_extent->truncated_len;
2865 /* Truncated the entire extent, don't bother adding */
2870 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
2871 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
2874 * For mwrite(mmap + memset to write) case, we still reserve
2875 * space for NOCOW range.
2876 * As NOCOW won't cause a new delayed ref, just free the space
2878 btrfs_qgroup_free_data(inode, ordered_extent->file_offset,
2879 ordered_extent->len);
2880 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2882 trans = btrfs_join_transaction_nolock(root);
2884 trans = btrfs_join_transaction(root);
2885 if (IS_ERR(trans)) {
2886 ret = PTR_ERR(trans);
2890 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
2891 ret = btrfs_update_inode_fallback(trans, root, inode);
2892 if (ret) /* -ENOMEM or corruption */
2893 btrfs_abort_transaction(trans, root, ret);
2897 lock_extent_bits(io_tree, ordered_extent->file_offset,
2898 ordered_extent->file_offset + ordered_extent->len - 1,
2901 ret = test_range_bit(io_tree, ordered_extent->file_offset,
2902 ordered_extent->file_offset + ordered_extent->len - 1,
2903 EXTENT_DEFRAG, 1, cached_state);
2905 u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
2906 if (0 && last_snapshot >= BTRFS_I(inode)->generation)
2907 /* the inode is shared */
2908 new = record_old_file_extents(inode, ordered_extent);
2910 clear_extent_bit(io_tree, ordered_extent->file_offset,
2911 ordered_extent->file_offset + ordered_extent->len - 1,
2912 EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS);
2916 trans = btrfs_join_transaction_nolock(root);
2918 trans = btrfs_join_transaction(root);
2919 if (IS_ERR(trans)) {
2920 ret = PTR_ERR(trans);
2925 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
2927 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
2928 compress_type = ordered_extent->compress_type;
2929 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
2930 BUG_ON(compress_type);
2931 ret = btrfs_mark_extent_written(trans, inode,
2932 ordered_extent->file_offset,
2933 ordered_extent->file_offset +
2936 BUG_ON(root == root->fs_info->tree_root);
2937 ret = insert_reserved_file_extent(trans, inode,
2938 ordered_extent->file_offset,
2939 ordered_extent->start,
2940 ordered_extent->disk_len,
2941 logical_len, logical_len,
2942 compress_type, 0, 0,
2943 BTRFS_FILE_EXTENT_REG);
2945 btrfs_release_delalloc_bytes(root,
2946 ordered_extent->start,
2947 ordered_extent->disk_len);
2949 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
2950 ordered_extent->file_offset, ordered_extent->len,
2953 btrfs_abort_transaction(trans, root, ret);
2957 add_pending_csums(trans, inode, ordered_extent->file_offset,
2958 &ordered_extent->list);
2960 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2961 ret = btrfs_update_inode_fallback(trans, root, inode);
2962 if (ret) { /* -ENOMEM or corruption */
2963 btrfs_abort_transaction(trans, root, ret);
2968 unlock_extent_cached(io_tree, ordered_extent->file_offset,
2969 ordered_extent->file_offset +
2970 ordered_extent->len - 1, &cached_state, GFP_NOFS);
2972 if (root != root->fs_info->tree_root)
2973 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
2975 btrfs_end_transaction(trans, root);
2977 if (ret || truncated) {
2981 start = ordered_extent->file_offset + logical_len;
2983 start = ordered_extent->file_offset;
2984 end = ordered_extent->file_offset + ordered_extent->len - 1;
2985 clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS);
2987 /* Drop the cache for the part of the extent we didn't write. */
2988 btrfs_drop_extent_cache(inode, start, end, 0);
2991 * If the ordered extent had an IOERR or something else went
2992 * wrong we need to return the space for this ordered extent
2993 * back to the allocator. We only free the extent in the
2994 * truncated case if we didn't write out the extent at all.
2996 if ((ret || !logical_len) &&
2997 !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
2998 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
2999 btrfs_free_reserved_extent(root, ordered_extent->start,
3000 ordered_extent->disk_len, 1);
3005 * This needs to be done to make sure anybody waiting knows we are done
3006 * updating everything for this ordered extent.
3008 btrfs_remove_ordered_extent(inode, ordered_extent);
3010 /* for snapshot-aware defrag */
3013 free_sa_defrag_extent(new);
3014 atomic_dec(&root->fs_info->defrag_running);
3016 relink_file_extents(new);
3021 btrfs_put_ordered_extent(ordered_extent);
3022 /* once for the tree */
3023 btrfs_put_ordered_extent(ordered_extent);
3028 static void finish_ordered_fn(struct btrfs_work *work)
3030 struct btrfs_ordered_extent *ordered_extent;
3031 ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
3032 btrfs_finish_ordered_io(ordered_extent);
3035 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
3036 struct extent_state *state, int uptodate)
3038 struct inode *inode = page->mapping->host;
3039 struct btrfs_root *root = BTRFS_I(inode)->root;
3040 struct btrfs_ordered_extent *ordered_extent = NULL;
3041 struct btrfs_workqueue *wq;
3042 btrfs_work_func_t func;
3044 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
3046 ClearPagePrivate2(page);
3047 if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
3048 end - start + 1, uptodate))
3051 if (btrfs_is_free_space_inode(inode)) {
3052 wq = root->fs_info->endio_freespace_worker;
3053 func = btrfs_freespace_write_helper;
3055 wq = root->fs_info->endio_write_workers;
3056 func = btrfs_endio_write_helper;
3059 btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
3061 btrfs_queue_work(wq, &ordered_extent->work);
3066 static int __readpage_endio_check(struct inode *inode,
3067 struct btrfs_io_bio *io_bio,
3068 int icsum, struct page *page,
3069 int pgoff, u64 start, size_t len)
3075 csum_expected = *(((u32 *)io_bio->csum) + icsum);
3077 kaddr = kmap_atomic(page);
3078 csum = btrfs_csum_data(kaddr + pgoff, csum, len);
3079 btrfs_csum_final(csum, (char *)&csum);
3080 if (csum != csum_expected)
3083 kunmap_atomic(kaddr);
3086 btrfs_warn_rl(BTRFS_I(inode)->root->fs_info,
3087 "csum failed ino %llu off %llu csum %u expected csum %u",
3088 btrfs_ino(inode), start, csum, csum_expected);