2 * Copyright (C) 2007 Oracle. All rights reserved.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
19 #include <linux/kernel.h>
20 #include <linux/bio.h>
21 #include <linux/buffer_head.h>
22 #include <linux/file.h>
24 #include <linux/pagemap.h>
25 #include <linux/highmem.h>
26 #include <linux/time.h>
27 #include <linux/init.h>
28 #include <linux/string.h>
29 #include <linux/backing-dev.h>
30 #include <linux/mpage.h>
31 #include <linux/swap.h>
32 #include <linux/writeback.h>
33 #include <linux/compat.h>
34 #include <linux/bit_spinlock.h>
35 #include <linux/xattr.h>
36 #include <linux/posix_acl.h>
37 #include <linux/falloc.h>
38 #include <linux/slab.h>
39 #include <linux/ratelimit.h>
40 #include <linux/mount.h>
41 #include <linux/btrfs.h>
42 #include <linux/blkdev.h>
43 #include <linux/posix_acl_xattr.h>
44 #include <linux/uio.h>
45 #include <linux/magic.h>
46 #include <linux/iversion.h>
49 #include "transaction.h"
50 #include "btrfs_inode.h"
51 #include "print-tree.h"
52 #include "ordered-data.h"
56 #include "compression.h"
58 #include "free-space-cache.h"
59 #include "inode-map.h"
65 struct btrfs_iget_args {
66 struct btrfs_key *location;
67 struct btrfs_root *root;
70 struct btrfs_dio_data {
72 u64 unsubmitted_oe_range_start;
73 u64 unsubmitted_oe_range_end;
77 static const struct inode_operations btrfs_dir_inode_operations;
78 static const struct inode_operations btrfs_symlink_inode_operations;
79 static const struct inode_operations btrfs_dir_ro_inode_operations;
80 static const struct inode_operations btrfs_special_inode_operations;
81 static const struct inode_operations btrfs_file_inode_operations;
82 static const struct address_space_operations btrfs_aops;
83 static const struct address_space_operations btrfs_symlink_aops;
84 static const struct file_operations btrfs_dir_file_operations;
85 static const struct extent_io_ops btrfs_extent_io_ops;
87 static struct kmem_cache *btrfs_inode_cachep;
88 struct kmem_cache *btrfs_trans_handle_cachep;
89 struct kmem_cache *btrfs_path_cachep;
90 struct kmem_cache *btrfs_free_space_cachep;
93 static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
94 [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE,
95 [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR,
96 [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV,
97 [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV,
98 [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO,
99 [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK,
100 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK,
103 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
104 static int btrfs_truncate(struct inode *inode, bool skip_writeback);
105 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
106 static noinline int cow_file_range(struct inode *inode,
107 struct page *locked_page,
108 u64 start, u64 end, u64 delalloc_end,
109 int *page_started, unsigned long *nr_written,
110 int unlock, struct btrfs_dedupe_hash *hash);
111 static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
112 u64 orig_start, u64 block_start,
113 u64 block_len, u64 orig_block_len,
114 u64 ram_bytes, int compress_type,
117 static void __endio_write_update_ordered(struct inode *inode,
118 const u64 offset, const u64 bytes,
119 const bool uptodate);
122 * Cleanup all submitted ordered extents in specified range to handle errors
123 * from the fill_dellaloc() callback.
125 * NOTE: caller must ensure that when an error happens, it can not call
126 * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
127 * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
128 * to be released, which we want to happen only when finishing the ordered
129 * extent (btrfs_finish_ordered_io()). Also note that the caller of the
130 * fill_delalloc() callback already does proper cleanup for the first page of
131 * the range, that is, it invokes the callback writepage_end_io_hook() for the
132 * range of the first page.
134 static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
138 unsigned long index = offset >> PAGE_SHIFT;
139 unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
142 while (index <= end_index) {
143 page = find_get_page(inode->i_mapping, index);
147 ClearPagePrivate2(page);
150 return __endio_write_update_ordered(inode, offset + PAGE_SIZE,
151 bytes - PAGE_SIZE, false);
154 static int btrfs_dirty_inode(struct inode *inode);
156 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
157 void btrfs_test_inode_set_ops(struct inode *inode)
159 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
163 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
164 struct inode *inode, struct inode *dir,
165 const struct qstr *qstr)
169 err = btrfs_init_acl(trans, inode, dir);
171 err = btrfs_xattr_security_init(trans, inode, dir, qstr);
176 * this does all the hard work for inserting an inline extent into
177 * the btree. The caller should have done a btrfs_drop_extents so that
178 * no overlapping inline items exist in the btree
180 static int insert_inline_extent(struct btrfs_trans_handle *trans,
181 struct btrfs_path *path, int extent_inserted,
182 struct btrfs_root *root, struct inode *inode,
183 u64 start, size_t size, size_t compressed_size,
185 struct page **compressed_pages)
187 struct extent_buffer *leaf;
188 struct page *page = NULL;
191 struct btrfs_file_extent_item *ei;
193 size_t cur_size = size;
194 unsigned long offset;
196 if (compressed_size && compressed_pages)
197 cur_size = compressed_size;
199 inode_add_bytes(inode, size);
201 if (!extent_inserted) {
202 struct btrfs_key key;
205 key.objectid = btrfs_ino(BTRFS_I(inode));
207 key.type = BTRFS_EXTENT_DATA_KEY;
209 datasize = btrfs_file_extent_calc_inline_size(cur_size);
210 path->leave_spinning = 1;
211 ret = btrfs_insert_empty_item(trans, root, path, &key,
216 leaf = path->nodes[0];
217 ei = btrfs_item_ptr(leaf, path->slots[0],
218 struct btrfs_file_extent_item);
219 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
220 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
221 btrfs_set_file_extent_encryption(leaf, ei, 0);
222 btrfs_set_file_extent_other_encoding(leaf, ei, 0);
223 btrfs_set_file_extent_ram_bytes(leaf, ei, size);
224 ptr = btrfs_file_extent_inline_start(ei);
226 if (compress_type != BTRFS_COMPRESS_NONE) {
229 while (compressed_size > 0) {
230 cpage = compressed_pages[i];
231 cur_size = min_t(unsigned long, compressed_size,
234 kaddr = kmap_atomic(cpage);
235 write_extent_buffer(leaf, kaddr, ptr, cur_size);
236 kunmap_atomic(kaddr);
240 compressed_size -= cur_size;
242 btrfs_set_file_extent_compression(leaf, ei,
245 page = find_get_page(inode->i_mapping,
246 start >> PAGE_SHIFT);
247 btrfs_set_file_extent_compression(leaf, ei, 0);
248 kaddr = kmap_atomic(page);
249 offset = start & (PAGE_SIZE - 1);
250 write_extent_buffer(leaf, kaddr + offset, ptr, size);
251 kunmap_atomic(kaddr);
254 btrfs_mark_buffer_dirty(leaf);
255 btrfs_release_path(path);
258 * we're an inline extent, so nobody can
259 * extend the file past i_size without locking
260 * a page we already have locked.
262 * We must do any isize and inode updates
263 * before we unlock the pages. Otherwise we
264 * could end up racing with unlink.
266 BTRFS_I(inode)->disk_i_size = inode->i_size;
267 ret = btrfs_update_inode(trans, root, inode);
275 * conditionally insert an inline extent into the file. This
276 * does the checks required to make sure the data is small enough
277 * to fit as an inline extent.
279 static noinline int cow_file_range_inline(struct btrfs_root *root,
280 struct inode *inode, u64 start,
281 u64 end, size_t compressed_size,
283 struct page **compressed_pages)
285 struct btrfs_fs_info *fs_info = root->fs_info;
286 struct btrfs_trans_handle *trans;
287 u64 isize = i_size_read(inode);
288 u64 actual_end = min(end + 1, isize);
289 u64 inline_len = actual_end - start;
290 u64 aligned_end = ALIGN(end, fs_info->sectorsize);
291 u64 data_len = inline_len;
293 struct btrfs_path *path;
294 int extent_inserted = 0;
295 u32 extent_item_size;
298 data_len = compressed_size;
301 actual_end > fs_info->sectorsize ||
302 data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
304 (actual_end & (fs_info->sectorsize - 1)) == 0) ||
306 data_len > fs_info->max_inline) {
310 path = btrfs_alloc_path();
314 trans = btrfs_join_transaction(root);
316 btrfs_free_path(path);
317 return PTR_ERR(trans);
319 trans->block_rsv = &BTRFS_I(inode)->block_rsv;
321 if (compressed_size && compressed_pages)
322 extent_item_size = btrfs_file_extent_calc_inline_size(
325 extent_item_size = btrfs_file_extent_calc_inline_size(
328 ret = __btrfs_drop_extents(trans, root, inode, path,
329 start, aligned_end, NULL,
330 1, 1, extent_item_size, &extent_inserted);
332 btrfs_abort_transaction(trans, ret);
336 if (isize > actual_end)
337 inline_len = min_t(u64, isize, actual_end);
338 ret = insert_inline_extent(trans, path, extent_inserted,
340 inline_len, compressed_size,
341 compress_type, compressed_pages);
342 if (ret && ret != -ENOSPC) {
343 btrfs_abort_transaction(trans, ret);
345 } else if (ret == -ENOSPC) {
350 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
351 btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0);
354 * Don't forget to free the reserved space, as for inlined extent
355 * it won't count as data extent, free them directly here.
356 * And at reserve time, it's always aligned to page size, so
357 * just free one page here.
359 btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
360 btrfs_free_path(path);
361 btrfs_end_transaction(trans);
365 struct async_extent {
370 unsigned long nr_pages;
372 struct list_head list;
377 struct btrfs_root *root;
378 struct page *locked_page;
381 unsigned int write_flags;
382 struct list_head extents;
383 struct btrfs_work work;
386 static noinline int add_async_extent(struct async_cow *cow,
387 u64 start, u64 ram_size,
390 unsigned long nr_pages,
393 struct async_extent *async_extent;
395 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
396 BUG_ON(!async_extent); /* -ENOMEM */
397 async_extent->start = start;
398 async_extent->ram_size = ram_size;
399 async_extent->compressed_size = compressed_size;
400 async_extent->pages = pages;
401 async_extent->nr_pages = nr_pages;
402 async_extent->compress_type = compress_type;
403 list_add_tail(&async_extent->list, &cow->extents);
407 static inline int inode_need_compress(struct inode *inode, u64 start, u64 end)
409 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
412 if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
415 if (BTRFS_I(inode)->defrag_compress)
417 /* bad compression ratios */
418 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
420 if (btrfs_test_opt(fs_info, COMPRESS) ||
421 BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
422 BTRFS_I(inode)->prop_compress)
423 return btrfs_compress_heuristic(inode, start, end);
427 static inline void inode_should_defrag(struct btrfs_inode *inode,
428 u64 start, u64 end, u64 num_bytes, u64 small_write)
430 /* If this is a small write inside eof, kick off a defrag */
431 if (num_bytes < small_write &&
432 (start > 0 || end + 1 < inode->disk_i_size))
433 btrfs_add_inode_defrag(NULL, inode);
437 * we create compressed extents in two phases. The first
438 * phase compresses a range of pages that have already been
439 * locked (both pages and state bits are locked).
441 * This is done inside an ordered work queue, and the compression
442 * is spread across many cpus. The actual IO submission is step
443 * two, and the ordered work queue takes care of making sure that
444 * happens in the same order things were put onto the queue by
445 * writepages and friends.
447 * If this code finds it can't get good compression, it puts an
448 * entry onto the work queue to write the uncompressed bytes. This
449 * makes sure that both compressed inodes and uncompressed inodes
450 * are written in the same order that the flusher thread sent them
453 static noinline void compress_file_range(struct inode *inode,
454 struct page *locked_page,
456 struct async_cow *async_cow,
459 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
460 struct btrfs_root *root = BTRFS_I(inode)->root;
461 u64 blocksize = fs_info->sectorsize;
463 u64 isize = i_size_read(inode);
465 struct page **pages = NULL;
466 unsigned long nr_pages;
467 unsigned long total_compressed = 0;
468 unsigned long total_in = 0;
471 int compress_type = fs_info->compress_type;
474 inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
477 actual_end = min_t(u64, isize, end + 1);
480 nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
481 BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0);
482 nr_pages = min_t(unsigned long, nr_pages,
483 BTRFS_MAX_COMPRESSED / PAGE_SIZE);
486 * we don't want to send crud past the end of i_size through
487 * compression, that's just a waste of CPU time. So, if the
488 * end of the file is before the start of our current
489 * requested range of bytes, we bail out to the uncompressed
490 * cleanup code that can deal with all of this.
492 * It isn't really the fastest way to fix things, but this is a
493 * very uncommon corner.
495 if (actual_end <= start)
496 goto cleanup_and_bail_uncompressed;
498 total_compressed = actual_end - start;
501 * skip compression for a small file range(<=blocksize) that
502 * isn't an inline extent, since it doesn't save disk space at all.
504 if (total_compressed <= blocksize &&
505 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
506 goto cleanup_and_bail_uncompressed;
508 total_compressed = min_t(unsigned long, total_compressed,
509 BTRFS_MAX_UNCOMPRESSED);
514 * we do compression for mount -o compress and when the
515 * inode has not been flagged as nocompress. This flag can
516 * change at any time if we discover bad compression ratios.
518 if (inode_need_compress(inode, start, end)) {
520 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
522 /* just bail out to the uncompressed code */
526 if (BTRFS_I(inode)->defrag_compress)
527 compress_type = BTRFS_I(inode)->defrag_compress;
528 else if (BTRFS_I(inode)->prop_compress)
529 compress_type = BTRFS_I(inode)->prop_compress;
532 * we need to call clear_page_dirty_for_io on each
533 * page in the range. Otherwise applications with the file
534 * mmap'd can wander in and change the page contents while
535 * we are compressing them.
537 * If the compression fails for any reason, we set the pages
538 * dirty again later on.
540 * Note that the remaining part is redirtied, the start pointer
541 * has moved, the end is the original one.
544 extent_range_clear_dirty_for_io(inode, start, end);
548 /* Compression level is applied here and only here */
549 ret = btrfs_compress_pages(
550 compress_type | (fs_info->compress_level << 4),
551 inode->i_mapping, start,
558 unsigned long offset = total_compressed &
560 struct page *page = pages[nr_pages - 1];
563 /* zero the tail end of the last page, we might be
564 * sending it down to disk
567 kaddr = kmap_atomic(page);
568 memset(kaddr + offset, 0,
570 kunmap_atomic(kaddr);
577 /* lets try to make an inline extent */
578 if (ret || total_in < actual_end) {
579 /* we didn't compress the entire range, try
580 * to make an uncompressed inline extent.
582 ret = cow_file_range_inline(root, inode, start, end,
583 0, BTRFS_COMPRESS_NONE, NULL);
585 /* try making a compressed inline extent */
586 ret = cow_file_range_inline(root, inode, start, end,
588 compress_type, pages);
591 unsigned long clear_flags = EXTENT_DELALLOC |
592 EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
593 EXTENT_DO_ACCOUNTING;
594 unsigned long page_error_op;
596 page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
599 * inline extent creation worked or returned error,
600 * we don't need to create any more async work items.
601 * Unlock and free up our temp pages.
603 * We use DO_ACCOUNTING here because we need the
604 * delalloc_release_metadata to be done _after_ we drop
605 * our outstanding extent for clearing delalloc for this
608 extent_clear_unlock_delalloc(inode, start, end, end,
621 * we aren't doing an inline extent round the compressed size
622 * up to a block size boundary so the allocator does sane
625 total_compressed = ALIGN(total_compressed, blocksize);
628 * one last check to make sure the compression is really a
629 * win, compare the page count read with the blocks on disk,
630 * compression must free at least one sector size
632 total_in = ALIGN(total_in, PAGE_SIZE);
633 if (total_compressed + blocksize <= total_in) {
637 * The async work queues will take care of doing actual
638 * allocation on disk for these compressed pages, and
639 * will submit them to the elevator.
641 add_async_extent(async_cow, start, total_in,
642 total_compressed, pages, nr_pages,
645 if (start + total_in < end) {
656 * the compression code ran but failed to make things smaller,
657 * free any pages it allocated and our page pointer array
659 for (i = 0; i < nr_pages; i++) {
660 WARN_ON(pages[i]->mapping);
665 total_compressed = 0;
668 /* flag the file so we don't compress in the future */
669 if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
670 !(BTRFS_I(inode)->prop_compress)) {
671 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
674 cleanup_and_bail_uncompressed:
676 * No compression, but we still need to write the pages in the file
677 * we've been given so far. redirty the locked page if it corresponds
678 * to our extent and set things up for the async work queue to run
679 * cow_file_range to do the normal delalloc dance.
681 if (page_offset(locked_page) >= start &&
682 page_offset(locked_page) <= end)
683 __set_page_dirty_nobuffers(locked_page);
684 /* unlocked later on in the async handlers */
687 extent_range_redirty_for_io(inode, start, end);
688 add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0,
689 BTRFS_COMPRESS_NONE);
695 for (i = 0; i < nr_pages; i++) {
696 WARN_ON(pages[i]->mapping);
702 static void free_async_extent_pages(struct async_extent *async_extent)
706 if (!async_extent->pages)
709 for (i = 0; i < async_extent->nr_pages; i++) {
710 WARN_ON(async_extent->pages[i]->mapping);
711 put_page(async_extent->pages[i]);
713 kfree(async_extent->pages);
714 async_extent->nr_pages = 0;
715 async_extent->pages = NULL;
719 * phase two of compressed writeback. This is the ordered portion
720 * of the code, which only gets called in the order the work was
721 * queued. We walk all the async extents created by compress_file_range
722 * and send them down to the disk.
724 static noinline void submit_compressed_extents(struct inode *inode,
725 struct async_cow *async_cow)
727 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
728 struct async_extent *async_extent;
730 struct btrfs_key ins;
731 struct extent_map *em;
732 struct btrfs_root *root = BTRFS_I(inode)->root;
733 struct extent_io_tree *io_tree;
737 while (!list_empty(&async_cow->extents)) {
738 async_extent = list_entry(async_cow->extents.next,
739 struct async_extent, list);
740 list_del(&async_extent->list);
742 io_tree = &BTRFS_I(inode)->io_tree;
745 /* did the compression code fall back to uncompressed IO? */
746 if (!async_extent->pages) {
747 int page_started = 0;
748 unsigned long nr_written = 0;
750 lock_extent(io_tree, async_extent->start,
751 async_extent->start +
752 async_extent->ram_size - 1);
754 /* allocate blocks */
755 ret = cow_file_range(inode, async_cow->locked_page,
757 async_extent->start +
758 async_extent->ram_size - 1,
759 async_extent->start +
760 async_extent->ram_size - 1,
761 &page_started, &nr_written, 0,
767 * if page_started, cow_file_range inserted an
768 * inline extent and took care of all the unlocking
769 * and IO for us. Otherwise, we need to submit
770 * all those pages down to the drive.
772 if (!page_started && !ret)
773 extent_write_locked_range(inode,
775 async_extent->start +
776 async_extent->ram_size - 1,
779 unlock_page(async_cow->locked_page);
785 lock_extent(io_tree, async_extent->start,
786 async_extent->start + async_extent->ram_size - 1);
788 ret = btrfs_reserve_extent(root, async_extent->ram_size,
789 async_extent->compressed_size,
790 async_extent->compressed_size,
791 0, alloc_hint, &ins, 1, 1);
793 free_async_extent_pages(async_extent);
795 if (ret == -ENOSPC) {
796 unlock_extent(io_tree, async_extent->start,
797 async_extent->start +
798 async_extent->ram_size - 1);
801 * we need to redirty the pages if we decide to
802 * fallback to uncompressed IO, otherwise we
803 * will not submit these pages down to lower
806 extent_range_redirty_for_io(inode,
808 async_extent->start +
809 async_extent->ram_size - 1);
816 * here we're doing allocation and writeback of the
819 em = create_io_em(inode, async_extent->start,
820 async_extent->ram_size, /* len */
821 async_extent->start, /* orig_start */
822 ins.objectid, /* block_start */
823 ins.offset, /* block_len */
824 ins.offset, /* orig_block_len */
825 async_extent->ram_size, /* ram_bytes */
826 async_extent->compress_type,
827 BTRFS_ORDERED_COMPRESSED);
829 /* ret value is not necessary due to void function */
830 goto out_free_reserve;
833 ret = btrfs_add_ordered_extent_compress(inode,
836 async_extent->ram_size,
838 BTRFS_ORDERED_COMPRESSED,
839 async_extent->compress_type);
841 btrfs_drop_extent_cache(BTRFS_I(inode),
843 async_extent->start +
844 async_extent->ram_size - 1, 0);
845 goto out_free_reserve;
847 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
850 * clear dirty, set writeback and unlock the pages.
852 extent_clear_unlock_delalloc(inode, async_extent->start,
853 async_extent->start +
854 async_extent->ram_size - 1,
855 async_extent->start +
856 async_extent->ram_size - 1,
857 NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
858 PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
860 if (btrfs_submit_compressed_write(inode,
862 async_extent->ram_size,
864 ins.offset, async_extent->pages,
865 async_extent->nr_pages,
866 async_cow->write_flags)) {
867 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
868 struct page *p = async_extent->pages[0];
869 const u64 start = async_extent->start;
870 const u64 end = start + async_extent->ram_size - 1;
872 p->mapping = inode->i_mapping;
873 tree->ops->writepage_end_io_hook(p, start, end,
876 extent_clear_unlock_delalloc(inode, start, end, end,
880 free_async_extent_pages(async_extent);
882 alloc_hint = ins.objectid + ins.offset;
888 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
889 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
891 extent_clear_unlock_delalloc(inode, async_extent->start,
892 async_extent->start +
893 async_extent->ram_size - 1,
894 async_extent->start +
895 async_extent->ram_size - 1,
896 NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
897 EXTENT_DELALLOC_NEW |
898 EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
899 PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
900 PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
902 free_async_extent_pages(async_extent);
907 static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
910 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
911 struct extent_map *em;
914 read_lock(&em_tree->lock);
915 em = search_extent_mapping(em_tree, start, num_bytes);
918 * if block start isn't an actual block number then find the
919 * first block in this inode and use that as a hint. If that
920 * block is also bogus then just don't worry about it.
922 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
924 em = search_extent_mapping(em_tree, 0, 0);
925 if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
926 alloc_hint = em->block_start;
930 alloc_hint = em->block_start;
934 read_unlock(&em_tree->lock);
940 * when extent_io.c finds a delayed allocation range in the file,
941 * the call backs end up in this code. The basic idea is to
942 * allocate extents on disk for the range, and create ordered data structs
943 * in ram to track those extents.
945 * locked_page is the page that writepage had locked already. We use
946 * it to make sure we don't do extra locks or unlocks.
948 * *page_started is set to one if we unlock locked_page and do everything
949 * required to start IO on it. It may be clean and already done with
952 static noinline int cow_file_range(struct inode *inode,
953 struct page *locked_page,
954 u64 start, u64 end, u64 delalloc_end,
955 int *page_started, unsigned long *nr_written,
956 int unlock, struct btrfs_dedupe_hash *hash)
958 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
959 struct btrfs_root *root = BTRFS_I(inode)->root;
962 unsigned long ram_size;
963 u64 cur_alloc_size = 0;
964 u64 blocksize = fs_info->sectorsize;
965 struct btrfs_key ins;
966 struct extent_map *em;
968 unsigned long page_ops;
969 bool extent_reserved = false;
972 if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
978 num_bytes = ALIGN(end - start + 1, blocksize);
979 num_bytes = max(blocksize, num_bytes);
980 ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
982 inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K);
985 /* lets try to make an inline extent */
986 ret = cow_file_range_inline(root, inode, start, end, 0,
987 BTRFS_COMPRESS_NONE, NULL);
990 * We use DO_ACCOUNTING here because we need the
991 * delalloc_release_metadata to be run _after_ we drop
992 * our outstanding extent for clearing delalloc for this
995 extent_clear_unlock_delalloc(inode, start, end,
997 EXTENT_LOCKED | EXTENT_DELALLOC |
998 EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
999 EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1000 PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
1001 PAGE_END_WRITEBACK);
1002 *nr_written = *nr_written +
1003 (end - start + PAGE_SIZE) / PAGE_SIZE;
1006 } else if (ret < 0) {
1011 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
1012 btrfs_drop_extent_cache(BTRFS_I(inode), start,
1013 start + num_bytes - 1, 0);
1015 while (num_bytes > 0) {
1016 cur_alloc_size = num_bytes;
1017 ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
1018 fs_info->sectorsize, 0, alloc_hint,
1022 cur_alloc_size = ins.offset;
1023 extent_reserved = true;
1025 ram_size = ins.offset;
1026 em = create_io_em(inode, start, ins.offset, /* len */
1027 start, /* orig_start */
1028 ins.objectid, /* block_start */
1029 ins.offset, /* block_len */
1030 ins.offset, /* orig_block_len */
1031 ram_size, /* ram_bytes */
1032 BTRFS_COMPRESS_NONE, /* compress_type */
1033 BTRFS_ORDERED_REGULAR /* type */);
1036 free_extent_map(em);
1038 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
1039 ram_size, cur_alloc_size, 0);
1041 goto out_drop_extent_cache;
1043 if (root->root_key.objectid ==
1044 BTRFS_DATA_RELOC_TREE_OBJECTID) {
1045 ret = btrfs_reloc_clone_csums(inode, start,
1048 * Only drop cache here, and process as normal.
1050 * We must not allow extent_clear_unlock_delalloc()
1051 * at out_unlock label to free meta of this ordered
1052 * extent, as its meta should be freed by
1053 * btrfs_finish_ordered_io().
1055 * So we must continue until @start is increased to
1056 * skip current ordered extent.
1059 btrfs_drop_extent_cache(BTRFS_I(inode), start,
1060 start + ram_size - 1, 0);
1063 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1065 /* we're not doing compressed IO, don't unlock the first
1066 * page (which the caller expects to stay locked), don't
1067 * clear any dirty bits and don't set any writeback bits
1069 * Do set the Private2 bit so we know this page was properly
1070 * setup for writepage
1072 page_ops = unlock ? PAGE_UNLOCK : 0;
1073 page_ops |= PAGE_SET_PRIVATE2;
1075 extent_clear_unlock_delalloc(inode, start,
1076 start + ram_size - 1,
1077 delalloc_end, locked_page,
1078 EXTENT_LOCKED | EXTENT_DELALLOC,
1080 if (num_bytes < cur_alloc_size)
1083 num_bytes -= cur_alloc_size;
1084 alloc_hint = ins.objectid + ins.offset;
1085 start += cur_alloc_size;
1086 extent_reserved = false;
1089 * btrfs_reloc_clone_csums() error, since start is increased
1090 * extent_clear_unlock_delalloc() at out_unlock label won't
1091 * free metadata of current ordered extent, we're OK to exit.
1099 out_drop_extent_cache:
1100 btrfs_drop_extent_cache(BTRFS_I(inode), start, start + ram_size - 1, 0);
1102 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1103 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1105 clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
1106 EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
1107 page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
1110 * If we reserved an extent for our delalloc range (or a subrange) and
1111 * failed to create the respective ordered extent, then it means that
1112 * when we reserved the extent we decremented the extent's size from
1113 * the data space_info's bytes_may_use counter and incremented the
1114 * space_info's bytes_reserved counter by the same amount. We must make
1115 * sure extent_clear_unlock_delalloc() does not try to decrement again
1116 * the data space_info's bytes_may_use counter, therefore we do not pass
1117 * it the flag EXTENT_CLEAR_DATA_RESV.
1119 if (extent_reserved) {
1120 extent_clear_unlock_delalloc(inode, start,
1121 start + cur_alloc_size,
1122 start + cur_alloc_size,
1126 start += cur_alloc_size;
1130 extent_clear_unlock_delalloc(inode, start, end, delalloc_end,
1132 clear_bits | EXTENT_CLEAR_DATA_RESV,
1138 * work queue call back to started compression on a file and pages
1140 static noinline void async_cow_start(struct btrfs_work *work)
1142 struct async_cow *async_cow;
1144 async_cow = container_of(work, struct async_cow, work);
1146 compress_file_range(async_cow->inode, async_cow->locked_page,
1147 async_cow->start, async_cow->end, async_cow,
1149 if (num_added == 0) {
1150 btrfs_add_delayed_iput(async_cow->inode);
1151 async_cow->inode = NULL;
1156 * work queue call back to submit previously compressed pages
1158 static noinline void async_cow_submit(struct btrfs_work *work)
1160 struct btrfs_fs_info *fs_info;
1161 struct async_cow *async_cow;
1162 struct btrfs_root *root;
1163 unsigned long nr_pages;
1165 async_cow = container_of(work, struct async_cow, work);
1167 root = async_cow->root;
1168 fs_info = root->fs_info;
1169 nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >>
1173 * atomic_sub_return implies a barrier for waitqueue_active
1175 if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
1177 waitqueue_active(&fs_info->async_submit_wait))
1178 wake_up(&fs_info->async_submit_wait);
1180 if (async_cow->inode)
1181 submit_compressed_extents(async_cow->inode, async_cow);
1184 static noinline void async_cow_free(struct btrfs_work *work)
1186 struct async_cow *async_cow;
1187 async_cow = container_of(work, struct async_cow, work);
1188 if (async_cow->inode)
1189 btrfs_add_delayed_iput(async_cow->inode);
1193 static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1194 u64 start, u64 end, int *page_started,
1195 unsigned long *nr_written,
1196 unsigned int write_flags)
1198 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1199 struct async_cow *async_cow;
1200 struct btrfs_root *root = BTRFS_I(inode)->root;
1201 unsigned long nr_pages;
1204 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1206 while (start < end) {
1207 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
1208 BUG_ON(!async_cow); /* -ENOMEM */
1209 async_cow->inode = igrab(inode);
1210 async_cow->root = root;
1211 async_cow->locked_page = locked_page;
1212 async_cow->start = start;
1213 async_cow->write_flags = write_flags;
1215 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
1216 !btrfs_test_opt(fs_info, FORCE_COMPRESS))
1219 cur_end = min(end, start + SZ_512K - 1);
1221 async_cow->end = cur_end;
1222 INIT_LIST_HEAD(&async_cow->extents);
1224 btrfs_init_work(&async_cow->work,
1225 btrfs_delalloc_helper,
1226 async_cow_start, async_cow_submit,
1229 nr_pages = (cur_end - start + PAGE_SIZE) >>
1231 atomic_add(nr_pages, &fs_info->async_delalloc_pages);
1233 btrfs_queue_work(fs_info->delalloc_workers, &async_cow->work);
1235 *nr_written += nr_pages;
1236 start = cur_end + 1;
1242 static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
1243 u64 bytenr, u64 num_bytes)
1246 struct btrfs_ordered_sum *sums;
1249 ret = btrfs_lookup_csums_range(fs_info->csum_root, bytenr,
1250 bytenr + num_bytes - 1, &list, 0);
1251 if (ret == 0 && list_empty(&list))
1254 while (!list_empty(&list)) {
1255 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1256 list_del(&sums->list);
1265 * when nowcow writeback call back. This checks for snapshots or COW copies
1266 * of the extents that exist in the file, and COWs the file as required.
1268 * If no cow copies or snapshots exist, we write directly to the existing
1271 static noinline int run_delalloc_nocow(struct inode *inode,
1272 struct page *locked_page,
1273 u64 start, u64 end, int *page_started, int force,
1274 unsigned long *nr_written)
1276 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1277 struct btrfs_root *root = BTRFS_I(inode)->root;
1278 struct extent_buffer *leaf;
1279 struct btrfs_path *path;
1280 struct btrfs_file_extent_item *fi;
1281 struct btrfs_key found_key;
1282 struct extent_map *em;
1297 u64 ino = btrfs_ino(BTRFS_I(inode));
1299 path = btrfs_alloc_path();
1301 extent_clear_unlock_delalloc(inode, start, end, end,
1303 EXTENT_LOCKED | EXTENT_DELALLOC |
1304 EXTENT_DO_ACCOUNTING |
1305 EXTENT_DEFRAG, PAGE_UNLOCK |
1307 PAGE_SET_WRITEBACK |
1308 PAGE_END_WRITEBACK);
1312 nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
1314 cow_start = (u64)-1;
1317 ret = btrfs_lookup_file_extent(NULL, root, path, ino,
1321 if (ret > 0 && path->slots[0] > 0 && check_prev) {
1322 leaf = path->nodes[0];
1323 btrfs_item_key_to_cpu(leaf, &found_key,
1324 path->slots[0] - 1);
1325 if (found_key.objectid == ino &&
1326 found_key.type == BTRFS_EXTENT_DATA_KEY)
1331 leaf = path->nodes[0];
1332 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1333 ret = btrfs_next_leaf(root, path);
1335 if (cow_start != (u64)-1)
1336 cur_offset = cow_start;
1341 leaf = path->nodes[0];
1347 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1349 if (found_key.objectid > ino)
1351 if (WARN_ON_ONCE(found_key.objectid < ino) ||
1352 found_key.type < BTRFS_EXTENT_DATA_KEY) {
1356 if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
1357 found_key.offset > end)
1360 if (found_key.offset > cur_offset) {
1361 extent_end = found_key.offset;
1366 fi = btrfs_item_ptr(leaf, path->slots[0],
1367 struct btrfs_file_extent_item);
1368 extent_type = btrfs_file_extent_type(leaf, fi);
1370 ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1371 if (extent_type == BTRFS_FILE_EXTENT_REG ||
1372 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1373 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1374 extent_offset = btrfs_file_extent_offset(leaf, fi);
1375 extent_end = found_key.offset +
1376 btrfs_file_extent_num_bytes(leaf, fi);
1378 btrfs_file_extent_disk_num_bytes(leaf, fi);
1379 if (extent_end <= start) {
1383 if (disk_bytenr == 0)
1385 if (btrfs_file_extent_compression(leaf, fi) ||
1386 btrfs_file_extent_encryption(leaf, fi) ||
1387 btrfs_file_extent_other_encoding(leaf, fi))
1389 if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1391 if (btrfs_extent_readonly(fs_info, disk_bytenr))
1393 ret = btrfs_cross_ref_exist(root, ino,
1395 extent_offset, disk_bytenr);
1398 * ret could be -EIO if the above fails to read
1402 if (cow_start != (u64)-1)
1403 cur_offset = cow_start;
1407 WARN_ON_ONCE(nolock);
1410 disk_bytenr += extent_offset;
1411 disk_bytenr += cur_offset - found_key.offset;
1412 num_bytes = min(end + 1, extent_end) - cur_offset;
1414 * if there are pending snapshots for this root,
1415 * we fall into common COW way.
1418 err = btrfs_start_write_no_snapshotting(root);
1423 * force cow if csum exists in the range.
1424 * this ensure that csum for a given extent are
1425 * either valid or do not exist.
1427 ret = csum_exist_in_range(fs_info, disk_bytenr,
1431 btrfs_end_write_no_snapshotting(root);
1434 * ret could be -EIO if the above fails to read
1438 if (cow_start != (u64)-1)
1439 cur_offset = cow_start;
1442 WARN_ON_ONCE(nolock);
1445 if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) {
1447 btrfs_end_write_no_snapshotting(root);
1451 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1452 extent_end = found_key.offset +
1453 btrfs_file_extent_inline_len(leaf,
1454 path->slots[0], fi);
1455 extent_end = ALIGN(extent_end,
1456 fs_info->sectorsize);
1461 if (extent_end <= start) {
1463 if (!nolock && nocow)
1464 btrfs_end_write_no_snapshotting(root);
1466 btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1470 if (cow_start == (u64)-1)
1471 cow_start = cur_offset;
1472 cur_offset = extent_end;
1473 if (cur_offset > end)
1479 btrfs_release_path(path);
1480 if (cow_start != (u64)-1) {
1481 ret = cow_file_range(inode, locked_page,
1482 cow_start, found_key.offset - 1,
1483 end, page_started, nr_written, 1,
1486 if (!nolock && nocow)
1487 btrfs_end_write_no_snapshotting(root);
1489 btrfs_dec_nocow_writers(fs_info,
1493 cow_start = (u64)-1;
1496 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1497 u64 orig_start = found_key.offset - extent_offset;
1499 em = create_io_em(inode, cur_offset, num_bytes,
1501 disk_bytenr, /* block_start */
1502 num_bytes, /* block_len */
1503 disk_num_bytes, /* orig_block_len */
1504 ram_bytes, BTRFS_COMPRESS_NONE,
1505 BTRFS_ORDERED_PREALLOC);
1507 if (!nolock && nocow)
1508 btrfs_end_write_no_snapshotting(root);
1510 btrfs_dec_nocow_writers(fs_info,
1515 free_extent_map(em);
1518 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1519 type = BTRFS_ORDERED_PREALLOC;
1521 type = BTRFS_ORDERED_NOCOW;
1524 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1525 num_bytes, num_bytes, type);
1527 btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1528 BUG_ON(ret); /* -ENOMEM */
1530 if (root->root_key.objectid ==
1531 BTRFS_DATA_RELOC_TREE_OBJECTID)
1533 * Error handled later, as we must prevent
1534 * extent_clear_unlock_delalloc() in error handler
1535 * from freeing metadata of created ordered extent.
1537 ret = btrfs_reloc_clone_csums(inode, cur_offset,
1540 extent_clear_unlock_delalloc(inode, cur_offset,
1541 cur_offset + num_bytes - 1, end,
1542 locked_page, EXTENT_LOCKED |
1544 EXTENT_CLEAR_DATA_RESV,
1545 PAGE_UNLOCK | PAGE_SET_PRIVATE2);
1547 if (!nolock && nocow)
1548 btrfs_end_write_no_snapshotting(root);
1549 cur_offset = extent_end;
1552 * btrfs_reloc_clone_csums() error, now we're OK to call error
1553 * handler, as metadata for created ordered extent will only
1554 * be freed by btrfs_finish_ordered_io().
1558 if (cur_offset > end)
1561 btrfs_release_path(path);
1563 if (cur_offset <= end && cow_start == (u64)-1) {
1564 cow_start = cur_offset;
1568 if (cow_start != (u64)-1) {
1569 ret = cow_file_range(inode, locked_page, cow_start, end, end,
1570 page_started, nr_written, 1, NULL);
1576 if (ret && cur_offset < end)
1577 extent_clear_unlock_delalloc(inode, cur_offset, end, end,
1578 locked_page, EXTENT_LOCKED |
1579 EXTENT_DELALLOC | EXTENT_DEFRAG |
1580 EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1582 PAGE_SET_WRITEBACK |
1583 PAGE_END_WRITEBACK);
1584 btrfs_free_path(path);
1588 static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
1591 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
1592 !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
1596 * @defrag_bytes is a hint value, no spinlock held here,
1597 * if is not zero, it means the file is defragging.
1598 * Force cow if given extent needs to be defragged.
1600 if (BTRFS_I(inode)->defrag_bytes &&
1601 test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1602 EXTENT_DEFRAG, 0, NULL))
1609 * extent_io.c call back to do delayed allocation processing
1611 static int run_delalloc_range(void *private_data, struct page *locked_page,
1612 u64 start, u64 end, int *page_started,
1613 unsigned long *nr_written,
1614 struct writeback_control *wbc)
1616 struct inode *inode = private_data;
1618 int force_cow = need_force_cow(inode, start, end);
1619 unsigned int write_flags = wbc_to_write_flags(wbc);
1621 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
1622 ret = run_delalloc_nocow(inode, locked_page, start, end,
1623 page_started, 1, nr_written);
1624 } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
1625 ret = run_delalloc_nocow(inode, locked_page, start, end,
1626 page_started, 0, nr_written);
1627 } else if (!inode_need_compress(inode, start, end)) {
1628 ret = cow_file_range(inode, locked_page, start, end, end,
1629 page_started, nr_written, 1, NULL);
1631 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1632 &BTRFS_I(inode)->runtime_flags);
1633 ret = cow_file_range_async(inode, locked_page, start, end,
1634 page_started, nr_written,
1638 btrfs_cleanup_ordered_extents(inode, start, end - start + 1);
1642 static void btrfs_split_extent_hook(void *private_data,
1643 struct extent_state *orig, u64 split)
1645 struct inode *inode = private_data;
1648 /* not delalloc, ignore it */
1649 if (!(orig->state & EXTENT_DELALLOC))
1652 size = orig->end - orig->start + 1;
1653 if (size > BTRFS_MAX_EXTENT_SIZE) {
1658 * See the explanation in btrfs_merge_extent_hook, the same
1659 * applies here, just in reverse.
1661 new_size = orig->end - split + 1;
1662 num_extents = count_max_extents(new_size);
1663 new_size = split - orig->start;
1664 num_extents += count_max_extents(new_size);
1665 if (count_max_extents(size) >= num_extents)
1669 spin_lock(&BTRFS_I(inode)->lock);
1670 btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
1671 spin_unlock(&BTRFS_I(inode)->lock);
1675 * extent_io.c merge_extent_hook, used to track merged delayed allocation
1676 * extents so we can keep track of new extents that are just merged onto old
1677 * extents, such as when we are doing sequential writes, so we can properly
1678 * account for the metadata space we'll need.
1680 static void btrfs_merge_extent_hook(void *private_data,
1681 struct extent_state *new,
1682 struct extent_state *other)
1684 struct inode *inode = private_data;
1685 u64 new_size, old_size;
1688 /* not delalloc, ignore it */
1689 if (!(other->state & EXTENT_DELALLOC))
1692 if (new->start > other->start)
1693 new_size = new->end - other->start + 1;
1695 new_size = other->end - new->start + 1;
1697 /* we're not bigger than the max, unreserve the space and go */
1698 if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
1699 spin_lock(&BTRFS_I(inode)->lock);
1700 btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
1701 spin_unlock(&BTRFS_I(inode)->lock);
1706 * We have to add up either side to figure out how many extents were
1707 * accounted for before we merged into one big extent. If the number of
1708 * extents we accounted for is <= the amount we need for the new range
1709 * then we can return, otherwise drop. Think of it like this
1713 * So we've grown the extent by a MAX_SIZE extent, this would mean we
1714 * need 2 outstanding extents, on one side we have 1 and the other side
1715 * we have 1 so they are == and we can return. But in this case
1717 * [MAX_SIZE+4k][MAX_SIZE+4k]
1719 * Each range on their own accounts for 2 extents, but merged together
1720 * they are only 3 extents worth of accounting, so we need to drop in
1723 old_size = other->end - other->start + 1;
1724 num_extents = count_max_extents(old_size);
1725 old_size = new->end - new->start + 1;
1726 num_extents += count_max_extents(old_size);
1727 if (count_max_extents(new_size) >= num_extents)
1730 spin_lock(&BTRFS_I(inode)->lock);
1731 btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
1732 spin_unlock(&BTRFS_I(inode)->lock);
1735 static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
1736 struct inode *inode)
1738 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1740 spin_lock(&root->delalloc_lock);
1741 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1742 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1743 &root->delalloc_inodes);
1744 set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1745 &BTRFS_I(inode)->runtime_flags);
1746 root->nr_delalloc_inodes++;
1747 if (root->nr_delalloc_inodes == 1) {
1748 spin_lock(&fs_info->delalloc_root_lock);
1749 BUG_ON(!list_empty(&root->delalloc_root));
1750 list_add_tail(&root->delalloc_root,
1751 &fs_info->delalloc_roots);
1752 spin_unlock(&fs_info->delalloc_root_lock);
1755 spin_unlock(&root->delalloc_lock);
1758 static void btrfs_del_delalloc_inode(struct btrfs_root *root,
1759 struct btrfs_inode *inode)
1761 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
1763 spin_lock(&root->delalloc_lock);
1764 if (!list_empty(&inode->delalloc_inodes)) {
1765 list_del_init(&inode->delalloc_inodes);
1766 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1767 &inode->runtime_flags);
1768 root->nr_delalloc_inodes--;
1769 if (!root->nr_delalloc_inodes) {
1770 spin_lock(&fs_info->delalloc_root_lock);
1771 BUG_ON(list_empty(&root->delalloc_root));
1772 list_del_init(&root->delalloc_root);
1773 spin_unlock(&fs_info->delalloc_root_lock);
1776 spin_unlock(&root->delalloc_lock);
1780 * extent_io.c set_bit_hook, used to track delayed allocation
1781 * bytes in this file, and to maintain the list of inodes that
1782 * have pending delalloc work to be done.
1784 static void btrfs_set_bit_hook(void *private_data,
1785 struct extent_state *state, unsigned *bits)
1787 struct inode *inode = private_data;
1789 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1791 if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
1794 * set_bit and clear bit hooks normally require _irqsave/restore
1795 * but in this case, we are only testing for the DELALLOC
1796 * bit, which is only set or cleared with irqs on
1798 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1799 struct btrfs_root *root = BTRFS_I(inode)->root;
1800 u64 len = state->end + 1 - state->start;
1801 u32 num_extents = count_max_extents(len);
1802 bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
1804 spin_lock(&BTRFS_I(inode)->lock);
1805 btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents);
1806 spin_unlock(&BTRFS_I(inode)->lock);
1808 /* For sanity tests */
1809 if (btrfs_is_testing(fs_info))
1812 percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
1813 fs_info->delalloc_batch);
1814 spin_lock(&BTRFS_I(inode)->lock);
1815 BTRFS_I(inode)->delalloc_bytes += len;
1816 if (*bits & EXTENT_DEFRAG)
1817 BTRFS_I(inode)->defrag_bytes += len;
1818 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1819 &BTRFS_I(inode)->runtime_flags))
1820 btrfs_add_delalloc_inodes(root, inode);
1821 spin_unlock(&BTRFS_I(inode)->lock);
1824 if (!(state->state & EXTENT_DELALLOC_NEW) &&
1825 (*bits & EXTENT_DELALLOC_NEW)) {
1826 spin_lock(&BTRFS_I(inode)->lock);
1827 BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
1829 spin_unlock(&BTRFS_I(inode)->lock);
1834 * extent_io.c clear_bit_hook, see set_bit_hook for why
1836 static void btrfs_clear_bit_hook(void *private_data,
1837 struct extent_state *state,
1840 struct btrfs_inode *inode = BTRFS_I((struct inode *)private_data);
1841 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
1842 u64 len = state->end + 1 - state->start;
1843 u32 num_extents = count_max_extents(len);
1845 if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) {
1846 spin_lock(&inode->lock);
1847 inode->defrag_bytes -= len;
1848 spin_unlock(&inode->lock);
1852 * set_bit and clear bit hooks normally require _irqsave/restore
1853 * but in this case, we are only testing for the DELALLOC
1854 * bit, which is only set or cleared with irqs on
1856 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1857 struct btrfs_root *root = inode->root;
1858 bool do_list = !btrfs_is_free_space_inode(inode);
1860 spin_lock(&inode->lock);
1861 btrfs_mod_outstanding_extents(inode, -num_extents);
1862 spin_unlock(&inode->lock);
1865 * We don't reserve metadata space for space cache inodes so we
1866 * don't need to call dellalloc_release_metadata if there is an
1869 if (*bits & EXTENT_CLEAR_META_RESV &&
1870 root != fs_info->tree_root)
1871 btrfs_delalloc_release_metadata(inode, len);
1873 /* For sanity tests. */
1874 if (btrfs_is_testing(fs_info))
1877 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID &&
1878 do_list && !(state->state & EXTENT_NORESERVE) &&
1879 (*bits & EXTENT_CLEAR_DATA_RESV))
1880 btrfs_free_reserved_data_space_noquota(
1884 percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
1885 fs_info->delalloc_batch);
1886 spin_lock(&inode->lock);
1887 inode->delalloc_bytes -= len;
1888 if (do_list && inode->delalloc_bytes == 0 &&
1889 test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1890 &inode->runtime_flags))
1891 btrfs_del_delalloc_inode(root, inode);
1892 spin_unlock(&inode->lock);
1895 if ((state->state & EXTENT_DELALLOC_NEW) &&
1896 (*bits & EXTENT_DELALLOC_NEW)) {
1897 spin_lock(&inode->lock);
1898 ASSERT(inode->new_delalloc_bytes >= len);
1899 inode->new_delalloc_bytes -= len;
1900 spin_unlock(&inode->lock);
1905 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1906 * we don't create bios that span stripes or chunks
1908 * return 1 if page cannot be merged to bio
1909 * return 0 if page can be merged to bio
1910 * return error otherwise
1912 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1913 size_t size, struct bio *bio,
1914 unsigned long bio_flags)
1916 struct inode *inode = page->mapping->host;
1917 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1918 u64 logical = (u64)bio->bi_iter.bi_sector << 9;
1923 if (bio_flags & EXTENT_BIO_COMPRESSED)
1926 length = bio->bi_iter.bi_size;
1927 map_length = length;
1928 ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
1932 if (map_length < length + size)
1938 * in order to insert checksums into the metadata in large chunks,
1939 * we wait until bio submission time. All the pages in the bio are
1940 * checksummed and sums are attached onto the ordered extent record.
1942 * At IO completion time the cums attached on the ordered extent record
1943 * are inserted into the btree
1945 static blk_status_t __btrfs_submit_bio_start(void *private_data, struct bio *bio,
1946 int mirror_num, unsigned long bio_flags,
1949 struct inode *inode = private_data;
1950 blk_status_t ret = 0;
1952 ret = btrfs_csum_one_bio(inode, bio, 0, 0);
1953 BUG_ON(ret); /* -ENOMEM */
1958 * in order to insert checksums into the metadata in large chunks,
1959 * we wait until bio submission time. All the pages in the bio are
1960 * checksummed and sums are attached onto the ordered extent record.
1962 * At IO completion time the cums attached on the ordered extent record
1963 * are inserted into the btree
1965 static blk_status_t __btrfs_submit_bio_done(void *private_data, struct bio *bio,
1966 int mirror_num, unsigned long bio_flags,
1969 struct inode *inode = private_data;
1970 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1973 ret = btrfs_map_bio(fs_info, bio, mirror_num, 1);
1975 bio->bi_status = ret;
1982 * extent_io.c submission hook. This does the right thing for csum calculation
1983 * on write, or reading the csums from the tree before a read.
1985 * Rules about async/sync submit,
1986 * a) read: sync submit
1988 * b) write without checksum: sync submit
1990 * c) write with checksum:
1991 * c-1) if bio is issued by fsync: sync submit
1992 * (sync_writers != 0)
1994 * c-2) if root is reloc root: sync submit
1995 * (only in case of buffered IO)
1997 * c-3) otherwise: async submit
1999 static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio,
2000 int mirror_num, unsigned long bio_flags,
2003 struct inode *inode = private_data;
2004 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2005 struct btrfs_root *root = BTRFS_I(inode)->root;
2006 enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
2007 blk_status_t ret = 0;
2009 int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
2011 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
2013 if (btrfs_is_free_space_inode(BTRFS_I(inode)))
2014 metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
2016 if (bio_op(bio) != REQ_OP_WRITE) {
2017 ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
2021 if (bio_flags & EXTENT_BIO_COMPRESSED) {
2022 ret = btrfs_submit_compressed_read(inode, bio,
2026 } else if (!skip_sum) {
2027 ret = btrfs_lookup_bio_sums(inode, bio, NULL);
2032 } else if (async && !skip_sum) {
2033 /* csum items have already been cloned */
2034 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2036 /* we're doing a write, do the async checksumming */
2037 ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags,
2039 __btrfs_submit_bio_start,
2040 __btrfs_submit_bio_done);
2042 } else if (!skip_sum) {
2043 ret = btrfs_csum_one_bio(inode, bio, 0, 0);
2049 ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
2053 bio->bi_status = ret;
2060 * given a list of ordered sums record them in the inode. This happens
2061 * at IO completion time based on sums calculated at bio submission time.
2063 static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
2064 struct inode *inode, struct list_head *list)
2066 struct btrfs_ordered_sum *sum;
2069 list_for_each_entry(sum, list, list) {
2070 trans->adding_csums = true;
2071 ret = btrfs_csum_file_blocks(trans,
2072 BTRFS_I(inode)->root->fs_info->csum_root, sum);
2073 trans->adding_csums = false;
2080 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2081 unsigned int extra_bits,
2082 struct extent_state **cached_state, int dedupe)
2084 WARN_ON((end & (PAGE_SIZE - 1)) == 0);
2085 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
2086 extra_bits, cached_state);
2089 /* see btrfs_writepage_start_hook for details on why this is required */
2090 struct btrfs_writepage_fixup {
2092 struct btrfs_work work;
2095 static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
2097 struct btrfs_writepage_fixup *fixup;
2098 struct btrfs_ordered_extent *ordered;
2099 struct extent_state *cached_state = NULL;
2100 struct extent_changeset *data_reserved = NULL;
2102 struct inode *inode;
2107 fixup = container_of(work, struct btrfs_writepage_fixup, work);
2111 if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
2112 ClearPageChecked(page);
2116 inode = page->mapping->host;
2117 page_start = page_offset(page);
2118 page_end = page_offset(page) + PAGE_SIZE - 1;
2120 lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
2123 /* already ordered? We're done */
2124 if (PagePrivate2(page))
2127 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
2130 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
2131 page_end, &cached_state);
2133 btrfs_start_ordered_extent(inode, ordered, 1);
2134 btrfs_put_ordered_extent(ordered);
2138 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
2141 mapping_set_error(page->mapping, ret);
2142 end_extent_writepage(page, ret, page_start, page_end);
2143 ClearPageChecked(page);
2147 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
2150 mapping_set_error(page->mapping, ret);
2151 end_extent_writepage(page, ret, page_start, page_end);
2152 ClearPageChecked(page);
2156 ClearPageChecked(page);
2157 set_page_dirty(page);
2158 btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
2160 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
2166 extent_changeset_free(data_reserved);
2170 * There are a few paths in the higher layers of the kernel that directly
2171 * set the page dirty bit without asking the filesystem if it is a
2172 * good idea. This causes problems because we want to make sure COW
2173 * properly happens and the data=ordered rules are followed.
2175 * In our case any range that doesn't have the ORDERED bit set
2176 * hasn't been properly setup for IO. We kick off an async process
2177 * to fix it up. The async helper will wait for ordered extents, set
2178 * the delalloc bit and make it safe to write the page.
2180 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
2182 struct inode *inode = page->mapping->host;
2183 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2184 struct btrfs_writepage_fixup *fixup;
2186 /* this page is properly in the ordered list */
2187 if (TestClearPagePrivate2(page))
2190 if (PageChecked(page))
2193 fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
2197 SetPageChecked(page);
2199 btrfs_init_work(&fixup->work, btrfs_fixup_helper,
2200 btrfs_writepage_fixup_worker, NULL, NULL);
2202 btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
2206 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2207 struct inode *inode, u64 file_pos,
2208 u64 disk_bytenr, u64 disk_num_bytes,
2209 u64 num_bytes, u64 ram_bytes,
2210 u8 compression, u8 encryption,
2211 u16 other_encoding, int extent_type)
2213 struct btrfs_root *root = BTRFS_I(inode)->root;
2214 struct btrfs_file_extent_item *fi;
2215 struct btrfs_path *path;
2216 struct extent_buffer *leaf;
2217 struct btrfs_key ins;
2219 int extent_inserted = 0;
2222 path = btrfs_alloc_path();
2227 * we may be replacing one extent in the tree with another.
2228 * The new extent is pinned in the extent map, and we don't want
2229 * to drop it from the cache until it is completely in the btree.
2231 * So, tell btrfs_drop_extents to leave this extent in the cache.
2232 * the caller is expected to unpin it and allow it to be merged
2235 ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
2236 file_pos + num_bytes, NULL, 0,
2237 1, sizeof(*fi), &extent_inserted);
2241 if (!extent_inserted) {
2242 ins.objectid = btrfs_ino(BTRFS_I(inode));
2243 ins.offset = file_pos;
2244 ins.type = BTRFS_EXTENT_DATA_KEY;
2246 path->leave_spinning = 1;
2247 ret = btrfs_insert_empty_item(trans, root, path, &ins,
2252 leaf = path->nodes[0];
2253 fi = btrfs_item_ptr(leaf, path->slots[0],
2254 struct btrfs_file_extent_item);
2255 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2256 btrfs_set_file_extent_type(leaf, fi, extent_type);
2257 btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
2258 btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
2259 btrfs_set_file_extent_offset(leaf, fi, 0);
2260 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2261 btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
2262 btrfs_set_file_extent_compression(leaf, fi, compression);
2263 btrfs_set_file_extent_encryption(leaf, fi, encryption);
2264 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
2266 btrfs_mark_buffer_dirty(leaf);
2267 btrfs_release_path(path);
2269 inode_add_bytes(inode, num_bytes);
2271 ins.objectid = disk_bytenr;
2272 ins.offset = disk_num_bytes;
2273 ins.type = BTRFS_EXTENT_ITEM_KEY;
2276 * Release the reserved range from inode dirty range map, as it is
2277 * already moved into delayed_ref_head
2279 ret = btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
2283 ret = btrfs_alloc_reserved_file_extent(trans, root,
2284 btrfs_ino(BTRFS_I(inode)),
2285 file_pos, qg_released, &ins);
2287 btrfs_free_path(path);
2292 /* snapshot-aware defrag */
2293 struct sa_defrag_extent_backref {
2294 struct rb_node node;
2295 struct old_sa_defrag_extent *old;
2304 struct old_sa_defrag_extent {
2305 struct list_head list;
2306 struct new_sa_defrag_extent *new;
2315 struct new_sa_defrag_extent {
2316 struct rb_root root;
2317 struct list_head head;
2318 struct btrfs_path *path;
2319 struct inode *inode;
2327 static int backref_comp(struct sa_defrag_extent_backref *b1,
2328 struct sa_defrag_extent_backref *b2)
2330 if (b1->root_id < b2->root_id)
2332 else if (b1->root_id > b2->root_id)
2335 if (b1->inum < b2->inum)
2337 else if (b1->inum > b2->inum)
2340 if (b1->file_pos < b2->file_pos)
2342 else if (b1->file_pos > b2->file_pos)
2346 * [------------------------------] ===> (a range of space)
2347 * |<--->| |<---->| =============> (fs/file tree A)
2348 * |<---------------------------->| ===> (fs/file tree B)
2350 * A range of space can refer to two file extents in one tree while
2351 * refer to only one file extent in another tree.
2353 * So we may process a disk offset more than one time(two extents in A)
2354 * and locate at the same extent(one extent in B), then insert two same
2355 * backrefs(both refer to the extent in B).
2360 static void backref_insert(struct rb_root *root,
2361 struct sa_defrag_extent_backref *backref)
2363 struct rb_node **p = &root->rb_node;
2364 struct rb_node *parent = NULL;
2365 struct sa_defrag_extent_backref *entry;
2370 entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
2372 ret = backref_comp(backref, entry);
2376 p = &(*p)->rb_right;
2379 rb_link_node(&backref->node, parent, p);
2380 rb_insert_color(&backref->node, root);
2384 * Note the backref might has changed, and in this case we just return 0.
2386 static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2389 struct btrfs_file_extent_item *extent;
2390 struct old_sa_defrag_extent *old = ctx;
2391 struct new_sa_defrag_extent *new = old->new;
2392 struct btrfs_path *path = new->path;
2393 struct btrfs_key key;
2394 struct btrfs_root *root;
2395 struct sa_defrag_extent_backref *backref;
2396 struct extent_buffer *leaf;
2397 struct inode *inode = new->inode;
2398 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2404 if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
2405 inum == btrfs_ino(BTRFS_I(inode)))
2408 key.objectid = root_id;
2409 key.type = BTRFS_ROOT_ITEM_KEY;
2410 key.offset = (u64)-1;
2412 root = btrfs_read_fs_root_no_name(fs_info, &key);
2414 if (PTR_ERR(root) == -ENOENT)
2417 btrfs_debug(fs_info, "inum=%llu, offset=%llu, root_id=%llu",
2418 inum, offset, root_id);
2419 return PTR_ERR(root);
2422 key.objectid = inum;
2423 key.type = BTRFS_EXTENT_DATA_KEY;
2424 if (offset > (u64)-1 << 32)
2427 key.offset = offset;
2429 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2430 if (WARN_ON(ret < 0))
2437 leaf = path->nodes[0];
2438 slot = path->slots[0];
2440 if (slot >= btrfs_header_nritems(leaf)) {
2441 ret = btrfs_next_leaf(root, path);
2444 } else if (ret > 0) {
2453 btrfs_item_key_to_cpu(leaf, &key, slot);
2455 if (key.objectid > inum)
2458 if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
2461 extent = btrfs_item_ptr(leaf, slot,
2462 struct btrfs_file_extent_item);
2464 if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
2468 * 'offset' refers to the exact key.offset,
2469 * NOT the 'offset' field in btrfs_extent_data_ref, ie.
2470 * (key.offset - extent_offset).
2472 if (key.offset != offset)
2475 extent_offset = btrfs_file_extent_offset(leaf, extent);
2476 num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
2478 if (extent_offset >= old->extent_offset + old->offset +
2479 old->len || extent_offset + num_bytes <=
2480 old->extent_offset + old->offset)
2485 backref = kmalloc(sizeof(*backref), GFP_NOFS);
2491 backref->root_id = root_id;
2492 backref->inum = inum;
2493 backref->file_pos = offset;
2494 backref->num_bytes = num_bytes;
2495 backref->extent_offset = extent_offset;
2496 backref->generation = btrfs_file_extent_generation(leaf, extent);
2498 backref_insert(&new->root, backref);
2501 btrfs_release_path(path);
2506 static noinline bool record_extent_backrefs(struct btrfs_path *path,
2507 struct new_sa_defrag_extent *new)
2509 struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2510 struct old_sa_defrag_extent *old, *tmp;
2515 list_for_each_entry_safe(old, tmp, &new->head, list) {
2516 ret = iterate_inodes_from_logical(old->bytenr +
2517 old->extent_offset, fs_info,
2518 path, record_one_backref,
2520 if (ret < 0 && ret != -ENOENT)
2523 /* no backref to be processed for this extent */
2525 list_del(&old->list);
2530 if (list_empty(&new->head))
2536 static int relink_is_mergable(struct extent_buffer *leaf,
2537 struct btrfs_file_extent_item *fi,
2538 struct new_sa_defrag_extent *new)
2540 if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
2543 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2546 if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
2549 if (btrfs_file_extent_encryption(leaf, fi) ||
2550 btrfs_file_extent_other_encoding(leaf, fi))
2557 * Note the backref might has changed, and in this case we just return 0.
2559 static noinline int relink_extent_backref(struct btrfs_path *path,
2560 struct sa_defrag_extent_backref *prev,
2561 struct sa_defrag_extent_backref *backref)
2563 struct btrfs_file_extent_item *extent;
2564 struct btrfs_file_extent_item *item;
2565 struct btrfs_ordered_extent *ordered;
2566 struct btrfs_trans_handle *trans;
2567 struct btrfs_root *root;
2568 struct btrfs_key key;
2569 struct extent_buffer *leaf;
2570 struct old_sa_defrag_extent *old = backref->old;
2571 struct new_sa_defrag_extent *new = old->new;
2572 struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2573 struct inode *inode;
2574 struct extent_state *cached = NULL;
2583 if (prev && prev->root_id == backref->root_id &&
2584 prev->inum == backref->inum &&
2585 prev->file_pos + prev->num_bytes == backref->file_pos)
2588 /* step 1: get root */
2589 key.objectid = backref->root_id;
2590 key.type = BTRFS_ROOT_ITEM_KEY;
2591 key.offset = (u64)-1;
2593 index = srcu_read_lock(&fs_info->subvol_srcu);
2595 root = btrfs_read_fs_root_no_name(fs_info, &key);
2597 srcu_read_unlock(&fs_info->subvol_srcu, index);
2598 if (PTR_ERR(root) == -ENOENT)
2600 return PTR_ERR(root);
2603 if (btrfs_root_readonly(root)) {
2604 srcu_read_unlock(&fs_info->subvol_srcu, index);
2608 /* step 2: get inode */
2609 key.objectid = backref->inum;
2610 key.type = BTRFS_INODE_ITEM_KEY;
2613 inode = btrfs_iget(fs_info->sb, &key, root, NULL);
2614 if (IS_ERR(inode)) {
2615 srcu_read_unlock(&fs_info->subvol_srcu, index);
2619 srcu_read_unlock(&fs_info->subvol_srcu, index);
2621 /* step 3: relink backref */
2622 lock_start = backref->file_pos;
2623 lock_end = backref->file_pos + backref->num_bytes - 1;
2624 lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2627 ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
2629 btrfs_put_ordered_extent(ordered);
2633 trans = btrfs_join_transaction(root);
2634 if (IS_ERR(trans)) {
2635 ret = PTR_ERR(trans);
2639 key.objectid = backref->inum;
2640 key.type = BTRFS_EXTENT_DATA_KEY;
2641 key.offset = backref->file_pos;
2643 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2646 } else if (ret > 0) {
2651 extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
2652 struct btrfs_file_extent_item);
2654 if (btrfs_file_extent_generation(path->nodes[0], extent) !=
2655 backref->generation)
2658 btrfs_release_path(path);
2660 start = backref->file_pos;
2661 if (backref->extent_offset < old->extent_offset + old->offset)
2662 start += old->extent_offset + old->offset -
2663 backref->extent_offset;
2665 len = min(backref->extent_offset + backref->num_bytes,
2666 old->extent_offset + old->offset + old->len);
2667 len -= max(backref->extent_offset, old->extent_offset + old->offset);
2669 ret = btrfs_drop_extents(trans, root, inode, start,
2674 key.objectid = btrfs_ino(BTRFS_I(inode));
2675 key.type = BTRFS_EXTENT_DATA_KEY;
2678 path->leave_spinning = 1;
2680 struct btrfs_file_extent_item *fi;
2682 struct btrfs_key found_key;
2684 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2689 leaf = path->nodes[0];
2690 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2692 fi = btrfs_item_ptr(leaf, path->slots[0],
2693 struct btrfs_file_extent_item);
2694 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
2696 if (extent_len + found_key.offset == start &&
2697 relink_is_mergable(leaf, fi, new)) {
2698 btrfs_set_file_extent_num_bytes(leaf, fi,
2700 btrfs_mark_buffer_dirty(leaf);
2701 inode_add_bytes(inode, len);
2707 btrfs_release_path(path);
2712 ret = btrfs_insert_empty_item(trans, root, path, &key,
2715 btrfs_abort_transaction(trans, ret);
2719 leaf = path->nodes[0];
2720 item = btrfs_item_ptr(leaf, path->slots[0],
2721 struct btrfs_file_extent_item);
2722 btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
2723 btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
2724 btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
2725 btrfs_set_file_extent_num_bytes(leaf, item, len);
2726 btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
2727 btrfs_set_file_extent_generation(leaf, item, trans->transid);
2728 btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
2729 btrfs_set_file_extent_compression(leaf, item, new->compress_type);
2730 btrfs_set_file_extent_encryption(leaf, item, 0);
2731 btrfs_set_file_extent_other_encoding(leaf, item, 0);
2733 btrfs_mark_buffer_dirty(leaf);
2734 inode_add_bytes(inode, len);
2735 btrfs_release_path(path);
2737 ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
2739 backref->root_id, backref->inum,
2740 new->file_pos); /* start - extent_offset */
2742 btrfs_abort_transaction(trans, ret);
2748 btrfs_release_path(path);
2749 path->leave_spinning = 0;
2750 btrfs_end_transaction(trans);
2752 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2758 static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
2760 struct old_sa_defrag_extent *old, *tmp;
2765 list_for_each_entry_safe(old, tmp, &new->head, list) {
2771 static void relink_file_extents(struct new_sa_defrag_extent *new)
2773 struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2774 struct btrfs_path *path;
2775 struct sa_defrag_extent_backref *backref;
2776 struct sa_defrag_extent_backref *prev = NULL;
2777 struct inode *inode;
2778 struct btrfs_root *root;
2779 struct rb_node *node;
2783 root = BTRFS_I(inode)->root;
2785 path = btrfs_alloc_path();
2789 if (!record_extent_backrefs(path, new)) {
2790 btrfs_free_path(path);
2793 btrfs_release_path(path);
2796 node = rb_first(&new->root);
2799 rb_erase(node, &new->root);
2801 backref = rb_entry(node, struct sa_defrag_extent_backref, node);
2803 ret = relink_extent_backref(path, prev, backref);
2816 btrfs_free_path(path);
2818 free_sa_defrag_extent(new);
2820 atomic_dec(&fs_info->defrag_running);
2821 wake_up(&fs_info->transaction_wait);
2824 static struct new_sa_defrag_extent *
2825 record_old_file_extents(struct inode *inode,
2826 struct btrfs_ordered_extent *ordered)
2828 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2829 struct btrfs_root *root = BTRFS_I(inode)->root;
2830 struct btrfs_path *path;
2831 struct btrfs_key key;
2832 struct old_sa_defrag_extent *old;
2833 struct new_sa_defrag_extent *new;
2836 new = kmalloc(sizeof(*new), GFP_NOFS);
2841 new->file_pos = ordered->file_offset;
2842 new->len = ordered->len;
2843 new->bytenr = ordered->start;
2844 new->disk_len = ordered->disk_len;
2845 new->compress_type = ordered->compress_type;
2846 new->root = RB_ROOT;
2847 INIT_LIST_HEAD(&new->head);
2849 path = btrfs_alloc_path();
2853 key.objectid = btrfs_ino(BTRFS_I(inode));
2854 key.type = BTRFS_EXTENT_DATA_KEY;
2855 key.offset = new->file_pos;
2857 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2860 if (ret > 0 && path->slots[0] > 0)
2863 /* find out all the old extents for the file range */
2865 struct btrfs_file_extent_item *extent;
2866 struct extent_buffer *l;
2875 slot = path->slots[0];
2877 if (slot >= btrfs_header_nritems(l)) {
2878 ret = btrfs_next_leaf(root, path);
2886 btrfs_item_key_to_cpu(l, &key, slot);
2888 if (key.objectid != btrfs_ino(BTRFS_I(inode)))
2890 if (key.type != BTRFS_EXTENT_DATA_KEY)
2892 if (key.offset >= new->file_pos + new->len)
2895 extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
2897 num_bytes = btrfs_file_extent_num_bytes(l, extent);
2898 if (key.offset + num_bytes < new->file_pos)
2901 disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
2905 extent_offset = btrfs_file_extent_offset(l, extent);
2907 old = kmalloc(sizeof(*old), GFP_NOFS);
2911 offset = max(new->file_pos, key.offset);
2912 end = min(new->file_pos + new->len, key.offset + num_bytes);
2914 old->bytenr = disk_bytenr;
2915 old->extent_offset = extent_offset;
2916 old->offset = offset - key.offset;
2917 old->len = end - offset;
2920 list_add_tail(&old->list, &new->head);
2926 btrfs_free_path(path);
2927 atomic_inc(&fs_info->defrag_running);
2932 btrfs_free_path(path);
2934 free_sa_defrag_extent(new);
2938 static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
2941 struct btrfs_block_group_cache *cache;
2943 cache = btrfs_lookup_block_group(fs_info, start);
2946 spin_lock(&cache->lock);
2947 cache->delalloc_bytes -= len;
2948 spin_unlock(&cache->lock);
2950 btrfs_put_block_group(cache);
2953 /* as ordered data IO finishes, this gets called so we can finish
2954 * an ordered extent if the range of bytes in the file it covers are
2957 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2959 struct inode *inode = ordered_extent->inode;
2960 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2961 struct btrfs_root *root = BTRFS_I(inode)->root;
2962 struct btrfs_trans_handle *trans = NULL;
2963 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2964 struct extent_state *cached_state = NULL;
2965 struct new_sa_defrag_extent *new = NULL;
2966 int compress_type = 0;
2968 u64 logical_len = ordered_extent->len;
2970 bool truncated = false;
2971 bool range_locked = false;
2972 bool clear_new_delalloc_bytes = false;
2974 if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
2975 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
2976 !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
2977 clear_new_delalloc_bytes = true;
2979 nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
2981 if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
2986 btrfs_free_io_failure_record(BTRFS_I(inode),
2987 ordered_extent->file_offset,
2988 ordered_extent->file_offset +
2989 ordered_extent->len - 1);
2991 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
2993 logical_len = ordered_extent->truncated_len;
2994 /* Truncated the entire extent, don't bother adding */
2999 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
3000 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
3003 * For mwrite(mmap + memset to write) case, we still reserve
3004 * space for NOCOW range.
3005 * As NOCOW won't cause a new delayed ref, just free the space
3007 btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
3008 ordered_extent->len);
3009 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
3011 trans = btrfs_join_transaction_nolock(root);
3013 trans = btrfs_join_transaction(root);
3014 if (IS_ERR(trans)) {
3015 ret = PTR_ERR(trans);
3019 trans->block_rsv = &BTRFS_I(inode)->block_rsv;
3020 ret = btrfs_update_inode_fallback(trans, root, inode);
3021 if (ret) /* -ENOMEM or corruption */
3022 btrfs_abort_transaction(trans, ret);
3026 range_locked = true;
3027 lock_extent_bits(io_tree, ordered_extent->file_offset,
3028 ordered_extent->file_offset + ordered_extent->len - 1,
3031 ret = test_range_bit(io_tree, ordered_extent->file_offset,
3032 ordered_extent->file_offset + ordered_extent->len - 1,
3033 EXTENT_DEFRAG, 0, cached_state);
3035 u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
3036 if (0 && last_snapshot >= BTRFS_I(inode)->generation)
3037 /* the inode is shared */
3038 new = record_old_file_extents(inode, ordered_extent);
3040 clear_extent_bit(io_tree, ordered_extent->file_offset,
3041 ordered_extent->file_offset + ordered_extent->len - 1,
3042 EXTENT_DEFRAG, 0, 0, &cached_state);
3046 trans = btrfs_join_transaction_nolock(root);
3048 trans = btrfs_join_transaction(root);
3049 if (IS_ERR(trans)) {
3050 ret = PTR_ERR(trans);
3055 trans->block_rsv = &BTRFS_I(inode)->block_rsv;
3057 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
3058 compress_type = ordered_extent->compress_type;
3059 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3060 BUG_ON(compress_type);
3061 btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
3062 ordered_extent->len);
3063 ret = btrfs_mark_extent_written(trans, BTRFS_I(inode),
3064 ordered_extent->file_offset,
3065 ordered_extent->file_offset +
3068 BUG_ON(root == fs_info->tree_root);
3069 ret = insert_reserved_file_extent(trans, inode,