1af069a9a0c7205b4ba7649463dcba5288f9c36f
[sfrench/cifs-2.6.git] / fs / btrfs / inode.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5
6 #include <linux/kernel.h>
7 #include <linux/bio.h>
8 #include <linux/buffer_head.h>
9 #include <linux/file.h>
10 #include <linux/fs.h>
11 #include <linux/pagemap.h>
12 #include <linux/highmem.h>
13 #include <linux/time.h>
14 #include <linux/init.h>
15 #include <linux/string.h>
16 #include <linux/backing-dev.h>
17 #include <linux/writeback.h>
18 #include <linux/compat.h>
19 #include <linux/xattr.h>
20 #include <linux/posix_acl.h>
21 #include <linux/falloc.h>
22 #include <linux/slab.h>
23 #include <linux/ratelimit.h>
24 #include <linux/btrfs.h>
25 #include <linux/blkdev.h>
26 #include <linux/posix_acl_xattr.h>
27 #include <linux/uio.h>
28 #include <linux/magic.h>
29 #include <linux/iversion.h>
30 #include <linux/swap.h>
31 #include <linux/sched/mm.h>
32 #include <asm/unaligned.h>
33 #include "ctree.h"
34 #include "disk-io.h"
35 #include "transaction.h"
36 #include "btrfs_inode.h"
37 #include "print-tree.h"
38 #include "ordered-data.h"
39 #include "xattr.h"
40 #include "tree-log.h"
41 #include "volumes.h"
42 #include "compression.h"
43 #include "locking.h"
44 #include "free-space-cache.h"
45 #include "inode-map.h"
46 #include "backref.h"
47 #include "props.h"
48 #include "qgroup.h"
49 #include "dedupe.h"
50 #include "delalloc-space.h"
51
52 struct btrfs_iget_args {
53         struct btrfs_key *location;
54         struct btrfs_root *root;
55 };
56
57 struct btrfs_dio_data {
58         u64 reserve;
59         u64 unsubmitted_oe_range_start;
60         u64 unsubmitted_oe_range_end;
61         int overwrite;
62 };
63
64 static const struct inode_operations btrfs_dir_inode_operations;
65 static const struct inode_operations btrfs_symlink_inode_operations;
66 static const struct inode_operations btrfs_dir_ro_inode_operations;
67 static const struct inode_operations btrfs_special_inode_operations;
68 static const struct inode_operations btrfs_file_inode_operations;
69 static const struct address_space_operations btrfs_aops;
70 static const struct file_operations btrfs_dir_file_operations;
71 static const struct extent_io_ops btrfs_extent_io_ops;
72
73 static struct kmem_cache *btrfs_inode_cachep;
74 struct kmem_cache *btrfs_trans_handle_cachep;
75 struct kmem_cache *btrfs_path_cachep;
76 struct kmem_cache *btrfs_free_space_cachep;
77
78 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
79 static int btrfs_truncate(struct inode *inode, bool skip_writeback);
80 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
81 static noinline int cow_file_range(struct inode *inode,
82                                    struct page *locked_page,
83                                    u64 start, u64 end, u64 delalloc_end,
84                                    int *page_started, unsigned long *nr_written,
85                                    int unlock, struct btrfs_dedupe_hash *hash);
86 static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
87                                        u64 orig_start, u64 block_start,
88                                        u64 block_len, u64 orig_block_len,
89                                        u64 ram_bytes, int compress_type,
90                                        int type);
91
92 static void __endio_write_update_ordered(struct inode *inode,
93                                          const u64 offset, const u64 bytes,
94                                          const bool uptodate);
95
96 /*
97  * Cleanup all submitted ordered extents in specified range to handle errors
98  * from the btrfs_run_delalloc_range() callback.
99  *
100  * NOTE: caller must ensure that when an error happens, it can not call
101  * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
102  * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
103  * to be released, which we want to happen only when finishing the ordered
104  * extent (btrfs_finish_ordered_io()).
105  */
106 static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
107                                                  struct page *locked_page,
108                                                  u64 offset, u64 bytes)
109 {
110         unsigned long index = offset >> PAGE_SHIFT;
111         unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
112         u64 page_start = page_offset(locked_page);
113         u64 page_end = page_start + PAGE_SIZE - 1;
114
115         struct page *page;
116
117         while (index <= end_index) {
118                 page = find_get_page(inode->i_mapping, index);
119                 index++;
120                 if (!page)
121                         continue;
122                 ClearPagePrivate2(page);
123                 put_page(page);
124         }
125
126         /*
127          * In case this page belongs to the delalloc range being instantiated
128          * then skip it, since the first page of a range is going to be
129          * properly cleaned up by the caller of run_delalloc_range
130          */
131         if (page_start >= offset && page_end <= (offset + bytes - 1)) {
132                 offset += PAGE_SIZE;
133                 bytes -= PAGE_SIZE;
134         }
135
136         return __endio_write_update_ordered(inode, offset, bytes, false);
137 }
138
139 static int btrfs_dirty_inode(struct inode *inode);
140
141 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
142 void btrfs_test_inode_set_ops(struct inode *inode)
143 {
144         BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
145 }
146 #endif
147
148 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
149                                      struct inode *inode,  struct inode *dir,
150                                      const struct qstr *qstr)
151 {
152         int err;
153
154         err = btrfs_init_acl(trans, inode, dir);
155         if (!err)
156                 err = btrfs_xattr_security_init(trans, inode, dir, qstr);
157         return err;
158 }
159
160 /*
161  * this does all the hard work for inserting an inline extent into
162  * the btree.  The caller should have done a btrfs_drop_extents so that
163  * no overlapping inline items exist in the btree
164  */
165 static int insert_inline_extent(struct btrfs_trans_handle *trans,
166                                 struct btrfs_path *path, int extent_inserted,
167                                 struct btrfs_root *root, struct inode *inode,
168                                 u64 start, size_t size, size_t compressed_size,
169                                 int compress_type,
170                                 struct page **compressed_pages)
171 {
172         struct extent_buffer *leaf;
173         struct page *page = NULL;
174         char *kaddr;
175         unsigned long ptr;
176         struct btrfs_file_extent_item *ei;
177         int ret;
178         size_t cur_size = size;
179         unsigned long offset;
180
181         if (compressed_size && compressed_pages)
182                 cur_size = compressed_size;
183
184         inode_add_bytes(inode, size);
185
186         if (!extent_inserted) {
187                 struct btrfs_key key;
188                 size_t datasize;
189
190                 key.objectid = btrfs_ino(BTRFS_I(inode));
191                 key.offset = start;
192                 key.type = BTRFS_EXTENT_DATA_KEY;
193
194                 datasize = btrfs_file_extent_calc_inline_size(cur_size);
195                 path->leave_spinning = 1;
196                 ret = btrfs_insert_empty_item(trans, root, path, &key,
197                                               datasize);
198                 if (ret)
199                         goto fail;
200         }
201         leaf = path->nodes[0];
202         ei = btrfs_item_ptr(leaf, path->slots[0],
203                             struct btrfs_file_extent_item);
204         btrfs_set_file_extent_generation(leaf, ei, trans->transid);
205         btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
206         btrfs_set_file_extent_encryption(leaf, ei, 0);
207         btrfs_set_file_extent_other_encoding(leaf, ei, 0);
208         btrfs_set_file_extent_ram_bytes(leaf, ei, size);
209         ptr = btrfs_file_extent_inline_start(ei);
210
211         if (compress_type != BTRFS_COMPRESS_NONE) {
212                 struct page *cpage;
213                 int i = 0;
214                 while (compressed_size > 0) {
215                         cpage = compressed_pages[i];
216                         cur_size = min_t(unsigned long, compressed_size,
217                                        PAGE_SIZE);
218
219                         kaddr = kmap_atomic(cpage);
220                         write_extent_buffer(leaf, kaddr, ptr, cur_size);
221                         kunmap_atomic(kaddr);
222
223                         i++;
224                         ptr += cur_size;
225                         compressed_size -= cur_size;
226                 }
227                 btrfs_set_file_extent_compression(leaf, ei,
228                                                   compress_type);
229         } else {
230                 page = find_get_page(inode->i_mapping,
231                                      start >> PAGE_SHIFT);
232                 btrfs_set_file_extent_compression(leaf, ei, 0);
233                 kaddr = kmap_atomic(page);
234                 offset = offset_in_page(start);
235                 write_extent_buffer(leaf, kaddr + offset, ptr, size);
236                 kunmap_atomic(kaddr);
237                 put_page(page);
238         }
239         btrfs_mark_buffer_dirty(leaf);
240         btrfs_release_path(path);
241
242         /*
243          * we're an inline extent, so nobody can
244          * extend the file past i_size without locking
245          * a page we already have locked.
246          *
247          * We must do any isize and inode updates
248          * before we unlock the pages.  Otherwise we
249          * could end up racing with unlink.
250          */
251         BTRFS_I(inode)->disk_i_size = inode->i_size;
252         ret = btrfs_update_inode(trans, root, inode);
253
254 fail:
255         return ret;
256 }
257
258
259 /*
260  * conditionally insert an inline extent into the file.  This
261  * does the checks required to make sure the data is small enough
262  * to fit as an inline extent.
263  */
264 static noinline int cow_file_range_inline(struct inode *inode, u64 start,
265                                           u64 end, size_t compressed_size,
266                                           int compress_type,
267                                           struct page **compressed_pages)
268 {
269         struct btrfs_root *root = BTRFS_I(inode)->root;
270         struct btrfs_fs_info *fs_info = root->fs_info;
271         struct btrfs_trans_handle *trans;
272         u64 isize = i_size_read(inode);
273         u64 actual_end = min(end + 1, isize);
274         u64 inline_len = actual_end - start;
275         u64 aligned_end = ALIGN(end, fs_info->sectorsize);
276         u64 data_len = inline_len;
277         int ret;
278         struct btrfs_path *path;
279         int extent_inserted = 0;
280         u32 extent_item_size;
281
282         if (compressed_size)
283                 data_len = compressed_size;
284
285         if (start > 0 ||
286             actual_end > fs_info->sectorsize ||
287             data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
288             (!compressed_size &&
289             (actual_end & (fs_info->sectorsize - 1)) == 0) ||
290             end + 1 < isize ||
291             data_len > fs_info->max_inline) {
292                 return 1;
293         }
294
295         path = btrfs_alloc_path();
296         if (!path)
297                 return -ENOMEM;
298
299         trans = btrfs_join_transaction(root);
300         if (IS_ERR(trans)) {
301                 btrfs_free_path(path);
302                 return PTR_ERR(trans);
303         }
304         trans->block_rsv = &BTRFS_I(inode)->block_rsv;
305
306         if (compressed_size && compressed_pages)
307                 extent_item_size = btrfs_file_extent_calc_inline_size(
308                    compressed_size);
309         else
310                 extent_item_size = btrfs_file_extent_calc_inline_size(
311                     inline_len);
312
313         ret = __btrfs_drop_extents(trans, root, inode, path,
314                                    start, aligned_end, NULL,
315                                    1, 1, extent_item_size, &extent_inserted);
316         if (ret) {
317                 btrfs_abort_transaction(trans, ret);
318                 goto out;
319         }
320
321         if (isize > actual_end)
322                 inline_len = min_t(u64, isize, actual_end);
323         ret = insert_inline_extent(trans, path, extent_inserted,
324                                    root, inode, start,
325                                    inline_len, compressed_size,
326                                    compress_type, compressed_pages);
327         if (ret && ret != -ENOSPC) {
328                 btrfs_abort_transaction(trans, ret);
329                 goto out;
330         } else if (ret == -ENOSPC) {
331                 ret = 1;
332                 goto out;
333         }
334
335         set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
336         btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0);
337 out:
338         /*
339          * Don't forget to free the reserved space, as for inlined extent
340          * it won't count as data extent, free them directly here.
341          * And at reserve time, it's always aligned to page size, so
342          * just free one page here.
343          */
344         btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
345         btrfs_free_path(path);
346         btrfs_end_transaction(trans);
347         return ret;
348 }
349
350 struct async_extent {
351         u64 start;
352         u64 ram_size;
353         u64 compressed_size;
354         struct page **pages;
355         unsigned long nr_pages;
356         int compress_type;
357         struct list_head list;
358 };
359
360 struct async_chunk {
361         struct inode *inode;
362         struct page *locked_page;
363         u64 start;
364         u64 end;
365         unsigned int write_flags;
366         struct list_head extents;
367         struct btrfs_work work;
368         atomic_t *pending;
369 };
370
371 struct async_cow {
372         /* Number of chunks in flight; must be first in the structure */
373         atomic_t num_chunks;
374         struct async_chunk chunks[];
375 };
376
377 static noinline int add_async_extent(struct async_chunk *cow,
378                                      u64 start, u64 ram_size,
379                                      u64 compressed_size,
380                                      struct page **pages,
381                                      unsigned long nr_pages,
382                                      int compress_type)
383 {
384         struct async_extent *async_extent;
385
386         async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
387         BUG_ON(!async_extent); /* -ENOMEM */
388         async_extent->start = start;
389         async_extent->ram_size = ram_size;
390         async_extent->compressed_size = compressed_size;
391         async_extent->pages = pages;
392         async_extent->nr_pages = nr_pages;
393         async_extent->compress_type = compress_type;
394         list_add_tail(&async_extent->list, &cow->extents);
395         return 0;
396 }
397
398 static inline int inode_need_compress(struct inode *inode, u64 start, u64 end)
399 {
400         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
401
402         /* force compress */
403         if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
404                 return 1;
405         /* defrag ioctl */
406         if (BTRFS_I(inode)->defrag_compress)
407                 return 1;
408         /* bad compression ratios */
409         if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
410                 return 0;
411         if (btrfs_test_opt(fs_info, COMPRESS) ||
412             BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
413             BTRFS_I(inode)->prop_compress)
414                 return btrfs_compress_heuristic(inode, start, end);
415         return 0;
416 }
417
418 static inline void inode_should_defrag(struct btrfs_inode *inode,
419                 u64 start, u64 end, u64 num_bytes, u64 small_write)
420 {
421         /* If this is a small write inside eof, kick off a defrag */
422         if (num_bytes < small_write &&
423             (start > 0 || end + 1 < inode->disk_i_size))
424                 btrfs_add_inode_defrag(NULL, inode);
425 }
426
427 /*
428  * we create compressed extents in two phases.  The first
429  * phase compresses a range of pages that have already been
430  * locked (both pages and state bits are locked).
431  *
432  * This is done inside an ordered work queue, and the compression
433  * is spread across many cpus.  The actual IO submission is step
434  * two, and the ordered work queue takes care of making sure that
435  * happens in the same order things were put onto the queue by
436  * writepages and friends.
437  *
438  * If this code finds it can't get good compression, it puts an
439  * entry onto the work queue to write the uncompressed bytes.  This
440  * makes sure that both compressed inodes and uncompressed inodes
441  * are written in the same order that the flusher thread sent them
442  * down.
443  */
444 static noinline void compress_file_range(struct async_chunk *async_chunk,
445                                          int *num_added)
446 {
447         struct inode *inode = async_chunk->inode;
448         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
449         u64 blocksize = fs_info->sectorsize;
450         u64 start = async_chunk->start;
451         u64 end = async_chunk->end;
452         u64 actual_end;
453         int ret = 0;
454         struct page **pages = NULL;
455         unsigned long nr_pages;
456         unsigned long total_compressed = 0;
457         unsigned long total_in = 0;
458         int i;
459         int will_compress;
460         int compress_type = fs_info->compress_type;
461         int redirty = 0;
462
463         inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
464                         SZ_16K);
465
466         actual_end = min_t(u64, i_size_read(inode), end + 1);
467 again:
468         will_compress = 0;
469         nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
470         BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0);
471         nr_pages = min_t(unsigned long, nr_pages,
472                         BTRFS_MAX_COMPRESSED / PAGE_SIZE);
473
474         /*
475          * we don't want to send crud past the end of i_size through
476          * compression, that's just a waste of CPU time.  So, if the
477          * end of the file is before the start of our current
478          * requested range of bytes, we bail out to the uncompressed
479          * cleanup code that can deal with all of this.
480          *
481          * It isn't really the fastest way to fix things, but this is a
482          * very uncommon corner.
483          */
484         if (actual_end <= start)
485                 goto cleanup_and_bail_uncompressed;
486
487         total_compressed = actual_end - start;
488
489         /*
490          * skip compression for a small file range(<=blocksize) that
491          * isn't an inline extent, since it doesn't save disk space at all.
492          */
493         if (total_compressed <= blocksize &&
494            (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
495                 goto cleanup_and_bail_uncompressed;
496
497         total_compressed = min_t(unsigned long, total_compressed,
498                         BTRFS_MAX_UNCOMPRESSED);
499         total_in = 0;
500         ret = 0;
501
502         /*
503          * we do compression for mount -o compress and when the
504          * inode has not been flagged as nocompress.  This flag can
505          * change at any time if we discover bad compression ratios.
506          */
507         if (inode_need_compress(inode, start, end)) {
508                 WARN_ON(pages);
509                 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
510                 if (!pages) {
511                         /* just bail out to the uncompressed code */
512                         nr_pages = 0;
513                         goto cont;
514                 }
515
516                 if (BTRFS_I(inode)->defrag_compress)
517                         compress_type = BTRFS_I(inode)->defrag_compress;
518                 else if (BTRFS_I(inode)->prop_compress)
519                         compress_type = BTRFS_I(inode)->prop_compress;
520
521                 /*
522                  * we need to call clear_page_dirty_for_io on each
523                  * page in the range.  Otherwise applications with the file
524                  * mmap'd can wander in and change the page contents while
525                  * we are compressing them.
526                  *
527                  * If the compression fails for any reason, we set the pages
528                  * dirty again later on.
529                  *
530                  * Note that the remaining part is redirtied, the start pointer
531                  * has moved, the end is the original one.
532                  */
533                 if (!redirty) {
534                         extent_range_clear_dirty_for_io(inode, start, end);
535                         redirty = 1;
536                 }
537
538                 /* Compression level is applied here and only here */
539                 ret = btrfs_compress_pages(
540                         compress_type | (fs_info->compress_level << 4),
541                                            inode->i_mapping, start,
542                                            pages,
543                                            &nr_pages,
544                                            &total_in,
545                                            &total_compressed);
546
547                 if (!ret) {
548                         unsigned long offset = offset_in_page(total_compressed);
549                         struct page *page = pages[nr_pages - 1];
550                         char *kaddr;
551
552                         /* zero the tail end of the last page, we might be
553                          * sending it down to disk
554                          */
555                         if (offset) {
556                                 kaddr = kmap_atomic(page);
557                                 memset(kaddr + offset, 0,
558                                        PAGE_SIZE - offset);
559                                 kunmap_atomic(kaddr);
560                         }
561                         will_compress = 1;
562                 }
563         }
564 cont:
565         if (start == 0) {
566                 /* lets try to make an inline extent */
567                 if (ret || total_in < actual_end) {
568                         /* we didn't compress the entire range, try
569                          * to make an uncompressed inline extent.
570                          */
571                         ret = cow_file_range_inline(inode, start, end, 0,
572                                                     BTRFS_COMPRESS_NONE, NULL);
573                 } else {
574                         /* try making a compressed inline extent */
575                         ret = cow_file_range_inline(inode, start, end,
576                                                     total_compressed,
577                                                     compress_type, pages);
578                 }
579                 if (ret <= 0) {
580                         unsigned long clear_flags = EXTENT_DELALLOC |
581                                 EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
582                                 EXTENT_DO_ACCOUNTING;
583                         unsigned long page_error_op;
584
585                         page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
586
587                         /*
588                          * inline extent creation worked or returned error,
589                          * we don't need to create any more async work items.
590                          * Unlock and free up our temp pages.
591                          *
592                          * We use DO_ACCOUNTING here because we need the
593                          * delalloc_release_metadata to be done _after_ we drop
594                          * our outstanding extent for clearing delalloc for this
595                          * range.
596                          */
597                         extent_clear_unlock_delalloc(inode, start, end, end,
598                                                      NULL, clear_flags,
599                                                      PAGE_UNLOCK |
600                                                      PAGE_CLEAR_DIRTY |
601                                                      PAGE_SET_WRITEBACK |
602                                                      page_error_op |
603                                                      PAGE_END_WRITEBACK);
604                         goto free_pages_out;
605                 }
606         }
607
608         if (will_compress) {
609                 /*
610                  * we aren't doing an inline extent round the compressed size
611                  * up to a block size boundary so the allocator does sane
612                  * things
613                  */
614                 total_compressed = ALIGN(total_compressed, blocksize);
615
616                 /*
617                  * one last check to make sure the compression is really a
618                  * win, compare the page count read with the blocks on disk,
619                  * compression must free at least one sector size
620                  */
621                 total_in = ALIGN(total_in, PAGE_SIZE);
622                 if (total_compressed + blocksize <= total_in) {
623                         *num_added += 1;
624
625                         /*
626                          * The async work queues will take care of doing actual
627                          * allocation on disk for these compressed pages, and
628                          * will submit them to the elevator.
629                          */
630                         add_async_extent(async_chunk, start, total_in,
631                                         total_compressed, pages, nr_pages,
632                                         compress_type);
633
634                         if (start + total_in < end) {
635                                 start += total_in;
636                                 pages = NULL;
637                                 cond_resched();
638                                 goto again;
639                         }
640                         return;
641                 }
642         }
643         if (pages) {
644                 /*
645                  * the compression code ran but failed to make things smaller,
646                  * free any pages it allocated and our page pointer array
647                  */
648                 for (i = 0; i < nr_pages; i++) {
649                         WARN_ON(pages[i]->mapping);
650                         put_page(pages[i]);
651                 }
652                 kfree(pages);
653                 pages = NULL;
654                 total_compressed = 0;
655                 nr_pages = 0;
656
657                 /* flag the file so we don't compress in the future */
658                 if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
659                     !(BTRFS_I(inode)->prop_compress)) {
660                         BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
661                 }
662         }
663 cleanup_and_bail_uncompressed:
664         /*
665          * No compression, but we still need to write the pages in the file
666          * we've been given so far.  redirty the locked page if it corresponds
667          * to our extent and set things up for the async work queue to run
668          * cow_file_range to do the normal delalloc dance.
669          */
670         if (page_offset(async_chunk->locked_page) >= start &&
671             page_offset(async_chunk->locked_page) <= end)
672                 __set_page_dirty_nobuffers(async_chunk->locked_page);
673                 /* unlocked later on in the async handlers */
674
675         if (redirty)
676                 extent_range_redirty_for_io(inode, start, end);
677         add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
678                          BTRFS_COMPRESS_NONE);
679         *num_added += 1;
680
681         return;
682
683 free_pages_out:
684         for (i = 0; i < nr_pages; i++) {
685                 WARN_ON(pages[i]->mapping);
686                 put_page(pages[i]);
687         }
688         kfree(pages);
689 }
690
691 static void free_async_extent_pages(struct async_extent *async_extent)
692 {
693         int i;
694
695         if (!async_extent->pages)
696                 return;
697
698         for (i = 0; i < async_extent->nr_pages; i++) {
699                 WARN_ON(async_extent->pages[i]->mapping);
700                 put_page(async_extent->pages[i]);
701         }
702         kfree(async_extent->pages);
703         async_extent->nr_pages = 0;
704         async_extent->pages = NULL;
705 }
706
707 /*
708  * phase two of compressed writeback.  This is the ordered portion
709  * of the code, which only gets called in the order the work was
710  * queued.  We walk all the async extents created by compress_file_range
711  * and send them down to the disk.
712  */
713 static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
714 {
715         struct inode *inode = async_chunk->inode;
716         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
717         struct async_extent *async_extent;
718         u64 alloc_hint = 0;
719         struct btrfs_key ins;
720         struct extent_map *em;
721         struct btrfs_root *root = BTRFS_I(inode)->root;
722         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
723         int ret = 0;
724
725 again:
726         while (!list_empty(&async_chunk->extents)) {
727                 async_extent = list_entry(async_chunk->extents.next,
728                                           struct async_extent, list);
729                 list_del(&async_extent->list);
730
731 retry:
732                 lock_extent(io_tree, async_extent->start,
733                             async_extent->start + async_extent->ram_size - 1);
734                 /* did the compression code fall back to uncompressed IO? */
735                 if (!async_extent->pages) {
736                         int page_started = 0;
737                         unsigned long nr_written = 0;
738
739                         /* allocate blocks */
740                         ret = cow_file_range(inode, async_chunk->locked_page,
741                                              async_extent->start,
742                                              async_extent->start +
743                                              async_extent->ram_size - 1,
744                                              async_extent->start +
745                                              async_extent->ram_size - 1,
746                                              &page_started, &nr_written, 0,
747                                              NULL);
748
749                         /* JDM XXX */
750
751                         /*
752                          * if page_started, cow_file_range inserted an
753                          * inline extent and took care of all the unlocking
754                          * and IO for us.  Otherwise, we need to submit
755                          * all those pages down to the drive.
756                          */
757                         if (!page_started && !ret)
758                                 extent_write_locked_range(inode,
759                                                   async_extent->start,
760                                                   async_extent->start +
761                                                   async_extent->ram_size - 1,
762                                                   WB_SYNC_ALL);
763                         else if (ret)
764                                 unlock_page(async_chunk->locked_page);
765                         kfree(async_extent);
766                         cond_resched();
767                         continue;
768                 }
769
770                 ret = btrfs_reserve_extent(root, async_extent->ram_size,
771                                            async_extent->compressed_size,
772                                            async_extent->compressed_size,
773                                            0, alloc_hint, &ins, 1, 1);
774                 if (ret) {
775                         free_async_extent_pages(async_extent);
776
777                         if (ret == -ENOSPC) {
778                                 unlock_extent(io_tree, async_extent->start,
779                                               async_extent->start +
780                                               async_extent->ram_size - 1);
781
782                                 /*
783                                  * we need to redirty the pages if we decide to
784                                  * fallback to uncompressed IO, otherwise we
785                                  * will not submit these pages down to lower
786                                  * layers.
787                                  */
788                                 extent_range_redirty_for_io(inode,
789                                                 async_extent->start,
790                                                 async_extent->start +
791                                                 async_extent->ram_size - 1);
792
793                                 goto retry;
794                         }
795                         goto out_free;
796                 }
797                 /*
798                  * here we're doing allocation and writeback of the
799                  * compressed pages
800                  */
801                 em = create_io_em(inode, async_extent->start,
802                                   async_extent->ram_size, /* len */
803                                   async_extent->start, /* orig_start */
804                                   ins.objectid, /* block_start */
805                                   ins.offset, /* block_len */
806                                   ins.offset, /* orig_block_len */
807                                   async_extent->ram_size, /* ram_bytes */
808                                   async_extent->compress_type,
809                                   BTRFS_ORDERED_COMPRESSED);
810                 if (IS_ERR(em))
811                         /* ret value is not necessary due to void function */
812                         goto out_free_reserve;
813                 free_extent_map(em);
814
815                 ret = btrfs_add_ordered_extent_compress(inode,
816                                                 async_extent->start,
817                                                 ins.objectid,
818                                                 async_extent->ram_size,
819                                                 ins.offset,
820                                                 BTRFS_ORDERED_COMPRESSED,
821                                                 async_extent->compress_type);
822                 if (ret) {
823                         btrfs_drop_extent_cache(BTRFS_I(inode),
824                                                 async_extent->start,
825                                                 async_extent->start +
826                                                 async_extent->ram_size - 1, 0);
827                         goto out_free_reserve;
828                 }
829                 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
830
831                 /*
832                  * clear dirty, set writeback and unlock the pages.
833                  */
834                 extent_clear_unlock_delalloc(inode, async_extent->start,
835                                 async_extent->start +
836                                 async_extent->ram_size - 1,
837                                 async_extent->start +
838                                 async_extent->ram_size - 1,
839                                 NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
840                                 PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
841                                 PAGE_SET_WRITEBACK);
842                 if (btrfs_submit_compressed_write(inode,
843                                     async_extent->start,
844                                     async_extent->ram_size,
845                                     ins.objectid,
846                                     ins.offset, async_extent->pages,
847                                     async_extent->nr_pages,
848                                     async_chunk->write_flags)) {
849                         struct page *p = async_extent->pages[0];
850                         const u64 start = async_extent->start;
851                         const u64 end = start + async_extent->ram_size - 1;
852
853                         p->mapping = inode->i_mapping;
854                         btrfs_writepage_endio_finish_ordered(p, start, end, 0);
855
856                         p->mapping = NULL;
857                         extent_clear_unlock_delalloc(inode, start, end, end,
858                                                      NULL, 0,
859                                                      PAGE_END_WRITEBACK |
860                                                      PAGE_SET_ERROR);
861                         free_async_extent_pages(async_extent);
862                 }
863                 alloc_hint = ins.objectid + ins.offset;
864                 kfree(async_extent);
865                 cond_resched();
866         }
867         return;
868 out_free_reserve:
869         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
870         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
871 out_free:
872         extent_clear_unlock_delalloc(inode, async_extent->start,
873                                      async_extent->start +
874                                      async_extent->ram_size - 1,
875                                      async_extent->start +
876                                      async_extent->ram_size - 1,
877                                      NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
878                                      EXTENT_DELALLOC_NEW |
879                                      EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
880                                      PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
881                                      PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
882                                      PAGE_SET_ERROR);
883         free_async_extent_pages(async_extent);
884         kfree(async_extent);
885         goto again;
886 }
887
888 static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
889                                       u64 num_bytes)
890 {
891         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
892         struct extent_map *em;
893         u64 alloc_hint = 0;
894
895         read_lock(&em_tree->lock);
896         em = search_extent_mapping(em_tree, start, num_bytes);
897         if (em) {
898                 /*
899                  * if block start isn't an actual block number then find the
900                  * first block in this inode and use that as a hint.  If that
901                  * block is also bogus then just don't worry about it.
902                  */
903                 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
904                         free_extent_map(em);
905                         em = search_extent_mapping(em_tree, 0, 0);
906                         if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
907                                 alloc_hint = em->block_start;
908                         if (em)
909                                 free_extent_map(em);
910                 } else {
911                         alloc_hint = em->block_start;
912                         free_extent_map(em);
913                 }
914         }
915         read_unlock(&em_tree->lock);
916
917         return alloc_hint;
918 }
919
920 /*
921  * when extent_io.c finds a delayed allocation range in the file,
922  * the call backs end up in this code.  The basic idea is to
923  * allocate extents on disk for the range, and create ordered data structs
924  * in ram to track those extents.
925  *
926  * locked_page is the page that writepage had locked already.  We use
927  * it to make sure we don't do extra locks or unlocks.
928  *
929  * *page_started is set to one if we unlock locked_page and do everything
930  * required to start IO on it.  It may be clean and already done with
931  * IO when we return.
932  */
933 static noinline int cow_file_range(struct inode *inode,
934                                    struct page *locked_page,
935                                    u64 start, u64 end, u64 delalloc_end,
936                                    int *page_started, unsigned long *nr_written,
937                                    int unlock, struct btrfs_dedupe_hash *hash)
938 {
939         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
940         struct btrfs_root *root = BTRFS_I(inode)->root;
941         u64 alloc_hint = 0;
942         u64 num_bytes;
943         unsigned long ram_size;
944         u64 cur_alloc_size = 0;
945         u64 blocksize = fs_info->sectorsize;
946         struct btrfs_key ins;
947         struct extent_map *em;
948         unsigned clear_bits;
949         unsigned long page_ops;
950         bool extent_reserved = false;
951         int ret = 0;
952
953         if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
954                 WARN_ON_ONCE(1);
955                 ret = -EINVAL;
956                 goto out_unlock;
957         }
958
959         num_bytes = ALIGN(end - start + 1, blocksize);
960         num_bytes = max(blocksize,  num_bytes);
961         ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
962
963         inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K);
964
965         if (start == 0) {
966                 /* lets try to make an inline extent */
967                 ret = cow_file_range_inline(inode, start, end, 0,
968                                             BTRFS_COMPRESS_NONE, NULL);
969                 if (ret == 0) {
970                         /*
971                          * We use DO_ACCOUNTING here because we need the
972                          * delalloc_release_metadata to be run _after_ we drop
973                          * our outstanding extent for clearing delalloc for this
974                          * range.
975                          */
976                         extent_clear_unlock_delalloc(inode, start, end,
977                                      delalloc_end, NULL,
978                                      EXTENT_LOCKED | EXTENT_DELALLOC |
979                                      EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
980                                      EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
981                                      PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
982                                      PAGE_END_WRITEBACK);
983                         *nr_written = *nr_written +
984                              (end - start + PAGE_SIZE) / PAGE_SIZE;
985                         *page_started = 1;
986                         goto out;
987                 } else if (ret < 0) {
988                         goto out_unlock;
989                 }
990         }
991
992         alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
993         btrfs_drop_extent_cache(BTRFS_I(inode), start,
994                         start + num_bytes - 1, 0);
995
996         while (num_bytes > 0) {
997                 cur_alloc_size = num_bytes;
998                 ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
999                                            fs_info->sectorsize, 0, alloc_hint,
1000                                            &ins, 1, 1);
1001                 if (ret < 0)
1002                         goto out_unlock;
1003                 cur_alloc_size = ins.offset;
1004                 extent_reserved = true;
1005
1006                 ram_size = ins.offset;
1007                 em = create_io_em(inode, start, ins.offset, /* len */
1008                                   start, /* orig_start */
1009                                   ins.objectid, /* block_start */
1010                                   ins.offset, /* block_len */
1011                                   ins.offset, /* orig_block_len */
1012                                   ram_size, /* ram_bytes */
1013                                   BTRFS_COMPRESS_NONE, /* compress_type */
1014                                   BTRFS_ORDERED_REGULAR /* type */);
1015                 if (IS_ERR(em)) {
1016                         ret = PTR_ERR(em);
1017                         goto out_reserve;
1018                 }
1019                 free_extent_map(em);
1020
1021                 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
1022                                                ram_size, cur_alloc_size, 0);
1023                 if (ret)
1024                         goto out_drop_extent_cache;
1025
1026                 if (root->root_key.objectid ==
1027                     BTRFS_DATA_RELOC_TREE_OBJECTID) {
1028                         ret = btrfs_reloc_clone_csums(inode, start,
1029                                                       cur_alloc_size);
1030                         /*
1031                          * Only drop cache here, and process as normal.
1032                          *
1033                          * We must not allow extent_clear_unlock_delalloc()
1034                          * at out_unlock label to free meta of this ordered
1035                          * extent, as its meta should be freed by
1036                          * btrfs_finish_ordered_io().
1037                          *
1038                          * So we must continue until @start is increased to
1039                          * skip current ordered extent.
1040                          */
1041                         if (ret)
1042                                 btrfs_drop_extent_cache(BTRFS_I(inode), start,
1043                                                 start + ram_size - 1, 0);
1044                 }
1045
1046                 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1047
1048                 /* we're not doing compressed IO, don't unlock the first
1049                  * page (which the caller expects to stay locked), don't
1050                  * clear any dirty bits and don't set any writeback bits
1051                  *
1052                  * Do set the Private2 bit so we know this page was properly
1053                  * setup for writepage
1054                  */
1055                 page_ops = unlock ? PAGE_UNLOCK : 0;
1056                 page_ops |= PAGE_SET_PRIVATE2;
1057
1058                 extent_clear_unlock_delalloc(inode, start,
1059                                              start + ram_size - 1,
1060                                              delalloc_end, locked_page,
1061                                              EXTENT_LOCKED | EXTENT_DELALLOC,
1062                                              page_ops);
1063                 if (num_bytes < cur_alloc_size)
1064                         num_bytes = 0;
1065                 else
1066                         num_bytes -= cur_alloc_size;
1067                 alloc_hint = ins.objectid + ins.offset;
1068                 start += cur_alloc_size;
1069                 extent_reserved = false;
1070
1071                 /*
1072                  * btrfs_reloc_clone_csums() error, since start is increased
1073                  * extent_clear_unlock_delalloc() at out_unlock label won't
1074                  * free metadata of current ordered extent, we're OK to exit.
1075                  */
1076                 if (ret)
1077                         goto out_unlock;
1078         }
1079 out:
1080         return ret;
1081
1082 out_drop_extent_cache:
1083         btrfs_drop_extent_cache(BTRFS_I(inode), start, start + ram_size - 1, 0);
1084 out_reserve:
1085         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1086         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1087 out_unlock:
1088         clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
1089                 EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
1090         page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
1091                 PAGE_END_WRITEBACK;
1092         /*
1093          * If we reserved an extent for our delalloc range (or a subrange) and
1094          * failed to create the respective ordered extent, then it means that
1095          * when we reserved the extent we decremented the extent's size from
1096          * the data space_info's bytes_may_use counter and incremented the
1097          * space_info's bytes_reserved counter by the same amount. We must make
1098          * sure extent_clear_unlock_delalloc() does not try to decrement again
1099          * the data space_info's bytes_may_use counter, therefore we do not pass
1100          * it the flag EXTENT_CLEAR_DATA_RESV.
1101          */
1102         if (extent_reserved) {
1103                 extent_clear_unlock_delalloc(inode, start,
1104                                              start + cur_alloc_size,
1105                                              start + cur_alloc_size,
1106                                              locked_page,
1107                                              clear_bits,
1108                                              page_ops);
1109                 start += cur_alloc_size;
1110                 if (start >= end)
1111                         goto out;
1112         }
1113         extent_clear_unlock_delalloc(inode, start, end, delalloc_end,
1114                                      locked_page,
1115                                      clear_bits | EXTENT_CLEAR_DATA_RESV,
1116                                      page_ops);
1117         goto out;
1118 }
1119
1120 /*
1121  * work queue call back to started compression on a file and pages
1122  */
1123 static noinline void async_cow_start(struct btrfs_work *work)
1124 {
1125         struct async_chunk *async_chunk;
1126         int num_added = 0;
1127
1128         async_chunk = container_of(work, struct async_chunk, work);
1129
1130         compress_file_range(async_chunk, &num_added);
1131         if (num_added == 0) {
1132                 btrfs_add_delayed_iput(async_chunk->inode);
1133                 async_chunk->inode = NULL;
1134         }
1135 }
1136
1137 /*
1138  * work queue call back to submit previously compressed pages
1139  */
1140 static noinline void async_cow_submit(struct btrfs_work *work)
1141 {
1142         struct async_chunk *async_chunk = container_of(work, struct async_chunk,
1143                                                      work);
1144         struct btrfs_fs_info *fs_info = btrfs_work_owner(work);
1145         unsigned long nr_pages;
1146
1147         nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
1148                 PAGE_SHIFT;
1149
1150         /* atomic_sub_return implies a barrier */
1151         if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
1152             5 * SZ_1M)
1153                 cond_wake_up_nomb(&fs_info->async_submit_wait);
1154
1155         /*
1156          * ->inode could be NULL if async_chunk_start has failed to compress,
1157          * in which case we don't have anything to submit, yet we need to
1158          * always adjust ->async_delalloc_pages as its paired with the init
1159          * happening in cow_file_range_async
1160          */
1161         if (async_chunk->inode)
1162                 submit_compressed_extents(async_chunk);
1163 }
1164
1165 static noinline void async_cow_free(struct btrfs_work *work)
1166 {
1167         struct async_chunk *async_chunk;
1168
1169         async_chunk = container_of(work, struct async_chunk, work);
1170         if (async_chunk->inode)
1171                 btrfs_add_delayed_iput(async_chunk->inode);
1172         /*
1173          * Since the pointer to 'pending' is at the beginning of the array of
1174          * async_chunk's, freeing it ensures the whole array has been freed.
1175          */
1176         if (atomic_dec_and_test(async_chunk->pending))
1177                 kvfree(async_chunk->pending);
1178 }
1179
1180 static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1181                                 u64 start, u64 end, int *page_started,
1182                                 unsigned long *nr_written,
1183                                 unsigned int write_flags)
1184 {
1185         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1186         struct async_cow *ctx;
1187         struct async_chunk *async_chunk;
1188         unsigned long nr_pages;
1189         u64 cur_end;
1190         u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
1191         int i;
1192         bool should_compress;
1193         unsigned nofs_flag;
1194
1195         unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
1196
1197         if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
1198             !btrfs_test_opt(fs_info, FORCE_COMPRESS)) {
1199                 num_chunks = 1;
1200                 should_compress = false;
1201         } else {
1202                 should_compress = true;
1203         }
1204
1205         nofs_flag = memalloc_nofs_save();
1206         ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL);
1207         memalloc_nofs_restore(nofs_flag);
1208
1209         if (!ctx) {
1210                 unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC |
1211                         EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
1212                         EXTENT_DO_ACCOUNTING;
1213                 unsigned long page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
1214                         PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
1215                         PAGE_SET_ERROR;
1216
1217                 extent_clear_unlock_delalloc(inode, start, end, 0, locked_page,
1218                                              clear_bits, page_ops);
1219                 return -ENOMEM;
1220         }
1221
1222         async_chunk = ctx->chunks;
1223         atomic_set(&ctx->num_chunks, num_chunks);
1224
1225         for (i = 0; i < num_chunks; i++) {
1226                 if (should_compress)
1227                         cur_end = min(end, start + SZ_512K - 1);
1228                 else
1229                         cur_end = end;
1230
1231                 /*
1232                  * igrab is called higher up in the call chain, take only the
1233                  * lightweight reference for the callback lifetime
1234                  */
1235                 ihold(inode);
1236                 async_chunk[i].pending = &ctx->num_chunks;
1237                 async_chunk[i].inode = inode;
1238                 async_chunk[i].start = start;
1239                 async_chunk[i].end = cur_end;
1240                 async_chunk[i].locked_page = locked_page;
1241                 async_chunk[i].write_flags = write_flags;
1242                 INIT_LIST_HEAD(&async_chunk[i].extents);
1243
1244                 btrfs_init_work(&async_chunk[i].work,
1245                                 btrfs_delalloc_helper,
1246                                 async_cow_start, async_cow_submit,
1247                                 async_cow_free);
1248
1249                 nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
1250                 atomic_add(nr_pages, &fs_info->async_delalloc_pages);
1251
1252                 btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work);
1253
1254                 *nr_written += nr_pages;
1255                 start = cur_end + 1;
1256         }
1257         *page_started = 1;
1258         return 0;
1259 }
1260
1261 static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
1262                                         u64 bytenr, u64 num_bytes)
1263 {
1264         int ret;
1265         struct btrfs_ordered_sum *sums;
1266         LIST_HEAD(list);
1267
1268         ret = btrfs_lookup_csums_range(fs_info->csum_root, bytenr,
1269                                        bytenr + num_bytes - 1, &list, 0);
1270         if (ret == 0 && list_empty(&list))
1271                 return 0;
1272
1273         while (!list_empty(&list)) {
1274                 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1275                 list_del(&sums->list);
1276                 kfree(sums);
1277         }
1278         if (ret < 0)
1279                 return ret;
1280         return 1;
1281 }
1282
1283 /*
1284  * when nowcow writeback call back.  This checks for snapshots or COW copies
1285  * of the extents that exist in the file, and COWs the file as required.
1286  *
1287  * If no cow copies or snapshots exist, we write directly to the existing
1288  * blocks on disk
1289  */
1290 static noinline int run_delalloc_nocow(struct inode *inode,
1291                                        struct page *locked_page,
1292                               u64 start, u64 end, int *page_started, int force,
1293                               unsigned long *nr_written)
1294 {
1295         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1296         struct btrfs_root *root = BTRFS_I(inode)->root;
1297         struct extent_buffer *leaf;
1298         struct btrfs_path *path;
1299         struct btrfs_file_extent_item *fi;
1300         struct btrfs_key found_key;
1301         struct extent_map *em;
1302         u64 cow_start;
1303         u64 cur_offset;
1304         u64 extent_end;
1305         u64 extent_offset;
1306         u64 disk_bytenr;
1307         u64 num_bytes;
1308         u64 disk_num_bytes;
1309         u64 ram_bytes;
1310         int extent_type;
1311         int ret;
1312         int type;
1313         int nocow;
1314         int check_prev = 1;
1315         bool nolock;
1316         u64 ino = btrfs_ino(BTRFS_I(inode));
1317
1318         path = btrfs_alloc_path();
1319         if (!path) {
1320                 extent_clear_unlock_delalloc(inode, start, end, end,
1321                                              locked_page,
1322                                              EXTENT_LOCKED | EXTENT_DELALLOC |
1323                                              EXTENT_DO_ACCOUNTING |
1324                                              EXTENT_DEFRAG, PAGE_UNLOCK |
1325                                              PAGE_CLEAR_DIRTY |
1326                                              PAGE_SET_WRITEBACK |
1327                                              PAGE_END_WRITEBACK);
1328                 return -ENOMEM;
1329         }
1330
1331         nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
1332
1333         cow_start = (u64)-1;
1334         cur_offset = start;
1335         while (1) {
1336                 ret = btrfs_lookup_file_extent(NULL, root, path, ino,
1337                                                cur_offset, 0);
1338                 if (ret < 0)
1339                         goto error;
1340                 if (ret > 0 && path->slots[0] > 0 && check_prev) {
1341                         leaf = path->nodes[0];
1342                         btrfs_item_key_to_cpu(leaf, &found_key,
1343                                               path->slots[0] - 1);
1344                         if (found_key.objectid == ino &&
1345                             found_key.type == BTRFS_EXTENT_DATA_KEY)
1346                                 path->slots[0]--;
1347                 }
1348                 check_prev = 0;
1349 next_slot:
1350                 leaf = path->nodes[0];
1351                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1352                         ret = btrfs_next_leaf(root, path);
1353                         if (ret < 0) {
1354                                 if (cow_start != (u64)-1)
1355                                         cur_offset = cow_start;
1356                                 goto error;
1357                         }
1358                         if (ret > 0)
1359                                 break;
1360                         leaf = path->nodes[0];
1361                 }
1362
1363                 nocow = 0;
1364                 disk_bytenr = 0;
1365                 num_bytes = 0;
1366                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1367
1368                 if (found_key.objectid > ino)
1369                         break;
1370                 if (WARN_ON_ONCE(found_key.objectid < ino) ||
1371                     found_key.type < BTRFS_EXTENT_DATA_KEY) {
1372                         path->slots[0]++;
1373                         goto next_slot;
1374                 }
1375                 if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
1376                     found_key.offset > end)
1377                         break;
1378
1379                 if (found_key.offset > cur_offset) {
1380                         extent_end = found_key.offset;
1381                         extent_type = 0;
1382                         goto out_check;
1383                 }
1384
1385                 fi = btrfs_item_ptr(leaf, path->slots[0],
1386                                     struct btrfs_file_extent_item);
1387                 extent_type = btrfs_file_extent_type(leaf, fi);
1388
1389                 ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1390                 if (extent_type == BTRFS_FILE_EXTENT_REG ||
1391                     extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1392                         disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1393                         extent_offset = btrfs_file_extent_offset(leaf, fi);
1394                         extent_end = found_key.offset +
1395                                 btrfs_file_extent_num_bytes(leaf, fi);
1396                         disk_num_bytes =
1397                                 btrfs_file_extent_disk_num_bytes(leaf, fi);
1398                         if (extent_end <= start) {
1399                                 path->slots[0]++;
1400                                 goto next_slot;
1401                         }
1402                         if (disk_bytenr == 0)
1403                                 goto out_check;
1404                         if (btrfs_file_extent_compression(leaf, fi) ||
1405                             btrfs_file_extent_encryption(leaf, fi) ||
1406                             btrfs_file_extent_other_encoding(leaf, fi))
1407                                 goto out_check;
1408                         /*
1409                          * Do the same check as in btrfs_cross_ref_exist but
1410                          * without the unnecessary search.
1411                          */
1412                         if (!nolock &&
1413                             btrfs_file_extent_generation(leaf, fi) <=
1414                             btrfs_root_last_snapshot(&root->root_item))
1415                                 goto out_check;
1416                         if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1417                                 goto out_check;
1418                         if (btrfs_extent_readonly(fs_info, disk_bytenr))
1419                                 goto out_check;
1420                         ret = btrfs_cross_ref_exist(root, ino,
1421                                                     found_key.offset -
1422                                                     extent_offset, disk_bytenr);
1423                         if (ret) {
1424                                 /*
1425                                  * ret could be -EIO if the above fails to read
1426                                  * metadata.
1427                                  */
1428                                 if (ret < 0) {
1429                                         if (cow_start != (u64)-1)
1430                                                 cur_offset = cow_start;
1431                                         goto error;
1432                                 }
1433
1434                                 WARN_ON_ONCE(nolock);
1435                                 goto out_check;
1436                         }
1437                         disk_bytenr += extent_offset;
1438                         disk_bytenr += cur_offset - found_key.offset;
1439                         num_bytes = min(end + 1, extent_end) - cur_offset;
1440                         /*
1441                          * if there are pending snapshots for this root,
1442                          * we fall into common COW way.
1443                          */
1444                         if (!nolock && atomic_read(&root->snapshot_force_cow))
1445                                 goto out_check;
1446                         /*
1447                          * force cow if csum exists in the range.
1448                          * this ensure that csum for a given extent are
1449                          * either valid or do not exist.
1450                          */
1451                         ret = csum_exist_in_range(fs_info, disk_bytenr,
1452                                                   num_bytes);
1453                         if (ret) {
1454                                 /*
1455                                  * ret could be -EIO if the above fails to read
1456                                  * metadata.
1457                                  */
1458                                 if (ret < 0) {
1459                                         if (cow_start != (u64)-1)
1460                                                 cur_offset = cow_start;
1461                                         goto error;
1462                                 }
1463                                 WARN_ON_ONCE(nolock);
1464                                 goto out_check;
1465                         }
1466                         if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr))
1467                                 goto out_check;
1468                         nocow = 1;
1469                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1470                         extent_end = found_key.offset +
1471                                 btrfs_file_extent_ram_bytes(leaf, fi);
1472                         extent_end = ALIGN(extent_end,
1473                                            fs_info->sectorsize);
1474                 } else {
1475                         BUG();
1476                 }
1477 out_check:
1478                 if (extent_end <= start) {
1479                         path->slots[0]++;
1480                         if (nocow)
1481                                 btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1482                         goto next_slot;
1483                 }
1484                 if (!nocow) {
1485                         if (cow_start == (u64)-1)
1486                                 cow_start = cur_offset;
1487                         cur_offset = extent_end;
1488                         if (cur_offset > end)
1489                                 break;
1490                         path->slots[0]++;
1491                         goto next_slot;
1492                 }
1493
1494                 btrfs_release_path(path);
1495                 if (cow_start != (u64)-1) {
1496                         ret = cow_file_range(inode, locked_page,
1497                                              cow_start, found_key.offset - 1,
1498                                              end, page_started, nr_written, 1,
1499                                              NULL);
1500                         if (ret) {
1501                                 if (nocow)
1502                                         btrfs_dec_nocow_writers(fs_info,
1503                                                                 disk_bytenr);
1504                                 goto error;
1505                         }
1506                         cow_start = (u64)-1;
1507                 }
1508
1509                 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1510                         u64 orig_start = found_key.offset - extent_offset;
1511
1512                         em = create_io_em(inode, cur_offset, num_bytes,
1513                                           orig_start,
1514                                           disk_bytenr, /* block_start */
1515                                           num_bytes, /* block_len */
1516                                           disk_num_bytes, /* orig_block_len */
1517                                           ram_bytes, BTRFS_COMPRESS_NONE,
1518                                           BTRFS_ORDERED_PREALLOC);
1519                         if (IS_ERR(em)) {
1520                                 if (nocow)
1521                                         btrfs_dec_nocow_writers(fs_info,
1522                                                                 disk_bytenr);
1523                                 ret = PTR_ERR(em);
1524                                 goto error;
1525                         }
1526                         free_extent_map(em);
1527                 }
1528
1529                 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1530                         type = BTRFS_ORDERED_PREALLOC;
1531                 } else {
1532                         type = BTRFS_ORDERED_NOCOW;
1533                 }
1534
1535                 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1536                                                num_bytes, num_bytes, type);
1537                 if (nocow)
1538                         btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1539                 BUG_ON(ret); /* -ENOMEM */
1540
1541                 if (root->root_key.objectid ==
1542                     BTRFS_DATA_RELOC_TREE_OBJECTID)
1543                         /*
1544                          * Error handled later, as we must prevent
1545                          * extent_clear_unlock_delalloc() in error handler
1546                          * from freeing metadata of created ordered extent.
1547                          */
1548                         ret = btrfs_reloc_clone_csums(inode, cur_offset,
1549                                                       num_bytes);
1550
1551                 extent_clear_unlock_delalloc(inode, cur_offset,
1552                                              cur_offset + num_bytes - 1, end,
1553                                              locked_page, EXTENT_LOCKED |
1554                                              EXTENT_DELALLOC |
1555                                              EXTENT_CLEAR_DATA_RESV,
1556                                              PAGE_UNLOCK | PAGE_SET_PRIVATE2);
1557
1558                 cur_offset = extent_end;
1559
1560                 /*
1561                  * btrfs_reloc_clone_csums() error, now we're OK to call error
1562                  * handler, as metadata for created ordered extent will only
1563                  * be freed by btrfs_finish_ordered_io().
1564                  */
1565                 if (ret)
1566                         goto error;
1567                 if (cur_offset > end)
1568                         break;
1569         }
1570         btrfs_release_path(path);
1571
1572         if (cur_offset <= end && cow_start == (u64)-1)
1573                 cow_start = cur_offset;
1574
1575         if (cow_start != (u64)-1) {
1576                 cur_offset = end;
1577                 ret = cow_file_range(inode, locked_page, cow_start, end, end,
1578                                      page_started, nr_written, 1, NULL);
1579                 if (ret)
1580                         goto error;
1581         }
1582
1583 error:
1584         if (ret && cur_offset < end)
1585                 extent_clear_unlock_delalloc(inode, cur_offset, end, end,
1586                                              locked_page, EXTENT_LOCKED |
1587                                              EXTENT_DELALLOC | EXTENT_DEFRAG |
1588                                              EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1589                                              PAGE_CLEAR_DIRTY |
1590                                              PAGE_SET_WRITEBACK |
1591                                              PAGE_END_WRITEBACK);
1592         btrfs_free_path(path);
1593         return ret;
1594 }
1595
1596 static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
1597 {
1598
1599         if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
1600             !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
1601                 return 0;
1602
1603         /*
1604          * @defrag_bytes is a hint value, no spinlock held here,
1605          * if is not zero, it means the file is defragging.
1606          * Force cow if given extent needs to be defragged.
1607          */
1608         if (BTRFS_I(inode)->defrag_bytes &&
1609             test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1610                            EXTENT_DEFRAG, 0, NULL))
1611                 return 1;
1612
1613         return 0;
1614 }
1615
1616 /*
1617  * Function to process delayed allocation (create CoW) for ranges which are
1618  * being touched for the first time.
1619  */
1620 int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
1621                 u64 start, u64 end, int *page_started, unsigned long *nr_written,
1622                 struct writeback_control *wbc)
1623 {
1624         int ret;
1625         int force_cow = need_force_cow(inode, start, end);
1626         unsigned int write_flags = wbc_to_write_flags(wbc);
1627
1628         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
1629                 ret = run_delalloc_nocow(inode, locked_page, start, end,
1630                                          page_started, 1, nr_written);
1631         } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
1632                 ret = run_delalloc_nocow(inode, locked_page, start, end,
1633                                          page_started, 0, nr_written);
1634         } else if (!inode_need_compress(inode, start, end)) {
1635                 ret = cow_file_range(inode, locked_page, start, end, end,
1636                                       page_started, nr_written, 1, NULL);
1637         } else {
1638                 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1639                         &BTRFS_I(inode)->runtime_flags);
1640                 ret = cow_file_range_async(inode, locked_page, start, end,
1641                                            page_started, nr_written,
1642                                            write_flags);
1643         }
1644         if (ret)
1645                 btrfs_cleanup_ordered_extents(inode, locked_page, start,
1646                                               end - start + 1);
1647         return ret;
1648 }
1649
1650 void btrfs_split_delalloc_extent(struct inode *inode,
1651                                  struct extent_state *orig, u64 split)
1652 {
1653         u64 size;
1654
1655         /* not delalloc, ignore it */
1656         if (!(orig->state & EXTENT_DELALLOC))
1657                 return;
1658
1659         size = orig->end - orig->start + 1;
1660         if (size > BTRFS_MAX_EXTENT_SIZE) {
1661                 u32 num_extents;
1662                 u64 new_size;
1663
1664                 /*
1665                  * See the explanation in btrfs_merge_delalloc_extent, the same
1666                  * applies here, just in reverse.
1667                  */
1668                 new_size = orig->end - split + 1;
1669                 num_extents = count_max_extents(new_size);
1670                 new_size = split - orig->start;
1671                 num_extents += count_max_extents(new_size);
1672                 if (count_max_extents(size) >= num_extents)
1673                         return;
1674         }
1675
1676         spin_lock(&BTRFS_I(inode)->lock);
1677         btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
1678         spin_unlock(&BTRFS_I(inode)->lock);
1679 }
1680
1681 /*
1682  * Handle merged delayed allocation extents so we can keep track of new extents
1683  * that are just merged onto old extents, such as when we are doing sequential
1684  * writes, so we can properly account for the metadata space we'll need.
1685  */
1686 void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
1687                                  struct extent_state *other)
1688 {
1689         u64 new_size, old_size;
1690         u32 num_extents;
1691
1692         /* not delalloc, ignore it */
1693         if (!(other->state & EXTENT_DELALLOC))
1694                 return;
1695
1696         if (new->start > other->start)
1697                 new_size = new->end - other->start + 1;
1698         else
1699                 new_size = other->end - new->start + 1;
1700
1701         /* we're not bigger than the max, unreserve the space and go */
1702         if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
1703                 spin_lock(&BTRFS_I(inode)->lock);
1704                 btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
1705                 spin_unlock(&BTRFS_I(inode)->lock);
1706                 return;
1707         }
1708
1709         /*
1710          * We have to add up either side to figure out how many extents were
1711          * accounted for before we merged into one big extent.  If the number of
1712          * extents we accounted for is <= the amount we need for the new range
1713          * then we can return, otherwise drop.  Think of it like this
1714          *
1715          * [ 4k][MAX_SIZE]
1716          *
1717          * So we've grown the extent by a MAX_SIZE extent, this would mean we
1718          * need 2 outstanding extents, on one side we have 1 and the other side
1719          * we have 1 so they are == and we can return.  But in this case
1720          *
1721          * [MAX_SIZE+4k][MAX_SIZE+4k]
1722          *
1723          * Each range on their own accounts for 2 extents, but merged together
1724          * they are only 3 extents worth of accounting, so we need to drop in
1725          * this case.
1726          */
1727         old_size = other->end - other->start + 1;
1728         num_extents = count_max_extents(old_size);
1729         old_size = new->end - new->start + 1;
1730         num_extents += count_max_extents(old_size);
1731         if (count_max_extents(new_size) >= num_extents)
1732                 return;
1733
1734         spin_lock(&BTRFS_I(inode)->lock);
1735         btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
1736         spin_unlock(&BTRFS_I(inode)->lock);
1737 }
1738
1739 static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
1740                                       struct inode *inode)
1741 {
1742         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1743
1744         spin_lock(&root->delalloc_lock);
1745         if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1746                 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1747                               &root->delalloc_inodes);
1748                 set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1749                         &BTRFS_I(inode)->runtime_flags);
1750                 root->nr_delalloc_inodes++;
1751                 if (root->nr_delalloc_inodes == 1) {
1752                         spin_lock(&fs_info->delalloc_root_lock);
1753                         BUG_ON(!list_empty(&root->delalloc_root));
1754                         list_add_tail(&root->delalloc_root,
1755                                       &fs_info->delalloc_roots);
1756                         spin_unlock(&fs_info->delalloc_root_lock);
1757                 }
1758         }
1759         spin_unlock(&root->delalloc_lock);
1760 }
1761
1762
1763 void __btrfs_del_delalloc_inode(struct btrfs_root *root,
1764                                 struct btrfs_inode *inode)
1765 {
1766         struct btrfs_fs_info *fs_info = root->fs_info;
1767
1768         if (!list_empty(&inode->delalloc_inodes)) {
1769                 list_del_init(&inode->delalloc_inodes);
1770                 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1771                           &inode->runtime_flags);
1772                 root->nr_delalloc_inodes--;
1773                 if (!root->nr_delalloc_inodes) {
1774                         ASSERT(list_empty(&root->delalloc_inodes));
1775                         spin_lock(&fs_info->delalloc_root_lock);
1776                         BUG_ON(list_empty(&root->delalloc_root));
1777                         list_del_init(&root->delalloc_root);
1778                         spin_unlock(&fs_info->delalloc_root_lock);
1779                 }
1780         }
1781 }
1782
1783 static void btrfs_del_delalloc_inode(struct btrfs_root *root,
1784                                      struct btrfs_inode *inode)
1785 {
1786         spin_lock(&root->delalloc_lock);
1787         __btrfs_del_delalloc_inode(root, inode);
1788         spin_unlock(&root->delalloc_lock);
1789 }
1790
1791 /*
1792  * Properly track delayed allocation bytes in the inode and to maintain the
1793  * list of inodes that have pending delalloc work to be done.
1794  */
1795 void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
1796                                unsigned *bits)
1797 {
1798         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1799
1800         if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
1801                 WARN_ON(1);
1802         /*
1803          * set_bit and clear bit hooks normally require _irqsave/restore
1804          * but in this case, we are only testing for the DELALLOC
1805          * bit, which is only set or cleared with irqs on
1806          */
1807         if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1808                 struct btrfs_root *root = BTRFS_I(inode)->root;
1809                 u64 len = state->end + 1 - state->start;
1810                 u32 num_extents = count_max_extents(len);
1811                 bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
1812
1813                 spin_lock(&BTRFS_I(inode)->lock);
1814                 btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents);
1815                 spin_unlock(&BTRFS_I(inode)->lock);
1816
1817                 /* For sanity tests */
1818                 if (btrfs_is_testing(fs_info))
1819                         return;
1820
1821                 percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
1822                                          fs_info->delalloc_batch);
1823                 spin_lock(&BTRFS_I(inode)->lock);
1824                 BTRFS_I(inode)->delalloc_bytes += len;
1825                 if (*bits & EXTENT_DEFRAG)
1826                         BTRFS_I(inode)->defrag_bytes += len;
1827                 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1828                                          &BTRFS_I(inode)->runtime_flags))
1829                         btrfs_add_delalloc_inodes(root, inode);
1830                 spin_unlock(&BTRFS_I(inode)->lock);
1831         }
1832
1833         if (!(state->state & EXTENT_DELALLOC_NEW) &&
1834             (*bits & EXTENT_DELALLOC_NEW)) {
1835                 spin_lock(&BTRFS_I(inode)->lock);
1836                 BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
1837                         state->start;
1838                 spin_unlock(&BTRFS_I(inode)->lock);
1839         }
1840 }
1841
1842 /*
1843  * Once a range is no longer delalloc this function ensures that proper
1844  * accounting happens.
1845  */
1846 void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
1847                                  struct extent_state *state, unsigned *bits)
1848 {
1849         struct btrfs_inode *inode = BTRFS_I(vfs_inode);
1850         struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb);
1851         u64 len = state->end + 1 - state->start;
1852         u32 num_extents = count_max_extents(len);
1853
1854         if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) {
1855                 spin_lock(&inode->lock);
1856                 inode->defrag_bytes -= len;
1857                 spin_unlock(&inode->lock);
1858         }
1859
1860         /*
1861          * set_bit and clear bit hooks normally require _irqsave/restore
1862          * but in this case, we are only testing for the DELALLOC
1863          * bit, which is only set or cleared with irqs on
1864          */
1865         if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1866                 struct btrfs_root *root = inode->root;
1867                 bool do_list = !btrfs_is_free_space_inode(inode);
1868
1869                 spin_lock(&inode->lock);
1870                 btrfs_mod_outstanding_extents(inode, -num_extents);
1871                 spin_unlock(&inode->lock);
1872
1873                 /*
1874                  * We don't reserve metadata space for space cache inodes so we
1875                  * don't need to call delalloc_release_metadata if there is an
1876                  * error.
1877                  */
1878                 if (*bits & EXTENT_CLEAR_META_RESV &&
1879                     root != fs_info->tree_root)
1880                         btrfs_delalloc_release_metadata(inode, len, false);
1881
1882                 /* For sanity tests. */
1883                 if (btrfs_is_testing(fs_info))
1884                         return;
1885
1886                 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID &&
1887                     do_list && !(state->state & EXTENT_NORESERVE) &&
1888                     (*bits & EXTENT_CLEAR_DATA_RESV))
1889                         btrfs_free_reserved_data_space_noquota(
1890                                         &inode->vfs_inode,
1891                                         state->start, len);
1892
1893                 percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
1894                                          fs_info->delalloc_batch);
1895                 spin_lock(&inode->lock);
1896                 inode->delalloc_bytes -= len;
1897                 if (do_list && inode->delalloc_bytes == 0 &&
1898                     test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1899                                         &inode->runtime_flags))
1900                         btrfs_del_delalloc_inode(root, inode);
1901                 spin_unlock(&inode->lock);
1902         }
1903
1904         if ((state->state & EXTENT_DELALLOC_NEW) &&
1905             (*bits & EXTENT_DELALLOC_NEW)) {
1906                 spin_lock(&inode->lock);
1907                 ASSERT(inode->new_delalloc_bytes >= len);
1908                 inode->new_delalloc_bytes -= len;
1909                 spin_unlock(&inode->lock);
1910         }
1911 }
1912
1913 /*
1914  * btrfs_bio_fits_in_stripe - Checks whether the size of the given bio will fit
1915  * in a chunk's stripe. This function ensures that bios do not span a
1916  * stripe/chunk
1917  *
1918  * @page - The page we are about to add to the bio
1919  * @size - size we want to add to the bio
1920  * @bio - bio we want to ensure is smaller than a stripe
1921  * @bio_flags - flags of the bio
1922  *
1923  * return 1 if page cannot be added to the bio
1924  * return 0 if page can be added to the bio
1925  * return error otherwise
1926  */
1927 int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
1928                              unsigned long bio_flags)
1929 {
1930         struct inode *inode = page->mapping->host;
1931         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1932         u64 logical = (u64)bio->bi_iter.bi_sector << 9;
1933         u64 length = 0;
1934         u64 map_length;
1935         int ret;
1936         struct btrfs_io_geometry geom;
1937
1938         if (bio_flags & EXTENT_BIO_COMPRESSED)
1939                 return 0;
1940
1941         length = bio->bi_iter.bi_size;
1942         map_length = length;
1943         ret = btrfs_get_io_geometry(fs_info, btrfs_op(bio), logical, map_length,
1944                                     &geom);
1945         if (ret < 0)
1946                 return ret;
1947
1948         if (geom.len < length + size)
1949                 return 1;
1950         return 0;
1951 }
1952
1953 /*
1954  * in order to insert checksums into the metadata in large chunks,
1955  * we wait until bio submission time.   All the pages in the bio are
1956  * checksummed and sums are attached onto the ordered extent record.
1957  *
1958  * At IO completion time the cums attached on the ordered extent record
1959  * are inserted into the btree
1960  */
1961 static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio,
1962                                     u64 bio_offset)
1963 {
1964         struct inode *inode = private_data;
1965         blk_status_t ret = 0;
1966
1967         ret = btrfs_csum_one_bio(inode, bio, 0, 0);
1968         BUG_ON(ret); /* -ENOMEM */
1969         return 0;
1970 }
1971
1972 /*
1973  * extent_io.c submission hook. This does the right thing for csum calculation
1974  * on write, or reading the csums from the tree before a read.
1975  *
1976  * Rules about async/sync submit,
1977  * a) read:                             sync submit
1978  *
1979  * b) write without checksum:           sync submit
1980  *
1981  * c) write with checksum:
1982  *    c-1) if bio is issued by fsync:   sync submit
1983  *         (sync_writers != 0)
1984  *
1985  *    c-2) if root is reloc root:       sync submit
1986  *         (only in case of buffered IO)
1987  *
1988  *    c-3) otherwise:                   async submit
1989  */
1990 static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
1991                                           int mirror_num,
1992                                           unsigned long bio_flags)
1993
1994 {
1995         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1996         struct btrfs_root *root = BTRFS_I(inode)->root;
1997         enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
1998         blk_status_t ret = 0;
1999         int skip_sum;
2000         int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
2001
2002         skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
2003
2004         if (btrfs_is_free_space_inode(BTRFS_I(inode)))
2005                 metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
2006
2007         if (bio_op(bio) != REQ_OP_WRITE) {
2008                 ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
2009                 if (ret)
2010                         goto out;
2011
2012                 if (bio_flags & EXTENT_BIO_COMPRESSED) {
2013                         ret = btrfs_submit_compressed_read(inode, bio,
2014                                                            mirror_num,
2015                                                            bio_flags);
2016                         goto out;
2017                 } else if (!skip_sum) {
2018                         ret = btrfs_lookup_bio_sums(inode, bio, NULL);
2019                         if (ret)
2020                                 goto out;
2021                 }
2022                 goto mapit;
2023         } else if (async && !skip_sum) {
2024                 /* csum items have already been cloned */
2025                 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2026                         goto mapit;
2027                 /* we're doing a write, do the async checksumming */
2028                 ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags,
2029                                           0, inode, btrfs_submit_bio_start);
2030                 goto out;
2031         } else if (!skip_sum) {
2032                 ret = btrfs_csum_one_bio(inode, bio, 0, 0);
2033                 if (ret)
2034                         goto out;
2035         }
2036
2037 mapit:
2038         ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
2039
2040 out:
2041         if (ret) {
2042                 bio->bi_status = ret;
2043                 bio_endio(bio);
2044         }
2045         return ret;
2046 }
2047
2048 /*
2049  * given a list of ordered sums record them in the inode.  This happens
2050  * at IO completion time based on sums calculated at bio submission time.
2051  */
2052 static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
2053                              struct inode *inode, struct list_head *list)
2054 {
2055         struct btrfs_ordered_sum *sum;
2056         int ret;
2057
2058         list_for_each_entry(sum, list, list) {
2059                 trans->adding_csums = true;
2060                 ret = btrfs_csum_file_blocks(trans,
2061                        BTRFS_I(inode)->root->fs_info->csum_root, sum);
2062                 trans->adding_csums = false;
2063                 if (ret)
2064                         return ret;
2065         }
2066         return 0;
2067 }
2068
2069 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2070                               unsigned int extra_bits,
2071                               struct extent_state **cached_state, int dedupe)
2072 {
2073         WARN_ON(PAGE_ALIGNED(end));
2074         return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
2075                                    extra_bits, cached_state);
2076 }
2077
2078 /* see btrfs_writepage_start_hook for details on why this is required */
2079 struct btrfs_writepage_fixup {
2080         struct page *page;
2081         struct btrfs_work work;
2082 };
2083
2084 static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
2085 {
2086         struct btrfs_writepage_fixup *fixup;
2087         struct btrfs_ordered_extent *ordered;
2088         struct extent_state *cached_state = NULL;
2089         struct extent_changeset *data_reserved = NULL;
2090         struct page *page;
2091         struct inode *inode;
2092         u64 page_start;
2093         u64 page_end;
2094         int ret;
2095
2096         fixup = container_of(work, struct btrfs_writepage_fixup, work);
2097         page = fixup->page;
2098 again:
2099         lock_page(page);
2100         if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
2101                 ClearPageChecked(page);
2102                 goto out_page;
2103         }
2104
2105         inode = page->mapping->host;
2106         page_start = page_offset(page);
2107         page_end = page_offset(page) + PAGE_SIZE - 1;
2108
2109         lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
2110                          &cached_state);
2111
2112         /* already ordered? We're done */
2113         if (PagePrivate2(page))
2114                 goto out;
2115
2116         ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
2117                                         PAGE_SIZE);
2118         if (ordered) {
2119                 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
2120                                      page_end, &cached_state);
2121                 unlock_page(page);
2122                 btrfs_start_ordered_extent(inode, ordered, 1);
2123                 btrfs_put_ordered_extent(ordered);
2124                 goto again;
2125         }
2126
2127         ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
2128                                            PAGE_SIZE);
2129         if (ret) {
2130                 mapping_set_error(page->mapping, ret);
2131                 end_extent_writepage(page, ret, page_start, page_end);
2132                 ClearPageChecked(page);
2133                 goto out;
2134          }
2135
2136         ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
2137                                         &cached_state, 0);
2138         if (ret) {
2139                 mapping_set_error(page->mapping, ret);
2140                 end_extent_writepage(page, ret, page_start, page_end);
2141                 ClearPageChecked(page);
2142                 goto out;
2143         }
2144
2145         ClearPageChecked(page);
2146         set_page_dirty(page);
2147         btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, false);
2148 out:
2149         unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
2150                              &cached_state);
2151 out_page:
2152         unlock_page(page);
2153         put_page(page);
2154         kfree(fixup);
2155         extent_changeset_free(data_reserved);
2156 }
2157
2158 /*
2159  * There are a few paths in the higher layers of the kernel that directly
2160  * set the page dirty bit without asking the filesystem if it is a
2161  * good idea.  This causes problems because we want to make sure COW
2162  * properly happens and the data=ordered rules are followed.
2163  *
2164  * In our case any range that doesn't have the ORDERED bit set
2165  * hasn't been properly setup for IO.  We kick off an async process
2166  * to fix it up.  The async helper will wait for ordered extents, set
2167  * the delalloc bit and make it safe to write the page.
2168  */
2169 int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
2170 {
2171         struct inode *inode = page->mapping->host;
2172         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2173         struct btrfs_writepage_fixup *fixup;
2174
2175         /* this page is properly in the ordered list */
2176         if (TestClearPagePrivate2(page))
2177                 return 0;
2178
2179         if (PageChecked(page))
2180                 return -EAGAIN;
2181
2182         fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
2183         if (!fixup)
2184                 return -EAGAIN;
2185
2186         SetPageChecked(page);
2187         get_page(page);
2188         btrfs_init_work(&fixup->work, btrfs_fixup_helper,
2189                         btrfs_writepage_fixup_worker, NULL, NULL);
2190         fixup->page = page;
2191         btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
2192         return -EBUSY;
2193 }
2194
2195 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2196                                        struct inode *inode, u64 file_pos,
2197                                        u64 disk_bytenr, u64 disk_num_bytes,
2198                                        u64 num_bytes, u64 ram_bytes,
2199                                        u8 compression, u8 encryption,
2200                                        u16 other_encoding, int extent_type)
2201 {
2202         struct btrfs_root *root = BTRFS_I(inode)->root;
2203         struct btrfs_file_extent_item *fi;
2204         struct btrfs_path *path;
2205         struct extent_buffer *leaf;
2206         struct btrfs_key ins;
2207         u64 qg_released;
2208         int extent_inserted = 0;
2209         int ret;
2210
2211         path = btrfs_alloc_path();
2212         if (!path)
2213                 return -ENOMEM;
2214
2215         /*
2216          * we may be replacing one extent in the tree with another.
2217          * The new extent is pinned in the extent map, and we don't want
2218          * to drop it from the cache until it is completely in the btree.
2219          *
2220          * So, tell btrfs_drop_extents to leave this extent in the cache.
2221          * the caller is expected to unpin it and allow it to be merged
2222          * with the others.
2223          */
2224         ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
2225                                    file_pos + num_bytes, NULL, 0,
2226                                    1, sizeof(*fi), &extent_inserted);
2227         if (ret)
2228                 goto out;
2229
2230         if (!extent_inserted) {
2231                 ins.objectid = btrfs_ino(BTRFS_I(inode));
2232                 ins.offset = file_pos;
2233                 ins.type = BTRFS_EXTENT_DATA_KEY;
2234
2235                 path->leave_spinning = 1;
2236                 ret = btrfs_insert_empty_item(trans, root, path, &ins,
2237                                               sizeof(*fi));
2238                 if (ret)
2239                         goto out;
2240         }
2241         leaf = path->nodes[0];
2242         fi = btrfs_item_ptr(leaf, path->slots[0],
2243                             struct btrfs_file_extent_item);
2244         btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2245         btrfs_set_file_extent_type(leaf, fi, extent_type);
2246         btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
2247         btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
2248         btrfs_set_file_extent_offset(leaf, fi, 0);
2249         btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2250         btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
2251         btrfs_set_file_extent_compression(leaf, fi, compression);
2252         btrfs_set_file_extent_encryption(leaf, fi, encryption);
2253         btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
2254
2255         btrfs_mark_buffer_dirty(leaf);
2256         btrfs_release_path(path);
2257
2258         inode_add_bytes(inode, num_bytes);
2259
2260         ins.objectid = disk_bytenr;
2261         ins.offset = disk_num_bytes;
2262         ins.type = BTRFS_EXTENT_ITEM_KEY;
2263
2264         /*
2265          * Release the reserved range from inode dirty range map, as it is
2266          * already moved into delayed_ref_head
2267          */
2268         ret = btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
2269         if (ret < 0)
2270                 goto out;
2271         qg_released = ret;
2272         ret = btrfs_alloc_reserved_file_extent(trans, root,
2273                                                btrfs_ino(BTRFS_I(inode)),
2274                                                file_pos, qg_released, &ins);
2275 out:
2276         btrfs_free_path(path);
2277
2278         return ret;
2279 }
2280
2281 /* snapshot-aware defrag */
2282 struct sa_defrag_extent_backref {
2283         struct rb_node node;
2284         struct old_sa_defrag_extent *old;
2285         u64 root_id;
2286         u64 inum;
2287         u64 file_pos;
2288         u64 extent_offset;
2289         u64 num_bytes;
2290         u64 generation;
2291 };
2292
2293 struct old_sa_defrag_extent {
2294         struct list_head list;
2295         struct new_sa_defrag_extent *new;
2296
2297         u64 extent_offset;
2298         u64 bytenr;
2299         u64 offset;
2300         u64 len;
2301         int count;
2302 };
2303
2304 struct new_sa_defrag_extent {
2305         struct rb_root root;
2306         struct list_head head;
2307         struct btrfs_path *path;
2308         struct inode *inode;
2309         u64 file_pos;
2310         u64 len;
2311         u64 bytenr;
2312         u64 disk_len;
2313         u8 compress_type;
2314 };
2315
2316 static int backref_comp(struct sa_defrag_extent_backref *b1,
2317                         struct sa_defrag_extent_backref *b2)
2318 {
2319         if (b1->root_id < b2->root_id)
2320                 return -1;
2321         else if (b1->root_id > b2->root_id)
2322                 return 1;
2323
2324         if (b1->inum < b2->inum)
2325                 return -1;
2326         else if (b1->inum > b2->inum)
2327                 return 1;
2328
2329         if (b1->file_pos < b2->file_pos)
2330                 return -1;
2331         else if (b1->file_pos > b2->file_pos)
2332                 return 1;
2333
2334         /*
2335          * [------------------------------] ===> (a range of space)
2336          *     |<--->|   |<---->| =============> (fs/file tree A)
2337          * |<---------------------------->| ===> (fs/file tree B)
2338          *
2339          * A range of space can refer to two file extents in one tree while
2340          * refer to only one file extent in another tree.
2341          *
2342          * So we may process a disk offset more than one time(two extents in A)
2343          * and locate at the same extent(one extent in B), then insert two same
2344          * backrefs(both refer to the extent in B).
2345          */
2346         return 0;
2347 }
2348
2349 static void backref_insert(struct rb_root *root,
2350                            struct sa_defrag_extent_backref *backref)
2351 {
2352         struct rb_node **p = &root->rb_node;
2353         struct rb_node *parent = NULL;
2354         struct sa_defrag_extent_backref *entry;
2355         int ret;
2356
2357         while (*p) {
2358                 parent = *p;
2359                 entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
2360
2361                 ret = backref_comp(backref, entry);
2362                 if (ret < 0)
2363                         p = &(*p)->rb_left;
2364                 else
2365                         p = &(*p)->rb_right;
2366         }
2367
2368         rb_link_node(&backref->node, parent, p);
2369         rb_insert_color(&backref->node, root);
2370 }
2371
2372 /*
2373  * Note the backref might has changed, and in this case we just return 0.
2374  */
2375 static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2376                                        void *ctx)
2377 {
2378         struct btrfs_file_extent_item *extent;
2379         struct old_sa_defrag_extent *old = ctx;
2380         struct new_sa_defrag_extent *new = old->new;
2381         struct btrfs_path *path = new->path;
2382         struct btrfs_key key;
2383         struct btrfs_root *root;
2384         struct sa_defrag_extent_backref *backref;
2385         struct extent_buffer *leaf;
2386         struct inode *inode = new->inode;
2387         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2388         int slot;
2389         int ret;
2390         u64 extent_offset;
2391         u64 num_bytes;
2392
2393         if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
2394             inum == btrfs_ino(BTRFS_I(inode)))
2395                 return 0;
2396
2397         key.objectid = root_id;
2398         key.type = BTRFS_ROOT_ITEM_KEY;
2399         key.offset = (u64)-1;
2400
2401         root = btrfs_read_fs_root_no_name(fs_info, &key);
2402         if (IS_ERR(root)) {
2403                 if (PTR_ERR(root) == -ENOENT)
2404                         return 0;
2405                 WARN_ON(1);
2406                 btrfs_debug(fs_info, "inum=%llu, offset=%llu, root_id=%llu",
2407                          inum, offset, root_id);
2408                 return PTR_ERR(root);
2409         }
2410
2411         key.objectid = inum;
2412         key.type = BTRFS_EXTENT_DATA_KEY;
2413         if (offset > (u64)-1 << 32)
2414                 key.offset = 0;
2415         else
2416                 key.offset = offset;
2417
2418         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2419         if (WARN_ON(ret < 0))
2420                 return ret;
2421         ret = 0;
2422
2423         while (1) {
2424                 cond_resched();
2425
2426                 leaf = path->nodes[0];
2427                 slot = path->slots[0];
2428
2429                 if (slot >= btrfs_header_nritems(leaf)) {
2430                         ret = btrfs_next_leaf(root, path);
2431                         if (ret < 0) {
2432                                 goto out;
2433                         } else if (ret > 0) {
2434                                 ret = 0;
2435                                 goto out;
2436                         }
2437                         continue;
2438                 }
2439
2440                 path->slots[0]++;
2441
2442                 btrfs_item_key_to_cpu(leaf, &key, slot);
2443
2444                 if (key.objectid > inum)
2445                         goto out;
2446
2447                 if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
2448                         continue;
2449
2450                 extent = btrfs_item_ptr(leaf, slot,
2451                                         struct btrfs_file_extent_item);
2452
2453                 if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
2454                         continue;
2455
2456                 /*
2457                  * 'offset' refers to the exact key.offset,
2458                  * NOT the 'offset' field in btrfs_extent_data_ref, ie.
2459                  * (key.offset - extent_offset).
2460                  */
2461                 if (key.offset != offset)
2462                         continue;
2463
2464                 extent_offset = btrfs_file_extent_offset(leaf, extent);
2465                 num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
2466
2467                 if (extent_offset >= old->extent_offset + old->offset +
2468                     old->len || extent_offset + num_bytes <=
2469                     old->extent_offset + old->offset)
2470                         continue;
2471                 break;
2472         }
2473
2474         backref = kmalloc(sizeof(*backref), GFP_NOFS);
2475         if (!backref) {
2476                 ret = -ENOENT;
2477                 goto out;
2478         }
2479
2480         backref->root_id = root_id;
2481         backref->inum = inum;
2482         backref->file_pos = offset;
2483         backref->num_bytes = num_bytes;
2484         backref->extent_offset = extent_offset;
2485         backref->generation = btrfs_file_extent_generation(leaf, extent);
2486         backref->old = old;
2487         backref_insert(&new->root, backref);
2488         old->count++;
2489 out:
2490         btrfs_release_path(path);
2491         WARN_ON(ret);
2492         return ret;
2493 }
2494
2495 static noinline bool record_extent_backrefs(struct btrfs_path *path,
2496                                    struct new_sa_defrag_extent *new)
2497 {
2498         struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2499         struct old_sa_defrag_extent *old, *tmp;
2500         int ret;
2501
2502         new->path = path;
2503
2504         list_for_each_entry_safe(old, tmp, &new->head, list) {
2505                 ret = iterate_inodes_from_logical(old->bytenr +
2506                                                   old->extent_offset, fs_info,
2507                                                   path, record_one_backref,
2508                                                   old, false);
2509                 if (ret < 0 && ret != -ENOENT)
2510                         return false;
2511
2512                 /* no backref to be processed for this extent */
2513                 if (!old->count) {
2514                         list_del(&old->list);
2515                         kfree(old);
2516                 }
2517         }
2518
2519         if (list_empty(&new->head))
2520                 return false;
2521
2522         return true;
2523 }
2524
2525 static int relink_is_mergable(struct extent_buffer *leaf,
2526                               struct btrfs_file_extent_item *fi,
2527                               struct new_sa_defrag_extent *new)
2528 {
2529         if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
2530                 return 0;
2531
2532         if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2533                 return 0;
2534
2535         if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
2536                 return 0;
2537
2538         if (btrfs_file_extent_encryption(leaf, fi) ||
2539             btrfs_file_extent_other_encoding(leaf, fi))
2540                 return 0;
2541
2542         return 1;
2543 }
2544
2545 /*
2546  * Note the backref might has changed, and in this case we just return 0.
2547  */
2548 static noinline int relink_extent_backref(struct btrfs_path *path,
2549                                  struct sa_defrag_extent_backref *prev,
2550                                  struct sa_defrag_extent_backref *backref)
2551 {
2552         struct btrfs_file_extent_item *extent;
2553         struct btrfs_file_extent_item *item;
2554         struct btrfs_ordered_extent *ordered;
2555         struct btrfs_trans_handle *trans;
2556         struct btrfs_ref ref = { 0 };
2557         struct btrfs_root *root;
2558         struct btrfs_key key;
2559         struct extent_buffer *leaf;
2560         struct old_sa_defrag_extent *old = backref->old;
2561         struct new_sa_defrag_extent *new = old->new;
2562         struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2563         struct inode *inode;
2564         struct extent_state *cached = NULL;
2565         int ret = 0;
2566         u64 start;
2567         u64 len;
2568         u64 lock_start;
2569         u64 lock_end;
2570         bool merge = false;
2571         int index;
2572
2573         if (prev && prev->root_id == backref->root_id &&
2574             prev->inum == backref->inum &&
2575             prev->file_pos + prev->num_bytes == backref->file_pos)
2576                 merge = true;
2577
2578         /* step 1: get root */
2579         key.objectid = backref->root_id;
2580         key.type = BTRFS_ROOT_ITEM_KEY;
2581         key.offset = (u64)-1;
2582
2583         index = srcu_read_lock(&fs_info->subvol_srcu);
2584
2585         root = btrfs_read_fs_root_no_name(fs_info, &key);
2586         if (IS_ERR(root)) {
2587                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2588                 if (PTR_ERR(root) == -ENOENT)
2589                         return 0;
2590                 return PTR_ERR(root);
2591         }
2592
2593         if (btrfs_root_readonly(root)) {
2594                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2595                 return 0;
2596         }
2597
2598         /* step 2: get inode */
2599         key.objectid = backref->inum;
2600         key.type = BTRFS_INODE_ITEM_KEY;
2601         key.offset = 0;
2602
2603         inode = btrfs_iget(fs_info->sb, &key, root, NULL);
2604         if (IS_ERR(inode)) {
2605                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2606                 return 0;
2607         }
2608
2609         srcu_read_unlock(&fs_info->subvol_srcu, index);
2610
2611         /* step 3: relink backref */
2612         lock_start = backref->file_pos;
2613         lock_end = backref->file_pos + backref->num_bytes - 1;
2614         lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2615                          &cached);
2616
2617         ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
2618         if (ordered) {
2619                 btrfs_put_ordered_extent(ordered);
2620                 goto out_unlock;
2621         }
2622
2623         trans = btrfs_join_transaction(root);
2624         if (IS_ERR(trans)) {
2625                 ret = PTR_ERR(trans);
2626                 goto out_unlock;
2627         }
2628
2629         key.objectid = backref->inum;
2630         key.type = BTRFS_EXTENT_DATA_KEY;
2631         key.offset = backref->file_pos;
2632
2633         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2634         if (ret < 0) {
2635                 goto out_free_path;
2636         } else if (ret > 0) {
2637                 ret = 0;
2638                 goto out_free_path;
2639         }
2640
2641         extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
2642                                 struct btrfs_file_extent_item);
2643
2644         if (btrfs_file_extent_generation(path->nodes[0], extent) !=
2645             backref->generation)
2646                 goto out_free_path;
2647
2648         btrfs_release_path(path);
2649
2650         start = backref->file_pos;
2651         if (backref->extent_offset < old->extent_offset + old->offset)
2652                 start += old->extent_offset + old->offset -
2653                          backref->extent_offset;
2654
2655         len = min(backref->extent_offset + backref->num_bytes,
2656                   old->extent_offset + old->offset + old->len);
2657         len -= max(backref->extent_offset, old->extent_offset + old->offset);
2658
2659         ret = btrfs_drop_extents(trans, root, inode, start,
2660                                  start + len, 1);
2661         if (ret)
2662                 goto out_free_path;
2663 again:
2664         key.objectid = btrfs_ino(BTRFS_I(inode));
2665         key.type = BTRFS_EXTENT_DATA_KEY;
2666         key.offset = start;
2667
2668         path->leave_spinning = 1;
2669         if (merge) {
2670                 struct btrfs_file_extent_item *fi;
2671                 u64 extent_len;
2672                 struct btrfs_key found_key;
2673
2674                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2675                 if (ret < 0)
2676                         goto out_free_path;
2677
2678                 path->slots[0]--;
2679                 leaf = path->nodes[0];
2680                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2681
2682                 fi = btrfs_item_ptr(leaf, path->slots[0],
2683                                     struct btrfs_file_extent_item);
2684                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
2685
2686                 if (extent_len + found_key.offset == start &&
2687                     relink_is_mergable(leaf, fi, new)) {
2688                         btrfs_set_file_extent_num_bytes(leaf, fi,
2689                                                         extent_len + len);
2690                         btrfs_mark_buffer_dirty(leaf);
2691                         inode_add_bytes(inode, len);
2692
2693                         ret = 1;
2694                         goto out_free_path;
2695                 } else {
2696                         merge = false;
2697                         btrfs_release_path(path);
2698                         goto again;
2699                 }
2700         }
2701
2702         ret = btrfs_insert_empty_item(trans, root, path, &key,
2703                                         sizeof(*extent));
2704         if (ret) {
2705                 btrfs_abort_transaction(trans, ret);
2706                 goto out_free_path;
2707         }
2708
2709         leaf = path->nodes[0];
2710         item = btrfs_item_ptr(leaf, path->slots[0],
2711                                 struct btrfs_file_extent_item);
2712         btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
2713         btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
2714         btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
2715         btrfs_set_file_extent_num_bytes(leaf, item, len);
2716         btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
2717         btrfs_set_file_extent_generation(leaf, item, trans->transid);
2718         btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
2719         btrfs_set_file_extent_compression(leaf, item, new->compress_type);
2720         btrfs_set_file_extent_encryption(leaf, item, 0);
2721         btrfs_set_file_extent_other_encoding(leaf, item, 0);
2722
2723         btrfs_mark_buffer_dirty(leaf);
2724         inode_add_bytes(inode, len);
2725         btrfs_release_path(path);
2726
2727         btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new->bytenr,
2728                                new->disk_len, 0);
2729         btrfs_init_data_ref(&ref, backref->root_id, backref->inum,
2730                             new->file_pos);  /* start - extent_offset */
2731         ret = btrfs_inc_extent_ref(trans, &ref);
2732         if (ret) {
2733                 btrfs_abort_transaction(trans, ret);
2734                 goto out_free_path;
2735         }
2736
2737         ret = 1;
2738 out_free_path:
2739         btrfs_release_path(path);
2740         path->leave_spinning = 0;
2741         btrfs_end_transaction(trans);
2742 out_unlock:
2743         unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2744                              &cached);
2745         iput(inode);
2746         return ret;
2747 }
2748
2749 static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
2750 {
2751         struct old_sa_defrag_extent *old, *tmp;
2752
2753         if (!new)
2754                 return;
2755
2756         list_for_each_entry_safe(old, tmp, &new->head, list) {
2757                 kfree(old);
2758         }
2759         kfree(new);
2760 }
2761
2762 static void relink_file_extents(struct new_sa_defrag_extent *new)
2763 {
2764         struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2765         struct btrfs_path *path;
2766         struct sa_defrag_extent_backref *backref;
2767         struct sa_defrag_extent_backref *prev = NULL;
2768         struct rb_node *node;
2769         int ret;
2770
2771         path = btrfs_alloc_path();
2772         if (!path)
2773                 return;
2774
2775         if (!record_extent_backrefs(path, new)) {
2776                 btrfs_free_path(path);
2777                 goto out;
2778         }
2779         btrfs_release_path(path);
2780
2781         while (1) {
2782                 node = rb_first(&new->root);
2783                 if (!node)
2784                         break;
2785                 rb_erase(node, &new->root);
2786
2787                 backref = rb_entry(node, struct sa_defrag_extent_backref, node);
2788
2789                 ret = relink_extent_backref(path, prev, backref);
2790                 WARN_ON(ret < 0);
2791
2792                 kfree(prev);
2793
2794                 if (ret == 1)
2795                         prev = backref;
2796                 else
2797                         prev = NULL;
2798                 cond_resched();
2799         }
2800         kfree(prev);
2801
2802         btrfs_free_path(path);
2803 out:
2804         free_sa_defrag_extent(new);
2805
2806         atomic_dec(&fs_info->defrag_running);
2807         wake_up(&fs_info->transaction_wait);
2808 }
2809
2810 static struct new_sa_defrag_extent *
2811 record_old_file_extents(struct inode *inode,
2812                         struct btrfs_ordered_extent *ordered)
2813 {
2814         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2815         struct btrfs_root *root = BTRFS_I(inode)->root;
2816         struct btrfs_path *path;
2817         struct btrfs_key key;
2818         struct old_sa_defrag_extent *old;
2819         struct new_sa_defrag_extent *new;
2820         int ret;
2821
2822         new = kmalloc(sizeof(*new), GFP_NOFS);
2823         if (!new)
2824                 return NULL;
2825
2826         new->inode = inode;
2827         new->file_pos = ordered->file_offset;
2828         new->len = ordered->len;
2829         new->bytenr = ordered->start;
2830         new->disk_len = ordered->disk_len;
2831         new->compress_type = ordered->compress_type;
2832         new->root = RB_ROOT;
2833         INIT_LIST_HEAD(&new->head);
2834
2835         path = btrfs_alloc_path();
2836         if (!path)
2837                 goto out_kfree;
2838
2839         key.objectid = btrfs_ino(BTRFS_I(inode));
2840         key.type = BTRFS_EXTENT_DATA_KEY;
2841         key.offset = new->file_pos;
2842
2843         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2844         if (ret < 0)
2845                 goto out_free_path;
2846         if (ret > 0 && path->slots[0] > 0)
2847                 path->slots[0]--;
2848
2849         /* find out all the old extents for the file range */
2850         while (1) {
2851                 struct btrfs_file_extent_item *extent;
2852                 struct extent_buffer *l;
2853                 int slot;
2854                 u64 num_bytes;
2855                 u64 offset;
2856                 u64 end;
2857                 u64 disk_bytenr;
2858                 u64 extent_offset;
2859
2860                 l = path->nodes[0];
2861                 slot = path->slots[0];
2862
2863                 if (slot >= btrfs_header_nritems(l)) {
2864                         ret = btrfs_next_leaf(root, path);
2865                         if (ret < 0)
2866                                 goto out_free_path;
2867                         else if (ret > 0)
2868                                 break;
2869                         continue;
2870                 }
2871
2872                 btrfs_item_key_to_cpu(l, &key, slot);
2873
2874                 if (key.objectid != btrfs_ino(BTRFS_I(inode)))
2875                         break;
2876                 if (key.type != BTRFS_EXTENT_DATA_KEY)
2877                         break;
2878                 if (key.offset >= new->file_pos + new->len)
2879                         break;
2880
2881                 extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
2882
2883                 num_bytes = btrfs_file_extent_num_bytes(l, extent);
2884                 if (key.offset + num_bytes < new->file_pos)
2885                         goto next;
2886
2887                 disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
2888                 if (!disk_bytenr)
2889                         goto next;
2890
2891                 extent_offset = btrfs_file_extent_offset(l, extent);
2892
2893                 old = kmalloc(sizeof(*old), GFP_NOFS);
2894                 if (!old)
2895                         goto out_free_path;
2896
2897                 offset = max(new->file_pos, key.offset);
2898                 end = min(new->file_pos + new->len, key.offset + num_bytes);
2899
2900                 old->bytenr = disk_bytenr;
2901                 old->extent_offset = extent_offset;
2902                 old->offset = offset - key.offset;
2903                 old->len = end - offset;
2904                 old->new = new;
2905                 old->count = 0;
2906                 list_add_tail(&old->list, &new->head);
2907 next:
2908                 path->slots[0]++;
2909                 cond_resched();
2910         }
2911
2912         btrfs_free_path(path);
2913         atomic_inc(&fs_info->defrag_running);
2914
2915         return new;
2916
2917 out_free_path:
2918         btrfs_free_path(path);
2919 out_kfree:
2920         free_sa_defrag_extent(new);
2921         return NULL;
2922 }
2923
2924 static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
2925                                          u64 start, u64 len)
2926 {
2927         struct btrfs_block_group_cache *cache;
2928
2929         cache = btrfs_lookup_block_group(fs_info, start);
2930         ASSERT(cache);
2931
2932         spin_lock(&cache->lock);
2933         cache->delalloc_bytes -= len;
2934         spin_unlock(&cache->lock);
2935
2936         btrfs_put_block_group(cache);
2937 }
2938
2939 /* as ordered data IO finishes, this gets called so we can finish
2940  * an ordered extent if the range of bytes in the file it covers are
2941  * fully written.
2942  */
2943 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2944 {
2945         struct inode *inode = ordered_extent->inode;
2946         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2947         struct btrfs_root *root = BTRFS_I(inode)->root;
2948         struct btrfs_trans_handle *trans = NULL;
2949         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2950         struct extent_state *cached_state = NULL;
2951         struct new_sa_defrag_extent *new = NULL;
2952         int compress_type = 0;
2953         int ret = 0;
2954         u64 logical_len = ordered_extent->len;
2955         bool nolock;
2956         bool truncated = false;
2957         bool range_locked = false;
2958         bool clear_new_delalloc_bytes = false;
2959         bool clear_reserved_extent = true;
2960
2961         if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
2962             !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
2963             !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
2964                 clear_new_delalloc_bytes = true;
2965
2966         nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
2967
2968         if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
2969                 ret = -EIO;
2970                 goto out;
2971         }
2972
2973         btrfs_free_io_failure_record(BTRFS_I(inode),
2974                         ordered_extent->file_offset,
2975                         ordered_extent->file_offset +
2976                         ordered_extent->len - 1);
2977
2978         if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
2979                 truncated = true;
2980                 logical_len = ordered_extent->truncated_len;
2981                 /* Truncated the entire extent, don't bother adding */
2982                 if (!logical_len)
2983                         goto out;
2984         }
2985
2986         if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
2987                 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
2988
2989                 /*
2990                  * For mwrite(mmap + memset to write) case, we still reserve
2991                  * space for NOCOW range.
2992                  * As NOCOW won't cause a new delayed ref, just free the space
2993                  */
2994                 btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
2995                                        ordered_extent->len);
2996                 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2997                 if (nolock)
2998                         trans = btrfs_join_transaction_nolock(root);
2999                 else
3000                         trans = btrfs_join_transaction(root);
3001                 if (IS_ERR(trans)) {
3002                         ret = PTR_ERR(trans);
3003                         trans = NULL;
3004                         goto out;
3005                 }
3006                 trans->block_rsv = &BTRFS_I(inode)->block_rsv;
3007                 ret = btrfs_update_inode_fallback(trans, root, inode);
3008                 if (ret) /* -ENOMEM or corruption */
3009                         btrfs_abort_transaction(trans, ret);
3010                 goto out;
3011         }
3012
3013         range_locked = true;
3014         lock_extent_bits(io_tree, ordered_extent->file_offset,
3015                          ordered_extent->file_offset + ordered_extent->len - 1,
3016                          &cached_state);
3017
3018         ret = test_range_bit(io_tree, ordered_extent->file_offset,
3019                         ordered_extent->file_offset + ordered_extent->len - 1,
3020                         EXTENT_DEFRAG, 0, cached_state);
3021         if (ret) {
3022                 u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
3023                 if (0 && last_snapshot >= BTRFS_I(inode)->generation)
3024                         /* the inode is shared */
3025                         new = record_old_file_extents(inode, ordered_extent);
3026
3027                 clear_extent_bit(io_tree, ordered_extent->file_offset,
3028                         ordered_extent->file_offset + ordered_extent->len - 1,
3029                         EXTENT_DEFRAG, 0, 0, &cached_state);
3030         }
3031
3032         if (nolock)
3033                 trans = btrfs_join_transaction_nolock(root);
3034         else
3035                 trans = btrfs_join_transaction(root);
3036         if (IS_ERR(trans)) {
3037                 ret = PTR_ERR(trans);
3038                 trans = NULL;
3039                 goto out;
3040         }
3041
3042         trans->block_rsv = &BTRFS_I(inode)->block_rsv;
3043
3044         if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
3045                 compress_type = ordered_extent->compress_type;
3046         if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3047                 BUG_ON(compress_type);
3048                 btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
3049                                        ordered_extent->len);
3050                 ret = btrfs_mark_extent_written(trans, BTRFS_I(inode),
3051                                                 ordered_extent->file_offset,
3052                                                 ordered_extent->file_offset +
3053                                                 logical_len);
3054         } else {
3055                 BUG_ON(root == fs_info->tree_root);
3056                 ret = insert_reserved_file_extent(trans, inode,
3057                                                 ordered_extent->file_offset,
3058                                                 ordered_extent->start,
3059                                                 ordered_extent->disk_len,
3060                                                 logical_len, logical_len,
3061                                                 compress_type, 0, 0,
3062                                                 BTRFS_FILE_EXTENT_REG);
3063                 if (!ret) {
3064                         clear_reserved_extent = false;
3065                         btrfs_release_delalloc_bytes(fs_info,
3066                                                      ordered_extent->start,
3067                                                      ordered_extent->disk_len);
3068                 }
3069         }
3070         unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
3071                            ordered_extent->file_offset, ordered_extent->len,
3072                            trans->transid);
3073         if (ret < 0) {
3074                 btrfs_abort_transaction(trans, ret);
3075                 goto out;
3076         }
3077
3078         ret = add_pending_csums(trans, inode, &ordered_extent->list);
3079         if (ret) {
3080                 btrfs_abort_transaction(trans, ret);
3081                 goto out;
3082         }
3083
3084         btrfs_ordered_update_i_size(inode, 0, ordered_extent);
3085         ret = btrfs_update_inode_fallback(trans, root, inode);
3086         if (ret) { /* -ENOMEM or corruption */
3087                 btrfs_abort_transaction(trans, ret);
3088                 goto out;
3089         }
3090         ret = 0;
3091 out:
3092         if (range_locked || clear_new_delalloc_bytes) {
3093                 unsigned int clear_bits = 0;
3094
3095                 if (range_locked)
3096                         clear_bits |= EXTENT_LOCKED;
3097                 if (clear_new_delalloc_bytes)
3098                         clear_bits |= EXTENT_DELALLOC_NEW;
3099                 clear_extent_bit(&BTRFS_I(inode)->io_tree,
3100                                  ordered_extent->file_offset,
3101                                  ordered_extent->file_offset +
3102                                  ordered_extent->len - 1,
3103                                  clear_bits,
3104                                  (clear_bits & EXTENT_LOCKED) ? 1 : 0,
3105                                  0, &cached_state);
3106         }
3107
3108         if (trans)
3109                 btrfs_end_transaction(trans);
3110
3111         if (ret || truncated) {
3112                 u64 start, end;
3113
3114                 if (truncated)
3115                         start = ordered_extent->file_offset + logical_len;
3116                 else
3117                         start = ordered_extent->file_offset;
3118                 end = ordered_extent->file_offset + ordered_extent->len - 1;
3119                 clear_extent_uptodate(io_tree, start, end, NULL);
3120
3121                 /* Drop the cache for the part of the extent we didn't write. */
3122                 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
3123
3124                 /*
3125                  * If the ordered extent had an IOERR or something else went
3126                  * wrong we need to return the space for this ordered extent
3127                  * back to the allocator.  We only free the extent in the
3128                  * truncated case if we didn't write out the extent at all.
3129                  *
3130                  * If we made it past insert_reserved_file_extent before we
3131                  * errored out then we don't need to do this as the accounting
3132                  * has already been done.
3133                  */
3134                 if ((ret || !logical_len) &&
3135                     clear_reserved_extent &&
3136                     !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3137                     !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
3138                         btrfs_free_reserved_extent(fs_info,
3139                                                    ordered_extent->start,
3140                                                    ordered_extent->disk_len, 1);
3141         }
3142
3143
3144         /*
3145          * This needs to be done to make sure anybody waiting knows we are done
3146          * updating everything for this ordered extent.
3147          */
3148         btrfs_remove_ordered_extent(inode, ordered_extent);
3149
3150         /* for snapshot-aware defrag */
3151         if (new) {
3152                 if (ret) {
3153                         free_sa_defrag_extent(new);
3154                         atomic_dec(&fs_info->defrag_running);
3155                 } else {
3156                         relink_file_extents(new);
3157                 }
3158         }
3159
3160         /* once for us */
3161         btrfs_put_ordered_extent(ordered_extent);
3162         /* once for the tree */
3163         btrfs_put_ordered_extent(ordered_extent);
3164
3165         return ret;
3166 }
3167
3168 static void finish_ordered_fn(struct btrfs_work *work)
3169 {
3170         struct btrfs_ordered_extent *ordered_extent;
3171         ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
3172         btrfs_finish_ordered_io(ordered_extent);
3173 }
3174
3175 void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
3176                                           u64 end, int uptodate)
3177 {
3178         struct inode *inode = page->mapping->host;
3179         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3180         struct btrfs_ordered_extent *ordered_extent = NULL;
3181         struct btrfs_workqueue *wq;
3182         btrfs_work_func_t func;
3183
3184         trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
3185
3186         ClearPagePrivate2(page);
3187         if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
3188                                             end - start + 1, uptodate))
3189                 return;
3190
3191         if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
3192                 wq = fs_info->endio_freespace_worker;
3193                 func = btrfs_freespace_write_helper;
3194         } else {
3195                 wq = fs_info->endio_write_workers;
3196                 func = btrfs_endio_write_helper;
3197         }
3198
3199         btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
3200                         NULL);
3201         btrfs_queue_work(wq, &ordered_extent->work);
3202 }
3203
3204 static int __readpage_endio_check(struct inode *inode,
3205                                   struct btrfs_io_bio *io_bio,
3206                                   int icsum, struct page *page,
3207                                   int pgoff, u64 start, size_t len)
3208 {
3209         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3210         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
3211         char *kaddr;
3212         u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
3213         u8 *csum_expected;
3214         u8 csum[BTRFS_CSUM_SIZE];
3215
3216         csum_expected = ((u8 *)io_bio->csum) + icsum * csum_size;
3217
3218         kaddr = kmap_atomic(page);
3219         shash->tfm = fs_info->csum_shash;
3220
3221         crypto_shash_init(shash);
3222         crypto_shash_update(shash, kaddr + pgoff, len);
3223         crypto_shash_final(shash, csum);
3224
3225         if (memcmp(csum, csum_expected, csum_size))
3226                 goto zeroit;
3227
3228         kunmap_atomic(kaddr);
3229         return 0;
3230 zeroit:
3231         btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
3232                                     io_bio->mirror_num);
3233         memset(kaddr + pgoff, 1, len);
3234         flush_dcache_page(page);
3235         kunmap_atomic(kaddr);
3236         return -EIO;
3237 }
3238
3239 /*
3240  * when reads are done, we need to check csums to verify the data is correct
3241  * if there's a match, we allow the bio to finish.  If not, the code in
3242  * extent_io.c will try to find good copies for us.
3243  */
3244 static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
3245                                       u64 phy_offset, struct page *page,
3246                                       u64 start, u64 end, int mirror)
3247 {
3248         size_t offset = start - page_offset(page);
3249         struct inode *inode = page->mapping->host;
3250         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3251         struct btrfs_root *root = BTRFS_I(inode)->root;
3252
3253         if (PageChecked(page)) {
3254                 ClearPageChecked(page);
3255                 return 0;
3256         }
3257
3258         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
3259                 return 0;
3260
3261         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
3262             test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
3263                 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM);
3264                 return 0;
3265         }
3266
3267         phy_offset >>= inode->i_sb->s_blocksize_bits;
3268         return __readpage_endio_check(inode, io_bio, phy_offset, page, offset,
3269                                       start, (size_t)(end - start + 1));
3270 }
3271
3272 /*
3273  * btrfs_add_delayed_iput - perform a delayed iput on @inode
3274  *
3275  * @inode: The inode we want to perform iput on
3276  *
3277  * This function uses the generic vfs_inode::i_count to track whether we should
3278  * just decrement it (in case it's > 1) or if this is the last iput then link
3279  * the inode to the delayed iput machinery. Delayed iputs are processed at
3280  * transaction commit time/superblock commit/cleaner kthread.
3281  */
3282 void btrfs_add_delayed_iput(struct inode *inode)
3283 {
3284         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3285         struct btrfs_inode *binode = BTRFS_I(inode);
3286
3287         if (atomic_add_unless(&inode->i_count, -1, 1))
3288                 return;
3289
3290         atomic_inc(&fs_info->nr_delayed_iputs);
3291         spin_lock(&fs_info->delayed_iput_lock);
3292         ASSERT(list_empty(&binode->delayed_iput));
3293         list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
3294         spin_unlock(&fs_info->delayed_iput_lock);
3295         if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
3296                 wake_up_process(fs_info->cleaner_kthread);
3297 }
3298
3299 static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info,
3300                                     struct btrfs_inode *inode)
3301 {
3302         list_del_init(&inode->delayed_iput);
3303         spin_unlock(&fs_info->delayed_iput_lock);
3304         iput(&inode->vfs_inode);
3305         if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
3306                 wake_up(&fs_info->delayed_iputs_wait);
3307         spin_lock(&fs_info->delayed_iput_lock);
3308 }
3309
3310 static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info,
3311                                    struct btrfs_inode *inode)
3312 {
3313         if (!list_empty(&inode->delayed_iput)) {
3314                 spin_lock(&fs_info->delayed_iput_lock);
3315                 if (!list_empty(&inode->delayed_iput))
3316                         run_delayed_iput_locked(fs_info, inode);
3317                 spin_unlock(&fs_info->delayed_iput_lock);
3318         }
3319 }
3320
3321 void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
3322 {
3323
3324         spin_lock(&fs_info->delayed_iput_lock);
3325         while (!list_empty(&fs_info->delayed_iputs)) {
3326                 struct btrfs_inode *inode;
3327
3328                 inode = list_first_entry(&fs_info->delayed_iputs,
3329                                 struct btrfs_inode, delayed_iput);
3330                 run_delayed_iput_locked(fs_info, inode);
3331         }
3332         spin_unlock(&fs_info->delayed_iput_lock);
3333 }
3334
3335 /**
3336  * btrfs_wait_on_delayed_iputs - wait on the delayed iputs to be done running
3337  * @fs_info - the fs_info for this fs
3338  * @return - EINTR if we were killed, 0 if nothing's pending
3339  *
3340  * This will wait on any delayed iputs that are currently running with KILLABLE
3341  * set.  Once they are all done running we will return, unless we are killed in
3342  * which case we return EINTR. This helps in user operations like fallocate etc
3343  * that might get blocked on the iputs.
3344  */
3345 int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
3346 {
3347         int ret = wait_event_killable(fs_info->delayed_iputs_wait,
3348                         atomic_read(&fs_info->nr_delayed_iputs) == 0);
3349         if (ret)
3350                 return -EINTR;
3351         return 0;
3352 }
3353
3354 /*
3355  * This creates an orphan entry for the given inode in case something goes wrong
3356  * in the middle of an unlink.
3357  */
3358 int btrfs_orphan_add(struct btrfs_trans_handle *trans,
3359                      struct btrfs_inode *inode)
3360 {
3361         int ret;
3362
3363         ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode));
3364         if (ret && ret != -EEXIST) {
3365                 btrfs_abort_transaction(trans, ret);
3366                 return ret;
3367         }
3368
3369         return 0;
3370 }
3371
3372 /*
3373  * We have done the delete so we can go ahead and remove the orphan item for
3374  * this particular inode.
3375  */
3376 static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
3377                             struct btrfs_inode *inode)
3378 {
3379         return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode));
3380 }
3381
3382 /*
3383  * this cleans up any orphans that may be left on the list from the last use
3384  * of this root.
3385  */
3386 int btrfs_orphan_cleanup(struct btrfs_root *root)
3387 {
3388         struct btrfs_fs_info *fs_info = root->fs_info;
3389         struct btrfs_path *path;
3390         struct extent_buffer *leaf;
3391         struct btrfs_key key, found_key;
3392         struct btrfs_trans_handle *trans;
3393         struct inode *inode;
3394         u64 last_objectid = 0;
3395         int ret = 0, nr_unlink = 0;
3396
3397         if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
3398                 return 0;
3399
3400         path = btrfs_alloc_path();
3401         if (!path) {
3402                 ret = -ENOMEM;
3403                 goto out;
3404         }
3405         path->reada = READA_BACK;
3406
3407         key.objectid = BTRFS_ORPHAN_OBJECTID;
3408         key.type = BTRFS_ORPHAN_ITEM_KEY;
3409         key.offset = (u64)-1;
3410
3411         while (1) {
3412                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3413                 if (ret < 0)
3414                         goto out;
3415
3416                 /*
3417                  * if ret == 0 means we found what we were searching for, which
3418                  * is weird, but possible, so only screw with path if we didn't
3419                  * find the key and see if we have stuff that matches
3420                  */
3421                 if (ret > 0) {
3422                         ret = 0;
3423                         if (path->slots[0] == 0)
3424                                 break;
3425                         path->slots[0]--;
3426                 }
3427
3428                 /* pull out the item */
3429                 leaf = path->nodes[0];
3430                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3431
3432                 /* make sure the item matches what we want */
3433                 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3434                         break;
3435                 if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
3436                         break;
3437
3438                 /* release the path since we're done with it */
3439                 btrfs_release_path(path);
3440
3441                 /*
3442                  * this is where we are basically btrfs_lookup, without the
3443                  * crossing root thing.  we store the inode number in the
3444                  * offset of the orphan item.
3445                  */
3446
3447                 if (found_key.offset == last_objectid) {
3448                         btrfs_err(fs_info,
3449                                   "Error removing orphan entry, stopping orphan cleanup");
3450                         ret = -EINVAL;
3451                         goto out;
3452                 }
3453
3454                 last_objectid = found_key.offset;
3455
3456                 found_key.objectid = found_key.offset;
3457                 found_key.type = BTRFS_INODE_ITEM_KEY;
3458                 found_key.offset = 0;
3459                 inode = btrfs_iget(fs_info->sb, &found_key, root, NULL);
3460                 ret = PTR_ERR_OR_ZERO(inode);
3461                 if (ret && ret != -ENOENT)
3462                         goto out;
3463
3464                 if (ret == -ENOENT && root == fs_info->tree_root) {
3465                         struct btrfs_root *dead_root;
3466                         struct btrfs_fs_info *fs_info = root->fs_info;
3467                         int is_dead_root = 0;
3468
3469                         /*
3470                          * this is an orphan in the tree root. Currently these
3471                          * could come from 2 sources:
3472                          *  a) a snapshot deletion in progress
3473                          *  b) a free space cache inode
3474                          * We need to distinguish those two, as the snapshot
3475                          * orphan must not get deleted.
3476                          * find_dead_roots already ran before us, so if this
3477                          * is a snapshot deletion, we should find the root
3478                          * in the dead_roots list
3479                          */
3480                         spin_lock(&fs_info->trans_lock);
3481                         list_for_each_entry(dead_root, &fs_info->dead_roots,
3482                                             root_list) {
3483                                 if (dead_root->root_key.objectid ==
3484                                     found_key.objectid) {
3485                                         is_dead_root = 1;
3486                                         break;
3487                                 }
3488                         }
3489                         spin_unlock(&fs_info->trans_lock);
3490                         if (is_dead_root) {
3491                                 /* prevent this orphan from being found again */
3492                                 key.offset = found_key.objectid - 1;
3493                                 continue;
3494                         }
3495
3496                 }
3497
3498                 /*
3499                  * If we have an inode with links, there are a couple of
3500                  * possibilities. Old kernels (before v3.12) used to create an
3501                  * orphan item for truncate indicating that there were possibly
3502                  * extent items past i_size that needed to be deleted. In v3.12,
3503                  * truncate was changed to update i_size in sync with the extent
3504                  * items, but the (useless) orphan item was still created. Since
3505                  * v4.18, we don't create the orphan item for truncate at all.
3506                  *
3507                  * So, this item could mean that we need to do a truncate, but
3508                  * only if this filesystem was last used on a pre-v3.12 kernel
3509                  * and was not cleanly unmounted. The odds of that are quite
3510                  * slim, and it's a pain to do the truncate now, so just delete
3511                  * the orphan item.
3512                  *
3513                  * It's also possible that this orphan item was supposed to be
3514                  * deleted but wasn't. The inode number may have been reused,
3515                  * but either way, we can delete the orphan item.
3516                  */
3517                 if (ret == -ENOENT || inode->i_nlink) {
3518                         if (!ret)
3519                                 iput(inode);
3520                         trans = btrfs_start_transaction(root, 1);
3521                         if (IS_ERR(trans)) {
3522                                 ret = PTR_ERR(trans);
3523                                 goto out;
3524                         }
3525                         btrfs_debug(fs_info, "auto deleting %Lu",
3526                                     found_key.objectid);
3527                         ret = btrfs_del_orphan_item(trans, root,
3528                                                     found_key.objectid);
3529                         btrfs_end_transaction(trans);
3530                         if (ret)
3531                                 goto out;
3532                         continue;
3533                 }
3534
3535                 nr_unlink++;
3536
3537                 /* this will do delete_inode and everything for us */
3538                 iput(inode);
3539         }
3540         /* release the path since we're done with it */
3541         btrfs_release_path(path);
3542
3543         root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
3544
3545   &nbs