btrfs: inode: Don't compress if NODATASUM or NODATACOW set
[sfrench/cifs-2.6.git] / fs / btrfs / inode.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5
6 #include <linux/kernel.h>
7 #include <linux/bio.h>
8 #include <linux/buffer_head.h>
9 #include <linux/file.h>
10 #include <linux/fs.h>
11 #include <linux/pagemap.h>
12 #include <linux/highmem.h>
13 #include <linux/time.h>
14 #include <linux/init.h>
15 #include <linux/string.h>
16 #include <linux/backing-dev.h>
17 #include <linux/writeback.h>
18 #include <linux/compat.h>
19 #include <linux/xattr.h>
20 #include <linux/posix_acl.h>
21 #include <linux/falloc.h>
22 #include <linux/slab.h>
23 #include <linux/ratelimit.h>
24 #include <linux/btrfs.h>
25 #include <linux/blkdev.h>
26 #include <linux/posix_acl_xattr.h>
27 #include <linux/uio.h>
28 #include <linux/magic.h>
29 #include <linux/iversion.h>
30 #include <linux/swap.h>
31 #include <linux/sched/mm.h>
32 #include <asm/unaligned.h>
33 #include "ctree.h"
34 #include "disk-io.h"
35 #include "transaction.h"
36 #include "btrfs_inode.h"
37 #include "print-tree.h"
38 #include "ordered-data.h"
39 #include "xattr.h"
40 #include "tree-log.h"
41 #include "volumes.h"
42 #include "compression.h"
43 #include "locking.h"
44 #include "free-space-cache.h"
45 #include "inode-map.h"
46 #include "backref.h"
47 #include "props.h"
48 #include "qgroup.h"
49 #include "dedupe.h"
50 #include "delalloc-space.h"
51
52 struct btrfs_iget_args {
53         struct btrfs_key *location;
54         struct btrfs_root *root;
55 };
56
57 struct btrfs_dio_data {
58         u64 reserve;
59         u64 unsubmitted_oe_range_start;
60         u64 unsubmitted_oe_range_end;
61         int overwrite;
62 };
63
64 static const struct inode_operations btrfs_dir_inode_operations;
65 static const struct inode_operations btrfs_symlink_inode_operations;
66 static const struct inode_operations btrfs_dir_ro_inode_operations;
67 static const struct inode_operations btrfs_special_inode_operations;
68 static const struct inode_operations btrfs_file_inode_operations;
69 static const struct address_space_operations btrfs_aops;
70 static const struct file_operations btrfs_dir_file_operations;
71 static const struct extent_io_ops btrfs_extent_io_ops;
72
73 static struct kmem_cache *btrfs_inode_cachep;
74 struct kmem_cache *btrfs_trans_handle_cachep;
75 struct kmem_cache *btrfs_path_cachep;
76 struct kmem_cache *btrfs_free_space_cachep;
77
78 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
79 static int btrfs_truncate(struct inode *inode, bool skip_writeback);
80 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
81 static noinline int cow_file_range(struct inode *inode,
82                                    struct page *locked_page,
83                                    u64 start, u64 end, u64 delalloc_end,
84                                    int *page_started, unsigned long *nr_written,
85                                    int unlock, struct btrfs_dedupe_hash *hash);
86 static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
87                                        u64 orig_start, u64 block_start,
88                                        u64 block_len, u64 orig_block_len,
89                                        u64 ram_bytes, int compress_type,
90                                        int type);
91
92 static void __endio_write_update_ordered(struct inode *inode,
93                                          const u64 offset, const u64 bytes,
94                                          const bool uptodate);
95
96 /*
97  * Cleanup all submitted ordered extents in specified range to handle errors
98  * from the btrfs_run_delalloc_range() callback.
99  *
100  * NOTE: caller must ensure that when an error happens, it can not call
101  * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
102  * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
103  * to be released, which we want to happen only when finishing the ordered
104  * extent (btrfs_finish_ordered_io()).
105  */
106 static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
107                                                  struct page *locked_page,
108                                                  u64 offset, u64 bytes)
109 {
110         unsigned long index = offset >> PAGE_SHIFT;
111         unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
112         u64 page_start = page_offset(locked_page);
113         u64 page_end = page_start + PAGE_SIZE - 1;
114
115         struct page *page;
116
117         while (index <= end_index) {
118                 page = find_get_page(inode->i_mapping, index);
119                 index++;
120                 if (!page)
121                         continue;
122                 ClearPagePrivate2(page);
123                 put_page(page);
124         }
125
126         /*
127          * In case this page belongs to the delalloc range being instantiated
128          * then skip it, since the first page of a range is going to be
129          * properly cleaned up by the caller of run_delalloc_range
130          */
131         if (page_start >= offset && page_end <= (offset + bytes - 1)) {
132                 offset += PAGE_SIZE;
133                 bytes -= PAGE_SIZE;
134         }
135
136         return __endio_write_update_ordered(inode, offset, bytes, false);
137 }
138
139 static int btrfs_dirty_inode(struct inode *inode);
140
141 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
142 void btrfs_test_inode_set_ops(struct inode *inode)
143 {
144         BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
145 }
146 #endif
147
148 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
149                                      struct inode *inode,  struct inode *dir,
150                                      const struct qstr *qstr)
151 {
152         int err;
153
154         err = btrfs_init_acl(trans, inode, dir);
155         if (!err)
156                 err = btrfs_xattr_security_init(trans, inode, dir, qstr);
157         return err;
158 }
159
160 /*
161  * this does all the hard work for inserting an inline extent into
162  * the btree.  The caller should have done a btrfs_drop_extents so that
163  * no overlapping inline items exist in the btree
164  */
165 static int insert_inline_extent(struct btrfs_trans_handle *trans,
166                                 struct btrfs_path *path, int extent_inserted,
167                                 struct btrfs_root *root, struct inode *inode,
168                                 u64 start, size_t size, size_t compressed_size,
169                                 int compress_type,
170                                 struct page **compressed_pages)
171 {
172         struct extent_buffer *leaf;
173         struct page *page = NULL;
174         char *kaddr;
175         unsigned long ptr;
176         struct btrfs_file_extent_item *ei;
177         int ret;
178         size_t cur_size = size;
179         unsigned long offset;
180
181         if (compressed_size && compressed_pages)
182                 cur_size = compressed_size;
183
184         inode_add_bytes(inode, size);
185
186         if (!extent_inserted) {
187                 struct btrfs_key key;
188                 size_t datasize;
189
190                 key.objectid = btrfs_ino(BTRFS_I(inode));
191                 key.offset = start;
192                 key.type = BTRFS_EXTENT_DATA_KEY;
193
194                 datasize = btrfs_file_extent_calc_inline_size(cur_size);
195                 path->leave_spinning = 1;
196                 ret = btrfs_insert_empty_item(trans, root, path, &key,
197                                               datasize);
198                 if (ret)
199                         goto fail;
200         }
201         leaf = path->nodes[0];
202         ei = btrfs_item_ptr(leaf, path->slots[0],
203                             struct btrfs_file_extent_item);
204         btrfs_set_file_extent_generation(leaf, ei, trans->transid);
205         btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
206         btrfs_set_file_extent_encryption(leaf, ei, 0);
207         btrfs_set_file_extent_other_encoding(leaf, ei, 0);
208         btrfs_set_file_extent_ram_bytes(leaf, ei, size);
209         ptr = btrfs_file_extent_inline_start(ei);
210
211         if (compress_type != BTRFS_COMPRESS_NONE) {
212                 struct page *cpage;
213                 int i = 0;
214                 while (compressed_size > 0) {
215                         cpage = compressed_pages[i];
216                         cur_size = min_t(unsigned long, compressed_size,
217                                        PAGE_SIZE);
218
219                         kaddr = kmap_atomic(cpage);
220                         write_extent_buffer(leaf, kaddr, ptr, cur_size);
221                         kunmap_atomic(kaddr);
222
223                         i++;
224                         ptr += cur_size;
225                         compressed_size -= cur_size;
226                 }
227                 btrfs_set_file_extent_compression(leaf, ei,
228                                                   compress_type);
229         } else {
230                 page = find_get_page(inode->i_mapping,
231                                      start >> PAGE_SHIFT);
232                 btrfs_set_file_extent_compression(leaf, ei, 0);
233                 kaddr = kmap_atomic(page);
234                 offset = offset_in_page(start);
235                 write_extent_buffer(leaf, kaddr + offset, ptr, size);
236                 kunmap_atomic(kaddr);
237                 put_page(page);
238         }
239         btrfs_mark_buffer_dirty(leaf);
240         btrfs_release_path(path);
241
242         /*
243          * we're an inline extent, so nobody can
244          * extend the file past i_size without locking
245          * a page we already have locked.
246          *
247          * We must do any isize and inode updates
248          * before we unlock the pages.  Otherwise we
249          * could end up racing with unlink.
250          */
251         BTRFS_I(inode)->disk_i_size = inode->i_size;
252         ret = btrfs_update_inode(trans, root, inode);
253
254 fail:
255         return ret;
256 }
257
258
259 /*
260  * conditionally insert an inline extent into the file.  This
261  * does the checks required to make sure the data is small enough
262  * to fit as an inline extent.
263  */
264 static noinline int cow_file_range_inline(struct inode *inode, u64 start,
265                                           u64 end, size_t compressed_size,
266                                           int compress_type,
267                                           struct page **compressed_pages)
268 {
269         struct btrfs_root *root = BTRFS_I(inode)->root;
270         struct btrfs_fs_info *fs_info = root->fs_info;
271         struct btrfs_trans_handle *trans;
272         u64 isize = i_size_read(inode);
273         u64 actual_end = min(end + 1, isize);
274         u64 inline_len = actual_end - start;
275         u64 aligned_end = ALIGN(end, fs_info->sectorsize);
276         u64 data_len = inline_len;
277         int ret;
278         struct btrfs_path *path;
279         int extent_inserted = 0;
280         u32 extent_item_size;
281
282         if (compressed_size)
283                 data_len = compressed_size;
284
285         if (start > 0 ||
286             actual_end > fs_info->sectorsize ||
287             data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
288             (!compressed_size &&
289             (actual_end & (fs_info->sectorsize - 1)) == 0) ||
290             end + 1 < isize ||
291             data_len > fs_info->max_inline) {
292                 return 1;
293         }
294
295         path = btrfs_alloc_path();
296         if (!path)
297                 return -ENOMEM;
298
299         trans = btrfs_join_transaction(root);
300         if (IS_ERR(trans)) {
301                 btrfs_free_path(path);
302                 return PTR_ERR(trans);
303         }
304         trans->block_rsv = &BTRFS_I(inode)->block_rsv;
305
306         if (compressed_size && compressed_pages)
307                 extent_item_size = btrfs_file_extent_calc_inline_size(
308                    compressed_size);
309         else
310                 extent_item_size = btrfs_file_extent_calc_inline_size(
311                     inline_len);
312
313         ret = __btrfs_drop_extents(trans, root, inode, path,
314                                    start, aligned_end, NULL,
315                                    1, 1, extent_item_size, &extent_inserted);
316         if (ret) {
317                 btrfs_abort_transaction(trans, ret);
318                 goto out;
319         }
320
321         if (isize > actual_end)
322                 inline_len = min_t(u64, isize, actual_end);
323         ret = insert_inline_extent(trans, path, extent_inserted,
324                                    root, inode, start,
325                                    inline_len, compressed_size,
326                                    compress_type, compressed_pages);
327         if (ret && ret != -ENOSPC) {
328                 btrfs_abort_transaction(trans, ret);
329                 goto out;
330         } else if (ret == -ENOSPC) {
331                 ret = 1;
332                 goto out;
333         }
334
335         set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
336         btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0);
337 out:
338         /*
339          * Don't forget to free the reserved space, as for inlined extent
340          * it won't count as data extent, free them directly here.
341          * And at reserve time, it's always aligned to page size, so
342          * just free one page here.
343          */
344         btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
345         btrfs_free_path(path);
346         btrfs_end_transaction(trans);
347         return ret;
348 }
349
350 struct async_extent {
351         u64 start;
352         u64 ram_size;
353         u64 compressed_size;
354         struct page **pages;
355         unsigned long nr_pages;
356         int compress_type;
357         struct list_head list;
358 };
359
360 struct async_chunk {
361         struct inode *inode;
362         struct page *locked_page;
363         u64 start;
364         u64 end;
365         unsigned int write_flags;
366         struct list_head extents;
367         struct btrfs_work work;
368         atomic_t *pending;
369 };
370
371 struct async_cow {
372         /* Number of chunks in flight; must be first in the structure */
373         atomic_t num_chunks;
374         struct async_chunk chunks[];
375 };
376
377 static noinline int add_async_extent(struct async_chunk *cow,
378                                      u64 start, u64 ram_size,
379                                      u64 compressed_size,
380                                      struct page **pages,
381                                      unsigned long nr_pages,
382                                      int compress_type)
383 {
384         struct async_extent *async_extent;
385
386         async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
387         BUG_ON(!async_extent); /* -ENOMEM */
388         async_extent->start = start;
389         async_extent->ram_size = ram_size;
390         async_extent->compressed_size = compressed_size;
391         async_extent->pages = pages;
392         async_extent->nr_pages = nr_pages;
393         async_extent->compress_type = compress_type;
394         list_add_tail(&async_extent->list, &cow->extents);
395         return 0;
396 }
397
398 /*
399  * Check if the inode has flags compatible with compression
400  */
401 static inline bool inode_can_compress(struct inode *inode)
402 {
403         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW ||
404             BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
405                 return false;
406         return true;
407 }
408
409 /*
410  * Check if the inode needs to be submitted to compression, based on mount
411  * options, defragmentation, properties or heuristics.
412  */
413 static inline int inode_need_compress(struct inode *inode, u64 start, u64 end)
414 {
415         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
416
417         if (!inode_can_compress(inode)) {
418                 WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
419                         KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
420                         btrfs_ino(BTRFS_I(inode)));
421                 return 0;
422         }
423         /* force compress */
424         if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
425                 return 1;
426         /* defrag ioctl */
427         if (BTRFS_I(inode)->defrag_compress)
428                 return 1;
429         /* bad compression ratios */
430         if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
431                 return 0;
432         if (btrfs_test_opt(fs_info, COMPRESS) ||
433             BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
434             BTRFS_I(inode)->prop_compress)
435                 return btrfs_compress_heuristic(inode, start, end);
436         return 0;
437 }
438
439 static inline void inode_should_defrag(struct btrfs_inode *inode,
440                 u64 start, u64 end, u64 num_bytes, u64 small_write)
441 {
442         /* If this is a small write inside eof, kick off a defrag */
443         if (num_bytes < small_write &&
444             (start > 0 || end + 1 < inode->disk_i_size))
445                 btrfs_add_inode_defrag(NULL, inode);
446 }
447
448 /*
449  * we create compressed extents in two phases.  The first
450  * phase compresses a range of pages that have already been
451  * locked (both pages and state bits are locked).
452  *
453  * This is done inside an ordered work queue, and the compression
454  * is spread across many cpus.  The actual IO submission is step
455  * two, and the ordered work queue takes care of making sure that
456  * happens in the same order things were put onto the queue by
457  * writepages and friends.
458  *
459  * If this code finds it can't get good compression, it puts an
460  * entry onto the work queue to write the uncompressed bytes.  This
461  * makes sure that both compressed inodes and uncompressed inodes
462  * are written in the same order that the flusher thread sent them
463  * down.
464  */
465 static noinline void compress_file_range(struct async_chunk *async_chunk,
466                                          int *num_added)
467 {
468         struct inode *inode = async_chunk->inode;
469         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
470         u64 blocksize = fs_info->sectorsize;
471         u64 start = async_chunk->start;
472         u64 end = async_chunk->end;
473         u64 actual_end;
474         int ret = 0;
475         struct page **pages = NULL;
476         unsigned long nr_pages;
477         unsigned long total_compressed = 0;
478         unsigned long total_in = 0;
479         int i;
480         int will_compress;
481         int compress_type = fs_info->compress_type;
482         int redirty = 0;
483
484         inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
485                         SZ_16K);
486
487         actual_end = min_t(u64, i_size_read(inode), end + 1);
488 again:
489         will_compress = 0;
490         nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
491         BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0);
492         nr_pages = min_t(unsigned long, nr_pages,
493                         BTRFS_MAX_COMPRESSED / PAGE_SIZE);
494
495         /*
496          * we don't want to send crud past the end of i_size through
497          * compression, that's just a waste of CPU time.  So, if the
498          * end of the file is before the start of our current
499          * requested range of bytes, we bail out to the uncompressed
500          * cleanup code that can deal with all of this.
501          *
502          * It isn't really the fastest way to fix things, but this is a
503          * very uncommon corner.
504          */
505         if (actual_end <= start)
506                 goto cleanup_and_bail_uncompressed;
507
508         total_compressed = actual_end - start;
509
510         /*
511          * skip compression for a small file range(<=blocksize) that
512          * isn't an inline extent, since it doesn't save disk space at all.
513          */
514         if (total_compressed <= blocksize &&
515            (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
516                 goto cleanup_and_bail_uncompressed;
517
518         total_compressed = min_t(unsigned long, total_compressed,
519                         BTRFS_MAX_UNCOMPRESSED);
520         total_in = 0;
521         ret = 0;
522
523         /*
524          * we do compression for mount -o compress and when the
525          * inode has not been flagged as nocompress.  This flag can
526          * change at any time if we discover bad compression ratios.
527          */
528         if (inode_need_compress(inode, start, end)) {
529                 WARN_ON(pages);
530                 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
531                 if (!pages) {
532                         /* just bail out to the uncompressed code */
533                         nr_pages = 0;
534                         goto cont;
535                 }
536
537                 if (BTRFS_I(inode)->defrag_compress)
538                         compress_type = BTRFS_I(inode)->defrag_compress;
539                 else if (BTRFS_I(inode)->prop_compress)
540                         compress_type = BTRFS_I(inode)->prop_compress;
541
542                 /*
543                  * we need to call clear_page_dirty_for_io on each
544                  * page in the range.  Otherwise applications with the file
545                  * mmap'd can wander in and change the page contents while
546                  * we are compressing them.
547                  *
548                  * If the compression fails for any reason, we set the pages
549                  * dirty again later on.
550                  *
551                  * Note that the remaining part is redirtied, the start pointer
552                  * has moved, the end is the original one.
553                  */
554                 if (!redirty) {
555                         extent_range_clear_dirty_for_io(inode, start, end);
556                         redirty = 1;
557                 }
558
559                 /* Compression level is applied here and only here */
560                 ret = btrfs_compress_pages(
561                         compress_type | (fs_info->compress_level << 4),
562                                            inode->i_mapping, start,
563                                            pages,
564                                            &nr_pages,
565                                            &total_in,
566                                            &total_compressed);
567
568                 if (!ret) {
569                         unsigned long offset = offset_in_page(total_compressed);
570                         struct page *page = pages[nr_pages - 1];
571                         char *kaddr;
572
573                         /* zero the tail end of the last page, we might be
574                          * sending it down to disk
575                          */
576                         if (offset) {
577                                 kaddr = kmap_atomic(page);
578                                 memset(kaddr + offset, 0,
579                                        PAGE_SIZE - offset);
580                                 kunmap_atomic(kaddr);
581                         }
582                         will_compress = 1;
583                 }
584         }
585 cont:
586         if (start == 0) {
587                 /* lets try to make an inline extent */
588                 if (ret || total_in < actual_end) {
589                         /* we didn't compress the entire range, try
590                          * to make an uncompressed inline extent.
591                          */
592                         ret = cow_file_range_inline(inode, start, end, 0,
593                                                     BTRFS_COMPRESS_NONE, NULL);
594                 } else {
595                         /* try making a compressed inline extent */
596                         ret = cow_file_range_inline(inode, start, end,
597                                                     total_compressed,
598                                                     compress_type, pages);
599                 }
600                 if (ret <= 0) {
601                         unsigned long clear_flags = EXTENT_DELALLOC |
602                                 EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
603                                 EXTENT_DO_ACCOUNTING;
604                         unsigned long page_error_op;
605
606                         page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
607
608                         /*
609                          * inline extent creation worked or returned error,
610                          * we don't need to create any more async work items.
611                          * Unlock and free up our temp pages.
612                          *
613                          * We use DO_ACCOUNTING here because we need the
614                          * delalloc_release_metadata to be done _after_ we drop
615                          * our outstanding extent for clearing delalloc for this
616                          * range.
617                          */
618                         extent_clear_unlock_delalloc(inode, start, end, end,
619                                                      NULL, clear_flags,
620                                                      PAGE_UNLOCK |
621                                                      PAGE_CLEAR_DIRTY |
622                                                      PAGE_SET_WRITEBACK |
623                                                      page_error_op |
624                                                      PAGE_END_WRITEBACK);
625                         goto free_pages_out;
626                 }
627         }
628
629         if (will_compress) {
630                 /*
631                  * we aren't doing an inline extent round the compressed size
632                  * up to a block size boundary so the allocator does sane
633                  * things
634                  */
635                 total_compressed = ALIGN(total_compressed, blocksize);
636
637                 /*
638                  * one last check to make sure the compression is really a
639                  * win, compare the page count read with the blocks on disk,
640                  * compression must free at least one sector size
641                  */
642                 total_in = ALIGN(total_in, PAGE_SIZE);
643                 if (total_compressed + blocksize <= total_in) {
644                         *num_added += 1;
645
646                         /*
647                          * The async work queues will take care of doing actual
648                          * allocation on disk for these compressed pages, and
649                          * will submit them to the elevator.
650                          */
651                         add_async_extent(async_chunk, start, total_in,
652                                         total_compressed, pages, nr_pages,
653                                         compress_type);
654
655                         if (start + total_in < end) {
656                                 start += total_in;
657                                 pages = NULL;
658                                 cond_resched();
659                                 goto again;
660                         }
661                         return;
662                 }
663         }
664         if (pages) {
665                 /*
666                  * the compression code ran but failed to make things smaller,
667                  * free any pages it allocated and our page pointer array
668                  */
669                 for (i = 0; i < nr_pages; i++) {
670                         WARN_ON(pages[i]->mapping);
671                         put_page(pages[i]);
672                 }
673                 kfree(pages);
674                 pages = NULL;
675                 total_compressed = 0;
676                 nr_pages = 0;
677
678                 /* flag the file so we don't compress in the future */
679                 if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
680                     !(BTRFS_I(inode)->prop_compress)) {
681                         BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
682                 }
683         }
684 cleanup_and_bail_uncompressed:
685         /*
686          * No compression, but we still need to write the pages in the file
687          * we've been given so far.  redirty the locked page if it corresponds
688          * to our extent and set things up for the async work queue to run
689          * cow_file_range to do the normal delalloc dance.
690          */
691         if (page_offset(async_chunk->locked_page) >= start &&
692             page_offset(async_chunk->locked_page) <= end)
693                 __set_page_dirty_nobuffers(async_chunk->locked_page);
694                 /* unlocked later on in the async handlers */
695
696         if (redirty)
697                 extent_range_redirty_for_io(inode, start, end);
698         add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
699                          BTRFS_COMPRESS_NONE);
700         *num_added += 1;
701
702         return;
703
704 free_pages_out:
705         for (i = 0; i < nr_pages; i++) {
706                 WARN_ON(pages[i]->mapping);
707                 put_page(pages[i]);
708         }
709         kfree(pages);
710 }
711
712 static void free_async_extent_pages(struct async_extent *async_extent)
713 {
714         int i;
715
716         if (!async_extent->pages)
717                 return;
718
719         for (i = 0; i < async_extent->nr_pages; i++) {
720                 WARN_ON(async_extent->pages[i]->mapping);
721                 put_page(async_extent->pages[i]);
722         }
723         kfree(async_extent->pages);
724         async_extent->nr_pages = 0;
725         async_extent->pages = NULL;
726 }
727
728 /*
729  * phase two of compressed writeback.  This is the ordered portion
730  * of the code, which only gets called in the order the work was
731  * queued.  We walk all the async extents created by compress_file_range
732  * and send them down to the disk.
733  */
734 static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
735 {
736         struct inode *inode = async_chunk->inode;
737         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
738         struct async_extent *async_extent;
739         u64 alloc_hint = 0;
740         struct btrfs_key ins;
741         struct extent_map *em;
742         struct btrfs_root *root = BTRFS_I(inode)->root;
743         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
744         int ret = 0;
745
746 again:
747         while (!list_empty(&async_chunk->extents)) {
748                 async_extent = list_entry(async_chunk->extents.next,
749                                           struct async_extent, list);
750                 list_del(&async_extent->list);
751
752 retry:
753                 lock_extent(io_tree, async_extent->start,
754                             async_extent->start + async_extent->ram_size - 1);
755                 /* did the compression code fall back to uncompressed IO? */
756                 if (!async_extent->pages) {
757                         int page_started = 0;
758                         unsigned long nr_written = 0;
759
760                         /* allocate blocks */
761                         ret = cow_file_range(inode, async_chunk->locked_page,
762                                              async_extent->start,
763                                              async_extent->start +
764                                              async_extent->ram_size - 1,
765                                              async_extent->start +
766                                              async_extent->ram_size - 1,
767                                              &page_started, &nr_written, 0,
768                                              NULL);
769
770                         /* JDM XXX */
771
772                         /*
773                          * if page_started, cow_file_range inserted an
774                          * inline extent and took care of all the unlocking
775                          * and IO for us.  Otherwise, we need to submit
776                          * all those pages down to the drive.
777                          */
778                         if (!page_started && !ret)
779                                 extent_write_locked_range(inode,
780                                                   async_extent->start,
781                                                   async_extent->start +
782                                                   async_extent->ram_size - 1,
783                                                   WB_SYNC_ALL);
784                         else if (ret)
785                                 unlock_page(async_chunk->locked_page);
786                         kfree(async_extent);
787                         cond_resched();
788                         continue;
789                 }
790
791                 ret = btrfs_reserve_extent(root, async_extent->ram_size,
792                                            async_extent->compressed_size,
793                                            async_extent->compressed_size,
794                                            0, alloc_hint, &ins, 1, 1);
795                 if (ret) {
796                         free_async_extent_pages(async_extent);
797
798                         if (ret == -ENOSPC) {
799                                 unlock_extent(io_tree, async_extent->start,
800                                               async_extent->start +
801                                               async_extent->ram_size - 1);
802
803                                 /*
804                                  * we need to redirty the pages if we decide to
805                                  * fallback to uncompressed IO, otherwise we
806                                  * will not submit these pages down to lower
807                                  * layers.
808                                  */
809                                 extent_range_redirty_for_io(inode,
810                                                 async_extent->start,
811                                                 async_extent->start +
812                                                 async_extent->ram_size - 1);
813
814                                 goto retry;
815                         }
816                         goto out_free;
817                 }
818                 /*
819                  * here we're doing allocation and writeback of the
820                  * compressed pages
821                  */
822                 em = create_io_em(inode, async_extent->start,
823                                   async_extent->ram_size, /* len */
824                                   async_extent->start, /* orig_start */
825                                   ins.objectid, /* block_start */
826                                   ins.offset, /* block_len */
827                                   ins.offset, /* orig_block_len */
828                                   async_extent->ram_size, /* ram_bytes */
829                                   async_extent->compress_type,
830                                   BTRFS_ORDERED_COMPRESSED);
831                 if (IS_ERR(em))
832                         /* ret value is not necessary due to void function */
833                         goto out_free_reserve;
834                 free_extent_map(em);
835
836                 ret = btrfs_add_ordered_extent_compress(inode,
837                                                 async_extent->start,
838                                                 ins.objectid,
839                                                 async_extent->ram_size,
840                                                 ins.offset,
841                                                 BTRFS_ORDERED_COMPRESSED,
842                                                 async_extent->compress_type);
843                 if (ret) {
844                         btrfs_drop_extent_cache(BTRFS_I(inode),
845                                                 async_extent->start,
846                                                 async_extent->start +
847                                                 async_extent->ram_size - 1, 0);
848                         goto out_free_reserve;
849                 }
850                 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
851
852                 /*
853                  * clear dirty, set writeback and unlock the pages.
854                  */
855                 extent_clear_unlock_delalloc(inode, async_extent->start,
856                                 async_extent->start +
857                                 async_extent->ram_size - 1,
858                                 async_extent->start +
859                                 async_extent->ram_size - 1,
860                                 NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
861                                 PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
862                                 PAGE_SET_WRITEBACK);
863                 if (btrfs_submit_compressed_write(inode,
864                                     async_extent->start,
865                                     async_extent->ram_size,
866                                     ins.objectid,
867                                     ins.offset, async_extent->pages,
868                                     async_extent->nr_pages,
869                                     async_chunk->write_flags)) {
870                         struct page *p = async_extent->pages[0];
871                         const u64 start = async_extent->start;
872                         const u64 end = start + async_extent->ram_size - 1;
873
874                         p->mapping = inode->i_mapping;
875                         btrfs_writepage_endio_finish_ordered(p, start, end, 0);
876
877                         p->mapping = NULL;
878                         extent_clear_unlock_delalloc(inode, start, end, end,
879                                                      NULL, 0,
880                                                      PAGE_END_WRITEBACK |
881                                                      PAGE_SET_ERROR);
882                         free_async_extent_pages(async_extent);
883                 }
884                 alloc_hint = ins.objectid + ins.offset;
885                 kfree(async_extent);
886                 cond_resched();
887         }
888         return;
889 out_free_reserve:
890         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
891         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
892 out_free:
893         extent_clear_unlock_delalloc(inode, async_extent->start,
894                                      async_extent->start +
895                                      async_extent->ram_size - 1,
896                                      async_extent->start +
897                                      async_extent->ram_size - 1,
898                                      NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
899                                      EXTENT_DELALLOC_NEW |
900                                      EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
901                                      PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
902                                      PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
903                                      PAGE_SET_ERROR);
904         free_async_extent_pages(async_extent);
905         kfree(async_extent);
906         goto again;
907 }
908
909 static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
910                                       u64 num_bytes)
911 {
912         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
913         struct extent_map *em;
914         u64 alloc_hint = 0;
915
916         read_lock(&em_tree->lock);
917         em = search_extent_mapping(em_tree, start, num_bytes);
918         if (em) {
919                 /*
920                  * if block start isn't an actual block number then find the
921                  * first block in this inode and use that as a hint.  If that
922                  * block is also bogus then just don't worry about it.
923                  */
924                 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
925                         free_extent_map(em);
926                         em = search_extent_mapping(em_tree, 0, 0);
927                         if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
928                                 alloc_hint = em->block_start;
929                         if (em)
930                                 free_extent_map(em);
931                 } else {
932                         alloc_hint = em->block_start;
933                         free_extent_map(em);
934                 }
935         }
936         read_unlock(&em_tree->lock);
937
938         return alloc_hint;
939 }
940
941 /*
942  * when extent_io.c finds a delayed allocation range in the file,
943  * the call backs end up in this code.  The basic idea is to
944  * allocate extents on disk for the range, and create ordered data structs
945  * in ram to track those extents.
946  *
947  * locked_page is the page that writepage had locked already.  We use
948  * it to make sure we don't do extra locks or unlocks.
949  *
950  * *page_started is set to one if we unlock locked_page and do everything
951  * required to start IO on it.  It may be clean and already done with
952  * IO when we return.
953  */
954 static noinline int cow_file_range(struct inode *inode,
955                                    struct page *locked_page,
956                                    u64 start, u64 end, u64 delalloc_end,
957                                    int *page_started, unsigned long *nr_written,
958                                    int unlock, struct btrfs_dedupe_hash *hash)
959 {
960         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
961         struct btrfs_root *root = BTRFS_I(inode)->root;
962         u64 alloc_hint = 0;
963         u64 num_bytes;
964         unsigned long ram_size;
965         u64 cur_alloc_size = 0;
966         u64 blocksize = fs_info->sectorsize;
967         struct btrfs_key ins;
968         struct extent_map *em;
969         unsigned clear_bits;
970         unsigned long page_ops;
971         bool extent_reserved = false;
972         int ret = 0;
973
974         if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
975                 WARN_ON_ONCE(1);
976                 ret = -EINVAL;
977                 goto out_unlock;
978         }
979
980         num_bytes = ALIGN(end - start + 1, blocksize);
981         num_bytes = max(blocksize,  num_bytes);
982         ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
983
984         inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K);
985
986         if (start == 0) {
987                 /* lets try to make an inline extent */
988                 ret = cow_file_range_inline(inode, start, end, 0,
989                                             BTRFS_COMPRESS_NONE, NULL);
990                 if (ret == 0) {
991                         /*
992                          * We use DO_ACCOUNTING here because we need the
993                          * delalloc_release_metadata to be run _after_ we drop
994                          * our outstanding extent for clearing delalloc for this
995                          * range.
996                          */
997                         extent_clear_unlock_delalloc(inode, start, end,
998                                      delalloc_end, NULL,
999                                      EXTENT_LOCKED | EXTENT_DELALLOC |
1000                                      EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
1001                                      EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1002                                      PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
1003                                      PAGE_END_WRITEBACK);
1004                         *nr_written = *nr_written +
1005                              (end - start + PAGE_SIZE) / PAGE_SIZE;
1006                         *page_started = 1;
1007                         goto out;
1008                 } else if (ret < 0) {
1009                         goto out_unlock;
1010                 }
1011         }
1012
1013         alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
1014         btrfs_drop_extent_cache(BTRFS_I(inode), start,
1015                         start + num_bytes - 1, 0);
1016
1017         while (num_bytes > 0) {
1018                 cur_alloc_size = num_bytes;
1019                 ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
1020                                            fs_info->sectorsize, 0, alloc_hint,
1021                                            &ins, 1, 1);
1022                 if (ret < 0)
1023                         goto out_unlock;
1024                 cur_alloc_size = ins.offset;
1025                 extent_reserved = true;
1026
1027                 ram_size = ins.offset;
1028                 em = create_io_em(inode, start, ins.offset, /* len */
1029                                   start, /* orig_start */
1030                                   ins.objectid, /* block_start */
1031                                   ins.offset, /* block_len */
1032                                   ins.offset, /* orig_block_len */
1033                                   ram_size, /* ram_bytes */
1034                                   BTRFS_COMPRESS_NONE, /* compress_type */
1035                                   BTRFS_ORDERED_REGULAR /* type */);
1036                 if (IS_ERR(em)) {
1037                         ret = PTR_ERR(em);
1038                         goto out_reserve;
1039                 }
1040                 free_extent_map(em);
1041
1042                 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
1043                                                ram_size, cur_alloc_size, 0);
1044                 if (ret)
1045                         goto out_drop_extent_cache;
1046
1047                 if (root->root_key.objectid ==
1048                     BTRFS_DATA_RELOC_TREE_OBJECTID) {
1049                         ret = btrfs_reloc_clone_csums(inode, start,
1050                                                       cur_alloc_size);
1051                         /*
1052                          * Only drop cache here, and process as normal.
1053                          *
1054                          * We must not allow extent_clear_unlock_delalloc()
1055                          * at out_unlock label to free meta of this ordered
1056                          * extent, as its meta should be freed by
1057                          * btrfs_finish_ordered_io().
1058                          *
1059                          * So we must continue until @start is increased to
1060                          * skip current ordered extent.
1061                          */
1062                         if (ret)
1063                                 btrfs_drop_extent_cache(BTRFS_I(inode), start,
1064                                                 start + ram_size - 1, 0);
1065                 }
1066
1067                 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1068
1069                 /* we're not doing compressed IO, don't unlock the first
1070                  * page (which the caller expects to stay locked), don't
1071                  * clear any dirty bits and don't set any writeback bits
1072                  *
1073                  * Do set the Private2 bit so we know this page was properly
1074                  * setup for writepage
1075                  */
1076                 page_ops = unlock ? PAGE_UNLOCK : 0;
1077                 page_ops |= PAGE_SET_PRIVATE2;
1078
1079                 extent_clear_unlock_delalloc(inode, start,
1080                                              start + ram_size - 1,
1081                                              delalloc_end, locked_page,
1082                                              EXTENT_LOCKED | EXTENT_DELALLOC,
1083                                              page_ops);
1084                 if (num_bytes < cur_alloc_size)
1085                         num_bytes = 0;
1086                 else
1087                         num_bytes -= cur_alloc_size;
1088                 alloc_hint = ins.objectid + ins.offset;
1089                 start += cur_alloc_size;
1090                 extent_reserved = false;
1091
1092                 /*
1093                  * btrfs_reloc_clone_csums() error, since start is increased
1094                  * extent_clear_unlock_delalloc() at out_unlock label won't
1095                  * free metadata of current ordered extent, we're OK to exit.
1096                  */
1097                 if (ret)
1098                         goto out_unlock;
1099         }
1100 out:
1101         return ret;
1102
1103 out_drop_extent_cache:
1104         btrfs_drop_extent_cache(BTRFS_I(inode), start, start + ram_size - 1, 0);
1105 out_reserve:
1106         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1107         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1108 out_unlock:
1109         clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
1110                 EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
1111         page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
1112                 PAGE_END_WRITEBACK;
1113         /*
1114          * If we reserved an extent for our delalloc range (or a subrange) and
1115          * failed to create the respective ordered extent, then it means that
1116          * when we reserved the extent we decremented the extent's size from
1117          * the data space_info's bytes_may_use counter and incremented the
1118          * space_info's bytes_reserved counter by the same amount. We must make
1119          * sure extent_clear_unlock_delalloc() does not try to decrement again
1120          * the data space_info's bytes_may_use counter, therefore we do not pass
1121          * it the flag EXTENT_CLEAR_DATA_RESV.
1122          */
1123         if (extent_reserved) {
1124                 extent_clear_unlock_delalloc(inode, start,
1125                                              start + cur_alloc_size,
1126                                              start + cur_alloc_size,
1127                                              locked_page,
1128                                              clear_bits,
1129                                              page_ops);
1130                 start += cur_alloc_size;
1131                 if (start >= end)
1132                         goto out;
1133         }
1134         extent_clear_unlock_delalloc(inode, start, end, delalloc_end,
1135                                      locked_page,
1136                                      clear_bits | EXTENT_CLEAR_DATA_RESV,
1137                                      page_ops);
1138         goto out;
1139 }
1140
1141 /*
1142  * work queue call back to started compression on a file and pages
1143  */
1144 static noinline void async_cow_start(struct btrfs_work *work)
1145 {
1146         struct async_chunk *async_chunk;
1147         int num_added = 0;
1148
1149         async_chunk = container_of(work, struct async_chunk, work);
1150
1151         compress_file_range(async_chunk, &num_added);
1152         if (num_added == 0) {
1153                 btrfs_add_delayed_iput(async_chunk->inode);
1154                 async_chunk->inode = NULL;
1155         }
1156 }
1157
1158 /*
1159  * work queue call back to submit previously compressed pages
1160  */
1161 static noinline void async_cow_submit(struct btrfs_work *work)
1162 {
1163         struct async_chunk *async_chunk = container_of(work, struct async_chunk,
1164                                                      work);
1165         struct btrfs_fs_info *fs_info = btrfs_work_owner(work);
1166         unsigned long nr_pages;
1167
1168         nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
1169                 PAGE_SHIFT;
1170
1171         /* atomic_sub_return implies a barrier */
1172         if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
1173             5 * SZ_1M)
1174                 cond_wake_up_nomb(&fs_info->async_submit_wait);
1175
1176         /*
1177          * ->inode could be NULL if async_chunk_start has failed to compress,
1178          * in which case we don't have anything to submit, yet we need to
1179          * always adjust ->async_delalloc_pages as its paired with the init
1180          * happening in cow_file_range_async
1181          */
1182         if (async_chunk->inode)
1183                 submit_compressed_extents(async_chunk);
1184 }
1185
1186 static noinline void async_cow_free(struct btrfs_work *work)
1187 {
1188         struct async_chunk *async_chunk;
1189
1190         async_chunk = container_of(work, struct async_chunk, work);
1191         if (async_chunk->inode)
1192                 btrfs_add_delayed_iput(async_chunk->inode);
1193         /*
1194          * Since the pointer to 'pending' is at the beginning of the array of
1195          * async_chunk's, freeing it ensures the whole array has been freed.
1196          */
1197         if (atomic_dec_and_test(async_chunk->pending))
1198                 kvfree(async_chunk->pending);
1199 }
1200
1201 static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1202                                 u64 start, u64 end, int *page_started,
1203                                 unsigned long *nr_written,
1204                                 unsigned int write_flags)
1205 {
1206         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1207         struct async_cow *ctx;
1208         struct async_chunk *async_chunk;
1209         unsigned long nr_pages;
1210         u64 cur_end;
1211         u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
1212         int i;
1213         bool should_compress;
1214         unsigned nofs_flag;
1215
1216         unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
1217
1218         if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
1219             !btrfs_test_opt(fs_info, FORCE_COMPRESS)) {
1220                 num_chunks = 1;
1221                 should_compress = false;
1222         } else {
1223                 should_compress = true;
1224         }
1225
1226         nofs_flag = memalloc_nofs_save();
1227         ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL);
1228         memalloc_nofs_restore(nofs_flag);
1229
1230         if (!ctx) {
1231                 unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC |
1232                         EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
1233                         EXTENT_DO_ACCOUNTING;
1234                 unsigned long page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
1235                         PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
1236                         PAGE_SET_ERROR;
1237
1238                 extent_clear_unlock_delalloc(inode, start, end, 0, locked_page,
1239                                              clear_bits, page_ops);
1240                 return -ENOMEM;
1241         }
1242
1243         async_chunk = ctx->chunks;
1244         atomic_set(&ctx->num_chunks, num_chunks);
1245
1246         for (i = 0; i < num_chunks; i++) {
1247                 if (should_compress)
1248                         cur_end = min(end, start + SZ_512K - 1);
1249                 else
1250                         cur_end = end;
1251
1252                 /*
1253                  * igrab is called higher up in the call chain, take only the
1254                  * lightweight reference for the callback lifetime
1255                  */
1256                 ihold(inode);
1257                 async_chunk[i].pending = &ctx->num_chunks;
1258                 async_chunk[i].inode = inode;
1259                 async_chunk[i].start = start;
1260                 async_chunk[i].end = cur_end;
1261                 async_chunk[i].locked_page = locked_page;
1262                 async_chunk[i].write_flags = write_flags;
1263                 INIT_LIST_HEAD(&async_chunk[i].extents);
1264
1265                 btrfs_init_work(&async_chunk[i].work,
1266                                 btrfs_delalloc_helper,
1267                                 async_cow_start, async_cow_submit,
1268                                 async_cow_free);
1269
1270                 nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
1271                 atomic_add(nr_pages, &fs_info->async_delalloc_pages);
1272
1273                 btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work);
1274
1275                 *nr_written += nr_pages;
1276                 start = cur_end + 1;
1277         }
1278         *page_started = 1;
1279         return 0;
1280 }
1281
1282 static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
1283                                         u64 bytenr, u64 num_bytes)
1284 {
1285         int ret;
1286         struct btrfs_ordered_sum *sums;
1287         LIST_HEAD(list);
1288
1289         ret = btrfs_lookup_csums_range(fs_info->csum_root, bytenr,
1290                                        bytenr + num_bytes - 1, &list, 0);
1291         if (ret == 0 && list_empty(&list))
1292                 return 0;
1293
1294         while (!list_empty(&list)) {
1295                 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1296                 list_del(&sums->list);
1297                 kfree(sums);
1298         }
1299         if (ret < 0)
1300                 return ret;
1301         return 1;
1302 }
1303
1304 /*
1305  * when nowcow writeback call back.  This checks for snapshots or COW copies
1306  * of the extents that exist in the file, and COWs the file as required.
1307  *
1308  * If no cow copies or snapshots exist, we write directly to the existing
1309  * blocks on disk
1310  */
1311 static noinline int run_delalloc_nocow(struct inode *inode,
1312                                        struct page *locked_page,
1313                               u64 start, u64 end, int *page_started, int force,
1314                               unsigned long *nr_written)
1315 {
1316         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1317         struct btrfs_root *root = BTRFS_I(inode)->root;
1318         struct extent_buffer *leaf;
1319         struct btrfs_path *path;
1320         struct btrfs_file_extent_item *fi;
1321         struct btrfs_key found_key;
1322         struct extent_map *em;
1323         u64 cow_start;
1324         u64 cur_offset;
1325         u64 extent_end;
1326         u64 extent_offset;
1327         u64 disk_bytenr;
1328         u64 num_bytes;
1329         u64 disk_num_bytes;
1330         u64 ram_bytes;
1331         int extent_type;
1332         int ret;
1333         int type;
1334         int nocow;
1335         int check_prev = 1;
1336         bool nolock;
1337         u64 ino = btrfs_ino(BTRFS_I(inode));
1338
1339         path = btrfs_alloc_path();
1340         if (!path) {
1341                 extent_clear_unlock_delalloc(inode, start, end, end,
1342                                              locked_page,
1343                                              EXTENT_LOCKED | EXTENT_DELALLOC |
1344                                              EXTENT_DO_ACCOUNTING |
1345                                              EXTENT_DEFRAG, PAGE_UNLOCK |
1346                                              PAGE_CLEAR_DIRTY |
1347                                              PAGE_SET_WRITEBACK |
1348                                              PAGE_END_WRITEBACK);
1349                 return -ENOMEM;
1350         }
1351
1352         nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
1353
1354         cow_start = (u64)-1;
1355         cur_offset = start;
1356         while (1) {
1357                 ret = btrfs_lookup_file_extent(NULL, root, path, ino,
1358                                                cur_offset, 0);
1359                 if (ret < 0)
1360                         goto error;
1361                 if (ret > 0 && path->slots[0] > 0 && check_prev) {
1362                         leaf = path->nodes[0];
1363                         btrfs_item_key_to_cpu(leaf, &found_key,
1364                                               path->slots[0] - 1);
1365                         if (found_key.objectid == ino &&
1366                             found_key.type == BTRFS_EXTENT_DATA_KEY)
1367                                 path->slots[0]--;
1368                 }
1369                 check_prev = 0;
1370 next_slot:
1371                 leaf = path->nodes[0];
1372                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1373                         ret = btrfs_next_leaf(root, path);
1374                         if (ret < 0) {
1375                                 if (cow_start != (u64)-1)
1376                                         cur_offset = cow_start;
1377                                 goto error;
1378                         }
1379                         if (ret > 0)
1380                                 break;
1381                         leaf = path->nodes[0];
1382                 }
1383
1384                 nocow = 0;
1385                 disk_bytenr = 0;
1386                 num_bytes = 0;
1387                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1388
1389                 if (found_key.objectid > ino)
1390                         break;
1391                 if (WARN_ON_ONCE(found_key.objectid < ino) ||
1392                     found_key.type < BTRFS_EXTENT_DATA_KEY) {
1393                         path->slots[0]++;
1394                         goto next_slot;
1395                 }
1396                 if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
1397                     found_key.offset > end)
1398                         break;
1399
1400                 if (found_key.offset > cur_offset) {
1401                         extent_end = found_key.offset;
1402                         extent_type = 0;
1403                         goto out_check;
1404                 }
1405
1406                 fi = btrfs_item_ptr(leaf, path->slots[0],
1407                                     struct btrfs_file_extent_item);
1408                 extent_type = btrfs_file_extent_type(leaf, fi);
1409
1410                 ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1411                 if (extent_type == BTRFS_FILE_EXTENT_REG ||
1412                     extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1413                         disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1414                         extent_offset = btrfs_file_extent_offset(leaf, fi);
1415                         extent_end = found_key.offset +
1416                                 btrfs_file_extent_num_bytes(leaf, fi);
1417                         disk_num_bytes =
1418                                 btrfs_file_extent_disk_num_bytes(leaf, fi);
1419                         if (extent_end <= start) {
1420                                 path->slots[0]++;
1421                                 goto next_slot;
1422                         }
1423                         if (disk_bytenr == 0)
1424                                 goto out_check;
1425                         if (btrfs_file_extent_compression(leaf, fi) ||
1426                             btrfs_file_extent_encryption(leaf, fi) ||
1427                             btrfs_file_extent_other_encoding(leaf, fi))
1428                                 goto out_check;
1429                         /*
1430                          * Do the same check as in btrfs_cross_ref_exist but
1431                          * without the unnecessary search.
1432                          */
1433                         if (!nolock &&
1434                             btrfs_file_extent_generation(leaf, fi) <=
1435                             btrfs_root_last_snapshot(&root->root_item))
1436                                 goto out_check;
1437                         if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1438                                 goto out_check;
1439                         if (btrfs_extent_readonly(fs_info, disk_bytenr))
1440                                 goto out_check;
1441                         ret = btrfs_cross_ref_exist(root, ino,
1442                                                     found_key.offset -
1443                                                     extent_offset, disk_bytenr);
1444                         if (ret) {
1445                                 /*
1446                                  * ret could be -EIO if the above fails to read
1447                                  * metadata.
1448                                  */
1449                                 if (ret < 0) {
1450                                         if (cow_start != (u64)-1)
1451                                                 cur_offset = cow_start;
1452                                         goto error;
1453                                 }
1454
1455                                 WARN_ON_ONCE(nolock);
1456                                 goto out_check;
1457                         }
1458                         disk_bytenr += extent_offset;
1459                         disk_bytenr += cur_offset - found_key.offset;
1460                         num_bytes = min(end + 1, extent_end) - cur_offset;
1461                         /*
1462                          * if there are pending snapshots for this root,
1463                          * we fall into common COW way.
1464                          */
1465                         if (!nolock && atomic_read(&root->snapshot_force_cow))
1466                                 goto out_check;
1467                         /*
1468                          * force cow if csum exists in the range.
1469                          * this ensure that csum for a given extent are
1470                          * either valid or do not exist.
1471                          */
1472                         ret = csum_exist_in_range(fs_info, disk_bytenr,
1473                                                   num_bytes);
1474                         if (ret) {
1475                                 /*
1476                                  * ret could be -EIO if the above fails to read
1477                                  * metadata.
1478                                  */
1479                                 if (ret < 0) {
1480                                         if (cow_start != (u64)-1)
1481                                                 cur_offset = cow_start;
1482                                         goto error;
1483                                 }
1484                                 WARN_ON_ONCE(nolock);
1485                                 goto out_check;
1486                         }
1487                         if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr))
1488                                 goto out_check;
1489                         nocow = 1;
1490                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1491                         extent_end = found_key.offset +
1492                                 btrfs_file_extent_ram_bytes(leaf, fi);
1493                         extent_end = ALIGN(extent_end,
1494                                            fs_info->sectorsize);
1495                 } else {
1496                         BUG();
1497                 }
1498 out_check:
1499                 if (extent_end <= start) {
1500                         path->slots[0]++;
1501                         if (nocow)
1502                                 btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1503                         goto next_slot;
1504                 }
1505                 if (!nocow) {
1506                         if (cow_start == (u64)-1)
1507                                 cow_start = cur_offset;
1508                         cur_offset = extent_end;
1509                         if (cur_offset > end)
1510                                 break;
1511                         path->slots[0]++;
1512                         goto next_slot;
1513                 }
1514
1515                 btrfs_release_path(path);
1516                 if (cow_start != (u64)-1) {
1517                         ret = cow_file_range(inode, locked_page,
1518                                              cow_start, found_key.offset - 1,
1519                                              end, page_started, nr_written, 1,
1520                                              NULL);
1521                         if (ret) {
1522                                 if (nocow)
1523                                         btrfs_dec_nocow_writers(fs_info,
1524                                                                 disk_bytenr);
1525                                 goto error;
1526                         }
1527                         cow_start = (u64)-1;
1528                 }
1529
1530                 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1531                         u64 orig_start = found_key.offset - extent_offset;
1532
1533                         em = create_io_em(inode, cur_offset, num_bytes,
1534                                           orig_start,
1535                                           disk_bytenr, /* block_start */
1536                                           num_bytes, /* block_len */
1537                                           disk_num_bytes, /* orig_block_len */
1538                                           ram_bytes, BTRFS_COMPRESS_NONE,
1539                                           BTRFS_ORDERED_PREALLOC);
1540                         if (IS_ERR(em)) {
1541                                 if (nocow)
1542                                         btrfs_dec_nocow_writers(fs_info,
1543                                                                 disk_bytenr);
1544                                 ret = PTR_ERR(em);
1545                                 goto error;
1546                         }
1547                         free_extent_map(em);
1548                 }
1549
1550                 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1551                         type = BTRFS_ORDERED_PREALLOC;
1552                 } else {
1553                         type = BTRFS_ORDERED_NOCOW;
1554                 }
1555
1556                 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1557                                                num_bytes, num_bytes, type);
1558                 if (nocow)
1559                         btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1560                 BUG_ON(ret); /* -ENOMEM */
1561
1562                 if (root->root_key.objectid ==
1563                     BTRFS_DATA_RELOC_TREE_OBJECTID)
1564                         /*
1565                          * Error handled later, as we must prevent
1566                          * extent_clear_unlock_delalloc() in error handler
1567                          * from freeing metadata of created ordered extent.
1568                          */
1569                         ret = btrfs_reloc_clone_csums(inode, cur_offset,
1570                                                       num_bytes);
1571
1572                 extent_clear_unlock_delalloc(inode, cur_offset,
1573                                              cur_offset + num_bytes - 1, end,
1574                                              locked_page, EXTENT_LOCKED |
1575                                              EXTENT_DELALLOC |
1576                                              EXTENT_CLEAR_DATA_RESV,
1577                                              PAGE_UNLOCK | PAGE_SET_PRIVATE2);
1578
1579                 cur_offset = extent_end;
1580
1581                 /*
1582                  * btrfs_reloc_clone_csums() error, now we're OK to call error
1583                  * handler, as metadata for created ordered extent will only
1584                  * be freed by btrfs_finish_ordered_io().
1585                  */
1586                 if (ret)
1587                         goto error;
1588                 if (cur_offset > end)
1589                         break;
1590         }
1591         btrfs_release_path(path);
1592
1593         if (cur_offset <= end && cow_start == (u64)-1)
1594                 cow_start = cur_offset;
1595
1596         if (cow_start != (u64)-1) {
1597                 cur_offset = end;
1598                 ret = cow_file_range(inode, locked_page, cow_start, end, end,
1599                                      page_started, nr_written, 1, NULL);
1600                 if (ret)
1601                         goto error;
1602         }
1603
1604 error:
1605         if (ret && cur_offset < end)
1606                 extent_clear_unlock_delalloc(inode, cur_offset, end, end,
1607                                              locked_page, EXTENT_LOCKED |
1608                                              EXTENT_DELALLOC | EXTENT_DEFRAG |
1609                                              EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1610                                              PAGE_CLEAR_DIRTY |
1611                                              PAGE_SET_WRITEBACK |
1612                                              PAGE_END_WRITEBACK);
1613         btrfs_free_path(path);
1614         return ret;
1615 }
1616
1617 static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
1618 {
1619
1620         if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
1621             !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
1622                 return 0;
1623
1624         /*
1625          * @defrag_bytes is a hint value, no spinlock held here,
1626          * if is not zero, it means the file is defragging.
1627          * Force cow if given extent needs to be defragged.
1628          */
1629         if (BTRFS_I(inode)->defrag_bytes &&
1630             test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1631                            EXTENT_DEFRAG, 0, NULL))
1632                 return 1;
1633
1634         return 0;
1635 }
1636
1637 /*
1638  * Function to process delayed allocation (create CoW) for ranges which are
1639  * being touched for the first time.
1640  */
1641 int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
1642                 u64 start, u64 end, int *page_started, unsigned long *nr_written,
1643                 struct writeback_control *wbc)
1644 {
1645         int ret;
1646         int force_cow = need_force_cow(inode, start, end);
1647         unsigned int write_flags = wbc_to_write_flags(wbc);
1648
1649         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
1650                 ret = run_delalloc_nocow(inode, locked_page, start, end,
1651                                          page_started, 1, nr_written);
1652         } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
1653                 ret = run_delalloc_nocow(inode, locked_page, start, end,
1654                                          page_started, 0, nr_written);
1655         } else if (!inode_can_compress(inode) ||
1656                    !inode_need_compress(inode, start, end)) {
1657                 ret = cow_file_range(inode, locked_page, start, end, end,
1658                                       page_started, nr_written, 1, NULL);
1659         } else {
1660                 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1661                         &BTRFS_I(inode)->runtime_flags);
1662                 ret = cow_file_range_async(inode, locked_page, start, end,
1663                                            page_started, nr_written,
1664                                            write_flags);
1665         }
1666         if (ret)
1667                 btrfs_cleanup_ordered_extents(inode, locked_page, start,
1668                                               end - start + 1);
1669         return ret;
1670 }
1671
1672 void btrfs_split_delalloc_extent(struct inode *inode,
1673                                  struct extent_state *orig, u64 split)
1674 {
1675         u64 size;
1676
1677         /* not delalloc, ignore it */
1678         if (!(orig->state & EXTENT_DELALLOC))
1679                 return;
1680
1681         size = orig->end - orig->start + 1;
1682         if (size > BTRFS_MAX_EXTENT_SIZE) {
1683                 u32 num_extents;
1684                 u64 new_size;
1685
1686                 /*
1687                  * See the explanation in btrfs_merge_delalloc_extent, the same
1688                  * applies here, just in reverse.
1689                  */
1690                 new_size = orig->end - split + 1;
1691                 num_extents = count_max_extents(new_size);
1692                 new_size = split - orig->start;
1693                 num_extents += count_max_extents(new_size);
1694                 if (count_max_extents(size) >= num_extents)
1695                         return;
1696         }
1697
1698         spin_lock(&BTRFS_I(inode)->lock);
1699         btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
1700         spin_unlock(&BTRFS_I(inode)->lock);
1701 }
1702
1703 /*
1704  * Handle merged delayed allocation extents so we can keep track of new extents
1705  * that are just merged onto old extents, such as when we are doing sequential
1706  * writes, so we can properly account for the metadata space we'll need.
1707  */
1708 void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
1709                                  struct extent_state *other)
1710 {
1711         u64 new_size, old_size;
1712         u32 num_extents;
1713
1714         /* not delalloc, ignore it */
1715         if (!(other->state & EXTENT_DELALLOC))
1716                 return;
1717
1718         if (new->start > other->start)
1719                 new_size = new->end - other->start + 1;
1720         else
1721                 new_size = other->end - new->start + 1;
1722
1723         /* we're not bigger than the max, unreserve the space and go */
1724         if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
1725                 spin_lock(&BTRFS_I(inode)->lock);
1726                 btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
1727                 spin_unlock(&BTRFS_I(inode)->lock);
1728                 return;
1729         }
1730
1731         /*
1732          * We have to add up either side to figure out how many extents were
1733          * accounted for before we merged into one big extent.  If the number of
1734          * extents we accounted for is <= the amount we need for the new range
1735          * then we can return, otherwise drop.  Think of it like this
1736          *
1737          * [ 4k][MAX_SIZE]
1738          *
1739          * So we've grown the extent by a MAX_SIZE extent, this would mean we
1740          * need 2 outstanding extents, on one side we have 1 and the other side
1741          * we have 1 so they are == and we can return.  But in this case
1742          *
1743          * [MAX_SIZE+4k][MAX_SIZE+4k]
1744          *
1745          * Each range on their own accounts for 2 extents, but merged together
1746          * they are only 3 extents worth of accounting, so we need to drop in
1747          * this case.
1748          */
1749         old_size = other->end - other->start + 1;
1750         num_extents = count_max_extents(old_size);
1751         old_size = new->end - new->start + 1;
1752         num_extents += count_max_extents(old_size);
1753         if (count_max_extents(new_size) >= num_extents)
1754                 return;
1755
1756         spin_lock(&BTRFS_I(inode)->lock);
1757         btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
1758         spin_unlock(&BTRFS_I(inode)->lock);
1759 }
1760
1761 static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
1762                                       struct inode *inode)
1763 {
1764         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1765
1766         spin_lock(&root->delalloc_lock);
1767         if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1768                 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1769                               &root->delalloc_inodes);
1770                 set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1771                         &BTRFS_I(inode)->runtime_flags);
1772                 root->nr_delalloc_inodes++;
1773                 if (root->nr_delalloc_inodes == 1) {
1774                         spin_lock(&fs_info->delalloc_root_lock);
1775                         BUG_ON(!list_empty(&root->delalloc_root));
1776                         list_add_tail(&root->delalloc_root,
1777                                       &fs_info->delalloc_roots);
1778                         spin_unlock(&fs_info->delalloc_root_lock);
1779                 }
1780         }
1781         spin_unlock(&root->delalloc_lock);
1782 }
1783
1784
1785 void __btrfs_del_delalloc_inode(struct btrfs_root *root,
1786                                 struct btrfs_inode *inode)
1787 {
1788         struct btrfs_fs_info *fs_info = root->fs_info;
1789
1790         if (!list_empty(&inode->delalloc_inodes)) {
1791                 list_del_init(&inode->delalloc_inodes);
1792                 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1793                           &inode->runtime_flags);
1794                 root->nr_delalloc_inodes--;
1795                 if (!root->nr_delalloc_inodes) {
1796                         ASSERT(list_empty(&root->delalloc_inodes));
1797                         spin_lock(&fs_info->delalloc_root_lock);
1798                         BUG_ON(list_empty(&root->delalloc_root));
1799                         list_del_init(&root->delalloc_root);
1800                         spin_unlock(&fs_info->delalloc_root_lock);
1801                 }
1802         }
1803 }
1804
1805 static void btrfs_del_delalloc_inode(struct btrfs_root *root,
1806                                      struct btrfs_inode *inode)
1807 {
1808         spin_lock(&root->delalloc_lock);
1809         __btrfs_del_delalloc_inode(root, inode);
1810         spin_unlock(&root->delalloc_lock);
1811 }
1812
1813 /*
1814  * Properly track delayed allocation bytes in the inode and to maintain the
1815  * list of inodes that have pending delalloc work to be done.
1816  */
1817 void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
1818                                unsigned *bits)
1819 {
1820         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1821
1822         if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
1823                 WARN_ON(1);
1824         /*
1825          * set_bit and clear bit hooks normally require _irqsave/restore
1826          * but in this case, we are only testing for the DELALLOC
1827          * bit, which is only set or cleared with irqs on
1828          */
1829         if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1830                 struct btrfs_root *root = BTRFS_I(inode)->root;
1831                 u64 len = state->end + 1 - state->start;
1832                 u32 num_extents = count_max_extents(len);
1833                 bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
1834
1835                 spin_lock(&BTRFS_I(inode)->lock);
1836                 btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents);
1837                 spin_unlock(&BTRFS_I(inode)->lock);
1838
1839                 /* For sanity tests */
1840                 if (btrfs_is_testing(fs_info))
1841                         return;
1842
1843                 percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
1844                                          fs_info->delalloc_batch);
1845                 spin_lock(&BTRFS_I(inode)->lock);
1846                 BTRFS_I(inode)->delalloc_bytes += len;
1847                 if (*bits & EXTENT_DEFRAG)
1848                         BTRFS_I(inode)->defrag_bytes += len;
1849                 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1850                                          &BTRFS_I(inode)->runtime_flags))
1851                         btrfs_add_delalloc_inodes(root, inode);
1852                 spin_unlock(&BTRFS_I(inode)->lock);
1853         }
1854
1855         if (!(state->state & EXTENT_DELALLOC_NEW) &&
1856             (*bits & EXTENT_DELALLOC_NEW)) {
1857                 spin_lock(&BTRFS_I(inode)->lock);
1858                 BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
1859                         state->start;
1860                 spin_unlock(&BTRFS_I(inode)->lock);
1861         }
1862 }
1863
1864 /*
1865  * Once a range is no longer delalloc this function ensures that proper
1866  * accounting happens.
1867  */
1868 void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
1869                                  struct extent_state *state, unsigned *bits)
1870 {
1871         struct btrfs_inode *inode = BTRFS_I(vfs_inode);
1872         struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb);
1873         u64 len = state->end + 1 - state->start;
1874         u32 num_extents = count_max_extents(len);
1875
1876         if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) {
1877                 spin_lock(&inode->lock);
1878                 inode->defrag_bytes -= len;
1879                 spin_unlock(&inode->lock);
1880         }
1881
1882         /*
1883          * set_bit and clear bit hooks normally require _irqsave/restore
1884          * but in this case, we are only testing for the DELALLOC
1885          * bit, which is only set or cleared with irqs on
1886          */
1887         if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1888                 struct btrfs_root *root = inode->root;
1889                 bool do_list = !btrfs_is_free_space_inode(inode);
1890
1891                 spin_lock(&inode->lock);
1892                 btrfs_mod_outstanding_extents(inode, -num_extents);
1893                 spin_unlock(&inode->lock);
1894
1895                 /*
1896                  * We don't reserve metadata space for space cache inodes so we
1897                  * don't need to call delalloc_release_metadata if there is an
1898                  * error.
1899                  */
1900                 if (*bits & EXTENT_CLEAR_META_RESV &&
1901                     root != fs_info->tree_root)
1902                         btrfs_delalloc_release_metadata(inode, len, false);
1903
1904                 /* For sanity tests. */
1905                 if (btrfs_is_testing(fs_info))
1906                         return;
1907
1908                 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID &&
1909                     do_list && !(state->state & EXTENT_NORESERVE) &&
1910                     (*bits & EXTENT_CLEAR_DATA_RESV))
1911                         btrfs_free_reserved_data_space_noquota(
1912                                         &inode->vfs_inode,
1913                                         state->start, len);
1914
1915                 percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
1916                                          fs_info->delalloc_batch);
1917                 spin_lock(&inode->lock);
1918                 inode->delalloc_bytes -= len;
1919                 if (do_list && inode->delalloc_bytes == 0 &&
1920                     test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1921                                         &inode->runtime_flags))
1922                         btrfs_del_delalloc_inode(root, inode);
1923                 spin_unlock(&inode->lock);
1924         }
1925
1926         if ((state->state & EXTENT_DELALLOC_NEW) &&
1927             (*bits & EXTENT_DELALLOC_NEW)) {
1928                 spin_lock(&inode->lock);
1929                 ASSERT(inode->new_delalloc_bytes >= len);
1930                 inode->new_delalloc_bytes -= len;
1931                 spin_unlock(&inode->lock);
1932         }
1933 }
1934
1935 /*
1936  * btrfs_bio_fits_in_stripe - Checks whether the size of the given bio will fit
1937  * in a chunk's stripe. This function ensures that bios do not span a
1938  * stripe/chunk
1939  *
1940  * @page - The page we are about to add to the bio
1941  * @size - size we want to add to the bio
1942  * @bio - bio we want to ensure is smaller than a stripe
1943  * @bio_flags - flags of the bio
1944  *
1945  * return 1 if page cannot be added to the bio
1946  * return 0 if page can be added to the bio
1947  * return error otherwise
1948  */
1949 int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
1950                              unsigned long bio_flags)
1951 {
1952         struct inode *inode = page->mapping->host;
1953         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1954         u64 logical = (u64)bio->bi_iter.bi_sector << 9;
1955         u64 length = 0;
1956         u64 map_length;
1957         int ret;
1958         struct btrfs_io_geometry geom;
1959
1960         if (bio_flags & EXTENT_BIO_COMPRESSED)
1961                 return 0;
1962
1963         length = bio->bi_iter.bi_size;
1964         map_length = length;
1965         ret = btrfs_get_io_geometry(fs_info, btrfs_op(bio), logical, map_length,
1966                                     &geom);
1967         if (ret < 0)
1968                 return ret;
1969
1970         if (geom.len < length + size)
1971                 return 1;
1972         return 0;
1973 }
1974
1975 /*
1976  * in order to insert checksums into the metadata in large chunks,
1977  * we wait until bio submission time.   All the pages in the bio are
1978  * checksummed and sums are attached onto the ordered extent record.
1979  *
1980  * At IO completion time the cums attached on the ordered extent record
1981  * are inserted into the btree
1982  */
1983 static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio,
1984                                     u64 bio_offset)
1985 {
1986         struct inode *inode = private_data;
1987         blk_status_t ret = 0;
1988
1989         ret = btrfs_csum_one_bio(inode, bio, 0, 0);
1990         BUG_ON(ret); /* -ENOMEM */
1991         return 0;
1992 }
1993
1994 /*
1995  * extent_io.c submission hook. This does the right thing for csum calculation
1996  * on write, or reading the csums from the tree before a read.
1997  *
1998  * Rules about async/sync submit,
1999  * a) read:                             sync submit
2000  *
2001  * b) write without checksum:           sync submit
2002  *
2003  * c) write with checksum:
2004  *    c-1) if bio is issued by fsync:   sync submit
2005  *         (sync_writers != 0)
2006  *
2007  *    c-2) if root is reloc root:       sync submit
2008  *         (only in case of buffered IO)
2009  *
2010  *    c-3) otherwise:                   async submit
2011  */
2012 static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
2013                                           int mirror_num,
2014                                           unsigned long bio_flags)
2015
2016 {
2017         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2018         struct btrfs_root *root = BTRFS_I(inode)->root;
2019         enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
2020         blk_status_t ret = 0;
2021         int skip_sum;
2022         int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
2023
2024         skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
2025
2026         if (btrfs_is_free_space_inode(BTRFS_I(inode)))
2027                 metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
2028
2029         if (bio_op(bio) != REQ_OP_WRITE) {
2030                 ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
2031                 if (ret)
2032                         goto out;
2033
2034                 if (bio_flags & EXTENT_BIO_COMPRESSED) {
2035                         ret = btrfs_submit_compressed_read(inode, bio,
2036                                                            mirror_num,
2037                                                            bio_flags);
2038                         goto out;
2039                 } else if (!skip_sum) {
2040                         ret = btrfs_lookup_bio_sums(inode, bio, NULL);
2041                         if (ret)
2042                                 goto out;
2043                 }
2044                 goto mapit;
2045         } else if (async && !skip_sum) {
2046                 /* csum items have already been cloned */
2047                 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2048                         goto mapit;
2049                 /* we're doing a write, do the async checksumming */
2050                 ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags,
2051                                           0, inode, btrfs_submit_bio_start);
2052                 goto out;
2053         } else if (!skip_sum) {
2054                 ret = btrfs_csum_one_bio(inode, bio, 0, 0);
2055                 if (ret)
2056                         goto out;
2057         }
2058
2059 mapit:
2060         ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
2061
2062 out:
2063         if (ret) {
2064                 bio->bi_status = ret;
2065                 bio_endio(bio);
2066         }
2067         return ret;
2068 }
2069
2070 /*
2071  * given a list of ordered sums record them in the inode.  This happens
2072  * at IO completion time based on sums calculated at bio submission time.
2073  */
2074 static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
2075                              struct inode *inode, struct list_head *list)
2076 {
2077         struct btrfs_ordered_sum *sum;
2078         int ret;
2079
2080         list_for_each_entry(sum, list, list) {
2081                 trans->adding_csums = true;
2082                 ret = btrfs_csum_file_blocks(trans,
2083                        BTRFS_I(inode)->root->fs_info->csum_root, sum);
2084                 trans->adding_csums = false;
2085                 if (ret)
2086                         return ret;
2087         }
2088         return 0;
2089 }
2090
2091 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2092                               unsigned int extra_bits,
2093                               struct extent_state **cached_state, int dedupe)
2094 {
2095         WARN_ON(PAGE_ALIGNED(end));
2096         return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
2097                                    extra_bits, cached_state);
2098 }
2099
2100 /* see btrfs_writepage_start_hook for details on why this is required */
2101 struct btrfs_writepage_fixup {
2102         struct page *page;
2103         struct btrfs_work work;
2104 };
2105
2106 static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
2107 {
2108         struct btrfs_writepage_fixup *fixup;
2109         struct btrfs_ordered_extent *ordered;
2110         struct extent_state *cached_state = NULL;
2111         struct extent_changeset *data_reserved = NULL;
2112         struct page *page;
2113         struct inode *inode;
2114         u64 page_start;
2115         u64 page_end;
2116         int ret;
2117
2118         fixup = container_of(work, struct btrfs_writepage_fixup, work);
2119         page = fixup->page;
2120 again:
2121         lock_page(page);
2122         if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
2123                 ClearPageChecked(page);
2124                 goto out_page;
2125         }
2126
2127         inode = page->mapping->host;
2128         page_start = page_offset(page);
2129         page_end = page_offset(page) + PAGE_SIZE - 1;
2130
2131         lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
2132                          &cached_state);
2133
2134         /* already ordered? We're done */
2135         if (PagePrivate2(page))
2136                 goto out;
2137
2138         ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
2139                                         PAGE_SIZE);
2140         if (ordered) {
2141                 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
2142                                      page_end, &cached_state);
2143                 unlock_page(page);
2144                 btrfs_start_ordered_extent(inode, ordered, 1);
2145                 btrfs_put_ordered_extent(ordered);
2146                 goto again;
2147         }
2148
2149         ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
2150                                            PAGE_SIZE);
2151         if (ret) {
2152                 mapping_set_error(page->mapping, ret);
2153                 end_extent_writepage(page, ret, page_start, page_end);
2154                 ClearPageChecked(page);
2155                 goto out;
2156          }
2157
2158         ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
2159                                         &cached_state, 0);
2160         if (ret) {
2161                 mapping_set_error(page->mapping, ret);
2162                 end_extent_writepage(page, ret, page_start, page_end);
2163                 ClearPageChecked(page);
2164                 goto out;
2165         }
2166
2167         ClearPageChecked(page);
2168         set_page_dirty(page);
2169         btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, false);
2170 out:
2171         unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
2172                              &cached_state);
2173 out_page:
2174         unlock_page(page);
2175         put_page(page);
2176         kfree(fixup);
2177         extent_changeset_free(data_reserved);
2178 }
2179
2180 /*
2181  * There are a few paths in the higher layers of the kernel that directly
2182  * set the page dirty bit without asking the filesystem if it is a
2183  * good idea.  This causes problems because we want to make sure COW
2184  * properly happens and the data=ordered rules are followed.
2185  *
2186  * In our case any range that doesn't have the ORDERED bit set
2187  * hasn't been properly setup for IO.  We kick off an async process
2188  * to fix it up.  The async helper will wait for ordered extents, set
2189  * the delalloc bit and make it safe to write the page.
2190  */
2191 int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
2192 {
2193         struct inode *inode = page->mapping->host;
2194         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2195         struct btrfs_writepage_fixup *fixup;
2196
2197         /* this page is properly in the ordered list */
2198         if (TestClearPagePrivate2(page))
2199                 return 0;
2200
2201         if (PageChecked(page))
2202                 return -EAGAIN;
2203
2204         fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
2205         if (!fixup)
2206                 return -EAGAIN;
2207
2208         SetPageChecked(page);
2209         get_page(page);
2210         btrfs_init_work(&fixup->work, btrfs_fixup_helper,
2211                         btrfs_writepage_fixup_worker, NULL, NULL);
2212         fixup->page = page;
2213         btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
2214         return -EBUSY;
2215 }
2216
2217 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2218                                        struct inode *inode, u64 file_pos,
2219                                        u64 disk_bytenr, u64 disk_num_bytes,
2220                                        u64 num_bytes, u64 ram_bytes,
2221                                        u8 compression, u8 encryption,
2222                                        u16 other_encoding, int extent_type)
2223 {
2224         struct btrfs_root *root = BTRFS_I(inode)->root;
2225         struct btrfs_file_extent_item *fi;
2226         struct btrfs_path *path;
2227         struct extent_buffer *leaf;
2228         struct btrfs_key ins;
2229         u64 qg_released;
2230         int extent_inserted = 0;
2231         int ret;
2232
2233         path = btrfs_alloc_path();
2234         if (!path)
2235                 return -ENOMEM;
2236
2237         /*
2238          * we may be replacing one extent in the tree with another.
2239          * The new extent is pinned in the extent map, and we don't want
2240          * to drop it from the cache until it is completely in the btree.
2241          *
2242          * So, tell btrfs_drop_extents to leave this extent in the cache.
2243          * the caller is expected to unpin it and allow it to be merged
2244          * with the others.
2245          */
2246         ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
2247                                    file_pos + num_bytes, NULL, 0,
2248                                    1, sizeof(*fi), &extent_inserted);
2249         if (ret)
2250                 goto out;
2251
2252         if (!extent_inserted) {
2253                 ins.objectid = btrfs_ino(BTRFS_I(inode));
2254                 ins.offset = file_pos;
2255                 ins.type = BTRFS_EXTENT_DATA_KEY;
2256
2257                 path->leave_spinning = 1;
2258                 ret = btrfs_insert_empty_item(trans, root, path, &ins,
2259                                               sizeof(*fi));
2260                 if (ret)
2261                         goto out;
2262         }
2263         leaf = path->nodes[0];
2264         fi = btrfs_item_ptr(leaf, path->slots[0],
2265                             struct btrfs_file_extent_item);
2266         btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2267         btrfs_set_file_extent_type(leaf, fi, extent_type);
2268         btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
2269         btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
2270         btrfs_set_file_extent_offset(leaf, fi, 0);
2271         btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2272         btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
2273         btrfs_set_file_extent_compression(leaf, fi, compression);
2274         btrfs_set_file_extent_encryption(leaf, fi, encryption);
2275         btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
2276
2277         btrfs_mark_buffer_dirty(leaf);
2278         btrfs_release_path(path);
2279
2280         inode_add_bytes(inode, num_bytes);
2281
2282         ins.objectid = disk_bytenr;
2283         ins.offset = disk_num_bytes;
2284         ins.type = BTRFS_EXTENT_ITEM_KEY;
2285
2286         /*
2287          * Release the reserved range from inode dirty range map, as it is
2288          * already moved into delayed_ref_head
2289          */
2290         ret = btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
2291         if (ret < 0)
2292                 goto out;
2293         qg_released = ret;
2294         ret = btrfs_alloc_reserved_file_extent(trans, root,
2295                                                btrfs_ino(BTRFS_I(inode)),
2296                                                file_pos, qg_released, &ins);
2297 out:
2298         btrfs_free_path(path);
2299
2300         return ret;
2301 }
2302
2303 /* snapshot-aware defrag */
2304 struct sa_defrag_extent_backref {
2305         struct rb_node node;
2306         struct old_sa_defrag_extent *old;
2307         u64 root_id;
2308         u64 inum;
2309         u64 file_pos;
2310         u64 extent_offset;
2311         u64 num_bytes;
2312         u64 generation;
2313 };
2314
2315 struct old_sa_defrag_extent {
2316         struct list_head list;
2317         struct new_sa_defrag_extent *new;
2318
2319         u64 extent_offset;
2320         u64 bytenr;
2321         u64 offset;
2322         u64 len;
2323         int count;
2324 };
2325
2326 struct new_sa_defrag_extent {
2327         struct rb_root root;
2328         struct list_head head;
2329         struct btrfs_path *path;
2330         struct inode *inode;
2331         u64 file_pos;
2332         u64 len;
2333         u64 bytenr;
2334         u64 disk_len;
2335         u8 compress_type;
2336 };
2337
2338 static int backref_comp(struct sa_defrag_extent_backref *b1,
2339                         struct sa_defrag_extent_backref *b2)
2340 {
2341         if (b1->root_id < b2->root_id)
2342                 return -1;
2343         else if (b1->root_id > b2->root_id)
2344                 return 1;
2345
2346         if (b1->inum < b2->inum)
2347                 return -1;
2348         else if (b1->inum > b2->inum)
2349                 return 1;
2350
2351         if (b1->file_pos < b2->file_pos)
2352                 return -1;
2353         else if (b1->file_pos > b2->file_pos)
2354                 return 1;
2355
2356         /*
2357          * [------------------------------] ===> (a range of space)
2358          *     |<--->|   |<---->| =============> (fs/file tree A)
2359          * |<---------------------------->| ===> (fs/file tree B)
2360          *
2361          * A range of space can refer to two file extents in one tree while
2362          * refer to only one file extent in another tree.
2363          *
2364          * So we may process a disk offset more than one time(two extents in A)
2365          * and locate at the same extent(one extent in B), then insert two same
2366          * backrefs(both refer to the extent in B).
2367          */
2368         return 0;
2369 }
2370
2371 static void backref_insert(struct rb_root *root,
2372                            struct sa_defrag_extent_backref *backref)
2373 {
2374         struct rb_node **p = &root->rb_node;
2375         struct rb_node *parent = NULL;
2376         struct sa_defrag_extent_backref *entry;
2377         int ret;
2378
2379         while (*p) {
2380                 parent = *p;
2381                 entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
2382
2383                 ret = backref_comp(backref, entry);
2384                 if (ret < 0)
2385                         p = &(*p)->rb_left;
2386                 else
2387                         p = &(*p)->rb_right;
2388         }
2389
2390         rb_link_node(&backref->node, parent, p);
2391         rb_insert_color(&backref->node, root);
2392 }
2393
2394 /*
2395  * Note the backref might has changed, and in this case we just return 0.
2396  */
2397 static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2398                                        void *ctx)
2399 {
2400         struct btrfs_file_extent_item *extent;
2401         struct old_sa_defrag_extent *old = ctx;
2402         struct new_sa_defrag_extent *new = old->new;
2403         struct btrfs_path *path = new->path;
2404         struct btrfs_key key;
2405         struct btrfs_root *root;
2406         struct sa_defrag_extent_backref *backref;
2407         struct extent_buffer *leaf;
2408         struct inode *inode = new->inode;
2409         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2410         int slot;
2411         int ret;
2412         u64 extent_offset;
2413         u64 num_bytes;
2414
2415         if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
2416             inum == btrfs_ino(BTRFS_I(inode)))
2417                 return 0;
2418
2419         key.objectid = root_id;
2420         key.type = BTRFS_ROOT_ITEM_KEY;
2421         key.offset = (u64)-1;
2422
2423         root = btrfs_read_fs_root_no_name(fs_info, &key);
2424         if (IS_ERR(root)) {
2425                 if (PTR_ERR(root) == -ENOENT)
2426                         return 0;
2427                 WARN_ON(1);
2428                 btrfs_debug(fs_info, "inum=%llu, offset=%llu, root_id=%llu",
2429                          inum, offset, root_id);
2430                 return PTR_ERR(root);
2431         }
2432
2433         key.objectid = inum;
2434         key.type = BTRFS_EXTENT_DATA_KEY;
2435         if (offset > (u64)-1 << 32)
2436                 key.offset = 0;
2437         else
2438                 key.offset = offset;
2439
2440         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2441         if (WARN_ON(ret < 0))
2442                 return ret;
2443         ret = 0;
2444
2445         while (1) {
2446                 cond_resched();
2447
2448                 leaf = path->nodes[0];
2449                 slot = path->slots[0];
2450
2451                 if (slot >= btrfs_header_nritems(leaf)) {
2452                         ret = btrfs_next_leaf(root, path);
2453                         if (ret < 0) {
2454                                 goto out;
2455                         } else if (ret > 0) {
2456                                 ret = 0;
2457                                 goto out;
2458                         }
2459                         continue;
2460                 }
2461
2462                 path->slots[0]++;
2463
2464                 btrfs_item_key_to_cpu(leaf, &key, slot);
2465
2466                 if (key.objectid > inum)
2467                         goto out;
2468
2469                 if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
2470                         continue;
2471
2472                 extent = btrfs_item_ptr(leaf, slot,
2473                                         struct btrfs_file_extent_item);
2474
2475                 if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
2476                         continue;
2477
2478                 /*
2479                  * 'offset' refers to the exact key.offset,
2480                  * NOT the 'offset' field in btrfs_extent_data_ref, ie.
2481                  * (key.offset - extent_offset).
2482                  */
2483                 if (key.offset != offset)
2484                         continue;
2485
2486                 extent_offset = btrfs_file_extent_offset(leaf, extent);
2487                 num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
2488
2489                 if (extent_offset >= old->extent_offset + old->offset +
2490                     old->len || extent_offset + num_bytes <=
2491                     old->extent_offset + old->offset)
2492                         continue;
2493                 break;
2494         }
2495
2496         backref = kmalloc(sizeof(*backref), GFP_NOFS);
2497         if (!backref) {
2498                 ret = -ENOENT;
2499                 goto out;
2500         }
2501
2502         backref->root_id = root_id;
2503         backref->inum = inum;
2504         backref->file_pos = offset;
2505         backref->num_bytes = num_bytes;
2506         backref->extent_offset = extent_offset;
2507         backref->generation = btrfs_file_extent_generation(leaf, extent);
2508         backref->old = old;
2509         backref_insert(&new->root, backref);
2510         old->count++;
2511 out:
2512         btrfs_release_path(path);
2513         WARN_ON(ret);
2514         return ret;
2515 }
2516
2517 static noinline bool record_extent_backrefs(struct btrfs_path *path,
2518                                    struct new_sa_defrag_extent *new)
2519 {
2520         struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2521         struct old_sa_defrag_extent *old, *tmp;
2522         int ret;
2523
2524         new->path = path;
2525
2526         list_for_each_entry_safe(old, tmp, &new->head, list) {
2527                 ret = iterate_inodes_from_logical(old->bytenr +
2528                                                   old->extent_offset, fs_info,
2529                                                   path, record_one_backref,
2530                                                   old, false);
2531                 if (ret < 0 && ret != -ENOENT)
2532                         return false;
2533
2534                 /* no backref to be processed for this extent */
2535                 if (!old->count) {
2536                         list_del(&old->list);
2537                         kfree(old);
2538                 }
2539         }
2540
2541         if (list_empty(&new->head))
2542                 return false;
2543
2544         return true;
2545 }
2546
2547 static int relink_is_mergable(struct extent_buffer *leaf,
2548                               struct btrfs_file_extent_item *fi,
2549                               struct new_sa_defrag_extent *new)
2550 {
2551         if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
2552                 return 0;
2553
2554         if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2555                 return 0;
2556
2557         if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
2558                 return 0;
2559
2560         if (btrfs_file_extent_encryption(leaf, fi) ||
2561             btrfs_file_extent_other_encoding(leaf, fi))
2562                 return 0;
2563
2564         return 1;
2565 }
2566
2567 /*
2568  * Note the backref might has changed, and in this case we just return 0.
2569  */
2570 static noinline int relink_extent_backref(struct btrfs_path *path,
2571                                  struct sa_defrag_extent_backref *prev,
2572                                  struct sa_defrag_extent_backref *backref)
2573 {
2574         struct btrfs_file_extent_item *extent;
2575         struct btrfs_file_extent_item *item;
2576         struct btrfs_ordered_extent *ordered;
2577         struct btrfs_trans_handle *trans;
2578         struct btrfs_ref ref = { 0 };
2579         struct btrfs_root *root;
2580         struct btrfs_key key;
2581         struct extent_buffer *leaf;
2582         struct old_sa_defrag_extent *old = backref->old;
2583         struct new_sa_defrag_extent *new = old->new;
2584         struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2585         struct inode *inode;
2586         struct extent_state *cached = NULL;
2587         int ret = 0;
2588         u64 start;
2589         u64 len;
2590         u64 lock_start;
2591         u64 lock_end;
2592         bool merge = false;
2593         int index;
2594
2595         if (prev && prev->root_id == backref->root_id &&
2596             prev->inum == backref->inum &&
2597             prev->file_pos + prev->num_bytes == backref->file_pos)
2598                 merge = true;
2599
2600         /* step 1: get root */
2601         key.objectid = backref->root_id;
2602         key.type = BTRFS_ROOT_ITEM_KEY;
2603         key.offset = (u64)-1;
2604
2605         index = srcu_read_lock(&fs_info->subvol_srcu);
2606
2607         root = btrfs_read_fs_root_no_name(fs_info, &key);
2608         if (IS_ERR(root)) {
2609                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2610                 if (PTR_ERR(root) == -ENOENT)
2611                         return 0;
2612                 return PTR_ERR(root);
2613         }
2614
2615         if (btrfs_root_readonly(root)) {
2616                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2617                 return 0;
2618         }
2619
2620         /* step 2: get inode */
2621         key.objectid = backref->inum;
2622         key.type = BTRFS_INODE_ITEM_KEY;
2623         key.offset = 0;
2624
2625         inode = btrfs_iget(fs_info->sb, &key, root, NULL);
2626         if (IS_ERR(inode)) {
2627                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2628                 return 0;
2629         }
2630
2631         srcu_read_unlock(&fs_info->subvol_srcu, index);
2632
2633         /* step 3: relink backref */
2634         lock_start = backref->file_pos;
2635         lock_end = backref->file_pos + backref->num_bytes - 1;
2636         lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2637                          &cached);
2638
2639         ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
2640         if (ordered) {
2641                 btrfs_put_ordered_extent(ordered);
2642                 goto out_unlock;
2643         }
2644
2645         trans = btrfs_join_transaction(root);
2646         if (IS_ERR(trans)) {
2647                 ret = PTR_ERR(trans);
2648                 goto out_unlock;
2649         }
2650
2651         key.objectid = backref->inum;
2652         key.type = BTRFS_EXTENT_DATA_KEY;
2653         key.offset = backref->file_pos;
2654
2655         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2656         if (ret < 0) {
2657                 goto out_free_path;
2658         } else if (ret > 0) {
2659                 ret = 0;
2660                 goto out_free_path;
2661         }
2662
2663         extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
2664                                 struct btrfs_file_extent_item);
2665
2666         if (btrfs_file_extent_generation(path->nodes[0], extent) !=
2667             backref->generation)
2668                 goto out_free_path;
2669
2670         btrfs_release_path(path);
2671
2672         start = backref->file_pos;
2673         if (backref->extent_offset < old->extent_offset + old->offset)
2674                 start += old->extent_offset + old->offset -
2675                          backref->extent_offset;
2676
2677         len = min(backref->extent_offset + backref->num_bytes,
2678                   old->extent_offset + old->offset + old->len);
2679         len -= max(backref->extent_offset, old->extent_offset + old->offset);
2680
2681         ret = btrfs_drop_extents(trans, root, inode, start,
2682                                  start + len, 1);
2683         if (ret)
2684                 goto out_free_path;
2685 again:
2686         key.objectid = btrfs_ino(BTRFS_I(inode));
2687         key.type = BTRFS_EXTENT_DATA_KEY;
2688         key.offset = start;
2689
2690         path->leave_spinning = 1;
2691         if (merge) {
2692                 struct btrfs_file_extent_item *fi;
2693                 u64 extent_len;
2694                 struct btrfs_key found_key;
2695
2696                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2697                 if (ret < 0)
2698                         goto out_free_path;
2699
2700                 path->slots[0]--;
2701                 leaf = path->nodes[0];
2702                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2703
2704                 fi = btrfs_item_ptr(leaf, path->slots[0],
2705                                     struct btrfs_file_extent_item);
2706                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
2707
2708                 if (extent_len + found_key.offset == start &&
2709                     relink_is_mergable(leaf, fi, new)) {
2710                         btrfs_set_file_extent_num_bytes(leaf, fi,
2711                                                         extent_len + len);
2712                         btrfs_mark_buffer_dirty(leaf);
2713                         inode_add_bytes(inode, len);
2714
2715                         ret = 1;
2716                         goto out_free_path;
2717                 } else {
2718                         merge = false;
2719                         btrfs_release_path(path);
2720                         goto again;
2721                 }
2722         }
2723
2724         ret = btrfs_insert_empty_item(trans, root, path, &key,
2725                                         sizeof(*extent));
2726         if (ret) {
2727                 btrfs_abort_transaction(trans, ret);
2728                 goto out_free_path;
2729         }
2730
2731         leaf = path->nodes[0];
2732         item = btrfs_item_ptr(leaf, path->slots[0],
2733                                 struct btrfs_file_extent_item);
2734         btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
2735         btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
2736         btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
2737         btrfs_set_file_extent_num_bytes(leaf, item, len);
2738         btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
2739         btrfs_set_file_extent_generation(leaf, item, trans->transid);
2740         btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
2741         btrfs_set_file_extent_compression(leaf, item, new->compress_type);
2742         btrfs_set_file_extent_encryption(leaf, item, 0);
2743         btrfs_set_file_extent_other_encoding(leaf, item, 0);
2744
2745         btrfs_mark_buffer_dirty(leaf);
2746         inode_add_bytes(inode, len);
2747         btrfs_release_path(path);
2748
2749         btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new->bytenr,
2750                                new->disk_len, 0);
2751         btrfs_init_data_ref(&ref, backref->root_id, backref->inum,
2752                             new->file_pos);  /* start - extent_offset */
2753         ret = btrfs_inc_extent_ref(trans, &ref);
2754         if (ret) {
2755                 btrfs_abort_transaction(trans, ret);
2756                 goto out_free_path;
2757         }
2758
2759         ret = 1;
2760 out_free_path:
2761         btrfs_release_path(path);
2762         path->leave_spinning = 0;
2763         btrfs_end_transaction(trans);
2764 out_unlock:
2765         unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2766                              &cached);
2767         iput(inode);
2768         return ret;
2769 }
2770
2771 static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
2772 {
2773         struct old_sa_defrag_extent *old, *tmp;
2774
2775         if (!new)
2776                 return;
2777
2778         list_for_each_entry_safe(old, tmp, &new->head, list) {
2779                 kfree(old);
2780         }
2781         kfree(new);
2782 }
2783
2784 static void relink_file_extents(struct new_sa_defrag_extent *new)
2785 {
2786         struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2787         struct btrfs_path *path;
2788         struct sa_defrag_extent_backref *backref;
2789         struct sa_defrag_extent_backref *prev = NULL;
2790         struct rb_node *node;
2791         int ret;
2792
2793         path = btrfs_alloc_path();
2794         if (!path)
2795                 return;
2796
2797         if (!record_extent_backrefs(path, new)) {
2798                 btrfs_free_path(path);
2799                 goto out;
2800         }
2801         btrfs_release_path(path);
2802
2803         while (1) {
2804                 node = rb_first(&new->root);
2805                 if (!node)
2806                         break;
2807                 rb_erase(node, &new->root);
2808
2809                 backref = rb_entry(node, struct sa_defrag_extent_backref, node);
2810
2811                 ret = relink_extent_backref(path, prev, backref);
2812                 WARN_ON(ret < 0);
2813
2814                 kfree(prev);
2815
2816                 if (ret == 1)
2817                         prev = backref;
2818                 else
2819                         prev = NULL;
2820                 cond_resched();
2821         }
2822         kfree(prev);
2823
2824         btrfs_free_path(path);
2825 out:
2826         free_sa_defrag_extent(new);
2827
2828         atomic_dec(&fs_info->defrag_running);
2829         wake_up(&fs_info->transaction_wait);
2830 }
2831
2832 static struct new_sa_defrag_extent *
2833 record_old_file_extents(struct inode *inode,
2834                         struct btrfs_ordered_extent *ordered)
2835 {
2836         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2837         struct btrfs_root *root = BTRFS_I(inode)->root;
2838         struct btrfs_path *path;
2839         struct btrfs_key key;
2840         struct old_sa_defrag_extent *old;
2841         struct new_sa_defrag_extent *new;
2842         int ret;
2843
2844         new = kmalloc(sizeof(*new), GFP_NOFS);
2845         if (!new)
2846                 return NULL;
2847
2848         new->inode = inode;
2849         new->file_pos = ordered->file_offset;
2850         new->len = ordered->len;
2851         new->bytenr = ordered->start;
2852         new->disk_len = ordered->disk_len;
2853         new->compress_type = ordered->compress_type;
2854         new->root = RB_ROOT;
2855         INIT_LIST_HEAD(&new->head);
2856
2857         path = btrfs_alloc_path();
2858         if (!path)
2859                 goto out_kfree;
2860
2861         key.objectid = btrfs_ino(BTRFS_I(inode));
2862         key.type = BTRFS_EXTENT_DATA_KEY;
2863         key.offset = new->file_pos;
2864
2865         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2866         if (ret < 0)
2867                 goto out_free_path;
2868         if (ret > 0 && path->slots[0] > 0)
2869                 path->slots[0]--;
2870
2871         /* find out all the old extents for the file range */
2872         while (1) {
2873                 struct btrfs_file_extent_item *extent;
2874                 struct extent_buffer *l;
2875                 int slot;
2876                 u64 num_bytes;
2877                 u64 offset;
2878                 u64 end;
2879                 u64 disk_bytenr;
2880                 u64 extent_offset;
2881
2882                 l = path->nodes[0];
2883                 slot = path->slots[0];
2884
2885                 if (slot >= btrfs_header_nritems(l)) {
2886                         ret = btrfs_next_leaf(root, path);
2887                         if (ret < 0)
2888                                 goto out_free_path;
2889                         else if (ret > 0)
2890                                 break;
2891                         continue;
2892                 }
2893
2894                 btrfs_item_key_to_cpu(l, &key, slot);
2895
2896                 if (key.objectid != btrfs_ino(BTRFS_I(inode)))
2897                         break;
2898                 if (key.type != BTRFS_EXTENT_DATA_KEY)
2899                         break;
2900                 if (key.offset >= new->file_pos + new->len)
2901                         break;
2902
2903                 extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
2904
2905                 num_bytes = btrfs_file_extent_num_bytes(l, extent);
2906                 if (key.offset + num_bytes < new->file_pos)
2907                         goto next;
2908
2909                 disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
2910                 if (!disk_bytenr)
2911                         goto next;
2912
2913                 extent_offset = btrfs_file_extent_offset(l, extent);
2914
2915                 old = kmalloc(sizeof(*old), GFP_NOFS);
2916                 if (!old)
2917                         goto out_free_path;
2918
2919                 offset = max(new->file_pos, key.offset);
2920                 end = min(new->file_pos + new->len, key.offset + num_bytes);
2921
2922                 old->bytenr = disk_bytenr;
2923                 old->extent_offset = extent_offset;
2924                 old->offset = offset - key.offset;
2925                 old->len = end - offset;
2926                 old->new = new;
2927                 old->count = 0;
2928                 list_add_tail(&old->list, &new->head);
2929 next:
2930                 path->slots[0]++;
2931                 cond_resched();
2932         }
2933
2934         btrfs_free_path(path);
2935         atomic_inc(&fs_info->defrag_running);
2936
2937         return new;
2938
2939 out_free_path:
2940         btrfs_free_path(path);
2941 out_kfree:
2942         free_sa_defrag_extent(new);
2943         return NULL;
2944 }
2945
2946 static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
2947                                          u64 start, u64 len)
2948 {
2949         struct btrfs_block_group_cache *cache;
2950
2951         cache = btrfs_lookup_block_group(fs_info, start);
2952         ASSERT(cache);
2953
2954         spin_lock(&cache->lock);
2955         cache->delalloc_bytes -= len;
2956         spin_unlock(&cache->lock);
2957
2958         btrfs_put_block_group(cache);
2959 }
2960
2961 /* as ordered data IO finishes, this gets called so we can finish
2962  * an ordered extent if the range of bytes in the file it covers are
2963  * fully written.
2964  */
2965 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2966 {
2967         struct inode *inode = ordered_extent->inode;
2968         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2969         struct btrfs_root *root = BTRFS_I(inode)->root;
2970         struct btrfs_trans_handle *trans = NULL;
2971         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2972         struct extent_state *cached_state = NULL;
2973         struct new_sa_defrag_extent *new = NULL;
2974         int compress_type = 0;
2975         int ret = 0;
2976         u64 logical_len = ordered_extent->len;
2977         bool nolock;
2978         bool truncated = false;
2979         bool range_locked = false;
2980         bool clear_new_delalloc_bytes = false;
2981         bool clear_reserved_extent = true;
2982
2983         if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
2984             !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
2985             !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
2986                 clear_new_delalloc_bytes = true;
2987
2988         nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
2989
2990         if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
2991                 ret = -EIO;
2992                 goto out;
2993         }
2994
2995         btrfs_free_io_failure_record(BTRFS_I(inode),
2996                         ordered_extent->file_offset,
2997                         ordered_extent->file_offset +
2998                         ordered_extent->len - 1);
2999
3000         if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
3001                 truncated = true;
3002                 logical_len = ordered_extent->truncated_len;
3003                 /* Truncated the entire extent, don't bother adding */
3004                 if (!logical_len)
3005                         goto out;
3006         }
3007
3008         if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
3009                 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
3010
3011                 /*
3012                  * For mwrite(mmap + memset to write) case, we still reserve
3013                  * space for NOCOW range.
3014                  * As NOCOW won't cause a new delayed ref, just free the space
3015                  */
3016                 btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
3017                                        ordered_extent->len);
3018                 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
3019                 if (nolock)
3020                         trans = btrfs_join_transaction_nolock(root);
3021                 else
3022                         trans = btrfs_join_transaction(root);
3023                 if (IS_ERR(trans)) {
3024                         ret = PTR_ERR(trans);
3025                         trans = NULL;
3026                         goto out;
3027                 }
3028                 trans->block_rsv = &BTRFS_I(inode)->block_rsv;
3029                 ret = btrfs_update_inode_fallback(trans, root, inode);
3030                 if (ret) /* -ENOMEM or corruption */
3031                         btrfs_abort_transaction(trans, ret);
3032                 goto out;
3033         }
3034
3035         range_locked = true;
3036         lock_extent_bits(io_tree, ordered_extent->file_offset,
3037                          ordered_extent->file_offset + ordered_extent->len - 1,
3038                          &cached_state);
3039
3040         ret = test_range_bit(io_tree, ordered_extent->file_offset,
3041                         ordered_extent->file_offset + ordered_extent->len - 1,
3042                         EXTENT_DEFRAG, 0, cached_state);
3043         if (ret) {
3044                 u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
3045                 if (0 && last_snapshot >= BTRFS_I(inode)->generation)
3046                         /* the inode is shared */
3047                         new = record_old_file_extents(inode, ordered_extent);
3048
3049                 clear_extent_bit(io_tree, ordered_extent->file_offset,
3050                         ordered_extent->file_offset + ordered_extent->len - 1,
3051                         EXTENT_DEFRAG, 0, 0, &cached_state);
3052         }
3053
3054         if (nolock)
3055                 trans = btrfs_join_transaction_nolock(root);
3056         else
3057                 trans = btrfs_join_transaction(root);
3058         if (IS_ERR(trans)) {
3059                 ret = PTR_ERR(trans);
3060                 trans = NULL;
3061                 goto out;
3062         }
3063
3064         trans->block_rsv = &BTRFS_I(inode)->block_rsv;
3065
3066         if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
3067                 compress_type = ordered_extent->compress_type;
3068         if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3069                 BUG_ON(compress_type);
3070                 btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
3071                                        ordered_extent->len);
3072                 ret = btrfs_mark_extent_written(trans, BTRFS_I(inode),
3073                                                 ordered_extent->file_offset,
3074                                                 ordered_extent->file_offset +
3075                                                 logical_len);
3076         } else {
3077                 BUG_ON(root == fs_info->tree_root);
3078                 ret = insert_reserved_file_extent(trans, inode,
3079                                                 ordered_extent->file_offset,
3080                                                 ordered_extent->start,