btrfs: Remove btrfs_inode::delayed_iput_count
[sfrench/cifs-2.6.git] / fs / btrfs / inode.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/kernel.h>
20 #include <linux/bio.h>
21 #include <linux/buffer_head.h>
22 #include <linux/file.h>
23 #include <linux/fs.h>
24 #include <linux/pagemap.h>
25 #include <linux/highmem.h>
26 #include <linux/time.h>
27 #include <linux/init.h>
28 #include <linux/string.h>
29 #include <linux/backing-dev.h>
30 #include <linux/mpage.h>
31 #include <linux/swap.h>
32 #include <linux/writeback.h>
33 #include <linux/compat.h>
34 #include <linux/bit_spinlock.h>
35 #include <linux/xattr.h>
36 #include <linux/posix_acl.h>
37 #include <linux/falloc.h>
38 #include <linux/slab.h>
39 #include <linux/ratelimit.h>
40 #include <linux/mount.h>
41 #include <linux/btrfs.h>
42 #include <linux/blkdev.h>
43 #include <linux/posix_acl_xattr.h>
44 #include <linux/uio.h>
45 #include <linux/magic.h>
46 #include <linux/iversion.h>
47 #include "ctree.h"
48 #include "disk-io.h"
49 #include "transaction.h"
50 #include "btrfs_inode.h"
51 #include "print-tree.h"
52 #include "ordered-data.h"
53 #include "xattr.h"
54 #include "tree-log.h"
55 #include "volumes.h"
56 #include "compression.h"
57 #include "locking.h"
58 #include "free-space-cache.h"
59 #include "inode-map.h"
60 #include "backref.h"
61 #include "hash.h"
62 #include "props.h"
63 #include "qgroup.h"
64 #include "dedupe.h"
65
66 struct btrfs_iget_args {
67         struct btrfs_key *location;
68         struct btrfs_root *root;
69 };
70
71 struct btrfs_dio_data {
72         u64 reserve;
73         u64 unsubmitted_oe_range_start;
74         u64 unsubmitted_oe_range_end;
75         int overwrite;
76 };
77
78 static const struct inode_operations btrfs_dir_inode_operations;
79 static const struct inode_operations btrfs_symlink_inode_operations;
80 static const struct inode_operations btrfs_dir_ro_inode_operations;
81 static const struct inode_operations btrfs_special_inode_operations;
82 static const struct inode_operations btrfs_file_inode_operations;
83 static const struct address_space_operations btrfs_aops;
84 static const struct address_space_operations btrfs_symlink_aops;
85 static const struct file_operations btrfs_dir_file_operations;
86 static const struct extent_io_ops btrfs_extent_io_ops;
87
88 static struct kmem_cache *btrfs_inode_cachep;
89 struct kmem_cache *btrfs_trans_handle_cachep;
90 struct kmem_cache *btrfs_path_cachep;
91 struct kmem_cache *btrfs_free_space_cachep;
92
93 #define S_SHIFT 12
94 static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
95         [S_IFREG >> S_SHIFT]    = BTRFS_FT_REG_FILE,
96         [S_IFDIR >> S_SHIFT]    = BTRFS_FT_DIR,
97         [S_IFCHR >> S_SHIFT]    = BTRFS_FT_CHRDEV,
98         [S_IFBLK >> S_SHIFT]    = BTRFS_FT_BLKDEV,
99         [S_IFIFO >> S_SHIFT]    = BTRFS_FT_FIFO,
100         [S_IFSOCK >> S_SHIFT]   = BTRFS_FT_SOCK,
101         [S_IFLNK >> S_SHIFT]    = BTRFS_FT_SYMLINK,
102 };
103
104 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
105 static int btrfs_truncate(struct inode *inode);
106 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
107 static noinline int cow_file_range(struct inode *inode,
108                                    struct page *locked_page,
109                                    u64 start, u64 end, u64 delalloc_end,
110                                    int *page_started, unsigned long *nr_written,
111                                    int unlock, struct btrfs_dedupe_hash *hash);
112 static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
113                                        u64 orig_start, u64 block_start,
114                                        u64 block_len, u64 orig_block_len,
115                                        u64 ram_bytes, int compress_type,
116                                        int type);
117
118 static void __endio_write_update_ordered(struct inode *inode,
119                                          const u64 offset, const u64 bytes,
120                                          const bool uptodate);
121
122 /*
123  * Cleanup all submitted ordered extents in specified range to handle errors
124  * from the fill_dellaloc() callback.
125  *
126  * NOTE: caller must ensure that when an error happens, it can not call
127  * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
128  * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
129  * to be released, which we want to happen only when finishing the ordered
130  * extent (btrfs_finish_ordered_io()). Also note that the caller of the
131  * fill_delalloc() callback already does proper cleanup for the first page of
132  * the range, that is, it invokes the callback writepage_end_io_hook() for the
133  * range of the first page.
134  */
135 static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
136                                                  const u64 offset,
137                                                  const u64 bytes)
138 {
139         unsigned long index = offset >> PAGE_SHIFT;
140         unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
141         struct page *page;
142
143         while (index <= end_index) {
144                 page = find_get_page(inode->i_mapping, index);
145                 index++;
146                 if (!page)
147                         continue;
148                 ClearPagePrivate2(page);
149                 put_page(page);
150         }
151         return __endio_write_update_ordered(inode, offset + PAGE_SIZE,
152                                             bytes - PAGE_SIZE, false);
153 }
154
155 static int btrfs_dirty_inode(struct inode *inode);
156
157 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
158 void btrfs_test_inode_set_ops(struct inode *inode)
159 {
160         BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
161 }
162 #endif
163
164 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
165                                      struct inode *inode,  struct inode *dir,
166                                      const struct qstr *qstr)
167 {
168         int err;
169
170         err = btrfs_init_acl(trans, inode, dir);
171         if (!err)
172                 err = btrfs_xattr_security_init(trans, inode, dir, qstr);
173         return err;
174 }
175
176 /*
177  * this does all the hard work for inserting an inline extent into
178  * the btree.  The caller should have done a btrfs_drop_extents so that
179  * no overlapping inline items exist in the btree
180  */
181 static int insert_inline_extent(struct btrfs_trans_handle *trans,
182                                 struct btrfs_path *path, int extent_inserted,
183                                 struct btrfs_root *root, struct inode *inode,
184                                 u64 start, size_t size, size_t compressed_size,
185                                 int compress_type,
186                                 struct page **compressed_pages)
187 {
188         struct extent_buffer *leaf;
189         struct page *page = NULL;
190         char *kaddr;
191         unsigned long ptr;
192         struct btrfs_file_extent_item *ei;
193         int ret;
194         size_t cur_size = size;
195         unsigned long offset;
196
197         if (compressed_size && compressed_pages)
198                 cur_size = compressed_size;
199
200         inode_add_bytes(inode, size);
201
202         if (!extent_inserted) {
203                 struct btrfs_key key;
204                 size_t datasize;
205
206                 key.objectid = btrfs_ino(BTRFS_I(inode));
207                 key.offset = start;
208                 key.type = BTRFS_EXTENT_DATA_KEY;
209
210                 datasize = btrfs_file_extent_calc_inline_size(cur_size);
211                 path->leave_spinning = 1;
212                 ret = btrfs_insert_empty_item(trans, root, path, &key,
213                                               datasize);
214                 if (ret)
215                         goto fail;
216         }
217         leaf = path->nodes[0];
218         ei = btrfs_item_ptr(leaf, path->slots[0],
219                             struct btrfs_file_extent_item);
220         btrfs_set_file_extent_generation(leaf, ei, trans->transid);
221         btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
222         btrfs_set_file_extent_encryption(leaf, ei, 0);
223         btrfs_set_file_extent_other_encoding(leaf, ei, 0);
224         btrfs_set_file_extent_ram_bytes(leaf, ei, size);
225         ptr = btrfs_file_extent_inline_start(ei);
226
227         if (compress_type != BTRFS_COMPRESS_NONE) {
228                 struct page *cpage;
229                 int i = 0;
230                 while (compressed_size > 0) {
231                         cpage = compressed_pages[i];
232                         cur_size = min_t(unsigned long, compressed_size,
233                                        PAGE_SIZE);
234
235                         kaddr = kmap_atomic(cpage);
236                         write_extent_buffer(leaf, kaddr, ptr, cur_size);
237                         kunmap_atomic(kaddr);
238
239                         i++;
240                         ptr += cur_size;
241                         compressed_size -= cur_size;
242                 }
243                 btrfs_set_file_extent_compression(leaf, ei,
244                                                   compress_type);
245         } else {
246                 page = find_get_page(inode->i_mapping,
247                                      start >> PAGE_SHIFT);
248                 btrfs_set_file_extent_compression(leaf, ei, 0);
249                 kaddr = kmap_atomic(page);
250                 offset = start & (PAGE_SIZE - 1);
251                 write_extent_buffer(leaf, kaddr + offset, ptr, size);
252                 kunmap_atomic(kaddr);
253                 put_page(page);
254         }
255         btrfs_mark_buffer_dirty(leaf);
256         btrfs_release_path(path);
257
258         /*
259          * we're an inline extent, so nobody can
260          * extend the file past i_size without locking
261          * a page we already have locked.
262          *
263          * We must do any isize and inode updates
264          * before we unlock the pages.  Otherwise we
265          * could end up racing with unlink.
266          */
267         BTRFS_I(inode)->disk_i_size = inode->i_size;
268         ret = btrfs_update_inode(trans, root, inode);
269
270 fail:
271         return ret;
272 }
273
274
275 /*
276  * conditionally insert an inline extent into the file.  This
277  * does the checks required to make sure the data is small enough
278  * to fit as an inline extent.
279  */
280 static noinline int cow_file_range_inline(struct btrfs_root *root,
281                                           struct inode *inode, u64 start,
282                                           u64 end, size_t compressed_size,
283                                           int compress_type,
284                                           struct page **compressed_pages)
285 {
286         struct btrfs_fs_info *fs_info = root->fs_info;
287         struct btrfs_trans_handle *trans;
288         u64 isize = i_size_read(inode);
289         u64 actual_end = min(end + 1, isize);
290         u64 inline_len = actual_end - start;
291         u64 aligned_end = ALIGN(end, fs_info->sectorsize);
292         u64 data_len = inline_len;
293         int ret;
294         struct btrfs_path *path;
295         int extent_inserted = 0;
296         u32 extent_item_size;
297
298         if (compressed_size)
299                 data_len = compressed_size;
300
301         if (start > 0 ||
302             actual_end > fs_info->sectorsize ||
303             data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
304             (!compressed_size &&
305             (actual_end & (fs_info->sectorsize - 1)) == 0) ||
306             end + 1 < isize ||
307             data_len > fs_info->max_inline) {
308                 return 1;
309         }
310
311         path = btrfs_alloc_path();
312         if (!path)
313                 return -ENOMEM;
314
315         trans = btrfs_join_transaction(root);
316         if (IS_ERR(trans)) {
317                 btrfs_free_path(path);
318                 return PTR_ERR(trans);
319         }
320         trans->block_rsv = &BTRFS_I(inode)->block_rsv;
321
322         if (compressed_size && compressed_pages)
323                 extent_item_size = btrfs_file_extent_calc_inline_size(
324                    compressed_size);
325         else
326                 extent_item_size = btrfs_file_extent_calc_inline_size(
327                     inline_len);
328
329         ret = __btrfs_drop_extents(trans, root, inode, path,
330                                    start, aligned_end, NULL,
331                                    1, 1, extent_item_size, &extent_inserted);
332         if (ret) {
333                 btrfs_abort_transaction(trans, ret);
334                 goto out;
335         }
336
337         if (isize > actual_end)
338                 inline_len = min_t(u64, isize, actual_end);
339         ret = insert_inline_extent(trans, path, extent_inserted,
340                                    root, inode, start,
341                                    inline_len, compressed_size,
342                                    compress_type, compressed_pages);
343         if (ret && ret != -ENOSPC) {
344                 btrfs_abort_transaction(trans, ret);
345                 goto out;
346         } else if (ret == -ENOSPC) {
347                 ret = 1;
348                 goto out;
349         }
350
351         set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
352         btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0);
353 out:
354         /*
355          * Don't forget to free the reserved space, as for inlined extent
356          * it won't count as data extent, free them directly here.
357          * And at reserve time, it's always aligned to page size, so
358          * just free one page here.
359          */
360         btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
361         btrfs_free_path(path);
362         btrfs_end_transaction(trans);
363         return ret;
364 }
365
366 struct async_extent {
367         u64 start;
368         u64 ram_size;
369         u64 compressed_size;
370         struct page **pages;
371         unsigned long nr_pages;
372         int compress_type;
373         struct list_head list;
374 };
375
376 struct async_cow {
377         struct inode *inode;
378         struct btrfs_root *root;
379         struct page *locked_page;
380         u64 start;
381         u64 end;
382         unsigned int write_flags;
383         struct list_head extents;
384         struct btrfs_work work;
385 };
386
387 static noinline int add_async_extent(struct async_cow *cow,
388                                      u64 start, u64 ram_size,
389                                      u64 compressed_size,
390                                      struct page **pages,
391                                      unsigned long nr_pages,
392                                      int compress_type)
393 {
394         struct async_extent *async_extent;
395
396         async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
397         BUG_ON(!async_extent); /* -ENOMEM */
398         async_extent->start = start;
399         async_extent->ram_size = ram_size;
400         async_extent->compressed_size = compressed_size;
401         async_extent->pages = pages;
402         async_extent->nr_pages = nr_pages;
403         async_extent->compress_type = compress_type;
404         list_add_tail(&async_extent->list, &cow->extents);
405         return 0;
406 }
407
408 static inline int inode_need_compress(struct inode *inode, u64 start, u64 end)
409 {
410         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
411
412         /* force compress */
413         if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
414                 return 1;
415         /* defrag ioctl */
416         if (BTRFS_I(inode)->defrag_compress)
417                 return 1;
418         /* bad compression ratios */
419         if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
420                 return 0;
421         if (btrfs_test_opt(fs_info, COMPRESS) ||
422             BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
423             BTRFS_I(inode)->prop_compress)
424                 return btrfs_compress_heuristic(inode, start, end);
425         return 0;
426 }
427
428 static inline void inode_should_defrag(struct btrfs_inode *inode,
429                 u64 start, u64 end, u64 num_bytes, u64 small_write)
430 {
431         /* If this is a small write inside eof, kick off a defrag */
432         if (num_bytes < small_write &&
433             (start > 0 || end + 1 < inode->disk_i_size))
434                 btrfs_add_inode_defrag(NULL, inode);
435 }
436
437 /*
438  * we create compressed extents in two phases.  The first
439  * phase compresses a range of pages that have already been
440  * locked (both pages and state bits are locked).
441  *
442  * This is done inside an ordered work queue, and the compression
443  * is spread across many cpus.  The actual IO submission is step
444  * two, and the ordered work queue takes care of making sure that
445  * happens in the same order things were put onto the queue by
446  * writepages and friends.
447  *
448  * If this code finds it can't get good compression, it puts an
449  * entry onto the work queue to write the uncompressed bytes.  This
450  * makes sure that both compressed inodes and uncompressed inodes
451  * are written in the same order that the flusher thread sent them
452  * down.
453  */
454 static noinline void compress_file_range(struct inode *inode,
455                                         struct page *locked_page,
456                                         u64 start, u64 end,
457                                         struct async_cow *async_cow,
458                                         int *num_added)
459 {
460         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
461         struct btrfs_root *root = BTRFS_I(inode)->root;
462         u64 blocksize = fs_info->sectorsize;
463         u64 actual_end;
464         u64 isize = i_size_read(inode);
465         int ret = 0;
466         struct page **pages = NULL;
467         unsigned long nr_pages;
468         unsigned long total_compressed = 0;
469         unsigned long total_in = 0;
470         int i;
471         int will_compress;
472         int compress_type = fs_info->compress_type;
473         int redirty = 0;
474
475         inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
476                         SZ_16K);
477
478         actual_end = min_t(u64, isize, end + 1);
479 again:
480         will_compress = 0;
481         nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
482         BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0);
483         nr_pages = min_t(unsigned long, nr_pages,
484                         BTRFS_MAX_COMPRESSED / PAGE_SIZE);
485
486         /*
487          * we don't want to send crud past the end of i_size through
488          * compression, that's just a waste of CPU time.  So, if the
489          * end of the file is before the start of our current
490          * requested range of bytes, we bail out to the uncompressed
491          * cleanup code that can deal with all of this.
492          *
493          * It isn't really the fastest way to fix things, but this is a
494          * very uncommon corner.
495          */
496         if (actual_end <= start)
497                 goto cleanup_and_bail_uncompressed;
498
499         total_compressed = actual_end - start;
500
501         /*
502          * skip compression for a small file range(<=blocksize) that
503          * isn't an inline extent, since it doesn't save disk space at all.
504          */
505         if (total_compressed <= blocksize &&
506            (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
507                 goto cleanup_and_bail_uncompressed;
508
509         total_compressed = min_t(unsigned long, total_compressed,
510                         BTRFS_MAX_UNCOMPRESSED);
511         total_in = 0;
512         ret = 0;
513
514         /*
515          * we do compression for mount -o compress and when the
516          * inode has not been flagged as nocompress.  This flag can
517          * change at any time if we discover bad compression ratios.
518          */
519         if (inode_need_compress(inode, start, end)) {
520                 WARN_ON(pages);
521                 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
522                 if (!pages) {
523                         /* just bail out to the uncompressed code */
524                         goto cont;
525                 }
526
527                 if (BTRFS_I(inode)->defrag_compress)
528                         compress_type = BTRFS_I(inode)->defrag_compress;
529                 else if (BTRFS_I(inode)->prop_compress)
530                         compress_type = BTRFS_I(inode)->prop_compress;
531
532                 /*
533                  * we need to call clear_page_dirty_for_io on each
534                  * page in the range.  Otherwise applications with the file
535                  * mmap'd can wander in and change the page contents while
536                  * we are compressing them.
537                  *
538                  * If the compression fails for any reason, we set the pages
539                  * dirty again later on.
540                  *
541                  * Note that the remaining part is redirtied, the start pointer
542                  * has moved, the end is the original one.
543                  */
544                 if (!redirty) {
545                         extent_range_clear_dirty_for_io(inode, start, end);
546                         redirty = 1;
547                 }
548
549                 /* Compression level is applied here and only here */
550                 ret = btrfs_compress_pages(
551                         compress_type | (fs_info->compress_level << 4),
552                                            inode->i_mapping, start,
553                                            pages,
554                                            &nr_pages,
555                                            &total_in,
556                                            &total_compressed);
557
558                 if (!ret) {
559                         unsigned long offset = total_compressed &
560                                 (PAGE_SIZE - 1);
561                         struct page *page = pages[nr_pages - 1];
562                         char *kaddr;
563
564                         /* zero the tail end of the last page, we might be
565                          * sending it down to disk
566                          */
567                         if (offset) {
568                                 kaddr = kmap_atomic(page);
569                                 memset(kaddr + offset, 0,
570                                        PAGE_SIZE - offset);
571                                 kunmap_atomic(kaddr);
572                         }
573                         will_compress = 1;
574                 }
575         }
576 cont:
577         if (start == 0) {
578                 /* lets try to make an inline extent */
579                 if (ret || total_in < actual_end) {
580                         /* we didn't compress the entire range, try
581                          * to make an uncompressed inline extent.
582                          */
583                         ret = cow_file_range_inline(root, inode, start, end,
584                                             0, BTRFS_COMPRESS_NONE, NULL);
585                 } else {
586                         /* try making a compressed inline extent */
587                         ret = cow_file_range_inline(root, inode, start, end,
588                                                     total_compressed,
589                                                     compress_type, pages);
590                 }
591                 if (ret <= 0) {
592                         unsigned long clear_flags = EXTENT_DELALLOC |
593                                 EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
594                                 EXTENT_DO_ACCOUNTING;
595                         unsigned long page_error_op;
596
597                         page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
598
599                         /*
600                          * inline extent creation worked or returned error,
601                          * we don't need to create any more async work items.
602                          * Unlock and free up our temp pages.
603                          *
604                          * We use DO_ACCOUNTING here because we need the
605                          * delalloc_release_metadata to be done _after_ we drop
606                          * our outstanding extent for clearing delalloc for this
607                          * range.
608                          */
609                         extent_clear_unlock_delalloc(inode, start, end, end,
610                                                      NULL, clear_flags,
611                                                      PAGE_UNLOCK |
612                                                      PAGE_CLEAR_DIRTY |
613                                                      PAGE_SET_WRITEBACK |
614                                                      page_error_op |
615                                                      PAGE_END_WRITEBACK);
616                         goto free_pages_out;
617                 }
618         }
619
620         if (will_compress) {
621                 /*
622                  * we aren't doing an inline extent round the compressed size
623                  * up to a block size boundary so the allocator does sane
624                  * things
625                  */
626                 total_compressed = ALIGN(total_compressed, blocksize);
627
628                 /*
629                  * one last check to make sure the compression is really a
630                  * win, compare the page count read with the blocks on disk,
631                  * compression must free at least one sector size
632                  */
633                 total_in = ALIGN(total_in, PAGE_SIZE);
634                 if (total_compressed + blocksize <= total_in) {
635                         *num_added += 1;
636
637                         /*
638                          * The async work queues will take care of doing actual
639                          * allocation on disk for these compressed pages, and
640                          * will submit them to the elevator.
641                          */
642                         add_async_extent(async_cow, start, total_in,
643                                         total_compressed, pages, nr_pages,
644                                         compress_type);
645
646                         if (start + total_in < end) {
647                                 start += total_in;
648                                 pages = NULL;
649                                 cond_resched();
650                                 goto again;
651                         }
652                         return;
653                 }
654         }
655         if (pages) {
656                 /*
657                  * the compression code ran but failed to make things smaller,
658                  * free any pages it allocated and our page pointer array
659                  */
660                 for (i = 0; i < nr_pages; i++) {
661                         WARN_ON(pages[i]->mapping);
662                         put_page(pages[i]);
663                 }
664                 kfree(pages);
665                 pages = NULL;
666                 total_compressed = 0;
667                 nr_pages = 0;
668
669                 /* flag the file so we don't compress in the future */
670                 if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
671                     !(BTRFS_I(inode)->prop_compress)) {
672                         BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
673                 }
674         }
675 cleanup_and_bail_uncompressed:
676         /*
677          * No compression, but we still need to write the pages in the file
678          * we've been given so far.  redirty the locked page if it corresponds
679          * to our extent and set things up for the async work queue to run
680          * cow_file_range to do the normal delalloc dance.
681          */
682         if (page_offset(locked_page) >= start &&
683             page_offset(locked_page) <= end)
684                 __set_page_dirty_nobuffers(locked_page);
685                 /* unlocked later on in the async handlers */
686
687         if (redirty)
688                 extent_range_redirty_for_io(inode, start, end);
689         add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0,
690                          BTRFS_COMPRESS_NONE);
691         *num_added += 1;
692
693         return;
694
695 free_pages_out:
696         for (i = 0; i < nr_pages; i++) {
697                 WARN_ON(pages[i]->mapping);
698                 put_page(pages[i]);
699         }
700         kfree(pages);
701 }
702
703 static void free_async_extent_pages(struct async_extent *async_extent)
704 {
705         int i;
706
707         if (!async_extent->pages)
708                 return;
709
710         for (i = 0; i < async_extent->nr_pages; i++) {
711                 WARN_ON(async_extent->pages[i]->mapping);
712                 put_page(async_extent->pages[i]);
713         }
714         kfree(async_extent->pages);
715         async_extent->nr_pages = 0;
716         async_extent->pages = NULL;
717 }
718
719 /*
720  * phase two of compressed writeback.  This is the ordered portion
721  * of the code, which only gets called in the order the work was
722  * queued.  We walk all the async extents created by compress_file_range
723  * and send them down to the disk.
724  */
725 static noinline void submit_compressed_extents(struct inode *inode,
726                                               struct async_cow *async_cow)
727 {
728         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
729         struct async_extent *async_extent;
730         u64 alloc_hint = 0;
731         struct btrfs_key ins;
732         struct extent_map *em;
733         struct btrfs_root *root = BTRFS_I(inode)->root;
734         struct extent_io_tree *io_tree;
735         int ret = 0;
736
737 again:
738         while (!list_empty(&async_cow->extents)) {
739                 async_extent = list_entry(async_cow->extents.next,
740                                           struct async_extent, list);
741                 list_del(&async_extent->list);
742
743                 io_tree = &BTRFS_I(inode)->io_tree;
744
745 retry:
746                 /* did the compression code fall back to uncompressed IO? */
747                 if (!async_extent->pages) {
748                         int page_started = 0;
749                         unsigned long nr_written = 0;
750
751                         lock_extent(io_tree, async_extent->start,
752                                          async_extent->start +
753                                          async_extent->ram_size - 1);
754
755                         /* allocate blocks */
756                         ret = cow_file_range(inode, async_cow->locked_page,
757                                              async_extent->start,
758                                              async_extent->start +
759                                              async_extent->ram_size - 1,
760                                              async_extent->start +
761                                              async_extent->ram_size - 1,
762                                              &page_started, &nr_written, 0,
763                                              NULL);
764
765                         /* JDM XXX */
766
767                         /*
768                          * if page_started, cow_file_range inserted an
769                          * inline extent and took care of all the unlocking
770                          * and IO for us.  Otherwise, we need to submit
771                          * all those pages down to the drive.
772                          */
773                         if (!page_started && !ret)
774                                 extent_write_locked_range(inode,
775                                                   async_extent->start,
776                                                   async_extent->start +
777                                                   async_extent->ram_size - 1,
778                                                   WB_SYNC_ALL);
779                         else if (ret)
780                                 unlock_page(async_cow->locked_page);
781                         kfree(async_extent);
782                         cond_resched();
783                         continue;
784                 }
785
786                 lock_extent(io_tree, async_extent->start,
787                             async_extent->start + async_extent->ram_size - 1);
788
789                 ret = btrfs_reserve_extent(root, async_extent->ram_size,
790                                            async_extent->compressed_size,
791                                            async_extent->compressed_size,
792                                            0, alloc_hint, &ins, 1, 1);
793                 if (ret) {
794                         free_async_extent_pages(async_extent);
795
796                         if (ret == -ENOSPC) {
797                                 unlock_extent(io_tree, async_extent->start,
798                                               async_extent->start +
799                                               async_extent->ram_size - 1);
800
801                                 /*
802                                  * we need to redirty the pages if we decide to
803                                  * fallback to uncompressed IO, otherwise we
804                                  * will not submit these pages down to lower
805                                  * layers.
806                                  */
807                                 extent_range_redirty_for_io(inode,
808                                                 async_extent->start,
809                                                 async_extent->start +
810                                                 async_extent->ram_size - 1);
811
812                                 goto retry;
813                         }
814                         goto out_free;
815                 }
816                 /*
817                  * here we're doing allocation and writeback of the
818                  * compressed pages
819                  */
820                 em = create_io_em(inode, async_extent->start,
821                                   async_extent->ram_size, /* len */
822                                   async_extent->start, /* orig_start */
823                                   ins.objectid, /* block_start */
824                                   ins.offset, /* block_len */
825                                   ins.offset, /* orig_block_len */
826                                   async_extent->ram_size, /* ram_bytes */
827                                   async_extent->compress_type,
828                                   BTRFS_ORDERED_COMPRESSED);
829                 if (IS_ERR(em))
830                         /* ret value is not necessary due to void function */
831                         goto out_free_reserve;
832                 free_extent_map(em);
833
834                 ret = btrfs_add_ordered_extent_compress(inode,
835                                                 async_extent->start,
836                                                 ins.objectid,
837                                                 async_extent->ram_size,
838                                                 ins.offset,
839                                                 BTRFS_ORDERED_COMPRESSED,
840                                                 async_extent->compress_type);
841                 if (ret) {
842                         btrfs_drop_extent_cache(BTRFS_I(inode),
843                                                 async_extent->start,
844                                                 async_extent->start +
845                                                 async_extent->ram_size - 1, 0);
846                         goto out_free_reserve;
847                 }
848                 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
849
850                 /*
851                  * clear dirty, set writeback and unlock the pages.
852                  */
853                 extent_clear_unlock_delalloc(inode, async_extent->start,
854                                 async_extent->start +
855                                 async_extent->ram_size - 1,
856                                 async_extent->start +
857                                 async_extent->ram_size - 1,
858                                 NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
859                                 PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
860                                 PAGE_SET_WRITEBACK);
861                 if (btrfs_submit_compressed_write(inode,
862                                     async_extent->start,
863                                     async_extent->ram_size,
864                                     ins.objectid,
865                                     ins.offset, async_extent->pages,
866                                     async_extent->nr_pages,
867                                     async_cow->write_flags)) {
868                         struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
869                         struct page *p = async_extent->pages[0];
870                         const u64 start = async_extent->start;
871                         const u64 end = start + async_extent->ram_size - 1;
872
873                         p->mapping = inode->i_mapping;
874                         tree->ops->writepage_end_io_hook(p, start, end,
875                                                          NULL, 0);
876                         p->mapping = NULL;
877                         extent_clear_unlock_delalloc(inode, start, end, end,
878                                                      NULL, 0,
879                                                      PAGE_END_WRITEBACK |
880                                                      PAGE_SET_ERROR);
881                         free_async_extent_pages(async_extent);
882                 }
883                 alloc_hint = ins.objectid + ins.offset;
884                 kfree(async_extent);
885                 cond_resched();
886         }
887         return;
888 out_free_reserve:
889         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
890         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
891 out_free:
892         extent_clear_unlock_delalloc(inode, async_extent->start,
893                                      async_extent->start +
894                                      async_extent->ram_size - 1,
895                                      async_extent->start +
896                                      async_extent->ram_size - 1,
897                                      NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
898                                      EXTENT_DELALLOC_NEW |
899                                      EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
900                                      PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
901                                      PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
902                                      PAGE_SET_ERROR);
903         free_async_extent_pages(async_extent);
904         kfree(async_extent);
905         goto again;
906 }
907
908 static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
909                                       u64 num_bytes)
910 {
911         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
912         struct extent_map *em;
913         u64 alloc_hint = 0;
914
915         read_lock(&em_tree->lock);
916         em = search_extent_mapping(em_tree, start, num_bytes);
917         if (em) {
918                 /*
919                  * if block start isn't an actual block number then find the
920                  * first block in this inode and use that as a hint.  If that
921                  * block is also bogus then just don't worry about it.
922                  */
923                 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
924                         free_extent_map(em);
925                         em = search_extent_mapping(em_tree, 0, 0);
926                         if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
927                                 alloc_hint = em->block_start;
928                         if (em)
929                                 free_extent_map(em);
930                 } else {
931                         alloc_hint = em->block_start;
932                         free_extent_map(em);
933                 }
934         }
935         read_unlock(&em_tree->lock);
936
937         return alloc_hint;
938 }
939
940 /*
941  * when extent_io.c finds a delayed allocation range in the file,
942  * the call backs end up in this code.  The basic idea is to
943  * allocate extents on disk for the range, and create ordered data structs
944  * in ram to track those extents.
945  *
946  * locked_page is the page that writepage had locked already.  We use
947  * it to make sure we don't do extra locks or unlocks.
948  *
949  * *page_started is set to one if we unlock locked_page and do everything
950  * required to start IO on it.  It may be clean and already done with
951  * IO when we return.
952  */
953 static noinline int cow_file_range(struct inode *inode,
954                                    struct page *locked_page,
955                                    u64 start, u64 end, u64 delalloc_end,
956                                    int *page_started, unsigned long *nr_written,
957                                    int unlock, struct btrfs_dedupe_hash *hash)
958 {
959         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
960         struct btrfs_root *root = BTRFS_I(inode)->root;
961         u64 alloc_hint = 0;
962         u64 num_bytes;
963         unsigned long ram_size;
964         u64 cur_alloc_size = 0;
965         u64 blocksize = fs_info->sectorsize;
966         struct btrfs_key ins;
967         struct extent_map *em;
968         unsigned clear_bits;
969         unsigned long page_ops;
970         bool extent_reserved = false;
971         int ret = 0;
972
973         if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
974                 WARN_ON_ONCE(1);
975                 ret = -EINVAL;
976                 goto out_unlock;
977         }
978
979         num_bytes = ALIGN(end - start + 1, blocksize);
980         num_bytes = max(blocksize,  num_bytes);
981         ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
982
983         inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K);
984
985         if (start == 0) {
986                 /* lets try to make an inline extent */
987                 ret = cow_file_range_inline(root, inode, start, end, 0,
988                                         BTRFS_COMPRESS_NONE, NULL);
989                 if (ret == 0) {
990                         /*
991                          * We use DO_ACCOUNTING here because we need the
992                          * delalloc_release_metadata to be run _after_ we drop
993                          * our outstanding extent for clearing delalloc for this
994                          * range.
995                          */
996                         extent_clear_unlock_delalloc(inode, start, end,
997                                      delalloc_end, NULL,
998                                      EXTENT_LOCKED | EXTENT_DELALLOC |
999                                      EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
1000                                      EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1001                                      PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
1002                                      PAGE_END_WRITEBACK);
1003                         *nr_written = *nr_written +
1004                              (end - start + PAGE_SIZE) / PAGE_SIZE;
1005                         *page_started = 1;
1006                         goto out;
1007                 } else if (ret < 0) {
1008                         goto out_unlock;
1009                 }
1010         }
1011
1012         alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
1013         btrfs_drop_extent_cache(BTRFS_I(inode), start,
1014                         start + num_bytes - 1, 0);
1015
1016         while (num_bytes > 0) {
1017                 cur_alloc_size = num_bytes;
1018                 ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
1019                                            fs_info->sectorsize, 0, alloc_hint,
1020                                            &ins, 1, 1);
1021                 if (ret < 0)
1022                         goto out_unlock;
1023                 cur_alloc_size = ins.offset;
1024                 extent_reserved = true;
1025
1026                 ram_size = ins.offset;
1027                 em = create_io_em(inode, start, ins.offset, /* len */
1028                                   start, /* orig_start */
1029                                   ins.objectid, /* block_start */
1030                                   ins.offset, /* block_len */
1031                                   ins.offset, /* orig_block_len */
1032                                   ram_size, /* ram_bytes */
1033                                   BTRFS_COMPRESS_NONE, /* compress_type */
1034                                   BTRFS_ORDERED_REGULAR /* type */);
1035                 if (IS_ERR(em))
1036                         goto out_reserve;
1037                 free_extent_map(em);
1038
1039                 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
1040                                                ram_size, cur_alloc_size, 0);
1041                 if (ret)
1042                         goto out_drop_extent_cache;
1043
1044                 if (root->root_key.objectid ==
1045                     BTRFS_DATA_RELOC_TREE_OBJECTID) {
1046                         ret = btrfs_reloc_clone_csums(inode, start,
1047                                                       cur_alloc_size);
1048                         /*
1049                          * Only drop cache here, and process as normal.
1050                          *
1051                          * We must not allow extent_clear_unlock_delalloc()
1052                          * at out_unlock label to free meta of this ordered
1053                          * extent, as its meta should be freed by
1054                          * btrfs_finish_ordered_io().
1055                          *
1056                          * So we must continue until @start is increased to
1057                          * skip current ordered extent.
1058                          */
1059                         if (ret)
1060                                 btrfs_drop_extent_cache(BTRFS_I(inode), start,
1061                                                 start + ram_size - 1, 0);
1062                 }
1063
1064                 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1065
1066                 /* we're not doing compressed IO, don't unlock the first
1067                  * page (which the caller expects to stay locked), don't
1068                  * clear any dirty bits and don't set any writeback bits
1069                  *
1070                  * Do set the Private2 bit so we know this page was properly
1071                  * setup for writepage
1072                  */
1073                 page_ops = unlock ? PAGE_UNLOCK : 0;
1074                 page_ops |= PAGE_SET_PRIVATE2;
1075
1076                 extent_clear_unlock_delalloc(inode, start,
1077                                              start + ram_size - 1,
1078                                              delalloc_end, locked_page,
1079                                              EXTENT_LOCKED | EXTENT_DELALLOC,
1080                                              page_ops);
1081                 if (num_bytes < cur_alloc_size)
1082                         num_bytes = 0;
1083                 else
1084                         num_bytes -= cur_alloc_size;
1085                 alloc_hint = ins.objectid + ins.offset;
1086                 start += cur_alloc_size;
1087                 extent_reserved = false;
1088
1089                 /*
1090                  * btrfs_reloc_clone_csums() error, since start is increased
1091                  * extent_clear_unlock_delalloc() at out_unlock label won't
1092                  * free metadata of current ordered extent, we're OK to exit.
1093                  */
1094                 if (ret)
1095                         goto out_unlock;
1096         }
1097 out:
1098         return ret;
1099
1100 out_drop_extent_cache:
1101         btrfs_drop_extent_cache(BTRFS_I(inode), start, start + ram_size - 1, 0);
1102 out_reserve:
1103         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1104         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1105 out_unlock:
1106         clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
1107                 EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
1108         page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
1109                 PAGE_END_WRITEBACK;
1110         /*
1111          * If we reserved an extent for our delalloc range (or a subrange) and
1112          * failed to create the respective ordered extent, then it means that
1113          * when we reserved the extent we decremented the extent's size from
1114          * the data space_info's bytes_may_use counter and incremented the
1115          * space_info's bytes_reserved counter by the same amount. We must make
1116          * sure extent_clear_unlock_delalloc() does not try to decrement again
1117          * the data space_info's bytes_may_use counter, therefore we do not pass
1118          * it the flag EXTENT_CLEAR_DATA_RESV.
1119          */
1120         if (extent_reserved) {
1121                 extent_clear_unlock_delalloc(inode, start,
1122                                              start + cur_alloc_size,
1123                                              start + cur_alloc_size,
1124                                              locked_page,
1125                                              clear_bits,
1126                                              page_ops);
1127                 start += cur_alloc_size;
1128                 if (start >= end)
1129                         goto out;
1130         }
1131         extent_clear_unlock_delalloc(inode, start, end, delalloc_end,
1132                                      locked_page,
1133                                      clear_bits | EXTENT_CLEAR_DATA_RESV,
1134                                      page_ops);
1135         goto out;
1136 }
1137
1138 /*
1139  * work queue call back to started compression on a file and pages
1140  */
1141 static noinline void async_cow_start(struct btrfs_work *work)
1142 {
1143         struct async_cow *async_cow;
1144         int num_added = 0;
1145         async_cow = container_of(work, struct async_cow, work);
1146
1147         compress_file_range(async_cow->inode, async_cow->locked_page,
1148                             async_cow->start, async_cow->end, async_cow,
1149                             &num_added);
1150         if (num_added == 0) {
1151                 btrfs_add_delayed_iput(async_cow->inode);
1152                 async_cow->inode = NULL;
1153         }
1154 }
1155
1156 /*
1157  * work queue call back to submit previously compressed pages
1158  */
1159 static noinline void async_cow_submit(struct btrfs_work *work)
1160 {
1161         struct btrfs_fs_info *fs_info;
1162         struct async_cow *async_cow;
1163         struct btrfs_root *root;
1164         unsigned long nr_pages;
1165
1166         async_cow = container_of(work, struct async_cow, work);
1167
1168         root = async_cow->root;
1169         fs_info = root->fs_info;
1170         nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >>
1171                 PAGE_SHIFT;
1172
1173         /*
1174          * atomic_sub_return implies a barrier for waitqueue_active
1175          */
1176         if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
1177             5 * SZ_1M &&
1178             waitqueue_active(&fs_info->async_submit_wait))
1179                 wake_up(&fs_info->async_submit_wait);
1180
1181         if (async_cow->inode)
1182                 submit_compressed_extents(async_cow->inode, async_cow);
1183 }
1184
1185 static noinline void async_cow_free(struct btrfs_work *work)
1186 {
1187         struct async_cow *async_cow;
1188         async_cow = container_of(work, struct async_cow, work);
1189         if (async_cow->inode)
1190                 btrfs_add_delayed_iput(async_cow->inode);
1191         kfree(async_cow);
1192 }
1193
1194 static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1195                                 u64 start, u64 end, int *page_started,
1196                                 unsigned long *nr_written,
1197                                 unsigned int write_flags)
1198 {
1199         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1200         struct async_cow *async_cow;
1201         struct btrfs_root *root = BTRFS_I(inode)->root;
1202         unsigned long nr_pages;
1203         u64 cur_end;
1204
1205         clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1206                          1, 0, NULL);
1207         while (start < end) {
1208                 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
1209                 BUG_ON(!async_cow); /* -ENOMEM */
1210                 async_cow->inode = igrab(inode);
1211                 async_cow->root = root;
1212                 async_cow->locked_page = locked_page;
1213                 async_cow->start = start;
1214                 async_cow->write_flags = write_flags;
1215
1216                 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
1217                     !btrfs_test_opt(fs_info, FORCE_COMPRESS))
1218                         cur_end = end;
1219                 else
1220                         cur_end = min(end, start + SZ_512K - 1);
1221
1222                 async_cow->end = cur_end;
1223                 INIT_LIST_HEAD(&async_cow->extents);
1224
1225                 btrfs_init_work(&async_cow->work,
1226                                 btrfs_delalloc_helper,
1227                                 async_cow_start, async_cow_submit,
1228                                 async_cow_free);
1229
1230                 nr_pages = (cur_end - start + PAGE_SIZE) >>
1231                         PAGE_SHIFT;
1232                 atomic_add(nr_pages, &fs_info->async_delalloc_pages);
1233
1234                 btrfs_queue_work(fs_info->delalloc_workers, &async_cow->work);
1235
1236                 *nr_written += nr_pages;
1237                 start = cur_end + 1;
1238         }
1239         *page_started = 1;
1240         return 0;
1241 }
1242
1243 static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
1244                                         u64 bytenr, u64 num_bytes)
1245 {
1246         int ret;
1247         struct btrfs_ordered_sum *sums;
1248         LIST_HEAD(list);
1249
1250         ret = btrfs_lookup_csums_range(fs_info->csum_root, bytenr,
1251                                        bytenr + num_bytes - 1, &list, 0);
1252         if (ret == 0 && list_empty(&list))
1253                 return 0;
1254
1255         while (!list_empty(&list)) {
1256                 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1257                 list_del(&sums->list);
1258                 kfree(sums);
1259         }
1260         return 1;
1261 }
1262
1263 /*
1264  * when nowcow writeback call back.  This checks for snapshots or COW copies
1265  * of the extents that exist in the file, and COWs the file as required.
1266  *
1267  * If no cow copies or snapshots exist, we write directly to the existing
1268  * blocks on disk
1269  */
1270 static noinline int run_delalloc_nocow(struct inode *inode,
1271                                        struct page *locked_page,
1272                               u64 start, u64 end, int *page_started, int force,
1273                               unsigned long *nr_written)
1274 {
1275         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1276         struct btrfs_root *root = BTRFS_I(inode)->root;
1277         struct extent_buffer *leaf;
1278         struct btrfs_path *path;
1279         struct btrfs_file_extent_item *fi;
1280         struct btrfs_key found_key;
1281         struct extent_map *em;
1282         u64 cow_start;
1283         u64 cur_offset;
1284         u64 extent_end;
1285         u64 extent_offset;
1286         u64 disk_bytenr;
1287         u64 num_bytes;
1288         u64 disk_num_bytes;
1289         u64 ram_bytes;
1290         int extent_type;
1291         int ret, err;
1292         int type;
1293         int nocow;
1294         int check_prev = 1;
1295         bool nolock;
1296         u64 ino = btrfs_ino(BTRFS_I(inode));
1297
1298         path = btrfs_alloc_path();
1299         if (!path) {
1300                 extent_clear_unlock_delalloc(inode, start, end, end,
1301                                              locked_page,
1302                                              EXTENT_LOCKED | EXTENT_DELALLOC |
1303                                              EXTENT_DO_ACCOUNTING |
1304                                              EXTENT_DEFRAG, PAGE_UNLOCK |
1305                                              PAGE_CLEAR_DIRTY |
1306                                              PAGE_SET_WRITEBACK |
1307                                              PAGE_END_WRITEBACK);
1308                 return -ENOMEM;
1309         }
1310
1311         nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
1312
1313         cow_start = (u64)-1;
1314         cur_offset = start;
1315         while (1) {
1316                 ret = btrfs_lookup_file_extent(NULL, root, path, ino,
1317                                                cur_offset, 0);
1318                 if (ret < 0)
1319                         goto error;
1320                 if (ret > 0 && path->slots[0] > 0 && check_prev) {
1321                         leaf = path->nodes[0];
1322                         btrfs_item_key_to_cpu(leaf, &found_key,
1323                                               path->slots[0] - 1);
1324                         if (found_key.objectid == ino &&
1325                             found_key.type == BTRFS_EXTENT_DATA_KEY)
1326                                 path->slots[0]--;
1327                 }
1328                 check_prev = 0;
1329 next_slot:
1330                 leaf = path->nodes[0];
1331                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1332                         ret = btrfs_next_leaf(root, path);
1333                         if (ret < 0) {
1334                                 if (cow_start != (u64)-1)
1335                                         cur_offset = cow_start;
1336                                 goto error;
1337                         }
1338                         if (ret > 0)
1339                                 break;
1340                         leaf = path->nodes[0];
1341                 }
1342
1343                 nocow = 0;
1344                 disk_bytenr = 0;
1345                 num_bytes = 0;
1346                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1347
1348                 if (found_key.objectid > ino)
1349                         break;
1350                 if (WARN_ON_ONCE(found_key.objectid < ino) ||
1351                     found_key.type < BTRFS_EXTENT_DATA_KEY) {
1352                         path->slots[0]++;
1353                         goto next_slot;
1354                 }
1355                 if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
1356                     found_key.offset > end)
1357                         break;
1358
1359                 if (found_key.offset > cur_offset) {
1360                         extent_end = found_key.offset;
1361                         extent_type = 0;
1362                         goto out_check;
1363                 }
1364
1365                 fi = btrfs_item_ptr(leaf, path->slots[0],
1366                                     struct btrfs_file_extent_item);
1367                 extent_type = btrfs_file_extent_type(leaf, fi);
1368
1369                 ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1370                 if (extent_type == BTRFS_FILE_EXTENT_REG ||
1371                     extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1372                         disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1373                         extent_offset = btrfs_file_extent_offset(leaf, fi);
1374                         extent_end = found_key.offset +
1375                                 btrfs_file_extent_num_bytes(leaf, fi);
1376                         disk_num_bytes =
1377                                 btrfs_file_extent_disk_num_bytes(leaf, fi);
1378                         if (extent_end <= start) {
1379                                 path->slots[0]++;
1380                                 goto next_slot;
1381                         }
1382                         if (disk_bytenr == 0)
1383                                 goto out_check;
1384                         if (btrfs_file_extent_compression(leaf, fi) ||
1385                             btrfs_file_extent_encryption(leaf, fi) ||
1386                             btrfs_file_extent_other_encoding(leaf, fi))
1387                                 goto out_check;
1388                         if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1389                                 goto out_check;
1390                         if (btrfs_extent_readonly(fs_info, disk_bytenr))
1391                                 goto out_check;
1392                         if (btrfs_cross_ref_exist(root, ino,
1393                                                   found_key.offset -
1394                                                   extent_offset, disk_bytenr))
1395                                 goto out_check;
1396                         disk_bytenr += extent_offset;
1397                         disk_bytenr += cur_offset - found_key.offset;
1398                         num_bytes = min(end + 1, extent_end) - cur_offset;
1399                         /*
1400                          * if there are pending snapshots for this root,
1401                          * we fall into common COW way.
1402                          */
1403                         if (!nolock) {
1404                                 err = btrfs_start_write_no_snapshotting(root);
1405                                 if (!err)
1406                                         goto out_check;
1407                         }
1408                         /*
1409                          * force cow if csum exists in the range.
1410                          * this ensure that csum for a given extent are
1411                          * either valid or do not exist.
1412                          */
1413                         if (csum_exist_in_range(fs_info, disk_bytenr,
1414                                                 num_bytes)) {
1415                                 if (!nolock)
1416                                         btrfs_end_write_no_snapshotting(root);
1417                                 goto out_check;
1418                         }
1419                         if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) {
1420                                 if (!nolock)
1421                                         btrfs_end_write_no_snapshotting(root);
1422                                 goto out_check;
1423                         }
1424                         nocow = 1;
1425                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1426                         extent_end = found_key.offset +
1427                                 btrfs_file_extent_inline_len(leaf,
1428                                                      path->slots[0], fi);
1429                         extent_end = ALIGN(extent_end,
1430                                            fs_info->sectorsize);
1431                 } else {
1432                         BUG_ON(1);
1433                 }
1434 out_check:
1435                 if (extent_end <= start) {
1436                         path->slots[0]++;
1437                         if (!nolock && nocow)
1438                                 btrfs_end_write_no_snapshotting(root);
1439                         if (nocow)
1440                                 btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1441                         goto next_slot;
1442                 }
1443                 if (!nocow) {
1444                         if (cow_start == (u64)-1)
1445                                 cow_start = cur_offset;
1446                         cur_offset = extent_end;
1447                         if (cur_offset > end)
1448                                 break;
1449                         path->slots[0]++;
1450                         goto next_slot;
1451                 }
1452
1453                 btrfs_release_path(path);
1454                 if (cow_start != (u64)-1) {
1455                         ret = cow_file_range(inode, locked_page,
1456                                              cow_start, found_key.offset - 1,
1457                                              end, page_started, nr_written, 1,
1458                                              NULL);
1459                         if (ret) {
1460                                 if (!nolock && nocow)
1461                                         btrfs_end_write_no_snapshotting(root);
1462                                 if (nocow)
1463                                         btrfs_dec_nocow_writers(fs_info,
1464                                                                 disk_bytenr);
1465                                 goto error;
1466                         }
1467                         cow_start = (u64)-1;
1468                 }
1469
1470                 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1471                         u64 orig_start = found_key.offset - extent_offset;
1472
1473                         em = create_io_em(inode, cur_offset, num_bytes,
1474                                           orig_start,
1475                                           disk_bytenr, /* block_start */
1476                                           num_bytes, /* block_len */
1477                                           disk_num_bytes, /* orig_block_len */
1478                                           ram_bytes, BTRFS_COMPRESS_NONE,
1479                                           BTRFS_ORDERED_PREALLOC);
1480                         if (IS_ERR(em)) {
1481                                 if (!nolock && nocow)
1482                                         btrfs_end_write_no_snapshotting(root);
1483                                 if (nocow)
1484                                         btrfs_dec_nocow_writers(fs_info,
1485                                                                 disk_bytenr);
1486                                 ret = PTR_ERR(em);
1487                                 goto error;
1488                         }
1489                         free_extent_map(em);
1490                 }
1491
1492                 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1493                         type = BTRFS_ORDERED_PREALLOC;
1494                 } else {
1495                         type = BTRFS_ORDERED_NOCOW;
1496                 }
1497
1498                 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1499                                                num_bytes, num_bytes, type);
1500                 if (nocow)
1501                         btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1502                 BUG_ON(ret); /* -ENOMEM */
1503
1504                 if (root->root_key.objectid ==
1505                     BTRFS_DATA_RELOC_TREE_OBJECTID)
1506                         /*
1507                          * Error handled later, as we must prevent
1508                          * extent_clear_unlock_delalloc() in error handler
1509                          * from freeing metadata of created ordered extent.
1510                          */
1511                         ret = btrfs_reloc_clone_csums(inode, cur_offset,
1512                                                       num_bytes);
1513
1514                 extent_clear_unlock_delalloc(inode, cur_offset,
1515                                              cur_offset + num_bytes - 1, end,
1516                                              locked_page, EXTENT_LOCKED |
1517                                              EXTENT_DELALLOC |
1518                                              EXTENT_CLEAR_DATA_RESV,
1519                                              PAGE_UNLOCK | PAGE_SET_PRIVATE2);
1520
1521                 if (!nolock && nocow)
1522                         btrfs_end_write_no_snapshotting(root);
1523                 cur_offset = extent_end;
1524
1525                 /*
1526                  * btrfs_reloc_clone_csums() error, now we're OK to call error
1527                  * handler, as metadata for created ordered extent will only
1528                  * be freed by btrfs_finish_ordered_io().
1529                  */
1530                 if (ret)
1531                         goto error;
1532                 if (cur_offset > end)
1533                         break;
1534         }
1535         btrfs_release_path(path);
1536
1537         if (cur_offset <= end && cow_start == (u64)-1) {
1538                 cow_start = cur_offset;
1539                 cur_offset = end;
1540         }
1541
1542         if (cow_start != (u64)-1) {
1543                 ret = cow_file_range(inode, locked_page, cow_start, end, end,
1544                                      page_started, nr_written, 1, NULL);
1545                 if (ret)
1546                         goto error;
1547         }
1548
1549 error:
1550         if (ret && cur_offset < end)
1551                 extent_clear_unlock_delalloc(inode, cur_offset, end, end,
1552                                              locked_page, EXTENT_LOCKED |
1553                                              EXTENT_DELALLOC | EXTENT_DEFRAG |
1554                                              EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1555                                              PAGE_CLEAR_DIRTY |
1556                                              PAGE_SET_WRITEBACK |
1557                                              PAGE_END_WRITEBACK);
1558         btrfs_free_path(path);
1559         return ret;
1560 }
1561
1562 static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
1563 {
1564
1565         if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
1566             !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
1567                 return 0;
1568
1569         /*
1570          * @defrag_bytes is a hint value, no spinlock held here,
1571          * if is not zero, it means the file is defragging.
1572          * Force cow if given extent needs to be defragged.
1573          */
1574         if (BTRFS_I(inode)->defrag_bytes &&
1575             test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1576                            EXTENT_DEFRAG, 0, NULL))
1577                 return 1;
1578
1579         return 0;
1580 }
1581
1582 /*
1583  * extent_io.c call back to do delayed allocation processing
1584  */
1585 static int run_delalloc_range(void *private_data, struct page *locked_page,
1586                               u64 start, u64 end, int *page_started,
1587                               unsigned long *nr_written,
1588                               struct writeback_control *wbc)
1589 {
1590         struct inode *inode = private_data;
1591         int ret;
1592         int force_cow = need_force_cow(inode, start, end);
1593         unsigned int write_flags = wbc_to_write_flags(wbc);
1594
1595         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
1596                 ret = run_delalloc_nocow(inode, locked_page, start, end,
1597                                          page_started, 1, nr_written);
1598         } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
1599                 ret = run_delalloc_nocow(inode, locked_page, start, end,
1600                                          page_started, 0, nr_written);
1601         } else if (!inode_need_compress(inode, start, end)) {
1602                 ret = cow_file_range(inode, locked_page, start, end, end,
1603                                       page_started, nr_written, 1, NULL);
1604         } else {
1605                 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1606                         &BTRFS_I(inode)->runtime_flags);
1607                 ret = cow_file_range_async(inode, locked_page, start, end,
1608                                            page_started, nr_written,
1609                                            write_flags);
1610         }
1611         if (ret)
1612                 btrfs_cleanup_ordered_extents(inode, start, end - start + 1);
1613         return ret;
1614 }
1615
1616 static void btrfs_split_extent_hook(void *private_data,
1617                                     struct extent_state *orig, u64 split)
1618 {
1619         struct inode *inode = private_data;
1620         u64 size;
1621
1622         /* not delalloc, ignore it */
1623         if (!(orig->state & EXTENT_DELALLOC))
1624                 return;
1625
1626         size = orig->end - orig->start + 1;
1627         if (size > BTRFS_MAX_EXTENT_SIZE) {
1628                 u32 num_extents;
1629                 u64 new_size;
1630
1631                 /*
1632                  * See the explanation in btrfs_merge_extent_hook, the same
1633                  * applies here, just in reverse.
1634                  */
1635                 new_size = orig->end - split + 1;
1636                 num_extents = count_max_extents(new_size);
1637                 new_size = split - orig->start;
1638                 num_extents += count_max_extents(new_size);
1639                 if (count_max_extents(size) >= num_extents)
1640                         return;
1641         }
1642
1643         spin_lock(&BTRFS_I(inode)->lock);
1644         btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
1645         spin_unlock(&BTRFS_I(inode)->lock);
1646 }
1647
1648 /*
1649  * extent_io.c merge_extent_hook, used to track merged delayed allocation
1650  * extents so we can keep track of new extents that are just merged onto old
1651  * extents, such as when we are doing sequential writes, so we can properly
1652  * account for the metadata space we'll need.
1653  */
1654 static void btrfs_merge_extent_hook(void *private_data,
1655                                     struct extent_state *new,
1656                                     struct extent_state *other)
1657 {
1658         struct inode *inode = private_data;
1659         u64 new_size, old_size;
1660         u32 num_extents;
1661
1662         /* not delalloc, ignore it */
1663         if (!(other->state & EXTENT_DELALLOC))
1664                 return;
1665
1666         if (new->start > other->start)
1667                 new_size = new->end - other->start + 1;
1668         else
1669                 new_size = other->end - new->start + 1;
1670
1671         /* we're not bigger than the max, unreserve the space and go */
1672         if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
1673                 spin_lock(&BTRFS_I(inode)->lock);
1674                 btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
1675                 spin_unlock(&BTRFS_I(inode)->lock);
1676                 return;
1677         }
1678
1679         /*
1680          * We have to add up either side to figure out how many extents were
1681          * accounted for before we merged into one big extent.  If the number of
1682          * extents we accounted for is <= the amount we need for the new range
1683          * then we can return, otherwise drop.  Think of it like this
1684          *
1685          * [ 4k][MAX_SIZE]
1686          *
1687          * So we've grown the extent by a MAX_SIZE extent, this would mean we
1688          * need 2 outstanding extents, on one side we have 1 and the other side
1689          * we have 1 so they are == and we can return.  But in this case
1690          *
1691          * [MAX_SIZE+4k][MAX_SIZE+4k]
1692          *
1693          * Each range on their own accounts for 2 extents, but merged together
1694          * they are only 3 extents worth of accounting, so we need to drop in
1695          * this case.
1696          */
1697         old_size = other->end - other->start + 1;
1698         num_extents = count_max_extents(old_size);
1699         old_size = new->end - new->start + 1;
1700         num_extents += count_max_extents(old_size);
1701         if (count_max_extents(new_size) >= num_extents)
1702                 return;
1703
1704         spin_lock(&BTRFS_I(inode)->lock);
1705         btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
1706         spin_unlock(&BTRFS_I(inode)->lock);
1707 }
1708
1709 static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
1710                                       struct inode *inode)
1711 {
1712         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1713
1714         spin_lock(&root->delalloc_lock);
1715         if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1716                 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1717                               &root->delalloc_inodes);
1718                 set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1719                         &BTRFS_I(inode)->runtime_flags);
1720                 root->nr_delalloc_inodes++;
1721                 if (root->nr_delalloc_inodes == 1) {
1722                         spin_lock(&fs_info->delalloc_root_lock);
1723                         BUG_ON(!list_empty(&root->delalloc_root));
1724                         list_add_tail(&root->delalloc_root,
1725                                       &fs_info->delalloc_roots);
1726                         spin_unlock(&fs_info->delalloc_root_lock);
1727                 }
1728         }
1729         spin_unlock(&root->delalloc_lock);
1730 }
1731
1732 static void btrfs_del_delalloc_inode(struct btrfs_root *root,
1733                                      struct btrfs_inode *inode)
1734 {
1735         struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
1736
1737         spin_lock(&root->delalloc_lock);
1738         if (!list_empty(&inode->delalloc_inodes)) {
1739                 list_del_init(&inode->delalloc_inodes);
1740                 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1741                           &inode->runtime_flags);
1742                 root->nr_delalloc_inodes--;
1743                 if (!root->nr_delalloc_inodes) {
1744                         spin_lock(&fs_info->delalloc_root_lock);
1745                         BUG_ON(list_empty(&root->delalloc_root));
1746                         list_del_init(&root->delalloc_root);
1747                         spin_unlock(&fs_info->delalloc_root_lock);
1748                 }
1749         }
1750         spin_unlock(&root->delalloc_lock);
1751 }
1752
1753 /*
1754  * extent_io.c set_bit_hook, used to track delayed allocation
1755  * bytes in this file, and to maintain the list of inodes that
1756  * have pending delalloc work to be done.
1757  */
1758 static void btrfs_set_bit_hook(void *private_data,
1759                                struct extent_state *state, unsigned *bits)
1760 {
1761         struct inode *inode = private_data;
1762
1763         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1764
1765         if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
1766                 WARN_ON(1);
1767         /*
1768          * set_bit and clear bit hooks normally require _irqsave/restore
1769          * but in this case, we are only testing for the DELALLOC
1770          * bit, which is only set or cleared with irqs on
1771          */
1772         if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1773                 struct btrfs_root *root = BTRFS_I(inode)->root;
1774                 u64 len = state->end + 1 - state->start;
1775                 u32 num_extents = count_max_extents(len);
1776                 bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
1777
1778                 spin_lock(&BTRFS_I(inode)->lock);
1779                 btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents);
1780                 spin_unlock(&BTRFS_I(inode)->lock);
1781
1782                 /* For sanity tests */
1783                 if (btrfs_is_testing(fs_info))
1784                         return;
1785
1786                 percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
1787                                          fs_info->delalloc_batch);
1788                 spin_lock(&BTRFS_I(inode)->lock);
1789                 BTRFS_I(inode)->delalloc_bytes += len;
1790                 if (*bits & EXTENT_DEFRAG)
1791                         BTRFS_I(inode)->defrag_bytes += len;
1792                 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1793                                          &BTRFS_I(inode)->runtime_flags))
1794                         btrfs_add_delalloc_inodes(root, inode);
1795                 spin_unlock(&BTRFS_I(inode)->lock);
1796         }
1797
1798         if (!(state->state & EXTENT_DELALLOC_NEW) &&
1799             (*bits & EXTENT_DELALLOC_NEW)) {
1800                 spin_lock(&BTRFS_I(inode)->lock);
1801                 BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
1802                         state->start;
1803                 spin_unlock(&BTRFS_I(inode)->lock);
1804         }
1805 }
1806
1807 /*
1808  * extent_io.c clear_bit_hook, see set_bit_hook for why
1809  */
1810 static void btrfs_clear_bit_hook(void *private_data,
1811                                  struct extent_state *state,
1812                                  unsigned *bits)
1813 {
1814         struct btrfs_inode *inode = BTRFS_I((struct inode *)private_data);
1815         struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
1816         u64 len = state->end + 1 - state->start;
1817         u32 num_extents = count_max_extents(len);
1818
1819         if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) {
1820                 spin_lock(&inode->lock);
1821                 inode->defrag_bytes -= len;
1822                 spin_unlock(&inode->lock);
1823         }
1824
1825         /*
1826          * set_bit and clear bit hooks normally require _irqsave/restore
1827          * but in this case, we are only testing for the DELALLOC
1828          * bit, which is only set or cleared with irqs on
1829          */
1830         if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1831                 struct btrfs_root *root = inode->root;
1832                 bool do_list = !btrfs_is_free_space_inode(inode);
1833
1834                 spin_lock(&inode->lock);
1835                 btrfs_mod_outstanding_extents(inode, -num_extents);
1836                 spin_unlock(&inode->lock);
1837
1838                 /*
1839                  * We don't reserve metadata space for space cache inodes so we
1840                  * don't need to call dellalloc_release_metadata if there is an
1841                  * error.
1842                  */
1843                 if (*bits & EXTENT_CLEAR_META_RESV &&
1844                     root != fs_info->tree_root)
1845                         btrfs_delalloc_release_metadata(inode, len);
1846
1847                 /* For sanity tests. */
1848                 if (btrfs_is_testing(fs_info))
1849                         return;
1850
1851                 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID &&
1852                     do_list && !(state->state & EXTENT_NORESERVE) &&
1853                     (*bits & EXTENT_CLEAR_DATA_RESV))
1854                         btrfs_free_reserved_data_space_noquota(
1855                                         &inode->vfs_inode,
1856                                         state->start, len);
1857
1858                 percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
1859                                          fs_info->delalloc_batch);
1860                 spin_lock(&inode->lock);
1861                 inode->delalloc_bytes -= len;
1862                 if (do_list && inode->delalloc_bytes == 0 &&
1863                     test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1864                                         &inode->runtime_flags))
1865                         btrfs_del_delalloc_inode(root, inode);
1866                 spin_unlock(&inode->lock);
1867         }
1868
1869         if ((state->state & EXTENT_DELALLOC_NEW) &&
1870             (*bits & EXTENT_DELALLOC_NEW)) {
1871                 spin_lock(&inode->lock);
1872                 ASSERT(inode->new_delalloc_bytes >= len);
1873                 inode->new_delalloc_bytes -= len;
1874                 spin_unlock(&inode->lock);
1875         }
1876 }
1877
1878 /*
1879  * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1880  * we don't create bios that span stripes or chunks
1881  *
1882  * return 1 if page cannot be merged to bio
1883  * return 0 if page can be merged to bio
1884  * return error otherwise
1885  */
1886 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1887                          size_t size, struct bio *bio,
1888                          unsigned long bio_flags)
1889 {
1890         struct inode *inode = page->mapping->host;
1891         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1892         u64 logical = (u64)bio->bi_iter.bi_sector << 9;
1893         u64 length = 0;
1894         u64 map_length;
1895         int ret;
1896
1897         if (bio_flags & EXTENT_BIO_COMPRESSED)
1898                 return 0;
1899
1900         length = bio->bi_iter.bi_size;
1901         map_length = length;
1902         ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
1903                               NULL, 0);
1904         if (ret < 0)
1905                 return ret;
1906         if (map_length < length + size)
1907                 return 1;
1908         return 0;
1909 }
1910
1911 /*
1912  * in order to insert checksums into the metadata in large chunks,
1913  * we wait until bio submission time.   All the pages in the bio are
1914  * checksummed and sums are attached onto the ordered extent record.
1915  *
1916  * At IO completion time the cums attached on the ordered extent record
1917  * are inserted into the btree
1918  */
1919 static blk_status_t __btrfs_submit_bio_start(void *private_data, struct bio *bio,
1920                                     int mirror_num, unsigned long bio_flags,
1921                                     u64 bio_offset)
1922 {
1923         struct inode *inode = private_data;
1924         blk_status_t ret = 0;
1925
1926         ret = btrfs_csum_one_bio(inode, bio, 0, 0);
1927         BUG_ON(ret); /* -ENOMEM */
1928         return 0;
1929 }
1930
1931 /*
1932  * in order to insert checksums into the metadata in large chunks,
1933  * we wait until bio submission time.   All the pages in the bio are
1934  * checksummed and sums are attached onto the ordered extent record.
1935  *
1936  * At IO completion time the cums attached on the ordered extent record
1937  * are inserted into the btree
1938  */
1939 static blk_status_t __btrfs_submit_bio_done(void *private_data, struct bio *bio,
1940                           int mirror_num, unsigned long bio_flags,
1941                           u64 bio_offset)
1942 {
1943         struct inode *inode = private_data;
1944         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1945         blk_status_t ret;
1946
1947         ret = btrfs_map_bio(fs_info, bio, mirror_num, 1);
1948         if (ret) {
1949                 bio->bi_status = ret;
1950                 bio_endio(bio);
1951         }
1952         return ret;
1953 }
1954
1955 /*
1956  * extent_io.c submission hook. This does the right thing for csum calculation
1957  * on write, or reading the csums from the tree before a read.
1958  *
1959  * Rules about async/sync submit,
1960  * a) read:                             sync submit
1961  *
1962  * b) write without checksum:           sync submit
1963  *
1964  * c) write with checksum:
1965  *    c-1) if bio is issued by fsync:   sync submit
1966  *         (sync_writers != 0)
1967  *
1968  *    c-2) if root is reloc root:       sync submit
1969  *         (only in case of buffered IO)
1970  *
1971  *    c-3) otherwise:                   async submit
1972  */
1973 static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio,
1974                                  int mirror_num, unsigned long bio_flags,
1975                                  u64 bio_offset)
1976 {
1977         struct inode *inode = private_data;
1978         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1979         struct btrfs_root *root = BTRFS_I(inode)->root;
1980         enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
1981         blk_status_t ret = 0;
1982         int skip_sum;
1983         int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
1984
1985         skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1986
1987         if (btrfs_is_free_space_inode(BTRFS_I(inode)))
1988                 metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
1989
1990         if (bio_op(bio) != REQ_OP_WRITE) {
1991                 ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
1992                 if (ret)
1993                         goto out;
1994
1995                 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1996                         ret = btrfs_submit_compressed_read(inode, bio,
1997                                                            mirror_num,
1998                                                            bio_flags);
1999                         goto out;
2000                 } else if (!skip_sum) {
2001                         ret = btrfs_lookup_bio_sums(inode, bio, NULL);
2002                         if (ret)
2003                                 goto out;
2004                 }
2005                 goto mapit;
2006         } else if (async && !skip_sum) {
2007                 /* csum items have already been cloned */
2008                 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2009                         goto mapit;
2010                 /* we're doing a write, do the async checksumming */
2011                 ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags,
2012                                           bio_offset, inode,
2013                                           __btrfs_submit_bio_start,
2014                                           __btrfs_submit_bio_done);
2015                 goto out;
2016         } else if (!skip_sum) {
2017                 ret = btrfs_csum_one_bio(inode, bio, 0, 0);
2018                 if (ret)
2019                         goto out;
2020         }
2021
2022 mapit:
2023         ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
2024
2025 out:
2026         if (ret) {
2027                 bio->bi_status = ret;
2028                 bio_endio(bio);
2029         }
2030         return ret;
2031 }
2032
2033 /*
2034  * given a list of ordered sums record them in the inode.  This happens
2035  * at IO completion time based on sums calculated at bio submission time.
2036  */
2037 static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
2038                              struct inode *inode, struct list_head *list)
2039 {
2040         struct btrfs_ordered_sum *sum;
2041         int ret;
2042
2043         list_for_each_entry(sum, list, list) {
2044                 trans->adding_csums = true;
2045                 ret = btrfs_csum_file_blocks(trans,
2046                        BTRFS_I(inode)->root->fs_info->csum_root, sum);
2047                 trans->adding_csums = false;
2048                 if (ret)
2049                         return ret;
2050         }
2051         return 0;
2052 }
2053
2054 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2055                               unsigned int extra_bits,
2056                               struct extent_state **cached_state, int dedupe)
2057 {
2058         WARN_ON((end & (PAGE_SIZE - 1)) == 0);
2059         return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
2060                                    extra_bits, cached_state);
2061 }
2062
2063 /* see btrfs_writepage_start_hook for details on why this is required */
2064 struct btrfs_writepage_fixup {
2065         struct page *page;
2066         struct btrfs_work work;
2067 };
2068
2069 static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
2070 {
2071         struct btrfs_writepage_fixup *fixup;
2072         struct btrfs_ordered_extent *ordered;
2073         struct extent_state *cached_state = NULL;
2074         struct extent_changeset *data_reserved = NULL;
2075         struct page *page;
2076         struct inode *inode;
2077         u64 page_start;
2078         u64 page_end;
2079         int ret;
2080
2081         fixup = container_of(work, struct btrfs_writepage_fixup, work);
2082         page = fixup->page;
2083 again:
2084         lock_page(page);
2085         if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
2086                 ClearPageChecked(page);
2087                 goto out_page;
2088         }
2089
2090         inode = page->mapping->host;
2091         page_start = page_offset(page);
2092         page_end = page_offset(page) + PAGE_SIZE - 1;
2093
2094         lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
2095                          &cached_state);
2096
2097         /* already ordered? We're done */
2098         if (PagePrivate2(page))
2099                 goto out;
2100
2101         ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
2102                                         PAGE_SIZE);
2103         if (ordered) {
2104                 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
2105                                      page_end, &cached_state);
2106                 unlock_page(page);
2107                 btrfs_start_ordered_extent(inode, ordered, 1);
2108                 btrfs_put_ordered_extent(ordered);
2109                 goto again;
2110         }
2111
2112         ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
2113                                            PAGE_SIZE);
2114         if (ret) {
2115                 mapping_set_error(page->mapping, ret);
2116                 end_extent_writepage(page, ret, page_start, page_end);
2117                 ClearPageChecked(page);
2118                 goto out;
2119          }
2120
2121         ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
2122                                         &cached_state, 0);
2123         if (ret) {
2124                 mapping_set_error(page->mapping, ret);
2125                 end_extent_writepage(page, ret, page_start, page_end);
2126                 ClearPageChecked(page);
2127                 goto out;
2128         }
2129
2130         ClearPageChecked(page);
2131         set_page_dirty(page);
2132         btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
2133 out:
2134         unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
2135                              &cached_state);
2136 out_page:
2137         unlock_page(page);
2138         put_page(page);
2139         kfree(fixup);
2140         extent_changeset_free(data_reserved);
2141 }
2142
2143 /*
2144  * There are a few paths in the higher layers of the kernel that directly
2145  * set the page dirty bit without asking the filesystem if it is a
2146  * good idea.  This causes problems because we want to make sure COW
2147  * properly happens and the data=ordered rules are followed.
2148  *
2149  * In our case any range that doesn't have the ORDERED bit set
2150  * hasn't been properly setup for IO.  We kick off an async process
2151  * to fix it up.  The async helper will wait for ordered extents, set
2152  * the delalloc bit and make it safe to write the page.
2153  */
2154 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
2155 {
2156         struct inode *inode = page->mapping->host;
2157         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2158         struct btrfs_writepage_fixup *fixup;
2159
2160         /* this page is properly in the ordered list */
2161         if (TestClearPagePrivate2(page))
2162                 return 0;
2163
2164         if (PageChecked(page))
2165                 return -EAGAIN;
2166
2167         fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
2168         if (!fixup)
2169                 return -EAGAIN;
2170
2171         SetPageChecked(page);
2172         get_page(page);
2173         btrfs_init_work(&fixup->work, btrfs_fixup_helper,
2174                         btrfs_writepage_fixup_worker, NULL, NULL);
2175         fixup->page = page;
2176         btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
2177         return -EBUSY;
2178 }
2179
2180 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2181                                        struct inode *inode, u64 file_pos,
2182                                        u64 disk_bytenr, u64 disk_num_bytes,
2183                                        u64 num_bytes, u64 ram_bytes,
2184                                        u8 compression, u8 encryption,
2185                                        u16 other_encoding, int extent_type)
2186 {
2187         struct btrfs_root *root = BTRFS_I(inode)->root;
2188         struct btrfs_file_extent_item *fi;
2189         struct btrfs_path *path;
2190         struct extent_buffer *leaf;
2191         struct btrfs_key ins;
2192         u64 qg_released;
2193         int extent_inserted = 0;
2194         int ret;
2195
2196         path = btrfs_alloc_path();
2197         if (!path)
2198                 return -ENOMEM;
2199
2200         /*
2201          * we may be replacing one extent in the tree with another.
2202          * The new extent is pinned in the extent map, and we don't want
2203          * to drop it from the cache until it is completely in the btree.
2204          *
2205          * So, tell btrfs_drop_extents to leave this extent in the cache.
2206          * the caller is expected to unpin it and allow it to be merged
2207          * with the others.
2208          */
2209         ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
2210                                    file_pos + num_bytes, NULL, 0,
2211                                    1, sizeof(*fi), &extent_inserted);
2212         if (ret)
2213                 goto out;
2214
2215         if (!extent_inserted) {
2216                 ins.objectid = btrfs_ino(BTRFS_I(inode));
2217                 ins.offset = file_pos;
2218                 ins.type = BTRFS_EXTENT_DATA_KEY;
2219
2220                 path->leave_spinning = 1;
2221                 ret = btrfs_insert_empty_item(trans, root, path, &ins,
2222                                               sizeof(*fi));
2223                 if (ret)
2224                         goto out;
2225         }
2226         leaf = path->nodes[0];
2227         fi = btrfs_item_ptr(leaf, path->slots[0],
2228                             struct btrfs_file_extent_item);
2229         btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2230         btrfs_set_file_extent_type(leaf, fi, extent_type);
2231         btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
2232         btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
2233         btrfs_set_file_extent_offset(leaf, fi, 0);
2234         btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2235         btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
2236         btrfs_set_file_extent_compression(leaf, fi, compression);
2237         btrfs_set_file_extent_encryption(leaf, fi, encryption);
2238         btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
2239
2240         btrfs_mark_buffer_dirty(leaf);
2241         btrfs_release_path(path);
2242
2243         inode_add_bytes(inode, num_bytes);
2244
2245         ins.objectid = disk_bytenr;
2246         ins.offset = disk_num_bytes;
2247         ins.type = BTRFS_EXTENT_ITEM_KEY;
2248
2249         /*
2250          * Release the reserved range from inode dirty range map, as it is
2251          * already moved into delayed_ref_head
2252          */
2253         ret = btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
2254         if (ret < 0)
2255                 goto out;
2256         qg_released = ret;
2257         ret = btrfs_alloc_reserved_file_extent(trans, root,
2258                                                btrfs_ino(BTRFS_I(inode)),
2259                                                file_pos, qg_released, &ins);
2260 out:
2261         btrfs_free_path(path);
2262
2263         return ret;
2264 }
2265
2266 /* snapshot-aware defrag */
2267 struct sa_defrag_extent_backref {
2268         struct rb_node node;
2269         struct old_sa_defrag_extent *old;
2270         u64 root_id;
2271         u64 inum;
2272         u64 file_pos;
2273         u64 extent_offset;
2274         u64 num_bytes;
2275         u64 generation;
2276 };
2277
2278 struct old_sa_defrag_extent {
2279         struct list_head list;
2280         struct new_sa_defrag_extent *new;
2281
2282         u64 extent_offset;
2283         u64 bytenr;
2284         u64 offset;
2285         u64 len;
2286         int count;
2287 };
2288
2289 struct new_sa_defrag_extent {
2290         struct rb_root root;
2291         struct list_head head;
2292         struct btrfs_path *path;
2293         struct inode *inode;
2294         u64 file_pos;
2295         u64 len;
2296         u64 bytenr;
2297         u64 disk_len;
2298         u8 compress_type;
2299 };
2300
2301 static int backref_comp(struct sa_defrag_extent_backref *b1,
2302                         struct sa_defrag_extent_backref *b2)
2303 {
2304         if (b1->root_id < b2->root_id)
2305                 return -1;
2306         else if (b1->root_id > b2->root_id)
2307                 return 1;
2308
2309         if (b1->inum < b2->inum)
2310                 return -1;
2311         else if (b1->inum > b2->inum)
2312                 return 1;
2313
2314         if (b1->file_pos < b2->file_pos)
2315                 return -1;
2316         else if (b1->file_pos > b2->file_pos)
2317                 return 1;
2318
2319         /*
2320          * [------------------------------] ===> (a range of space)
2321          *     |<--->|   |<---->| =============> (fs/file tree A)
2322          * |<---------------------------->| ===> (fs/file tree B)
2323          *
2324          * A range of space can refer to two file extents in one tree while
2325          * refer to only one file extent in another tree.
2326          *
2327          * So we may process a disk offset more than one time(two extents in A)
2328          * and locate at the same extent(one extent in B), then insert two same
2329          * backrefs(both refer to the extent in B).
2330          */
2331         return 0;
2332 }
2333
2334 static void backref_insert(struct rb_root *root,
2335                            struct sa_defrag_extent_backref *backref)
2336 {
2337         struct rb_node **p = &root->rb_node;
2338         struct rb_node *parent = NULL;
2339         struct sa_defrag_extent_backref *entry;
2340         int ret;
2341
2342         while (*p) {
2343                 parent = *p;
2344                 entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
2345
2346                 ret = backref_comp(backref, entry);
2347                 if (ret < 0)
2348                         p = &(*p)->rb_left;
2349                 else
2350                         p = &(*p)->rb_right;
2351         }
2352
2353         rb_link_node(&backref->node, parent, p);
2354         rb_insert_color(&backref->node, root);
2355 }
2356
2357 /*
2358  * Note the backref might has changed, and in this case we just return 0.
2359  */
2360 static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2361                                        void *ctx)
2362 {
2363         struct btrfs_file_extent_item *extent;
2364         struct old_sa_defrag_extent *old = ctx;
2365         struct new_sa_defrag_extent *new = old->new;
2366         struct btrfs_path *path = new->path;
2367         struct btrfs_key key;
2368         struct btrfs_root *root;
2369         struct sa_defrag_extent_backref *backref;
2370         struct extent_buffer *leaf;
2371         struct inode *inode = new->inode;
2372         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2373         int slot;
2374         int ret;
2375         u64 extent_offset;
2376         u64 num_bytes;
2377
2378         if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
2379             inum == btrfs_ino(BTRFS_I(inode)))
2380                 return 0;
2381
2382         key.objectid = root_id;
2383         key.type = BTRFS_ROOT_ITEM_KEY;
2384         key.offset = (u64)-1;
2385
2386         root = btrfs_read_fs_root_no_name(fs_info, &key);
2387         if (IS_ERR(root)) {
2388                 if (PTR_ERR(root) == -ENOENT)
2389                         return 0;
2390                 WARN_ON(1);
2391                 btrfs_debug(fs_info, "inum=%llu, offset=%llu, root_id=%llu",
2392                          inum, offset, root_id);
2393                 return PTR_ERR(root);
2394         }
2395
2396         key.objectid = inum;
2397         key.type = BTRFS_EXTENT_DATA_KEY;
2398         if (offset > (u64)-1 << 32)
2399                 key.offset = 0;
2400         else
2401                 key.offset = offset;
2402
2403         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2404         if (WARN_ON(ret < 0))
2405                 return ret;
2406         ret = 0;
2407
2408         while (1) {
2409                 cond_resched();
2410
2411                 leaf = path->nodes[0];
2412                 slot = path->slots[0];
2413
2414                 if (slot >= btrfs_header_nritems(leaf)) {
2415                         ret = btrfs_next_leaf(root, path);
2416                         if (ret < 0) {
2417                                 goto out;
2418                         } else if (ret > 0) {
2419                                 ret = 0;
2420                                 goto out;
2421                         }
2422                         continue;
2423                 }
2424
2425                 path->slots[0]++;
2426
2427                 btrfs_item_key_to_cpu(leaf, &key, slot);
2428
2429                 if (key.objectid > inum)
2430                         goto out;
2431
2432                 if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
2433                         continue;
2434
2435                 extent = btrfs_item_ptr(leaf, slot,
2436                                         struct btrfs_file_extent_item);
2437
2438                 if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
2439                         continue;
2440
2441                 /*
2442                  * 'offset' refers to the exact key.offset,
2443                  * NOT the 'offset' field in btrfs_extent_data_ref, ie.
2444                  * (key.offset - extent_offset).
2445                  */
2446                 if (key.offset != offset)
2447                         continue;
2448
2449                 extent_offset = btrfs_file_extent_offset(leaf, extent);
2450                 num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
2451
2452                 if (extent_offset >= old->extent_offset + old->offset +
2453                     old->len || extent_offset + num_bytes <=
2454                     old->extent_offset + old->offset)
2455                         continue;
2456                 break;
2457         }
2458
2459         backref = kmalloc(sizeof(*backref), GFP_NOFS);
2460         if (!backref) {
2461                 ret = -ENOENT;
2462                 goto out;
2463         }
2464
2465         backref->root_id = root_id;
2466         backref->inum = inum;
2467         backref->file_pos = offset;
2468         backref->num_bytes = num_bytes;
2469         backref->extent_offset = extent_offset;
2470         backref->generation = btrfs_file_extent_generation(leaf, extent);
2471         backref->old = old;
2472         backref_insert(&new->root, backref);
2473         old->count++;
2474 out:
2475         btrfs_release_path(path);
2476         WARN_ON(ret);
2477         return ret;
2478 }
2479
2480 static noinline bool record_extent_backrefs(struct btrfs_path *path,
2481                                    struct new_sa_defrag_extent *new)
2482 {
2483         struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2484         struct old_sa_defrag_extent *old, *tmp;
2485         int ret;
2486
2487         new->path = path;
2488
2489         list_for_each_entry_safe(old, tmp, &new->head, list) {
2490                 ret = iterate_inodes_from_logical(old->bytenr +
2491                                                   old->extent_offset, fs_info,
2492                                                   path, record_one_backref,
2493                                                   old, false);
2494                 if (ret < 0 && ret != -ENOENT)
2495                         return false;
2496
2497                 /* no backref to be processed for this extent */
2498                 if (!old->count) {
2499                         list_del(&old->list);
2500                         kfree(old);
2501                 }
2502         }
2503
2504         if (list_empty(&new->head))
2505                 return false;
2506
2507         return true;
2508 }
2509
2510 static int relink_is_mergable(struct extent_buffer *leaf,
2511                               struct btrfs_file_extent_item *fi,
2512                               struct new_sa_defrag_extent *new)
2513 {
2514         if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
2515                 return 0;
2516
2517         if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2518                 return 0;
2519
2520         if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
2521                 return 0;
2522
2523         if (btrfs_file_extent_encryption(leaf, fi) ||
2524             btrfs_file_extent_other_encoding(leaf, fi))
2525                 return 0;
2526
2527         return 1;
2528 }
2529
2530 /*
2531  * Note the backref might has changed, and in this case we just return 0.
2532  */
2533 static noinline int relink_extent_backref(struct btrfs_path *path,
2534                                  struct sa_defrag_extent_backref *prev,
2535                                  struct sa_defrag_extent_backref *backref)
2536 {
2537         struct btrfs_file_extent_item *extent;
2538         struct btrfs_file_extent_item *item;
2539         struct btrfs_ordered_extent *ordered;
2540         struct btrfs_trans_handle *trans;
2541         struct btrfs_root *root;
2542         struct btrfs_key key;
2543         struct extent_buffer *leaf;
2544         struct old_sa_defrag_extent *old = backref->old;
2545         struct new_sa_defrag_extent *new = old->new;
2546         struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2547         struct inode *inode;
2548         struct extent_state *cached = NULL;
2549         int ret = 0;
2550         u64 start;
2551         u64 len;
2552         u64 lock_start;
2553         u64 lock_end;
2554         bool merge = false;
2555         int index;
2556
2557         if (prev && prev->root_id == backref->root_id &&
2558             prev->inum == backref->inum &&
2559             prev->file_pos + prev->num_bytes == backref->file_pos)
2560                 merge = true;
2561
2562         /* step 1: get root */
2563         key.objectid = backref->root_id;
2564         key.type = BTRFS_ROOT_ITEM_KEY;
2565         key.offset = (u64)-1;
2566
2567         index = srcu_read_lock(&fs_info->subvol_srcu);
2568
2569         root = btrfs_read_fs_root_no_name(fs_info, &key);
2570         if (IS_ERR(root)) {
2571                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2572                 if (PTR_ERR(root) == -ENOENT)
2573                         return 0;
2574                 return PTR_ERR(root);
2575         }
2576
2577         if (btrfs_root_readonly(root)) {
2578                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2579                 return 0;
2580         }
2581
2582         /* step 2: get inode */
2583         key.objectid = backref->inum;
2584         key.type = BTRFS_INODE_ITEM_KEY;
2585         key.offset = 0;
2586
2587         inode = btrfs_iget(fs_info->sb, &key, root, NULL);
2588         if (IS_ERR(inode)) {
2589                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2590                 return 0;
2591         }
2592
2593         srcu_read_unlock(&fs_info->subvol_srcu, index);
2594
2595         /* step 3: relink backref */
2596         lock_start = backref->file_pos;
2597         lock_end = backref->file_pos + backref->num_bytes - 1;
2598         lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2599                          &cached);
2600
2601         ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
2602         if (ordered) {
2603                 btrfs_put_ordered_extent(ordered);
2604                 goto out_unlock;
2605         }
2606
2607         trans = btrfs_join_transaction(root);
2608         if (IS_ERR(trans)) {
2609                 ret = PTR_ERR(trans);
2610                 goto out_unlock;
2611         }
2612
2613         key.objectid = backref->inum;
2614         key.type = BTRFS_EXTENT_DATA_KEY;
2615         key.offset = backref->file_pos;
2616
2617         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2618         if (ret < 0) {
2619                 goto out_free_path;
2620         } else if (ret > 0) {
2621                 ret = 0;
2622                 goto out_free_path;
2623         }
2624
2625         extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
2626                                 struct btrfs_file_extent_item);
2627
2628         if (btrfs_file_extent_generation(path->nodes[0], extent) !=
2629             backref->generation)
2630                 goto out_free_path;
2631
2632         btrfs_release_path(path);
2633
2634         start = backref->file_pos;
2635         if (backref->extent_offset < old->extent_offset + old->offset)
2636                 start += old->extent_offset + old->offset -
2637                          backref->extent_offset;
2638
2639         len = min(backref->extent_offset + backref->num_bytes,
2640                   old->extent_offset + old->offset + old->len);
2641         len -= max(backref->extent_offset, old->extent_offset + old->offset);
2642
2643         ret = btrfs_drop_extents(trans, root, inode, start,
2644                                  start + len, 1);
2645         if (ret)
2646                 goto out_free_path;
2647 again:
2648         key.objectid = btrfs_ino(BTRFS_I(inode));
2649         key.type = BTRFS_EXTENT_DATA_KEY;
2650         key.offset = start;
2651
2652         path->leave_spinning = 1;
2653         if (merge) {
2654                 struct btrfs_file_extent_item *fi;
2655                 u64 extent_len;
2656                 struct btrfs_key found_key;
2657
2658                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2659                 if (ret < 0)
2660                         goto out_free_path;
2661
2662                 path->slots[0]--;
2663                 leaf = path->nodes[0];
2664                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2665
2666                 fi = btrfs_item_ptr(leaf, path->slots[0],
2667                                     struct btrfs_file_extent_item);
2668                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
2669
2670                 if (extent_len + found_key.offset == start &&
2671                     relink_is_mergable(leaf, fi, new)) {
2672                         btrfs_set_file_extent_num_bytes(leaf, fi,
2673                                                         extent_len + len);
2674                         btrfs_mark_buffer_dirty(leaf);
2675                         inode_add_bytes(inode, len);
2676
2677                         ret = 1;
2678                         goto out_free_path;
2679                 } else {
2680                         merge = false;
2681                         btrfs_release_path(path);
2682                         goto again;
2683                 }
2684         }
2685
2686         ret = btrfs_insert_empty_item(trans, root, path, &key,
2687                                         sizeof(*extent));
2688         if (ret) {
2689                 btrfs_abort_transaction(trans, ret);
2690                 goto out_free_path;
2691         }
2692
2693         leaf = path->nodes[0];
2694         item = btrfs_item_ptr(leaf, path->slots[0],
2695                                 struct btrfs_file_extent_item);
2696         btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
2697         btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
2698         btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
2699         btrfs_set_file_extent_num_bytes(leaf, item, len);
2700         btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
2701         btrfs_set_file_extent_generation(leaf, item, trans->transid);
2702         btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
2703         btrfs_set_file_extent_compression(leaf, item, new->compress_type);
2704         btrfs_set_file_extent_encryption(leaf, item, 0);
2705         btrfs_set_file_extent_other_encoding(leaf, item, 0);
2706
2707         btrfs_mark_buffer_dirty(leaf);
2708         inode_add_bytes(inode, len);
2709         btrfs_release_path(path);
2710
2711         ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
2712                         new->disk_len, 0,
2713                         backref->root_id, backref->inum,
2714                         new->file_pos); /* start - extent_offset */
2715         if (ret) {
2716                 btrfs_abort_transaction(trans, ret);
2717                 goto out_free_path;
2718         }
2719
2720         ret = 1;
2721 out_free_path:
2722         btrfs_release_path(path);
2723         path->leave_spinning = 0;
2724         btrfs_end_transaction(trans);
2725 out_unlock:
2726         unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2727                              &cached);
2728         iput(inode);
2729         return ret;
2730 }
2731
2732 static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
2733 {
2734         struct old_sa_defrag_extent *old, *tmp;
2735
2736         if (!new)
2737                 return;
2738
2739         list_for_each_entry_safe(old, tmp, &new->head, list) {
2740                 kfree(old);
2741         }
2742         kfree(new);
2743 }
2744
2745 static void relink_file_extents(struct new_sa_defrag_extent *new)
2746 {
2747         struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2748         struct btrfs_path *path;
2749         struct sa_defrag_extent_backref *backref;
2750         struct sa_defrag_extent_backref *prev = NULL;
2751         struct inode *inode;
2752         struct btrfs_root *root;
2753         struct rb_node *node;
2754         int ret;
2755
2756         inode = new->inode;
2757         root = BTRFS_I(inode)->root;
2758
2759         path = btrfs_alloc_path();
2760         if (!path)
2761                 return;
2762
2763         if (!record_extent_backrefs(path, new)) {
2764                 btrfs_free_path(path);
2765                 goto out;
2766         }
2767         btrfs_release_path(path);
2768
2769         while (1) {
2770                 node = rb_first(&new->root);
2771                 if (!node)
2772                         break;
2773                 rb_erase(node, &new->root);
2774
2775                 backref = rb_entry(node, struct sa_defrag_extent_backref, node);
2776
2777                 ret = relink_extent_backref(path, prev, backref);
2778                 WARN_ON(ret < 0);
2779
2780                 kfree(prev);
2781
2782                 if (ret == 1)
2783                         prev = backref;
2784                 else
2785                         prev = NULL;
2786                 cond_resched();
2787         }
2788         kfree(prev);
2789
2790         btrfs_free_path(path);
2791 out:
2792         free_sa_defrag_extent(new);
2793
2794         atomic_dec(&fs_info->defrag_running);
2795         wake_up(&fs_info->transaction_wait);
2796 }
2797
2798 static struct new_sa_defrag_extent *
2799 record_old_file_extents(struct inode *inode,
2800                         struct btrfs_ordered_extent *ordered)
2801 {
2802         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2803         struct btrfs_root *root = BTRFS_I(inode)->root;
2804         struct btrfs_path *path;
2805         struct btrfs_key key;
2806         struct old_sa_defrag_extent *old;
2807         struct new_sa_defrag_extent *new;
2808         int ret;
2809
2810         new = kmalloc(sizeof(*new), GFP_NOFS);
2811         if (!new)
2812                 return NULL;
2813
2814         new->inode = inode;
2815         new->file_pos = ordered->file_offset;
2816         new->len = ordered->len;
2817         new->bytenr = ordered->start;
2818         new->disk_len = ordered->disk_len;
2819         new->compress_type = ordered->compress_type;
2820         new->root = RB_ROOT;
2821         INIT_LIST_HEAD(&new->head);
2822
2823         path = btrfs_alloc_path();
2824         if (!path)
2825                 goto out_kfree;
2826
2827         key.objectid = btrfs_ino(BTRFS_I(inode));
2828         key.type = BTRFS_EXTENT_DATA_KEY;
2829         key.offset = new->file_pos;
2830
2831         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2832         if (ret < 0)
2833                 goto out_free_path;
2834         if (ret > 0 && path->slots[0] > 0)
2835                 path->slots[0]--;
2836
2837         /* find out all the old extents for the file range */
2838         while (1) {
2839                 struct btrfs_file_extent_item *extent;
2840                 struct extent_buffer *l;
2841                 int slot;
2842                 u64 num_bytes;
2843                 u64 offset;
2844                 u64 end;
2845                 u64 disk_bytenr;
2846                 u64 extent_offset;
2847
2848                 l = path->nodes[0];
2849                 slot = path->slots[0];
2850
2851                 if (slot >= btrfs_header_nritems(l)) {
2852                         ret = btrfs_next_leaf(root, path);
2853                         if (ret < 0)
2854                                 goto out_free_path;
2855                         else if (ret > 0)
2856                                 break;
2857                         continue;
2858                 }
2859
2860                 btrfs_item_key_to_cpu(l, &key, slot);
2861
2862                 if (key.objectid != btrfs_ino(BTRFS_I(inode)))
2863                         break;
2864                 if (key.type != BTRFS_EXTENT_DATA_KEY)
2865                         break;
2866                 if (key.offset >= new->file_pos + new->len)
2867                         break;
2868
2869                 extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
2870
2871                 num_bytes = btrfs_file_extent_num_bytes(l, extent);
2872                 if (key.offset + num_bytes < new->file_pos)
2873                         goto next;
2874
2875                 disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
2876                 if (!disk_bytenr)
2877                         goto next;
2878
2879                 extent_offset = btrfs_file_extent_offset(l, extent);
2880
2881                 old = kmalloc(sizeof(*old), GFP_NOFS);
2882                 if (!old)
2883                         goto out_free_path;
2884
2885                 offset = max(new->file_pos, key.offset);
2886                 end = min(new->file_pos + new->len, key.offset + num_bytes);
2887
2888                 old->bytenr = disk_bytenr;
2889                 old->extent_offset = extent_offset;
2890                 old->offset = offset - key.offset;
2891                 old->len = end - offset;
2892                 old->new = new;
2893                 old->count = 0;
2894                 list_add_tail(&old->list, &new->head);
2895 next:
2896                 path->slots[0]++;
2897                 cond_resched();
2898         }
2899
2900         btrfs_free_path(path);
2901         atomic_inc(&fs_info->defrag_running);
2902
2903         return new;
2904
2905 out_free_path:
2906         btrfs_free_path(path);
2907 out_kfree:
2908         free_sa_defrag_extent(new);
2909         return NULL;
2910 }
2911
2912 static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
2913                                          u64 start, u64 len)
2914 {
2915         struct btrfs_block_group_cache *cache;
2916
2917         cache = btrfs_lookup_block_group(fs_info, start);
2918         ASSERT(cache);
2919
2920         spin_lock(&cache->lock);
2921         cache->delalloc_bytes -= len;
2922         spin_unlock(&cache->lock);
2923
2924         btrfs_put_block_group(cache);
2925 }
2926
2927 /* as ordered data IO finishes, this gets called so we can finish
2928  * an ordered extent if the range of bytes in the file it covers are
2929  * fully written.
2930  */
2931 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2932 {
2933         struct inode *inode = ordered_extent->inode;
2934         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2935         struct btrfs_root *root = BTRFS_I(inode)->root;
2936         struct btrfs_trans_handle *trans = NULL;
2937         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2938         struct extent_state *cached_state = NULL;
2939         struct new_sa_defrag_extent *new = NULL;
2940         int compress_type = 0;
2941         int ret = 0;
2942         u64 logical_len = ordered_extent->len;
2943         bool nolock;
2944         bool truncated = false;
2945         bool range_locked = false;
2946         bool clear_new_delalloc_bytes = false;
2947
2948         if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
2949             !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
2950             !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
2951                 clear_new_delalloc_bytes = true;
2952
2953         nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
2954
2955         if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
2956                 ret = -EIO;
2957                 goto out;
2958         }
2959
2960         btrfs_free_io_failure_record(BTRFS_I(inode),
2961                         ordered_extent->file_offset,
2962                         ordered_extent->file_offset +
2963                         ordered_extent->len - 1);
2964
2965         if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
2966                 truncated = true;
2967                 logical_len = ordered_extent->truncated_len;
2968                 /* Truncated the entire extent, don't bother adding */
2969                 if (!logical_len)
2970                         goto out;
2971         }
2972
2973         if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
2974                 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
2975
2976                 /*
2977                  * For mwrite(mmap + memset to write) case, we still reserve
2978                  * space for NOCOW range.
2979                  * As NOCOW won't cause a new delayed ref, just free the space
2980                  */
2981                 btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
2982                                        ordered_extent->len);
2983                 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2984                 if (nolock)
2985                         trans = btrfs_join_transaction_nolock(root);
2986                 else
2987                         trans = btrfs_join_transaction(root);
2988                 if (IS_ERR(trans)) {
2989                         ret = PTR_ERR(trans);
2990                         trans = NULL;
2991                         goto out;
2992                 }
2993                 trans->block_rsv = &BTRFS_I(inode)->block_rsv;
2994                 ret = btrfs_update_inode_fallback(trans, root, inode);
2995                 if (ret) /* -ENOMEM or corruption */
2996                         btrfs_abort_transaction(trans, ret);
2997                 goto out;
2998         }
2999
3000         range_locked = true;
3001         lock_extent_bits(io_tree, ordered_extent->file_offset,
3002                          ordered_extent->file_offset + ordered_extent->len - 1,
3003                          &cached_state);
3004
3005         ret = test_range_bit(io_tree, ordered_extent->file_offset,
3006                         ordered_extent->file_offset + ordered_extent->len - 1,
3007                         EXTENT_DEFRAG, 0, cached_state);
3008         if (ret) {
3009                 u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
3010                 if (0 && last_snapshot >= BTRFS_I(inode)->generation)
3011                         /* the inode is shared */
3012                         new = record_old_file_extents(inode, ordered_extent);
3013
3014                 clear_extent_bit(io_tree, ordered_extent->file_offset,
3015                         ordered_extent->file_offset + ordered_extent->len - 1,
3016                         EXTENT_DEFRAG, 0, 0, &cached_state);
3017         }
3018
3019         if (nolock)
3020                 trans = btrfs_join_transaction_nolock(root);
3021         else
3022                 trans = btrfs_join_transaction(root);
3023         if (IS_ERR(trans)) {
3024                 ret = PTR_ERR(trans);
3025                 trans = NULL;
3026                 goto out;
3027         }
3028
3029         trans->block_rsv = &BTRFS_I(inode)->block_rsv;
3030
3031         if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
3032                 compress_type = ordered_extent->compress_type;
3033         if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3034                 BUG_ON(compress_type);
3035                 btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
3036                                        ordered_extent->len);
3037                 ret = btrfs_mark_extent_written(trans, BTRFS_I(inode),
3038                                                 ordered_extent->file_offset,
3039                                                 ordered_extent->file_offset +
3040                                                 logical_len);
3041         } else {
3042                 BUG_ON(root == fs_info->tree_root);
3043                 ret = insert_reserved_file_extent(trans, inode,
3044                                                 ordered_extent->file_offset,
3045                                                 ordered_extent->start,
3046                                                 ordered_extent->disk_len,
3047                                                 logical_len, logical_len,
3048                                                 compress_type, 0, 0,
3049                                                 BTRFS_FILE_EXTENT_REG);
3050                 if (!ret)
3051                         btrfs_release_delalloc_bytes(fs_info,
3052                                                      ordered_extent->start,
3053                                                      ordered_extent->disk_len);
3054         }
3055         unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
3056                            ordered_extent->file_offset, ordered_extent->len,
3057                            trans->transid);
3058         if (ret < 0) {
3059                 btrfs_abort_transaction(trans, ret);
3060                 goto out;
3061         }
3062
3063         ret = add_pending_csums(trans, inode, &ordered_extent->list);
3064         if (ret) {
3065                 btrfs_abort_transaction(trans, ret);
3066                 goto out;
3067         }
3068
3069         btrfs_ordered_update_i_size(inode, 0, ordered_extent);
3070         ret = btrfs_update_inode_fallback(trans, root, inode);
3071         if (ret) { /* -ENOMEM or corruption */
3072                 btrfs_abort_transaction(trans, ret);
3073                 goto out;