btrfs: submit superblock io with REQ_META and REQ_PRIO
[sfrench/cifs-2.6.git] / fs / btrfs / inode.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/kernel.h>
20 #include <linux/bio.h>
21 #include <linux/buffer_head.h>
22 #include <linux/file.h>
23 #include <linux/fs.h>
24 #include <linux/pagemap.h>
25 #include <linux/highmem.h>
26 #include <linux/time.h>
27 #include <linux/init.h>
28 #include <linux/string.h>
29 #include <linux/backing-dev.h>
30 #include <linux/mpage.h>
31 #include <linux/swap.h>
32 #include <linux/writeback.h>
33 #include <linux/compat.h>
34 #include <linux/bit_spinlock.h>
35 #include <linux/xattr.h>
36 #include <linux/posix_acl.h>
37 #include <linux/falloc.h>
38 #include <linux/slab.h>
39 #include <linux/ratelimit.h>
40 #include <linux/mount.h>
41 #include <linux/btrfs.h>
42 #include <linux/blkdev.h>
43 #include <linux/posix_acl_xattr.h>
44 #include <linux/uio.h>
45 #include "ctree.h"
46 #include "disk-io.h"
47 #include "transaction.h"
48 #include "btrfs_inode.h"
49 #include "print-tree.h"
50 #include "ordered-data.h"
51 #include "xattr.h"
52 #include "tree-log.h"
53 #include "volumes.h"
54 #include "compression.h"
55 #include "locking.h"
56 #include "free-space-cache.h"
57 #include "inode-map.h"
58 #include "backref.h"
59 #include "hash.h"
60 #include "props.h"
61 #include "qgroup.h"
62 #include "dedupe.h"
63
64 struct btrfs_iget_args {
65         struct btrfs_key *location;
66         struct btrfs_root *root;
67 };
68
69 struct btrfs_dio_data {
70         u64 outstanding_extents;
71         u64 reserve;
72         u64 unsubmitted_oe_range_start;
73         u64 unsubmitted_oe_range_end;
74         int overwrite;
75 };
76
77 static const struct inode_operations btrfs_dir_inode_operations;
78 static const struct inode_operations btrfs_symlink_inode_operations;
79 static const struct inode_operations btrfs_dir_ro_inode_operations;
80 static const struct inode_operations btrfs_special_inode_operations;
81 static const struct inode_operations btrfs_file_inode_operations;
82 static const struct address_space_operations btrfs_aops;
83 static const struct address_space_operations btrfs_symlink_aops;
84 static const struct file_operations btrfs_dir_file_operations;
85 static const struct extent_io_ops btrfs_extent_io_ops;
86
87 static struct kmem_cache *btrfs_inode_cachep;
88 struct kmem_cache *btrfs_trans_handle_cachep;
89 struct kmem_cache *btrfs_path_cachep;
90 struct kmem_cache *btrfs_free_space_cachep;
91
92 #define S_SHIFT 12
93 static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
94         [S_IFREG >> S_SHIFT]    = BTRFS_FT_REG_FILE,
95         [S_IFDIR >> S_SHIFT]    = BTRFS_FT_DIR,
96         [S_IFCHR >> S_SHIFT]    = BTRFS_FT_CHRDEV,
97         [S_IFBLK >> S_SHIFT]    = BTRFS_FT_BLKDEV,
98         [S_IFIFO >> S_SHIFT]    = BTRFS_FT_FIFO,
99         [S_IFSOCK >> S_SHIFT]   = BTRFS_FT_SOCK,
100         [S_IFLNK >> S_SHIFT]    = BTRFS_FT_SYMLINK,
101 };
102
103 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
104 static int btrfs_truncate(struct inode *inode);
105 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
106 static noinline int cow_file_range(struct inode *inode,
107                                    struct page *locked_page,
108                                    u64 start, u64 end, u64 delalloc_end,
109                                    int *page_started, unsigned long *nr_written,
110                                    int unlock, struct btrfs_dedupe_hash *hash);
111 static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
112                                        u64 orig_start, u64 block_start,
113                                        u64 block_len, u64 orig_block_len,
114                                        u64 ram_bytes, int compress_type,
115                                        int type);
116
117 static void __endio_write_update_ordered(struct inode *inode,
118                                          const u64 offset, const u64 bytes,
119                                          const bool uptodate);
120
121 /*
122  * Cleanup all submitted ordered extents in specified range to handle errors
123  * from the fill_dellaloc() callback.
124  *
125  * NOTE: caller must ensure that when an error happens, it can not call
126  * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
127  * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
128  * to be released, which we want to happen only when finishing the ordered
129  * extent (btrfs_finish_ordered_io()). Also note that the caller of the
130  * fill_delalloc() callback already does proper cleanup for the first page of
131  * the range, that is, it invokes the callback writepage_end_io_hook() for the
132  * range of the first page.
133  */
134 static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
135                                                  const u64 offset,
136                                                  const u64 bytes)
137 {
138         return __endio_write_update_ordered(inode, offset + PAGE_SIZE,
139                                             bytes - PAGE_SIZE, false);
140 }
141
142 static int btrfs_dirty_inode(struct inode *inode);
143
144 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
145 void btrfs_test_inode_set_ops(struct inode *inode)
146 {
147         BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
148 }
149 #endif
150
151 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
152                                      struct inode *inode,  struct inode *dir,
153                                      const struct qstr *qstr)
154 {
155         int err;
156
157         err = btrfs_init_acl(trans, inode, dir);
158         if (!err)
159                 err = btrfs_xattr_security_init(trans, inode, dir, qstr);
160         return err;
161 }
162
163 /*
164  * this does all the hard work for inserting an inline extent into
165  * the btree.  The caller should have done a btrfs_drop_extents so that
166  * no overlapping inline items exist in the btree
167  */
168 static int insert_inline_extent(struct btrfs_trans_handle *trans,
169                                 struct btrfs_path *path, int extent_inserted,
170                                 struct btrfs_root *root, struct inode *inode,
171                                 u64 start, size_t size, size_t compressed_size,
172                                 int compress_type,
173                                 struct page **compressed_pages)
174 {
175         struct extent_buffer *leaf;
176         struct page *page = NULL;
177         char *kaddr;
178         unsigned long ptr;
179         struct btrfs_file_extent_item *ei;
180         int ret;
181         size_t cur_size = size;
182         unsigned long offset;
183
184         if (compressed_size && compressed_pages)
185                 cur_size = compressed_size;
186
187         inode_add_bytes(inode, size);
188
189         if (!extent_inserted) {
190                 struct btrfs_key key;
191                 size_t datasize;
192
193                 key.objectid = btrfs_ino(BTRFS_I(inode));
194                 key.offset = start;
195                 key.type = BTRFS_EXTENT_DATA_KEY;
196
197                 datasize = btrfs_file_extent_calc_inline_size(cur_size);
198                 path->leave_spinning = 1;
199                 ret = btrfs_insert_empty_item(trans, root, path, &key,
200                                               datasize);
201                 if (ret)
202                         goto fail;
203         }
204         leaf = path->nodes[0];
205         ei = btrfs_item_ptr(leaf, path->slots[0],
206                             struct btrfs_file_extent_item);
207         btrfs_set_file_extent_generation(leaf, ei, trans->transid);
208         btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
209         btrfs_set_file_extent_encryption(leaf, ei, 0);
210         btrfs_set_file_extent_other_encoding(leaf, ei, 0);
211         btrfs_set_file_extent_ram_bytes(leaf, ei, size);
212         ptr = btrfs_file_extent_inline_start(ei);
213
214         if (compress_type != BTRFS_COMPRESS_NONE) {
215                 struct page *cpage;
216                 int i = 0;
217                 while (compressed_size > 0) {
218                         cpage = compressed_pages[i];
219                         cur_size = min_t(unsigned long, compressed_size,
220                                        PAGE_SIZE);
221
222                         kaddr = kmap_atomic(cpage);
223                         write_extent_buffer(leaf, kaddr, ptr, cur_size);
224                         kunmap_atomic(kaddr);
225
226                         i++;
227                         ptr += cur_size;
228                         compressed_size -= cur_size;
229                 }
230                 btrfs_set_file_extent_compression(leaf, ei,
231                                                   compress_type);
232         } else {
233                 page = find_get_page(inode->i_mapping,
234                                      start >> PAGE_SHIFT);
235                 btrfs_set_file_extent_compression(leaf, ei, 0);
236                 kaddr = kmap_atomic(page);
237                 offset = start & (PAGE_SIZE - 1);
238                 write_extent_buffer(leaf, kaddr + offset, ptr, size);
239                 kunmap_atomic(kaddr);
240                 put_page(page);
241         }
242         btrfs_mark_buffer_dirty(leaf);
243         btrfs_release_path(path);
244
245         /*
246          * we're an inline extent, so nobody can
247          * extend the file past i_size without locking
248          * a page we already have locked.
249          *
250          * We must do any isize and inode updates
251          * before we unlock the pages.  Otherwise we
252          * could end up racing with unlink.
253          */
254         BTRFS_I(inode)->disk_i_size = inode->i_size;
255         ret = btrfs_update_inode(trans, root, inode);
256
257 fail:
258         return ret;
259 }
260
261
262 /*
263  * conditionally insert an inline extent into the file.  This
264  * does the checks required to make sure the data is small enough
265  * to fit as an inline extent.
266  */
267 static noinline int cow_file_range_inline(struct btrfs_root *root,
268                                           struct inode *inode, u64 start,
269                                           u64 end, size_t compressed_size,
270                                           int compress_type,
271                                           struct page **compressed_pages)
272 {
273         struct btrfs_fs_info *fs_info = root->fs_info;
274         struct btrfs_trans_handle *trans;
275         u64 isize = i_size_read(inode);
276         u64 actual_end = min(end + 1, isize);
277         u64 inline_len = actual_end - start;
278         u64 aligned_end = ALIGN(end, fs_info->sectorsize);
279         u64 data_len = inline_len;
280         int ret;
281         struct btrfs_path *path;
282         int extent_inserted = 0;
283         u32 extent_item_size;
284
285         if (compressed_size)
286                 data_len = compressed_size;
287
288         if (start > 0 ||
289             actual_end > fs_info->sectorsize ||
290             data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
291             (!compressed_size &&
292             (actual_end & (fs_info->sectorsize - 1)) == 0) ||
293             end + 1 < isize ||
294             data_len > fs_info->max_inline) {
295                 return 1;
296         }
297
298         path = btrfs_alloc_path();
299         if (!path)
300                 return -ENOMEM;
301
302         trans = btrfs_join_transaction(root);
303         if (IS_ERR(trans)) {
304                 btrfs_free_path(path);
305                 return PTR_ERR(trans);
306         }
307         trans->block_rsv = &fs_info->delalloc_block_rsv;
308
309         if (compressed_size && compressed_pages)
310                 extent_item_size = btrfs_file_extent_calc_inline_size(
311                    compressed_size);
312         else
313                 extent_item_size = btrfs_file_extent_calc_inline_size(
314                     inline_len);
315
316         ret = __btrfs_drop_extents(trans, root, inode, path,
317                                    start, aligned_end, NULL,
318                                    1, 1, extent_item_size, &extent_inserted);
319         if (ret) {
320                 btrfs_abort_transaction(trans, ret);
321                 goto out;
322         }
323
324         if (isize > actual_end)
325                 inline_len = min_t(u64, isize, actual_end);
326         ret = insert_inline_extent(trans, path, extent_inserted,
327                                    root, inode, start,
328                                    inline_len, compressed_size,
329                                    compress_type, compressed_pages);
330         if (ret && ret != -ENOSPC) {
331                 btrfs_abort_transaction(trans, ret);
332                 goto out;
333         } else if (ret == -ENOSPC) {
334                 ret = 1;
335                 goto out;
336         }
337
338         set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
339         btrfs_delalloc_release_metadata(BTRFS_I(inode), end + 1 - start);
340         btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0);
341 out:
342         /*
343          * Don't forget to free the reserved space, as for inlined extent
344          * it won't count as data extent, free them directly here.
345          * And at reserve time, it's always aligned to page size, so
346          * just free one page here.
347          */
348         btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
349         btrfs_free_path(path);
350         btrfs_end_transaction(trans);
351         return ret;
352 }
353
354 struct async_extent {
355         u64 start;
356         u64 ram_size;
357         u64 compressed_size;
358         struct page **pages;
359         unsigned long nr_pages;
360         int compress_type;
361         struct list_head list;
362 };
363
364 struct async_cow {
365         struct inode *inode;
366         struct btrfs_root *root;
367         struct page *locked_page;
368         u64 start;
369         u64 end;
370         struct list_head extents;
371         struct btrfs_work work;
372 };
373
374 static noinline int add_async_extent(struct async_cow *cow,
375                                      u64 start, u64 ram_size,
376                                      u64 compressed_size,
377                                      struct page **pages,
378                                      unsigned long nr_pages,
379                                      int compress_type)
380 {
381         struct async_extent *async_extent;
382
383         async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
384         BUG_ON(!async_extent); /* -ENOMEM */
385         async_extent->start = start;
386         async_extent->ram_size = ram_size;
387         async_extent->compressed_size = compressed_size;
388         async_extent->pages = pages;
389         async_extent->nr_pages = nr_pages;
390         async_extent->compress_type = compress_type;
391         list_add_tail(&async_extent->list, &cow->extents);
392         return 0;
393 }
394
395 static inline int inode_need_compress(struct inode *inode, u64 start, u64 end)
396 {
397         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
398
399         /* force compress */
400         if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
401                 return 1;
402         /* defrag ioctl */
403         if (BTRFS_I(inode)->defrag_compress)
404                 return 1;
405         /* bad compression ratios */
406         if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
407                 return 0;
408         if (btrfs_test_opt(fs_info, COMPRESS) ||
409             BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
410             BTRFS_I(inode)->prop_compress)
411                 return btrfs_compress_heuristic(inode, start, end);
412         return 0;
413 }
414
415 static inline void inode_should_defrag(struct btrfs_inode *inode,
416                 u64 start, u64 end, u64 num_bytes, u64 small_write)
417 {
418         /* If this is a small write inside eof, kick off a defrag */
419         if (num_bytes < small_write &&
420             (start > 0 || end + 1 < inode->disk_i_size))
421                 btrfs_add_inode_defrag(NULL, inode);
422 }
423
424 /*
425  * we create compressed extents in two phases.  The first
426  * phase compresses a range of pages that have already been
427  * locked (both pages and state bits are locked).
428  *
429  * This is done inside an ordered work queue, and the compression
430  * is spread across many cpus.  The actual IO submission is step
431  * two, and the ordered work queue takes care of making sure that
432  * happens in the same order things were put onto the queue by
433  * writepages and friends.
434  *
435  * If this code finds it can't get good compression, it puts an
436  * entry onto the work queue to write the uncompressed bytes.  This
437  * makes sure that both compressed inodes and uncompressed inodes
438  * are written in the same order that the flusher thread sent them
439  * down.
440  */
441 static noinline void compress_file_range(struct inode *inode,
442                                         struct page *locked_page,
443                                         u64 start, u64 end,
444                                         struct async_cow *async_cow,
445                                         int *num_added)
446 {
447         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
448         struct btrfs_root *root = BTRFS_I(inode)->root;
449         u64 num_bytes;
450         u64 blocksize = fs_info->sectorsize;
451         u64 actual_end;
452         u64 isize = i_size_read(inode);
453         int ret = 0;
454         struct page **pages = NULL;
455         unsigned long nr_pages;
456         unsigned long total_compressed = 0;
457         unsigned long total_in = 0;
458         int i;
459         int will_compress;
460         int compress_type = fs_info->compress_type;
461         int redirty = 0;
462
463         inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
464                         SZ_16K);
465
466         actual_end = min_t(u64, isize, end + 1);
467 again:
468         will_compress = 0;
469         nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
470         BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0);
471         nr_pages = min_t(unsigned long, nr_pages,
472                         BTRFS_MAX_COMPRESSED / PAGE_SIZE);
473
474         /*
475          * we don't want to send crud past the end of i_size through
476          * compression, that's just a waste of CPU time.  So, if the
477          * end of the file is before the start of our current
478          * requested range of bytes, we bail out to the uncompressed
479          * cleanup code that can deal with all of this.
480          *
481          * It isn't really the fastest way to fix things, but this is a
482          * very uncommon corner.
483          */
484         if (actual_end <= start)
485                 goto cleanup_and_bail_uncompressed;
486
487         total_compressed = actual_end - start;
488
489         /*
490          * skip compression for a small file range(<=blocksize) that
491          * isn't an inline extent, since it doesn't save disk space at all.
492          */
493         if (total_compressed <= blocksize &&
494            (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
495                 goto cleanup_and_bail_uncompressed;
496
497         total_compressed = min_t(unsigned long, total_compressed,
498                         BTRFS_MAX_UNCOMPRESSED);
499         num_bytes = ALIGN(end - start + 1, blocksize);
500         num_bytes = max(blocksize,  num_bytes);
501         total_in = 0;
502         ret = 0;
503
504         /*
505          * we do compression for mount -o compress and when the
506          * inode has not been flagged as nocompress.  This flag can
507          * change at any time if we discover bad compression ratios.
508          */
509         if (inode_need_compress(inode, start, end)) {
510                 WARN_ON(pages);
511                 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
512                 if (!pages) {
513                         /* just bail out to the uncompressed code */
514                         goto cont;
515                 }
516
517                 if (BTRFS_I(inode)->defrag_compress)
518                         compress_type = BTRFS_I(inode)->defrag_compress;
519                 else if (BTRFS_I(inode)->prop_compress)
520                         compress_type = BTRFS_I(inode)->prop_compress;
521
522                 /*
523                  * we need to call clear_page_dirty_for_io on each
524                  * page in the range.  Otherwise applications with the file
525                  * mmap'd can wander in and change the page contents while
526                  * we are compressing them.
527                  *
528                  * If the compression fails for any reason, we set the pages
529                  * dirty again later on.
530                  */
531                 extent_range_clear_dirty_for_io(inode, start, end);
532                 redirty = 1;
533                 ret = btrfs_compress_pages(compress_type,
534                                            inode->i_mapping, start,
535                                            pages,
536                                            &nr_pages,
537                                            &total_in,
538                                            &total_compressed);
539
540                 if (!ret) {
541                         unsigned long offset = total_compressed &
542                                 (PAGE_SIZE - 1);
543                         struct page *page = pages[nr_pages - 1];
544                         char *kaddr;
545
546                         /* zero the tail end of the last page, we might be
547                          * sending it down to disk
548                          */
549                         if (offset) {
550                                 kaddr = kmap_atomic(page);
551                                 memset(kaddr + offset, 0,
552                                        PAGE_SIZE - offset);
553                                 kunmap_atomic(kaddr);
554                         }
555                         will_compress = 1;
556                 }
557         }
558 cont:
559         if (start == 0) {
560                 /* lets try to make an inline extent */
561                 if (ret || total_in < (actual_end - start)) {
562                         /* we didn't compress the entire range, try
563                          * to make an uncompressed inline extent.
564                          */
565                         ret = cow_file_range_inline(root, inode, start, end,
566                                             0, BTRFS_COMPRESS_NONE, NULL);
567                 } else {
568                         /* try making a compressed inline extent */
569                         ret = cow_file_range_inline(root, inode, start, end,
570                                                     total_compressed,
571                                                     compress_type, pages);
572                 }
573                 if (ret <= 0) {
574                         unsigned long clear_flags = EXTENT_DELALLOC |
575                                 EXTENT_DELALLOC_NEW | EXTENT_DEFRAG;
576                         unsigned long page_error_op;
577
578                         clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
579                         page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
580
581                         /*
582                          * inline extent creation worked or returned error,
583                          * we don't need to create any more async work items.
584                          * Unlock and free up our temp pages.
585                          */
586                         extent_clear_unlock_delalloc(inode, start, end, end,
587                                                      NULL, clear_flags,
588                                                      PAGE_UNLOCK |
589                                                      PAGE_CLEAR_DIRTY |
590                                                      PAGE_SET_WRITEBACK |
591                                                      page_error_op |
592                                                      PAGE_END_WRITEBACK);
593                         if (ret == 0)
594                                 btrfs_free_reserved_data_space_noquota(inode,
595                                                                start,
596                                                                end - start + 1);
597                         goto free_pages_out;
598                 }
599         }
600
601         if (will_compress) {
602                 /*
603                  * we aren't doing an inline extent round the compressed size
604                  * up to a block size boundary so the allocator does sane
605                  * things
606                  */
607                 total_compressed = ALIGN(total_compressed, blocksize);
608
609                 /*
610                  * one last check to make sure the compression is really a
611                  * win, compare the page count read with the blocks on disk,
612                  * compression must free at least one sector size
613                  */
614                 total_in = ALIGN(total_in, PAGE_SIZE);
615                 if (total_compressed + blocksize <= total_in) {
616                         num_bytes = total_in;
617                         *num_added += 1;
618
619                         /*
620                          * The async work queues will take care of doing actual
621                          * allocation on disk for these compressed pages, and
622                          * will submit them to the elevator.
623                          */
624                         add_async_extent(async_cow, start, num_bytes,
625                                         total_compressed, pages, nr_pages,
626                                         compress_type);
627
628                         if (start + num_bytes < end) {
629                                 start += num_bytes;
630                                 pages = NULL;
631                                 cond_resched();
632                                 goto again;
633                         }
634                         return;
635                 }
636         }
637         if (pages) {
638                 /*
639                  * the compression code ran but failed to make things smaller,
640                  * free any pages it allocated and our page pointer array
641                  */
642                 for (i = 0; i < nr_pages; i++) {
643                         WARN_ON(pages[i]->mapping);
644                         put_page(pages[i]);
645                 }
646                 kfree(pages);
647                 pages = NULL;
648                 total_compressed = 0;
649                 nr_pages = 0;
650
651                 /* flag the file so we don't compress in the future */
652                 if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
653                     !(BTRFS_I(inode)->prop_compress)) {
654                         BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
655                 }
656         }
657 cleanup_and_bail_uncompressed:
658         /*
659          * No compression, but we still need to write the pages in the file
660          * we've been given so far.  redirty the locked page if it corresponds
661          * to our extent and set things up for the async work queue to run
662          * cow_file_range to do the normal delalloc dance.
663          */
664         if (page_offset(locked_page) >= start &&
665             page_offset(locked_page) <= end)
666                 __set_page_dirty_nobuffers(locked_page);
667                 /* unlocked later on in the async handlers */
668
669         if (redirty)
670                 extent_range_redirty_for_io(inode, start, end);
671         add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0,
672                          BTRFS_COMPRESS_NONE);
673         *num_added += 1;
674
675         return;
676
677 free_pages_out:
678         for (i = 0; i < nr_pages; i++) {
679                 WARN_ON(pages[i]->mapping);
680                 put_page(pages[i]);
681         }
682         kfree(pages);
683 }
684
685 static void free_async_extent_pages(struct async_extent *async_extent)
686 {
687         int i;
688
689         if (!async_extent->pages)
690                 return;
691
692         for (i = 0; i < async_extent->nr_pages; i++) {
693                 WARN_ON(async_extent->pages[i]->mapping);
694                 put_page(async_extent->pages[i]);
695         }
696         kfree(async_extent->pages);
697         async_extent->nr_pages = 0;
698         async_extent->pages = NULL;
699 }
700
701 /*
702  * phase two of compressed writeback.  This is the ordered portion
703  * of the code, which only gets called in the order the work was
704  * queued.  We walk all the async extents created by compress_file_range
705  * and send them down to the disk.
706  */
707 static noinline void submit_compressed_extents(struct inode *inode,
708                                               struct async_cow *async_cow)
709 {
710         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
711         struct async_extent *async_extent;
712         u64 alloc_hint = 0;
713         struct btrfs_key ins;
714         struct extent_map *em;
715         struct btrfs_root *root = BTRFS_I(inode)->root;
716         struct extent_io_tree *io_tree;
717         int ret = 0;
718
719 again:
720         while (!list_empty(&async_cow->extents)) {
721                 async_extent = list_entry(async_cow->extents.next,
722                                           struct async_extent, list);
723                 list_del(&async_extent->list);
724
725                 io_tree = &BTRFS_I(inode)->io_tree;
726
727 retry:
728                 /* did the compression code fall back to uncompressed IO? */
729                 if (!async_extent->pages) {
730                         int page_started = 0;
731                         unsigned long nr_written = 0;
732
733                         lock_extent(io_tree, async_extent->start,
734                                          async_extent->start +
735                                          async_extent->ram_size - 1);
736
737                         /* allocate blocks */
738                         ret = cow_file_range(inode, async_cow->locked_page,
739                                              async_extent->start,
740                                              async_extent->start +
741                                              async_extent->ram_size - 1,
742                                              async_extent->start +
743                                              async_extent->ram_size - 1,
744                                              &page_started, &nr_written, 0,
745                                              NULL);
746
747                         /* JDM XXX */
748
749                         /*
750                          * if page_started, cow_file_range inserted an
751                          * inline extent and took care of all the unlocking
752                          * and IO for us.  Otherwise, we need to submit
753                          * all those pages down to the drive.
754                          */
755                         if (!page_started && !ret)
756                                 extent_write_locked_range(io_tree,
757                                                   inode, async_extent->start,
758                                                   async_extent->start +
759                                                   async_extent->ram_size - 1,
760                                                   btrfs_get_extent,
761                                                   WB_SYNC_ALL);
762                         else if (ret)
763                                 unlock_page(async_cow->locked_page);
764                         kfree(async_extent);
765                         cond_resched();
766                         continue;
767                 }
768
769                 lock_extent(io_tree, async_extent->start,
770                             async_extent->start + async_extent->ram_size - 1);
771
772                 ret = btrfs_reserve_extent(root, async_extent->ram_size,
773                                            async_extent->compressed_size,
774                                            async_extent->compressed_size,
775                                            0, alloc_hint, &ins, 1, 1);
776                 if (ret) {
777                         free_async_extent_pages(async_extent);
778
779                         if (ret == -ENOSPC) {
780                                 unlock_extent(io_tree, async_extent->start,
781                                               async_extent->start +
782                                               async_extent->ram_size - 1);
783
784                                 /*
785                                  * we need to redirty the pages if we decide to
786                                  * fallback to uncompressed IO, otherwise we
787                                  * will not submit these pages down to lower
788                                  * layers.
789                                  */
790                                 extent_range_redirty_for_io(inode,
791                                                 async_extent->start,
792                                                 async_extent->start +
793                                                 async_extent->ram_size - 1);
794
795                                 goto retry;
796                         }
797                         goto out_free;
798                 }
799                 /*
800                  * here we're doing allocation and writeback of the
801                  * compressed pages
802                  */
803                 em = create_io_em(inode, async_extent->start,
804                                   async_extent->ram_size, /* len */
805                                   async_extent->start, /* orig_start */
806                                   ins.objectid, /* block_start */
807                                   ins.offset, /* block_len */
808                                   ins.offset, /* orig_block_len */
809                                   async_extent->ram_size, /* ram_bytes */
810                                   async_extent->compress_type,
811                                   BTRFS_ORDERED_COMPRESSED);
812                 if (IS_ERR(em))
813                         /* ret value is not necessary due to void function */
814                         goto out_free_reserve;
815                 free_extent_map(em);
816
817                 ret = btrfs_add_ordered_extent_compress(inode,
818                                                 async_extent->start,
819                                                 ins.objectid,
820                                                 async_extent->ram_size,
821                                                 ins.offset,
822                                                 BTRFS_ORDERED_COMPRESSED,
823                                                 async_extent->compress_type);
824                 if (ret) {
825                         btrfs_drop_extent_cache(BTRFS_I(inode),
826                                                 async_extent->start,
827                                                 async_extent->start +
828                                                 async_extent->ram_size - 1, 0);
829                         goto out_free_reserve;
830                 }
831                 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
832
833                 /*
834                  * clear dirty, set writeback and unlock the pages.
835                  */
836                 extent_clear_unlock_delalloc(inode, async_extent->start,
837                                 async_extent->start +
838                                 async_extent->ram_size - 1,
839                                 async_extent->start +
840                                 async_extent->ram_size - 1,
841                                 NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
842                                 PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
843                                 PAGE_SET_WRITEBACK);
844                 if (btrfs_submit_compressed_write(inode,
845                                     async_extent->start,
846                                     async_extent->ram_size,
847                                     ins.objectid,
848                                     ins.offset, async_extent->pages,
849                                     async_extent->nr_pages)) {
850                         struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
851                         struct page *p = async_extent->pages[0];
852                         const u64 start = async_extent->start;
853                         const u64 end = start + async_extent->ram_size - 1;
854
855                         p->mapping = inode->i_mapping;
856                         tree->ops->writepage_end_io_hook(p, start, end,
857                                                          NULL, 0);
858                         p->mapping = NULL;
859                         extent_clear_unlock_delalloc(inode, start, end, end,
860                                                      NULL, 0,
861                                                      PAGE_END_WRITEBACK |
862                                                      PAGE_SET_ERROR);
863                         free_async_extent_pages(async_extent);
864                 }
865                 alloc_hint = ins.objectid + ins.offset;
866                 kfree(async_extent);
867                 cond_resched();
868         }
869         return;
870 out_free_reserve:
871         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
872         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
873 out_free:
874         extent_clear_unlock_delalloc(inode, async_extent->start,
875                                      async_extent->start +
876                                      async_extent->ram_size - 1,
877                                      async_extent->start +
878                                      async_extent->ram_size - 1,
879                                      NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
880                                      EXTENT_DELALLOC_NEW |
881                                      EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
882                                      PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
883                                      PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
884                                      PAGE_SET_ERROR);
885         free_async_extent_pages(async_extent);
886         kfree(async_extent);
887         goto again;
888 }
889
890 static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
891                                       u64 num_bytes)
892 {
893         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
894         struct extent_map *em;
895         u64 alloc_hint = 0;
896
897         read_lock(&em_tree->lock);
898         em = search_extent_mapping(em_tree, start, num_bytes);
899         if (em) {
900                 /*
901                  * if block start isn't an actual block number then find the
902                  * first block in this inode and use that as a hint.  If that
903                  * block is also bogus then just don't worry about it.
904                  */
905                 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
906                         free_extent_map(em);
907                         em = search_extent_mapping(em_tree, 0, 0);
908                         if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
909                                 alloc_hint = em->block_start;
910                         if (em)
911                                 free_extent_map(em);
912                 } else {
913                         alloc_hint = em->block_start;
914                         free_extent_map(em);
915                 }
916         }
917         read_unlock(&em_tree->lock);
918
919         return alloc_hint;
920 }
921
922 /*
923  * when extent_io.c finds a delayed allocation range in the file,
924  * the call backs end up in this code.  The basic idea is to
925  * allocate extents on disk for the range, and create ordered data structs
926  * in ram to track those extents.
927  *
928  * locked_page is the page that writepage had locked already.  We use
929  * it to make sure we don't do extra locks or unlocks.
930  *
931  * *page_started is set to one if we unlock locked_page and do everything
932  * required to start IO on it.  It may be clean and already done with
933  * IO when we return.
934  */
935 static noinline int cow_file_range(struct inode *inode,
936                                    struct page *locked_page,
937                                    u64 start, u64 end, u64 delalloc_end,
938                                    int *page_started, unsigned long *nr_written,
939                                    int unlock, struct btrfs_dedupe_hash *hash)
940 {
941         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
942         struct btrfs_root *root = BTRFS_I(inode)->root;
943         u64 alloc_hint = 0;
944         u64 num_bytes;
945         unsigned long ram_size;
946         u64 disk_num_bytes;
947         u64 cur_alloc_size = 0;
948         u64 blocksize = fs_info->sectorsize;
949         struct btrfs_key ins;
950         struct extent_map *em;
951         unsigned clear_bits;
952         unsigned long page_ops;
953         bool extent_reserved = false;
954         int ret = 0;
955
956         if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
957                 WARN_ON_ONCE(1);
958                 ret = -EINVAL;
959                 goto out_unlock;
960         }
961
962         num_bytes = ALIGN(end - start + 1, blocksize);
963         num_bytes = max(blocksize,  num_bytes);
964         disk_num_bytes = num_bytes;
965
966         inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K);
967
968         if (start == 0) {
969                 /* lets try to make an inline extent */
970                 ret = cow_file_range_inline(root, inode, start, end, 0,
971                                         BTRFS_COMPRESS_NONE, NULL);
972                 if (ret == 0) {
973                         extent_clear_unlock_delalloc(inode, start, end,
974                                      delalloc_end, NULL,
975                                      EXTENT_LOCKED | EXTENT_DELALLOC |
976                                      EXTENT_DELALLOC_NEW |
977                                      EXTENT_DEFRAG, PAGE_UNLOCK |
978                                      PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
979                                      PAGE_END_WRITEBACK);
980                         btrfs_free_reserved_data_space_noquota(inode, start,
981                                                 end - start + 1);
982                         *nr_written = *nr_written +
983                              (end - start + PAGE_SIZE) / PAGE_SIZE;
984                         *page_started = 1;
985                         goto out;
986                 } else if (ret < 0) {
987                         goto out_unlock;
988                 }
989         }
990
991         BUG_ON(disk_num_bytes >
992                btrfs_super_total_bytes(fs_info->super_copy));
993
994         alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
995         btrfs_drop_extent_cache(BTRFS_I(inode), start,
996                         start + num_bytes - 1, 0);
997
998         while (disk_num_bytes > 0) {
999                 cur_alloc_size = disk_num_bytes;
1000                 ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
1001                                            fs_info->sectorsize, 0, alloc_hint,
1002                                            &ins, 1, 1);
1003                 if (ret < 0)
1004                         goto out_unlock;
1005                 cur_alloc_size = ins.offset;
1006                 extent_reserved = true;
1007
1008                 ram_size = ins.offset;
1009                 em = create_io_em(inode, start, ins.offset, /* len */
1010                                   start, /* orig_start */
1011                                   ins.objectid, /* block_start */
1012                                   ins.offset, /* block_len */
1013                                   ins.offset, /* orig_block_len */
1014                                   ram_size, /* ram_bytes */
1015                                   BTRFS_COMPRESS_NONE, /* compress_type */
1016                                   BTRFS_ORDERED_REGULAR /* type */);
1017                 if (IS_ERR(em))
1018                         goto out_reserve;
1019                 free_extent_map(em);
1020
1021                 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
1022                                                ram_size, cur_alloc_size, 0);
1023                 if (ret)
1024                         goto out_drop_extent_cache;
1025
1026                 if (root->root_key.objectid ==
1027                     BTRFS_DATA_RELOC_TREE_OBJECTID) {
1028                         ret = btrfs_reloc_clone_csums(inode, start,
1029                                                       cur_alloc_size);
1030                         /*
1031                          * Only drop cache here, and process as normal.
1032                          *
1033                          * We must not allow extent_clear_unlock_delalloc()
1034                          * at out_unlock label to free meta of this ordered
1035                          * extent, as its meta should be freed by
1036                          * btrfs_finish_ordered_io().
1037                          *
1038                          * So we must continue until @start is increased to
1039                          * skip current ordered extent.
1040                          */
1041                         if (ret)
1042                                 btrfs_drop_extent_cache(BTRFS_I(inode), start,
1043                                                 start + ram_size - 1, 0);
1044                 }
1045
1046                 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1047
1048                 /* we're not doing compressed IO, don't unlock the first
1049                  * page (which the caller expects to stay locked), don't
1050                  * clear any dirty bits and don't set any writeback bits
1051                  *
1052                  * Do set the Private2 bit so we know this page was properly
1053                  * setup for writepage
1054                  */
1055                 page_ops = unlock ? PAGE_UNLOCK : 0;
1056                 page_ops |= PAGE_SET_PRIVATE2;
1057
1058                 extent_clear_unlock_delalloc(inode, start,
1059                                              start + ram_size - 1,
1060                                              delalloc_end, locked_page,
1061                                              EXTENT_LOCKED | EXTENT_DELALLOC,
1062                                              page_ops);
1063                 if (disk_num_bytes < cur_alloc_size)
1064                         disk_num_bytes = 0;
1065                 else
1066                         disk_num_bytes -= cur_alloc_size;
1067                 num_bytes -= cur_alloc_size;
1068                 alloc_hint = ins.objectid + ins.offset;
1069                 start += cur_alloc_size;
1070                 extent_reserved = false;
1071
1072                 /*
1073                  * btrfs_reloc_clone_csums() error, since start is increased
1074                  * extent_clear_unlock_delalloc() at out_unlock label won't
1075                  * free metadata of current ordered extent, we're OK to exit.
1076                  */
1077                 if (ret)
1078                         goto out_unlock;
1079         }
1080 out:
1081         return ret;
1082
1083 out_drop_extent_cache:
1084         btrfs_drop_extent_cache(BTRFS_I(inode), start, start + ram_size - 1, 0);
1085 out_reserve:
1086         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1087         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1088 out_unlock:
1089         clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
1090                 EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
1091         page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
1092                 PAGE_END_WRITEBACK;
1093         /*
1094          * If we reserved an extent for our delalloc range (or a subrange) and
1095          * failed to create the respective ordered extent, then it means that
1096          * when we reserved the extent we decremented the extent's size from
1097          * the data space_info's bytes_may_use counter and incremented the
1098          * space_info's bytes_reserved counter by the same amount. We must make
1099          * sure extent_clear_unlock_delalloc() does not try to decrement again
1100          * the data space_info's bytes_may_use counter, therefore we do not pass
1101          * it the flag EXTENT_CLEAR_DATA_RESV.
1102          */
1103         if (extent_reserved) {
1104                 extent_clear_unlock_delalloc(inode, start,
1105                                              start + cur_alloc_size,
1106                                              start + cur_alloc_size,
1107                                              locked_page,
1108                                              clear_bits,
1109                                              page_ops);
1110                 start += cur_alloc_size;
1111                 if (start >= end)
1112                         goto out;
1113         }
1114         extent_clear_unlock_delalloc(inode, start, end, delalloc_end,
1115                                      locked_page,
1116                                      clear_bits | EXTENT_CLEAR_DATA_RESV,
1117                                      page_ops);
1118         goto out;
1119 }
1120
1121 /*
1122  * work queue call back to started compression on a file and pages
1123  */
1124 static noinline void async_cow_start(struct btrfs_work *work)
1125 {
1126         struct async_cow *async_cow;
1127         int num_added = 0;
1128         async_cow = container_of(work, struct async_cow, work);
1129
1130         compress_file_range(async_cow->inode, async_cow->locked_page,
1131                             async_cow->start, async_cow->end, async_cow,
1132                             &num_added);
1133         if (num_added == 0) {
1134                 btrfs_add_delayed_iput(async_cow->inode);
1135                 async_cow->inode = NULL;
1136         }
1137 }
1138
1139 /*
1140  * work queue call back to submit previously compressed pages
1141  */
1142 static noinline void async_cow_submit(struct btrfs_work *work)
1143 {
1144         struct btrfs_fs_info *fs_info;
1145         struct async_cow *async_cow;
1146         struct btrfs_root *root;
1147         unsigned long nr_pages;
1148
1149         async_cow = container_of(work, struct async_cow, work);
1150
1151         root = async_cow->root;
1152         fs_info = root->fs_info;
1153         nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >>
1154                 PAGE_SHIFT;
1155
1156         /*
1157          * atomic_sub_return implies a barrier for waitqueue_active
1158          */
1159         if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
1160             5 * SZ_1M &&
1161             waitqueue_active(&fs_info->async_submit_wait))
1162                 wake_up(&fs_info->async_submit_wait);
1163
1164         if (async_cow->inode)
1165                 submit_compressed_extents(async_cow->inode, async_cow);
1166 }
1167
1168 static noinline void async_cow_free(struct btrfs_work *work)
1169 {
1170         struct async_cow *async_cow;
1171         async_cow = container_of(work, struct async_cow, work);
1172         if (async_cow->inode)
1173                 btrfs_add_delayed_iput(async_cow->inode);
1174         kfree(async_cow);
1175 }
1176
1177 static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1178                                 u64 start, u64 end, int *page_started,
1179                                 unsigned long *nr_written)
1180 {
1181         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1182         struct async_cow *async_cow;
1183         struct btrfs_root *root = BTRFS_I(inode)->root;
1184         unsigned long nr_pages;
1185         u64 cur_end;
1186
1187         clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1188                          1, 0, NULL, GFP_NOFS);
1189         while (start < end) {
1190                 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
1191                 BUG_ON(!async_cow); /* -ENOMEM */
1192                 async_cow->inode = igrab(inode);
1193                 async_cow->root = root;
1194                 async_cow->locked_page = locked_page;
1195                 async_cow->start = start;
1196
1197                 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
1198                     !btrfs_test_opt(fs_info, FORCE_COMPRESS))
1199                         cur_end = end;
1200                 else
1201                         cur_end = min(end, start + SZ_512K - 1);
1202
1203                 async_cow->end = cur_end;
1204                 INIT_LIST_HEAD(&async_cow->extents);
1205
1206                 btrfs_init_work(&async_cow->work,
1207                                 btrfs_delalloc_helper,
1208                                 async_cow_start, async_cow_submit,
1209                                 async_cow_free);
1210
1211                 nr_pages = (cur_end - start + PAGE_SIZE) >>
1212                         PAGE_SHIFT;
1213                 atomic_add(nr_pages, &fs_info->async_delalloc_pages);
1214
1215                 btrfs_queue_work(fs_info->delalloc_workers, &async_cow->work);
1216
1217                 while (atomic_read(&fs_info->async_submit_draining) &&
1218                        atomic_read(&fs_info->async_delalloc_pages)) {
1219                         wait_event(fs_info->async_submit_wait,
1220                                    (atomic_read(&fs_info->async_delalloc_pages) ==
1221                                     0));
1222                 }
1223
1224                 *nr_written += nr_pages;
1225                 start = cur_end + 1;
1226         }
1227         *page_started = 1;
1228         return 0;
1229 }
1230
1231 static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
1232                                         u64 bytenr, u64 num_bytes)
1233 {
1234         int ret;
1235         struct btrfs_ordered_sum *sums;
1236         LIST_HEAD(list);
1237
1238         ret = btrfs_lookup_csums_range(fs_info->csum_root, bytenr,
1239                                        bytenr + num_bytes - 1, &list, 0);
1240         if (ret == 0 && list_empty(&list))
1241                 return 0;
1242
1243         while (!list_empty(&list)) {
1244                 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1245                 list_del(&sums->list);
1246                 kfree(sums);
1247         }
1248         return 1;
1249 }
1250
1251 /*
1252  * when nowcow writeback call back.  This checks for snapshots or COW copies
1253  * of the extents that exist in the file, and COWs the file as required.
1254  *
1255  * If no cow copies or snapshots exist, we write directly to the existing
1256  * blocks on disk
1257  */
1258 static noinline int run_delalloc_nocow(struct inode *inode,
1259                                        struct page *locked_page,
1260                               u64 start, u64 end, int *page_started, int force,
1261                               unsigned long *nr_written)
1262 {
1263         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1264         struct btrfs_root *root = BTRFS_I(inode)->root;
1265         struct extent_buffer *leaf;
1266         struct btrfs_path *path;
1267         struct btrfs_file_extent_item *fi;
1268         struct btrfs_key found_key;
1269         struct extent_map *em;
1270         u64 cow_start;
1271         u64 cur_offset;
1272         u64 extent_end;
1273         u64 extent_offset;
1274         u64 disk_bytenr;
1275         u64 num_bytes;
1276         u64 disk_num_bytes;
1277         u64 ram_bytes;
1278         int extent_type;
1279         int ret, err;
1280         int type;
1281         int nocow;
1282         int check_prev = 1;
1283         bool nolock;
1284         u64 ino = btrfs_ino(BTRFS_I(inode));
1285
1286         path = btrfs_alloc_path();
1287         if (!path) {
1288                 extent_clear_unlock_delalloc(inode, start, end, end,
1289                                              locked_page,
1290                                              EXTENT_LOCKED | EXTENT_DELALLOC |
1291                                              EXTENT_DO_ACCOUNTING |
1292                                              EXTENT_DEFRAG, PAGE_UNLOCK |
1293                                              PAGE_CLEAR_DIRTY |
1294                                              PAGE_SET_WRITEBACK |
1295                                              PAGE_END_WRITEBACK);
1296                 return -ENOMEM;
1297         }
1298
1299         nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
1300
1301         cow_start = (u64)-1;
1302         cur_offset = start;
1303         while (1) {
1304                 ret = btrfs_lookup_file_extent(NULL, root, path, ino,
1305                                                cur_offset, 0);
1306                 if (ret < 0)
1307                         goto error;
1308                 if (ret > 0 && path->slots[0] > 0 && check_prev) {
1309                         leaf = path->nodes[0];
1310                         btrfs_item_key_to_cpu(leaf, &found_key,
1311                                               path->slots[0] - 1);
1312                         if (found_key.objectid == ino &&
1313                             found_key.type == BTRFS_EXTENT_DATA_KEY)
1314                                 path->slots[0]--;
1315                 }
1316                 check_prev = 0;
1317 next_slot:
1318                 leaf = path->nodes[0];
1319                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1320                         ret = btrfs_next_leaf(root, path);
1321                         if (ret < 0)
1322                                 goto error;
1323                         if (ret > 0)
1324                                 break;
1325                         leaf = path->nodes[0];
1326                 }
1327
1328                 nocow = 0;
1329                 disk_bytenr = 0;
1330                 num_bytes = 0;
1331                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1332
1333                 if (found_key.objectid > ino)
1334                         break;
1335                 if (WARN_ON_ONCE(found_key.objectid < ino) ||
1336                     found_key.type < BTRFS_EXTENT_DATA_KEY) {
1337                         path->slots[0]++;
1338                         goto next_slot;
1339                 }
1340                 if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
1341                     found_key.offset > end)
1342                         break;
1343
1344                 if (found_key.offset > cur_offset) {
1345                         extent_end = found_key.offset;
1346                         extent_type = 0;
1347                         goto out_check;
1348                 }
1349
1350                 fi = btrfs_item_ptr(leaf, path->slots[0],
1351                                     struct btrfs_file_extent_item);
1352                 extent_type = btrfs_file_extent_type(leaf, fi);
1353
1354                 ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1355                 if (extent_type == BTRFS_FILE_EXTENT_REG ||
1356                     extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1357                         disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1358                         extent_offset = btrfs_file_extent_offset(leaf, fi);
1359                         extent_end = found_key.offset +
1360                                 btrfs_file_extent_num_bytes(leaf, fi);
1361                         disk_num_bytes =
1362                                 btrfs_file_extent_disk_num_bytes(leaf, fi);
1363                         if (extent_end <= start) {
1364                                 path->slots[0]++;
1365                                 goto next_slot;
1366                         }
1367                         if (disk_bytenr == 0)
1368                                 goto out_check;
1369                         if (btrfs_file_extent_compression(leaf, fi) ||
1370                             btrfs_file_extent_encryption(leaf, fi) ||
1371                             btrfs_file_extent_other_encoding(leaf, fi))
1372                                 goto out_check;
1373                         if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1374                                 goto out_check;
1375                         if (btrfs_extent_readonly(fs_info, disk_bytenr))
1376                                 goto out_check;
1377                         if (btrfs_cross_ref_exist(root, ino,
1378                                                   found_key.offset -
1379                                                   extent_offset, disk_bytenr))
1380                                 goto out_check;
1381                         disk_bytenr += extent_offset;
1382                         disk_bytenr += cur_offset - found_key.offset;
1383                         num_bytes = min(end + 1, extent_end) - cur_offset;
1384                         /*
1385                          * if there are pending snapshots for this root,
1386                          * we fall into common COW way.
1387                          */
1388                         if (!nolock) {
1389                                 err = btrfs_start_write_no_snapshotting(root);
1390                                 if (!err)
1391                                         goto out_check;
1392                         }
1393                         /*
1394                          * force cow if csum exists in the range.
1395                          * this ensure that csum for a given extent are
1396                          * either valid or do not exist.
1397                          */
1398                         if (csum_exist_in_range(fs_info, disk_bytenr,
1399                                                 num_bytes)) {
1400                                 if (!nolock)
1401                                         btrfs_end_write_no_snapshotting(root);
1402                                 goto out_check;
1403                         }
1404                         if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) {
1405                                 if (!nolock)
1406                                         btrfs_end_write_no_snapshotting(root);
1407                                 goto out_check;
1408                         }
1409                         nocow = 1;
1410                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1411                         extent_end = found_key.offset +
1412                                 btrfs_file_extent_inline_len(leaf,
1413                                                      path->slots[0], fi);
1414                         extent_end = ALIGN(extent_end,
1415                                            fs_info->sectorsize);
1416                 } else {
1417                         BUG_ON(1);
1418                 }
1419 out_check:
1420                 if (extent_end <= start) {
1421                         path->slots[0]++;
1422                         if (!nolock && nocow)
1423                                 btrfs_end_write_no_snapshotting(root);
1424                         if (nocow)
1425                                 btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1426                         goto next_slot;
1427                 }
1428                 if (!nocow) {
1429                         if (cow_start == (u64)-1)
1430                                 cow_start = cur_offset;
1431                         cur_offset = extent_end;
1432                         if (cur_offset > end)
1433                                 break;
1434                         path->slots[0]++;
1435                         goto next_slot;
1436                 }
1437
1438                 btrfs_release_path(path);
1439                 if (cow_start != (u64)-1) {
1440                         ret = cow_file_range(inode, locked_page,
1441                                              cow_start, found_key.offset - 1,
1442                                              end, page_started, nr_written, 1,
1443                                              NULL);
1444                         if (ret) {
1445                                 if (!nolock && nocow)
1446                                         btrfs_end_write_no_snapshotting(root);
1447                                 if (nocow)
1448                                         btrfs_dec_nocow_writers(fs_info,
1449                                                                 disk_bytenr);
1450                                 goto error;
1451                         }
1452                         cow_start = (u64)-1;
1453                 }
1454
1455                 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1456                         u64 orig_start = found_key.offset - extent_offset;
1457
1458                         em = create_io_em(inode, cur_offset, num_bytes,
1459                                           orig_start,
1460                                           disk_bytenr, /* block_start */
1461                                           num_bytes, /* block_len */
1462                                           disk_num_bytes, /* orig_block_len */
1463                                           ram_bytes, BTRFS_COMPRESS_NONE,
1464                                           BTRFS_ORDERED_PREALLOC);
1465                         if (IS_ERR(em)) {
1466                                 if (!nolock && nocow)
1467                                         btrfs_end_write_no_snapshotting(root);
1468                                 if (nocow)
1469                                         btrfs_dec_nocow_writers(fs_info,
1470                                                                 disk_bytenr);
1471                                 ret = PTR_ERR(em);
1472                                 goto error;
1473                         }
1474                         free_extent_map(em);
1475                 }
1476
1477                 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1478                         type = BTRFS_ORDERED_PREALLOC;
1479                 } else {
1480                         type = BTRFS_ORDERED_NOCOW;
1481                 }
1482
1483                 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1484                                                num_bytes, num_bytes, type);
1485                 if (nocow)
1486                         btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1487                 BUG_ON(ret); /* -ENOMEM */
1488
1489                 if (root->root_key.objectid ==
1490                     BTRFS_DATA_RELOC_TREE_OBJECTID)
1491                         /*
1492                          * Error handled later, as we must prevent
1493                          * extent_clear_unlock_delalloc() in error handler
1494                          * from freeing metadata of created ordered extent.
1495                          */
1496                         ret = btrfs_reloc_clone_csums(inode, cur_offset,
1497                                                       num_bytes);
1498
1499                 extent_clear_unlock_delalloc(inode, cur_offset,
1500                                              cur_offset + num_bytes - 1, end,
1501                                              locked_page, EXTENT_LOCKED |
1502                                              EXTENT_DELALLOC |
1503                                              EXTENT_CLEAR_DATA_RESV,
1504                                              PAGE_UNLOCK | PAGE_SET_PRIVATE2);
1505
1506                 if (!nolock && nocow)
1507                         btrfs_end_write_no_snapshotting(root);
1508                 cur_offset = extent_end;
1509
1510                 /*
1511                  * btrfs_reloc_clone_csums() error, now we're OK to call error
1512                  * handler, as metadata for created ordered extent will only
1513                  * be freed by btrfs_finish_ordered_io().
1514                  */
1515                 if (ret)
1516                         goto error;
1517                 if (cur_offset > end)
1518                         break;
1519         }
1520         btrfs_release_path(path);
1521
1522         if (cur_offset <= end && cow_start == (u64)-1) {
1523                 cow_start = cur_offset;
1524                 cur_offset = end;
1525         }
1526
1527         if (cow_start != (u64)-1) {
1528                 ret = cow_file_range(inode, locked_page, cow_start, end, end,
1529                                      page_started, nr_written, 1, NULL);
1530                 if (ret)
1531                         goto error;
1532         }
1533
1534 error:
1535         if (ret && cur_offset < end)
1536                 extent_clear_unlock_delalloc(inode, cur_offset, end, end,
1537                                              locked_page, EXTENT_LOCKED |
1538                                              EXTENT_DELALLOC | EXTENT_DEFRAG |
1539                                              EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1540                                              PAGE_CLEAR_DIRTY |
1541                                              PAGE_SET_WRITEBACK |
1542                                              PAGE_END_WRITEBACK);
1543         btrfs_free_path(path);
1544         return ret;
1545 }
1546
1547 static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
1548 {
1549
1550         if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
1551             !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
1552                 return 0;
1553
1554         /*
1555          * @defrag_bytes is a hint value, no spinlock held here,
1556          * if is not zero, it means the file is defragging.
1557          * Force cow if given extent needs to be defragged.
1558          */
1559         if (BTRFS_I(inode)->defrag_bytes &&
1560             test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1561                            EXTENT_DEFRAG, 0, NULL))
1562                 return 1;
1563
1564         return 0;
1565 }
1566
1567 /*
1568  * extent_io.c call back to do delayed allocation processing
1569  */
1570 static int run_delalloc_range(void *private_data, struct page *locked_page,
1571                               u64 start, u64 end, int *page_started,
1572                               unsigned long *nr_written)
1573 {
1574         struct inode *inode = private_data;
1575         int ret;
1576         int force_cow = need_force_cow(inode, start, end);
1577
1578         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
1579                 ret = run_delalloc_nocow(inode, locked_page, start, end,
1580                                          page_started, 1, nr_written);
1581         } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
1582                 ret = run_delalloc_nocow(inode, locked_page, start, end,
1583                                          page_started, 0, nr_written);
1584         } else if (!inode_need_compress(inode, start, end)) {
1585                 ret = cow_file_range(inode, locked_page, start, end, end,
1586                                       page_started, nr_written, 1, NULL);
1587         } else {
1588                 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1589                         &BTRFS_I(inode)->runtime_flags);
1590                 ret = cow_file_range_async(inode, locked_page, start, end,
1591                                            page_started, nr_written);
1592         }
1593         if (ret)
1594                 btrfs_cleanup_ordered_extents(inode, start, end - start + 1);
1595         return ret;
1596 }
1597
1598 static void btrfs_split_extent_hook(void *private_data,
1599                                     struct extent_state *orig, u64 split)
1600 {
1601         struct inode *inode = private_data;
1602         u64 size;
1603
1604         /* not delalloc, ignore it */
1605         if (!(orig->state & EXTENT_DELALLOC))
1606                 return;
1607
1608         size = orig->end - orig->start + 1;
1609         if (size > BTRFS_MAX_EXTENT_SIZE) {
1610                 u32 num_extents;
1611                 u64 new_size;
1612
1613                 /*
1614                  * See the explanation in btrfs_merge_extent_hook, the same
1615                  * applies here, just in reverse.
1616                  */
1617                 new_size = orig->end - split + 1;
1618                 num_extents = count_max_extents(new_size);
1619                 new_size = split - orig->start;
1620                 num_extents += count_max_extents(new_size);
1621                 if (count_max_extents(size) >= num_extents)
1622                         return;
1623         }
1624
1625         spin_lock(&BTRFS_I(inode)->lock);
1626         BTRFS_I(inode)->outstanding_extents++;
1627         spin_unlock(&BTRFS_I(inode)->lock);
1628 }
1629
1630 /*
1631  * extent_io.c merge_extent_hook, used to track merged delayed allocation
1632  * extents so we can keep track of new extents that are just merged onto old
1633  * extents, such as when we are doing sequential writes, so we can properly
1634  * account for the metadata space we'll need.
1635  */
1636 static void btrfs_merge_extent_hook(void *private_data,
1637                                     struct extent_state *new,
1638                                     struct extent_state *other)
1639 {
1640         struct inode *inode = private_data;
1641         u64 new_size, old_size;
1642         u32 num_extents;
1643
1644         /* not delalloc, ignore it */
1645         if (!(other->state & EXTENT_DELALLOC))
1646                 return;
1647
1648         if (new->start > other->start)
1649                 new_size = new->end - other->start + 1;
1650         else
1651                 new_size = other->end - new->start + 1;
1652
1653         /* we're not bigger than the max, unreserve the space and go */
1654         if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
1655                 spin_lock(&BTRFS_I(inode)->lock);
1656                 BTRFS_I(inode)->outstanding_extents--;
1657                 spin_unlock(&BTRFS_I(inode)->lock);
1658                 return;
1659         }
1660
1661         /*
1662          * We have to add up either side to figure out how many extents were
1663          * accounted for before we merged into one big extent.  If the number of
1664          * extents we accounted for is <= the amount we need for the new range
1665          * then we can return, otherwise drop.  Think of it like this
1666          *
1667          * [ 4k][MAX_SIZE]
1668          *
1669          * So we've grown the extent by a MAX_SIZE extent, this would mean we
1670          * need 2 outstanding extents, on one side we have 1 and the other side
1671          * we have 1 so they are == and we can return.  But in this case
1672          *
1673          * [MAX_SIZE+4k][MAX_SIZE+4k]
1674          *
1675          * Each range on their own accounts for 2 extents, but merged together
1676          * they are only 3 extents worth of accounting, so we need to drop in
1677          * this case.
1678          */
1679         old_size = other->end - other->start + 1;
1680         num_extents = count_max_extents(old_size);
1681         old_size = new->end - new->start + 1;
1682         num_extents += count_max_extents(old_size);
1683         if (count_max_extents(new_size) >= num_extents)
1684                 return;
1685
1686         spin_lock(&BTRFS_I(inode)->lock);
1687         BTRFS_I(inode)->outstanding_extents--;
1688         spin_unlock(&BTRFS_I(inode)->lock);
1689 }
1690
1691 static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
1692                                       struct inode *inode)
1693 {
1694         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1695
1696         spin_lock(&root->delalloc_lock);
1697         if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1698                 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1699                               &root->delalloc_inodes);
1700                 set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1701                         &BTRFS_I(inode)->runtime_flags);
1702                 root->nr_delalloc_inodes++;
1703                 if (root->nr_delalloc_inodes == 1) {
1704                         spin_lock(&fs_info->delalloc_root_lock);
1705                         BUG_ON(!list_empty(&root->delalloc_root));
1706                         list_add_tail(&root->delalloc_root,
1707                                       &fs_info->delalloc_roots);
1708                         spin_unlock(&fs_info->delalloc_root_lock);
1709                 }
1710         }
1711         spin_unlock(&root->delalloc_lock);
1712 }
1713
1714 static void btrfs_del_delalloc_inode(struct btrfs_root *root,
1715                                      struct btrfs_inode *inode)
1716 {
1717         struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
1718
1719         spin_lock(&root->delalloc_lock);
1720         if (!list_empty(&inode->delalloc_inodes)) {
1721                 list_del_init(&inode->delalloc_inodes);
1722                 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1723                           &inode->runtime_flags);
1724                 root->nr_delalloc_inodes--;
1725                 if (!root->nr_delalloc_inodes) {
1726                         spin_lock(&fs_info->delalloc_root_lock);
1727                         BUG_ON(list_empty(&root->delalloc_root));
1728                         list_del_init(&root->delalloc_root);
1729                         spin_unlock(&fs_info->delalloc_root_lock);
1730                 }
1731         }
1732         spin_unlock(&root->delalloc_lock);
1733 }
1734
1735 /*
1736  * extent_io.c set_bit_hook, used to track delayed allocation
1737  * bytes in this file, and to maintain the list of inodes that
1738  * have pending delalloc work to be done.
1739  */
1740 static void btrfs_set_bit_hook(void *private_data,
1741                                struct extent_state *state, unsigned *bits)
1742 {
1743         struct inode *inode = private_data;
1744
1745         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1746
1747         if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
1748                 WARN_ON(1);
1749         /*
1750          * set_bit and clear bit hooks normally require _irqsave/restore
1751          * but in this case, we are only testing for the DELALLOC
1752          * bit, which is only set or cleared with irqs on
1753          */
1754         if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1755                 struct btrfs_root *root = BTRFS_I(inode)->root;
1756                 u64 len = state->end + 1 - state->start;
1757                 bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
1758
1759                 if (*bits & EXTENT_FIRST_DELALLOC) {
1760                         *bits &= ~EXTENT_FIRST_DELALLOC;
1761                 } else {
1762                         spin_lock(&BTRFS_I(inode)->lock);
1763                         BTRFS_I(inode)->outstanding_extents++;
1764                         spin_unlock(&BTRFS_I(inode)->lock);
1765                 }
1766
1767                 /* For sanity tests */
1768                 if (btrfs_is_testing(fs_info))
1769                         return;
1770
1771                 percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
1772                                          fs_info->delalloc_batch);
1773                 spin_lock(&BTRFS_I(inode)->lock);
1774                 BTRFS_I(inode)->delalloc_bytes += len;
1775                 if (*bits & EXTENT_DEFRAG)
1776                         BTRFS_I(inode)->defrag_bytes += len;
1777                 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1778                                          &BTRFS_I(inode)->runtime_flags))
1779                         btrfs_add_delalloc_inodes(root, inode);
1780                 spin_unlock(&BTRFS_I(inode)->lock);
1781         }
1782
1783         if (!(state->state & EXTENT_DELALLOC_NEW) &&
1784             (*bits & EXTENT_DELALLOC_NEW)) {
1785                 spin_lock(&BTRFS_I(inode)->lock);
1786                 BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
1787                         state->start;
1788                 spin_unlock(&BTRFS_I(inode)->lock);
1789         }
1790 }
1791
1792 /*
1793  * extent_io.c clear_bit_hook, see set_bit_hook for why
1794  */
1795 static void btrfs_clear_bit_hook(void *private_data,
1796                                  struct extent_state *state,
1797                                  unsigned *bits)
1798 {
1799         struct btrfs_inode *inode = BTRFS_I((struct inode *)private_data);
1800         struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
1801         u64 len = state->end + 1 - state->start;
1802         u32 num_extents = count_max_extents(len);
1803
1804         if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) {
1805                 spin_lock(&inode->lock);
1806                 inode->defrag_bytes -= len;
1807                 spin_unlock(&inode->lock);
1808         }
1809
1810         /*
1811          * set_bit and clear bit hooks normally require _irqsave/restore
1812          * but in this case, we are only testing for the DELALLOC
1813          * bit, which is only set or cleared with irqs on
1814          */
1815         if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1816                 struct btrfs_root *root = inode->root;
1817                 bool do_list = !btrfs_is_free_space_inode(inode);
1818
1819                 if (*bits & EXTENT_FIRST_DELALLOC) {
1820                         *bits &= ~EXTENT_FIRST_DELALLOC;
1821                 } else if (!(*bits & EXTENT_CLEAR_META_RESV)) {
1822                         spin_lock(&inode->lock);
1823                         inode->outstanding_extents -= num_extents;
1824                         spin_unlock(&inode->lock);
1825                 }
1826
1827                 /*
1828                  * We don't reserve metadata space for space cache inodes so we
1829                  * don't need to call dellalloc_release_metadata if there is an
1830                  * error.
1831                  */
1832                 if (*bits & EXTENT_CLEAR_META_RESV &&
1833                     root != fs_info->tree_root)
1834                         btrfs_delalloc_release_metadata(inode, len);
1835
1836                 /* For sanity tests. */
1837                 if (btrfs_is_testing(fs_info))
1838                         return;
1839
1840                 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID &&
1841                     do_list && !(state->state & EXTENT_NORESERVE) &&
1842                     (*bits & EXTENT_CLEAR_DATA_RESV))
1843                         btrfs_free_reserved_data_space_noquota(
1844                                         &inode->vfs_inode,
1845                                         state->start, len);
1846
1847                 percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
1848                                          fs_info->delalloc_batch);
1849                 spin_lock(&inode->lock);
1850                 inode->delalloc_bytes -= len;
1851                 if (do_list && inode->delalloc_bytes == 0 &&
1852                     test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1853                                         &inode->runtime_flags))
1854                         btrfs_del_delalloc_inode(root, inode);
1855                 spin_unlock(&inode->lock);
1856         }
1857
1858         if ((state->state & EXTENT_DELALLOC_NEW) &&
1859             (*bits & EXTENT_DELALLOC_NEW)) {
1860                 spin_lock(&inode->lock);
1861                 ASSERT(inode->new_delalloc_bytes >= len);
1862                 inode->new_delalloc_bytes -= len;
1863                 spin_unlock(&inode->lock);
1864         }
1865 }
1866
1867 /*
1868  * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1869  * we don't create bios that span stripes or chunks
1870  *
1871  * return 1 if page cannot be merged to bio
1872  * return 0 if page can be merged to bio
1873  * return error otherwise
1874  */
1875 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1876                          size_t size, struct bio *bio,
1877                          unsigned long bio_flags)
1878 {
1879         struct inode *inode = page->mapping->host;
1880         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1881         u64 logical = (u64)bio->bi_iter.bi_sector << 9;
1882         u64 length = 0;
1883         u64 map_length;
1884         int ret;
1885
1886         if (bio_flags & EXTENT_BIO_COMPRESSED)
1887                 return 0;
1888
1889         length = bio->bi_iter.bi_size;
1890         map_length = length;
1891         ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
1892                               NULL, 0);
1893         if (ret < 0)
1894                 return ret;
1895         if (map_length < length + size)
1896                 return 1;
1897         return 0;
1898 }
1899
1900 /*
1901  * in order to insert checksums into the metadata in large chunks,
1902  * we wait until bio submission time.   All the pages in the bio are
1903  * checksummed and sums are attached onto the ordered extent record.
1904  *
1905  * At IO completion time the cums attached on the ordered extent record
1906  * are inserted into the btree
1907  */
1908 static blk_status_t __btrfs_submit_bio_start(void *private_data, struct bio *bio,
1909                                     int mirror_num, unsigned long bio_flags,
1910                                     u64 bio_offset)
1911 {
1912         struct inode *inode = private_data;
1913         blk_status_t ret = 0;
1914
1915         ret = btrfs_csum_one_bio(inode, bio, 0, 0);
1916         BUG_ON(ret); /* -ENOMEM */
1917         return 0;
1918 }
1919
1920 /*
1921  * in order to insert checksums into the metadata in large chunks,
1922  * we wait until bio submission time.   All the pages in the bio are
1923  * checksummed and sums are attached onto the ordered extent record.
1924  *
1925  * At IO completion time the cums attached on the ordered extent record
1926  * are inserted into the btree
1927  */
1928 static blk_status_t __btrfs_submit_bio_done(void *private_data, struct bio *bio,
1929                           int mirror_num, unsigned long bio_flags,
1930                           u64 bio_offset)
1931 {
1932         struct inode *inode = private_data;
1933         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1934         blk_status_t ret;
1935
1936         ret = btrfs_map_bio(fs_info, bio, mirror_num, 1);
1937         if (ret) {
1938                 bio->bi_status = ret;
1939                 bio_endio(bio);
1940         }
1941         return ret;
1942 }
1943
1944 /*
1945  * extent_io.c submission hook. This does the right thing for csum calculation
1946  * on write, or reading the csums from the tree before a read
1947  */
1948 static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio,
1949                                  int mirror_num, unsigned long bio_flags,
1950                                  u64 bio_offset)
1951 {
1952         struct inode *inode = private_data;
1953         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1954         struct btrfs_root *root = BTRFS_I(inode)->root;
1955         enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
1956         blk_status_t ret = 0;
1957         int skip_sum;
1958         int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
1959
1960         skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1961
1962         if (btrfs_is_free_space_inode(BTRFS_I(inode)))
1963                 metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
1964
1965         if (bio_op(bio) != REQ_OP_WRITE) {
1966                 ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
1967                 if (ret)
1968                         goto out;
1969
1970                 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1971                         ret = btrfs_submit_compressed_read(inode, bio,
1972                                                            mirror_num,
1973                                                            bio_flags);
1974                         goto out;
1975                 } else if (!skip_sum) {
1976                         ret = btrfs_lookup_bio_sums(inode, bio, NULL);
1977                         if (ret)
1978                                 goto out;
1979                 }
1980                 goto mapit;
1981         } else if (async && !skip_sum) {
1982                 /* csum items have already been cloned */
1983                 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
1984                         goto mapit;
1985                 /* we're doing a write, do the async checksumming */
1986                 ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags,
1987                                           bio_offset, inode,
1988                                           __btrfs_submit_bio_start,
1989                                           __btrfs_submit_bio_done);
1990                 goto out;
1991         } else if (!skip_sum) {
1992                 ret = btrfs_csum_one_bio(inode, bio, 0, 0);
1993                 if (ret)
1994                         goto out;
1995         }
1996
1997 mapit:
1998         ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
1999
2000 out:
2001         if (ret) {
2002                 bio->bi_status = ret;
2003                 bio_endio(bio);
2004         }
2005         return ret;
2006 }
2007
2008 /*
2009  * given a list of ordered sums record them in the inode.  This happens
2010  * at IO completion time based on sums calculated at bio submission time.
2011  */
2012 static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
2013                              struct inode *inode, struct list_head *list)
2014 {
2015         struct btrfs_ordered_sum *sum;
2016
2017         list_for_each_entry(sum, list, list) {
2018                 trans->adding_csums = 1;
2019                 btrfs_csum_file_blocks(trans,
2020                        BTRFS_I(inode)->root->fs_info->csum_root, sum);
2021                 trans->adding_csums = 0;
2022         }
2023         return 0;
2024 }
2025
2026 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2027                               struct extent_state **cached_state, int dedupe)
2028 {
2029         WARN_ON((end & (PAGE_SIZE - 1)) == 0);
2030         return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
2031                                    cached_state);
2032 }
2033
2034 /* see btrfs_writepage_start_hook for details on why this is required */
2035 struct btrfs_writepage_fixup {
2036         struct page *page;
2037         struct btrfs_work work;
2038 };
2039
2040 static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
2041 {
2042         struct btrfs_writepage_fixup *fixup;
2043         struct btrfs_ordered_extent *ordered;
2044         struct extent_state *cached_state = NULL;
2045         struct extent_changeset *data_reserved = NULL;
2046         struct page *page;
2047         struct inode *inode;
2048         u64 page_start;
2049         u64 page_end;
2050         int ret;
2051
2052         fixup = container_of(work, struct btrfs_writepage_fixup, work);
2053         page = fixup->page;
2054 again:
2055         lock_page(page);
2056         if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
2057                 ClearPageChecked(page);
2058                 goto out_page;
2059         }
2060
2061         inode = page->mapping->host;
2062         page_start = page_offset(page);
2063         page_end = page_offset(page) + PAGE_SIZE - 1;
2064
2065         lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
2066                          &cached_state);
2067
2068         /* already ordered? We're done */
2069         if (PagePrivate2(page))
2070                 goto out;
2071
2072         ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
2073                                         PAGE_SIZE);
2074         if (ordered) {
2075                 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
2076                                      page_end, &cached_state, GFP_NOFS);
2077                 unlock_page(page);
2078                 btrfs_start_ordered_extent(inode, ordered, 1);
2079                 btrfs_put_ordered_extent(ordered);
2080                 goto again;
2081         }
2082
2083         ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
2084                                            PAGE_SIZE);
2085         if (ret) {
2086                 mapping_set_error(page->mapping, ret);
2087                 end_extent_writepage(page, ret, page_start, page_end);
2088                 ClearPageChecked(page);
2089                 goto out;
2090          }
2091
2092         btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state,
2093                                   0);
2094         ClearPageChecked(page);
2095         set_page_dirty(page);
2096 out:
2097         unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
2098                              &cached_state, GFP_NOFS);
2099 out_page:
2100         unlock_page(page);
2101         put_page(page);
2102         kfree(fixup);
2103         extent_changeset_free(data_reserved);
2104 }
2105
2106 /*
2107  * There are a few paths in the higher layers of the kernel that directly
2108  * set the page dirty bit without asking the filesystem if it is a
2109  * good idea.  This causes problems because we want to make sure COW
2110  * properly happens and the data=ordered rules are followed.
2111  *
2112  * In our case any range that doesn't have the ORDERED bit set
2113  * hasn't been properly setup for IO.  We kick off an async process
2114  * to fix it up.  The async helper will wait for ordered extents, set
2115  * the delalloc bit and make it safe to write the page.
2116  */
2117 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
2118 {
2119         struct inode *inode = page->mapping->host;
2120         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2121         struct btrfs_writepage_fixup *fixup;
2122
2123         /* this page is properly in the ordered list */
2124         if (TestClearPagePrivate2(page))
2125                 return 0;
2126
2127         if (PageChecked(page))
2128                 return -EAGAIN;
2129
2130         fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
2131         if (!fixup)
2132                 return -EAGAIN;
2133
2134         SetPageChecked(page);
2135         get_page(page);
2136         btrfs_init_work(&fixup->work, btrfs_fixup_helper,
2137                         btrfs_writepage_fixup_worker, NULL, NULL);
2138         fixup->page = page;
2139         btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
2140         return -EBUSY;
2141 }
2142
2143 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2144                                        struct inode *inode, u64 file_pos,
2145                                        u64 disk_bytenr, u64 disk_num_bytes,
2146                                        u64 num_bytes, u64 ram_bytes,
2147                                        u8 compression, u8 encryption,
2148                                        u16 other_encoding, int extent_type)
2149 {
2150         struct btrfs_root *root = BTRFS_I(inode)->root;
2151         struct btrfs_file_extent_item *fi;
2152         struct btrfs_path *path;
2153         struct extent_buffer *leaf;
2154         struct btrfs_key ins;
2155         u64 qg_released;
2156         int extent_inserted = 0;
2157         int ret;
2158
2159         path = btrfs_alloc_path();
2160         if (!path)
2161                 return -ENOMEM;
2162
2163         /*
2164          * we may be replacing one extent in the tree with another.
2165          * The new extent is pinned in the extent map, and we don't want
2166          * to drop it from the cache until it is completely in the btree.
2167          *
2168          * So, tell btrfs_drop_extents to leave this extent in the cache.
2169          * the caller is expected to unpin it and allow it to be merged
2170          * with the others.
2171          */
2172         ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
2173                                    file_pos + num_bytes, NULL, 0,
2174                                    1, sizeof(*fi), &extent_inserted);
2175         if (ret)
2176                 goto out;
2177
2178         if (!extent_inserted) {
2179                 ins.objectid = btrfs_ino(BTRFS_I(inode));
2180                 ins.offset = file_pos;
2181                 ins.type = BTRFS_EXTENT_DATA_KEY;
2182
2183                 path->leave_spinning = 1;
2184                 ret = btrfs_insert_empty_item(trans, root, path, &ins,
2185                                               sizeof(*fi));
2186                 if (ret)
2187                         goto out;
2188         }
2189         leaf = path->nodes[0];
2190         fi = btrfs_item_ptr(leaf, path->slots[0],
2191                             struct btrfs_file_extent_item);
2192         btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2193         btrfs_set_file_extent_type(leaf, fi, extent_type);
2194         btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
2195         btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
2196         btrfs_set_file_extent_offset(leaf, fi, 0);
2197         btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2198         btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
2199         btrfs_set_file_extent_compression(leaf, fi, compression);
2200         btrfs_set_file_extent_encryption(leaf, fi, encryption);
2201         btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
2202
2203         btrfs_mark_buffer_dirty(leaf);
2204         btrfs_release_path(path);
2205
2206         inode_add_bytes(inode, num_bytes);
2207
2208         ins.objectid = disk_bytenr;
2209         ins.offset = disk_num_bytes;
2210         ins.type = BTRFS_EXTENT_ITEM_KEY;
2211
2212         /*
2213          * Release the reserved range from inode dirty range map, as it is
2214          * already moved into delayed_ref_head
2215          */
2216         ret = btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
2217         if (ret < 0)
2218                 goto out;
2219         qg_released = ret;
2220         ret = btrfs_alloc_reserved_file_extent(trans, root->root_key.objectid,
2221                         btrfs_ino(BTRFS_I(inode)), file_pos, qg_released, &ins);
2222 out:
2223         btrfs_free_path(path);
2224
2225         return ret;
2226 }
2227
2228 /* snapshot-aware defrag */
2229 struct sa_defrag_extent_backref {
2230         struct rb_node node;
2231         struct old_sa_defrag_extent *old;
2232         u64 root_id;
2233         u64 inum;
2234         u64 file_pos;
2235         u64 extent_offset;
2236         u64 num_bytes;
2237         u64 generation;
2238 };
2239
2240 struct old_sa_defrag_extent {
2241         struct list_head list;
2242         struct new_sa_defrag_extent *new;
2243
2244         u64 extent_offset;
2245         u64 bytenr;
2246         u64 offset;
2247         u64 len;
2248         int count;
2249 };
2250
2251 struct new_sa_defrag_extent {
2252         struct rb_root root;
2253         struct list_head head;
2254         struct btrfs_path *path;
2255         struct inode *inode;
2256         u64 file_pos;
2257         u64 len;
2258         u64 bytenr;
2259         u64 disk_len;
2260         u8 compress_type;
2261 };
2262
2263 static int backref_comp(struct sa_defrag_extent_backref *b1,
2264                         struct sa_defrag_extent_backref *b2)
2265 {
2266         if (b1->root_id < b2->root_id)
2267                 return -1;
2268         else if (b1->root_id > b2->root_id)
2269                 return 1;
2270
2271         if (b1->inum < b2->inum)
2272                 return -1;
2273         else if (b1->inum > b2->inum)
2274                 return 1;
2275
2276         if (b1->file_pos < b2->file_pos)
2277                 return -1;
2278         else if (b1->file_pos > b2->file_pos)
2279                 return 1;
2280
2281         /*
2282          * [------------------------------] ===> (a range of space)
2283          *     |<--->|   |<---->| =============> (fs/file tree A)
2284          * |<---------------------------->| ===> (fs/file tree B)
2285          *
2286          * A range of space can refer to two file extents in one tree while
2287          * refer to only one file extent in another tree.
2288          *
2289          * So we may process a disk offset more than one time(two extents in A)
2290          * and locate at the same extent(one extent in B), then insert two same
2291          * backrefs(both refer to the extent in B).
2292          */
2293         return 0;
2294 }
2295
2296 static void backref_insert(struct rb_root *root,
2297                            struct sa_defrag_extent_backref *backref)
2298 {
2299         struct rb_node **p = &root->rb_node;
2300         struct rb_node *parent = NULL;
2301         struct sa_defrag_extent_backref *entry;
2302         int ret;
2303
2304         while (*p) {
2305                 parent = *p;
2306                 entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
2307
2308                 ret = backref_comp(backref, entry);
2309                 if (ret < 0)
2310                         p = &(*p)->rb_left;
2311                 else
2312                         p = &(*p)->rb_right;
2313         }
2314
2315         rb_link_node(&backref->node, parent, p);
2316         rb_insert_color(&backref->node, root);
2317 }
2318
2319 /*
2320  * Note the backref might has changed, and in this case we just return 0.
2321  */
2322 static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2323                                        void *ctx)
2324 {
2325         struct btrfs_file_extent_item *extent;
2326         struct old_sa_defrag_extent *old = ctx;
2327         struct new_sa_defrag_extent *new = old->new;
2328         struct btrfs_path *path = new->path;
2329         struct btrfs_key key;
2330         struct btrfs_root *root;
2331         struct sa_defrag_extent_backref *backref;
2332         struct extent_buffer *leaf;
2333         struct inode *inode = new->inode;
2334         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2335         int slot;
2336         int ret;
2337         u64 extent_offset;
2338         u64 num_bytes;
2339
2340         if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
2341             inum == btrfs_ino(BTRFS_I(inode)))
2342                 return 0;
2343
2344         key.objectid = root_id;
2345         key.type = BTRFS_ROOT_ITEM_KEY;
2346         key.offset = (u64)-1;
2347
2348         root = btrfs_read_fs_root_no_name(fs_info, &key);
2349         if (IS_ERR(root)) {
2350                 if (PTR_ERR(root) == -ENOENT)
2351                         return 0;
2352                 WARN_ON(1);
2353                 btrfs_debug(fs_info, "inum=%llu, offset=%llu, root_id=%llu",
2354                          inum, offset, root_id);
2355                 return PTR_ERR(root);
2356         }
2357
2358         key.objectid = inum;
2359         key.type = BTRFS_EXTENT_DATA_KEY;
2360         if (offset > (u64)-1 << 32)
2361                 key.offset = 0;
2362         else
2363                 key.offset = offset;
2364
2365         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2366         if (WARN_ON(ret < 0))
2367                 return ret;
2368         ret = 0;
2369
2370         while (1) {
2371                 cond_resched();
2372
2373                 leaf = path->nodes[0];
2374                 slot = path->slots[0];
2375
2376                 if (slot >= btrfs_header_nritems(leaf)) {
2377                         ret = btrfs_next_leaf(root, path);
2378                         if (ret < 0) {
2379                                 goto out;
2380                         } else if (ret > 0) {
2381                                 ret = 0;
2382                                 goto out;
2383                         }
2384                         continue;
2385                 }
2386
2387                 path->slots[0]++;
2388
2389                 btrfs_item_key_to_cpu(leaf, &key, slot);
2390
2391                 if (key.objectid > inum)
2392                         goto out;
2393
2394                 if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
2395                         continue;
2396
2397                 extent = btrfs_item_ptr(leaf, slot,
2398                                         struct btrfs_file_extent_item);
2399
2400                 if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
2401                         continue;
2402
2403                 /*
2404                  * 'offset' refers to the exact key.offset,
2405                  * NOT the 'offset' field in btrfs_extent_data_ref, ie.
2406                  * (key.offset - extent_offset).
2407                  */
2408                 if (key.offset != offset)
2409                         continue;
2410
2411                 extent_offset = btrfs_file_extent_offset(leaf, extent);
2412                 num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
2413
2414                 if (extent_offset >= old->extent_offset + old->offset +
2415                     old->len || extent_offset + num_bytes <=
2416                     old->extent_offset + old->offset)
2417                         continue;
2418                 break;
2419         }
2420
2421         backref = kmalloc(sizeof(*backref), GFP_NOFS);
2422         if (!backref) {
2423                 ret = -ENOENT;
2424                 goto out;
2425         }
2426
2427         backref->root_id = root_id;
2428         backref->inum = inum;
2429         backref->file_pos = offset;
2430         backref->num_bytes = num_bytes;
2431         backref->extent_offset = extent_offset;
2432         backref->generation = btrfs_file_extent_generation(leaf, extent);
2433         backref->old = old;
2434         backref_insert(&new->root, backref);
2435         old->count++;
2436 out:
2437         btrfs_release_path(path);
2438         WARN_ON(ret);
2439         return ret;
2440 }
2441
2442 static noinline bool record_extent_backrefs(struct btrfs_path *path,
2443                                    struct new_sa_defrag_extent *new)
2444 {
2445         struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2446         struct old_sa_defrag_extent *old, *tmp;
2447         int ret;
2448
2449         new->path = path;
2450
2451         list_for_each_entry_safe(old, tmp, &new->head, list) {
2452                 ret = iterate_inodes_from_logical(old->bytenr +
2453                                                   old->extent_offset, fs_info,
2454                                                   path, record_one_backref,
2455                                                   old);
2456                 if (ret < 0 && ret != -ENOENT)
2457                         return false;
2458
2459                 /* no backref to be processed for this extent */
2460                 if (!old->count) {
2461                         list_del(&old->list);
2462                         kfree(old);
2463                 }
2464         }
2465
2466         if (list_empty(&new->head))
2467                 return false;
2468
2469         return true;
2470 }
2471
2472 static int relink_is_mergable(struct extent_buffer *leaf,
2473                               struct btrfs_file_extent_item *fi,
2474                               struct new_sa_defrag_extent *new)
2475 {
2476         if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
2477                 return 0;
2478
2479         if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2480                 return 0;
2481
2482         if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
2483                 return 0;
2484
2485         if (btrfs_file_extent_encryption(leaf, fi) ||
2486             btrfs_file_extent_other_encoding(leaf, fi))
2487                 return 0;
2488
2489         return 1;
2490 }
2491
2492 /*
2493  * Note the backref might has changed, and in this case we just return 0.
2494  */
2495 static noinline int relink_extent_backref(struct btrfs_path *path,
2496                                  struct sa_defrag_extent_backref *prev,
2497                                  struct sa_defrag_extent_backref *backref)
2498 {
2499         struct btrfs_file_extent_item *extent;
2500         struct btrfs_file_extent_item *item;
2501         struct btrfs_ordered_extent *ordered;
2502         struct btrfs_trans_handle *trans;
2503         struct btrfs_root *root;
2504         struct btrfs_key key;
2505         struct extent_buffer *leaf;
2506         struct old_sa_defrag_extent *old = backref->old;
2507         struct new_sa_defrag_extent *new = old->new;
2508         struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2509         struct inode *inode;
2510         struct extent_state *cached = NULL;
2511         int ret = 0;
2512         u64 start;
2513         u64 len;
2514         u64 lock_start;
2515         u64 lock_end;
2516         bool merge = false;
2517         int index;
2518
2519         if (prev && prev->root_id == backref->root_id &&
2520             prev->inum == backref->inum &&
2521             prev->file_pos + prev->num_bytes == backref->file_pos)
2522                 merge = true;
2523
2524         /* step 1: get root */
2525         key.objectid = backref->root_id;
2526         key.type = BTRFS_ROOT_ITEM_KEY;
2527         key.offset = (u64)-1;
2528
2529         index = srcu_read_lock(&fs_info->subvol_srcu);
2530
2531         root = btrfs_read_fs_root_no_name(fs_info, &key);
2532         if (IS_ERR(root)) {
2533                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2534                 if (PTR_ERR(root) == -ENOENT)
2535                         return 0;
2536                 return PTR_ERR(root);
2537         }
2538
2539         if (btrfs_root_readonly(root)) {
2540                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2541                 return 0;
2542         }
2543
2544         /* step 2: get inode */
2545         key.objectid = backref->inum;
2546         key.type = BTRFS_INODE_ITEM_KEY;
2547         key.offset = 0;
2548
2549         inode = btrfs_iget(fs_info->sb, &key, root, NULL);
2550         if (IS_ERR(inode)) {
2551                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2552                 return 0;
2553         }
2554
2555         srcu_read_unlock(&fs_info->subvol_srcu, index);
2556
2557         /* step 3: relink backref */
2558         lock_start = backref->file_pos;
2559         lock_end = backref->file_pos + backref->num_bytes - 1;
2560         lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2561                          &cached);
2562
2563         ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
2564         if (ordered) {
2565                 btrfs_put_ordered_extent(ordered);
2566                 goto out_unlock;
2567         }
2568
2569         trans = btrfs_join_transaction(root);
2570         if (IS_ERR(trans)) {
2571                 ret = PTR_ERR(trans);
2572                 goto out_unlock;
2573         }
2574
2575         key.objectid = backref->inum;
2576         key.type = BTRFS_EXTENT_DATA_KEY;
2577         key.offset = backref->file_pos;
2578
2579         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2580         if (ret < 0) {
2581                 goto out_free_path;
2582         } else if (ret > 0) {
2583                 ret = 0;
2584                 goto out_free_path;
2585         }
2586
2587         extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
2588                                 struct btrfs_file_extent_item);
2589
2590         if (btrfs_file_extent_generation(path->nodes[0], extent) !=
2591             backref->generation)
2592                 goto out_free_path;
2593
2594         btrfs_release_path(path);
2595
2596         start = backref->file_pos;
2597         if (backref->extent_offset < old->extent_offset + old->offset)
2598                 start += old->extent_offset + old->offset -
2599                          backref->extent_offset;
2600
2601         len = min(backref->extent_offset + backref->num_bytes,
2602                   old->extent_offset + old->offset + old->len);
2603         len -= max(backref->extent_offset, old->extent_offset + old->offset);
2604
2605         ret = btrfs_drop_extents(trans, root, inode, start,
2606                                  start + len, 1);
2607         if (ret)
2608                 goto out_free_path;
2609 again:
2610         key.objectid = btrfs_ino(BTRFS_I(inode));
2611         key.type = BTRFS_EXTENT_DATA_KEY;
2612         key.offset = start;
2613
2614         path->leave_spinning = 1;
2615         if (merge) {
2616                 struct btrfs_file_extent_item *fi;
2617                 u64 extent_len;
2618                 struct btrfs_key found_key;
2619
2620                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2621                 if (ret < 0)
2622                         goto out_free_path;
2623
2624                 path->slots[0]--;
2625                 leaf = path->nodes[0];
2626                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2627
2628                 fi = btrfs_item_ptr(leaf, path->slots[0],
2629                                     struct btrfs_file_extent_item);
2630                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
2631
2632                 if (extent_len + found_key.offset == start &&
2633                     relink_is_mergable(leaf, fi, new)) {
2634                         btrfs_set_file_extent_num_bytes(leaf, fi,
2635                                                         extent_len + len);
2636                         btrfs_mark_buffer_dirty(leaf);
2637                         inode_add_bytes(inode, len);
2638
2639                         ret = 1;
2640                         goto out_free_path;
2641                 } else {
2642                         merge = false;
2643                         btrfs_release_path(path);
2644                         goto again;
2645                 }
2646         }
2647
2648         ret = btrfs_insert_empty_item(trans, root, path, &key,
2649                                         sizeof(*extent));
2650         if (ret) {
2651                 btrfs_abort_transaction(trans, ret);
2652                 goto out_free_path;
2653         }
2654
2655         leaf = path->nodes[0];
2656         item = btrfs_item_ptr(leaf, path->slots[0],
2657                                 struct btrfs_file_extent_item);
2658         btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
2659         btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
2660         btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
2661         btrfs_set_file_extent_num_bytes(leaf, item, len);
2662         btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
2663         btrfs_set_file_extent_generation(leaf, item, trans->transid);
2664         btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
2665         btrfs_set_file_extent_compression(leaf, item, new->compress_type);
2666         btrfs_set_file_extent_encryption(leaf, item, 0);
2667         btrfs_set_file_extent_other_encoding(leaf, item, 0);
2668
2669         btrfs_mark_buffer_dirty(leaf);
2670         inode_add_bytes(inode, len);
2671         btrfs_release_path(path);
2672
2673         ret = btrfs_inc_extent_ref(trans, fs_info, new->bytenr,
2674                         new->disk_len, 0,
2675                         backref->root_id, backref->inum,
2676                         new->file_pos); /* start - extent_offset */
2677         if (ret) {
2678                 btrfs_abort_transaction(trans, ret);
2679                 goto out_free_path;
2680         }
2681
2682         ret = 1;
2683 out_free_path:
2684         btrfs_release_path(path);
2685         path->leave_spinning = 0;
2686         btrfs_end_transaction(trans);
2687 out_unlock:
2688         unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2689                              &cached, GFP_NOFS);
2690         iput(inode);
2691         return ret;
2692 }
2693
2694 static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
2695 {
2696         struct old_sa_defrag_extent *old, *tmp;
2697
2698         if (!new)
2699                 return;
2700
2701         list_for_each_entry_safe(old, tmp, &new->head, list) {
2702                 kfree(old);
2703         }
2704         kfree(new);
2705 }
2706
2707 static void relink_file_extents(struct new_sa_defrag_extent *new)
2708 {
2709         struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2710         struct btrfs_path *path;
2711         struct sa_defrag_extent_backref *backref;
2712         struct sa_defrag_extent_backref *prev = NULL;
2713         struct inode *inode;
2714         struct btrfs_root *root;
2715         struct rb_node *node;
2716         int ret;
2717
2718         inode = new->inode;
2719         root = BTRFS_I(inode)->root;
2720
2721         path = btrfs_alloc_path();
2722         if (!path)
2723                 return;
2724
2725         if (!record_extent_backrefs(path, new)) {
2726                 btrfs_free_path(path);
2727                 goto out;
2728         }
2729         btrfs_release_path(path);
2730
2731         while (1) {
2732                 node = rb_first(&new->root);
2733                 if (!node)
2734                         break;
2735                 rb_erase(node, &new->root);
2736
2737                 backref = rb_entry(node, struct sa_defrag_extent_backref, node);
2738
2739                 ret = relink_extent_backref(path, prev, backref);
2740                 WARN_ON(ret < 0);
2741
2742                 kfree(prev);
2743
2744                 if (ret == 1)
2745                         prev = backref;
2746                 else
2747                         prev = NULL;
2748                 cond_resched();
2749         }
2750         kfree(prev);
2751
2752         btrfs_free_path(path);
2753 out:
2754         free_sa_defrag_extent(new);
2755
2756         atomic_dec(&fs_info->defrag_running);
2757         wake_up(&fs_info->transaction_wait);
2758 }
2759
2760 static struct new_sa_defrag_extent *
2761 record_old_file_extents(struct inode *inode,
2762                         struct btrfs_ordered_extent *ordered)
2763 {
2764         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2765         struct btrfs_root *root = BTRFS_I(inode)->root;
2766         struct btrfs_path *path;
2767         struct btrfs_key key;
2768         struct old_sa_defrag_extent *old;
2769         struct new_sa_defrag_extent *new;
2770         int ret;
2771
2772         new = kmalloc(sizeof(*new), GFP_NOFS);
2773         if (!new)
2774                 return NULL;
2775
2776         new->inode = inode;
2777         new->file_pos = ordered->file_offset;
2778         new->len = ordered->len;
2779         new->bytenr = ordered->start;
2780         new->disk_len = ordered->disk_len;
2781         new->compress_type = ordered->compress_type;
2782         new->root = RB_ROOT;
2783         INIT_LIST_HEAD(&new->head);
2784
2785         path = btrfs_alloc_path();
2786         if (!path)
2787                 goto out_kfree;
2788
2789         key.objectid = btrfs_ino(BTRFS_I(inode));
2790         key.type = BTRFS_EXTENT_DATA_KEY;
2791         key.offset = new->file_pos;
2792
2793         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2794         if (ret < 0)
2795                 goto out_free_path;
2796         if (ret > 0 && path->slots[0] > 0)
2797                 path->slots[0]--;
2798
2799         /* find out all the old extents for the file range */
2800         while (1) {
2801                 struct btrfs_file_extent_item *extent;
2802                 struct extent_buffer *l;
2803                 int slot;
2804                 u64 num_bytes;
2805                 u64 offset;
2806                 u64 end;
2807                 u64 disk_bytenr;
2808                 u64 extent_offset;
2809
2810                 l = path->nodes[0];
2811                 slot = path->slots[0];
2812
2813                 if (slot >= btrfs_header_nritems(l)) {
2814                         ret = btrfs_next_leaf(root, path);
2815                         if (ret < 0)
2816                                 goto out_free_path;
2817                         else if (ret > 0)
2818                                 break;
2819                         continue;
2820                 }
2821
2822                 btrfs_item_key_to_cpu(l, &key, slot);
2823
2824                 if (key.objectid != btrfs_ino(BTRFS_I(inode)))
2825                         break;
2826                 if (key.type != BTRFS_EXTENT_DATA_KEY)
2827                         break;
2828                 if (key.offset >= new->file_pos + new->len)
2829                         break;
2830
2831                 extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
2832
2833                 num_bytes = btrfs_file_extent_num_bytes(l, extent);
2834                 if (key.offset + num_bytes < new->file_pos)
2835                         goto next;
2836
2837                 disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
2838                 if (!disk_bytenr)
2839                         goto next;
2840
2841                 extent_offset = btrfs_file_extent_offset(l, extent);
2842
2843                 old = kmalloc(sizeof(*old), GFP_NOFS);
2844                 if (!old)
2845                         goto out_free_path;
2846
2847                 offset = max(new->file_pos, key.offset);
2848                 end = min(new->file_pos + new->len, key.offset + num_bytes);
2849
2850                 old->bytenr = disk_bytenr;
2851                 old->extent_offset = extent_offset;
2852                 old->offset = offset - key.offset;
2853                 old->len = end - offset;
2854                 old->new = new;
2855                 old->count = 0;
2856                 list_add_tail(&old->list, &new->head);
2857 next:
2858                 path->slots[0]++;
2859                 cond_resched();
2860         }
2861
2862         btrfs_free_path(path);
2863         atomic_inc(&fs_info->defrag_running);
2864
2865         return new;
2866
2867 out_free_path:
2868         btrfs_free_path(path);
2869 out_kfree:
2870         free_sa_defrag_extent(new);
2871         return NULL;
2872 }
2873
2874 static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
2875                                          u64 start, u64 len)
2876 {
2877         struct btrfs_block_group_cache *cache;
2878
2879         cache = btrfs_lookup_block_group(fs_info, start);
2880         ASSERT(cache);
2881
2882         spin_lock(&cache->lock);
2883         cache->delalloc_bytes -= len;
2884         spin_unlock(&cache->lock);
2885
2886         btrfs_put_block_group(cache);
2887 }
2888
2889 /* as ordered data IO finishes, this gets called so we can finish
2890  * an ordered extent if the range of bytes in the file it covers are
2891  * fully written.
2892  */
2893 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2894 {
2895         struct inode *inode = ordered_extent->inode;
2896         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2897         struct btrfs_root *root = BTRFS_I(inode)->root;
2898         struct btrfs_trans_handle *trans = NULL;
2899         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2900         struct extent_state *cached_state = NULL;
2901         struct new_sa_defrag_extent *new = NULL;
2902         int compress_type = 0;
2903         int ret = 0;
2904         u64 logical_len = ordered_extent->len;
2905         bool nolock;
2906         bool truncated = false;
2907         bool range_locked = false;
2908         bool clear_new_delalloc_bytes = false;
2909
2910         if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
2911             !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
2912             !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
2913                 clear_new_delalloc_bytes = true;
2914
2915         nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
2916
2917         if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
2918                 ret = -EIO;
2919                 goto out;
2920         }
2921
2922         btrfs_free_io_failure_record(BTRFS_I(inode),
2923                         ordered_extent->file_offset,
2924                         ordered_extent->file_offset +
2925                         ordered_extent->len - 1);
2926
2927         if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
2928                 truncated = true;
2929                 logical_len = ordered_extent->truncated_len;
2930                 /* Truncated the entire extent, don't bother adding */
2931                 if (!logical_len)
2932                         goto out;
2933         }
2934
2935         if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
2936                 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
2937
2938                 /*
2939                  * For mwrite(mmap + memset to write) case, we still reserve
2940                  * space for NOCOW range.
2941                  * As NOCOW won't cause a new delayed ref, just free the space
2942                  */
2943                 btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
2944                                        ordered_extent->len);
2945                 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2946                 if (nolock)
2947                         trans = btrfs_join_transaction_nolock(root);
2948                 else
2949                         trans = btrfs_join_transaction(root);
2950                 if (IS_ERR(trans)) {
2951                         ret = PTR_ERR(trans);
2952                         trans = NULL;
2953                         goto out;
2954                 }
2955                 trans->block_rsv = &fs_info->delalloc_block_rsv;
2956                 ret = btrfs_update_inode_fallback(trans, root, inode);
2957                 if (ret) /* -ENOMEM or corruption */
2958                         btrfs_abort_transaction(trans, ret);
2959                 goto out;
2960         }
2961
2962         range_locked = true;
2963         lock_extent_bits(io_tree, ordered_extent->file_offset,
2964                          ordered_extent->file_offset + ordered_extent->len - 1,
2965                          &cached_state);
2966
2967         ret = test_range_bit(io_tree, ordered_extent->file_offset,
2968                         ordered_extent->file_offset + ordered_extent->len - 1,
2969                         EXTENT_DEFRAG, 0, cached_state);
2970         if (ret) {
2971                 u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
2972                 if (0 && last_snapshot >= BTRFS_I(inode)->generation)
2973                         /* the inode is shared */
2974                         new = record_old_file_extents(inode, ordered_extent);
2975
2976                 clear_extent_bit(io_tree, ordered_extent->file_offset,
2977                         ordered_extent->file_offset + ordered_extent->len - 1,
2978                         EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS);
2979         }
2980
2981         if (nolock)
2982                 trans = btrfs_join_transaction_nolock(root);
2983         else
2984                 trans = btrfs_join_transaction(root);
2985         if (IS_ERR(trans)) {
2986                 ret = PTR_ERR(trans);
2987                 trans = NULL;
2988                 goto out;
2989         }
2990
2991         trans->block_rsv = &fs_info->delalloc_block_rsv;
2992
2993         if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
2994                 compress_type = ordered_extent->compress_type;
2995         if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
2996                 BUG_ON(compress_type);
2997                 ret = btrfs_mark_extent_written(trans, BTRFS_I(inode),
2998                                                 ordered_extent->file_offset,
2999                                                 ordered_extent->file_offset +
3000                                                 logical_len);
3001         } else {
3002                 BUG_ON(root == fs_info->tree_root);
3003                 ret = insert_reserved_file_extent(trans, inode,
3004                                                 ordered_extent->file_offset,
3005                                                 ordered_extent->start,
3006                                                 ordered_extent->disk_len,
3007                                                 logical_len, logical_len,
3008                                                 compress_type, 0, 0,
3009                                                 BTRFS_FILE_EXTENT_REG);
3010                 if (!ret)
3011                         btrfs_release_delalloc_bytes(fs_info,
3012                                                      ordered_extent->start,
3013                                                      ordered_extent->disk_len);
3014         }
3015         unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
3016                            ordered_extent->file_offset, ordered_extent->len,
3017                            trans->transid);
3018         if (ret < 0) {
3019                 btrfs_abort_transaction(trans, ret);
3020                 goto out;
3021         }
3022
3023         add_pending_csums(trans, inode, &ordered_extent->list);
3024
3025         btrfs_ordered_update_i_size(inode, 0, ordered_extent);
3026         ret = btrfs_update_inode_fallback(trans, root, inode);
3027         if (ret) { /* -ENOMEM or corruption */
3028                 btrfs_abort_transaction(trans, ret);
3029                 goto out;
3030         }
3031         ret = 0;
3032 out:
3033         if (range_locked || clear_new_delalloc_bytes) {
3034                 unsigned int clear_bits = 0;
3035
3036                 if (range_locked)
3037                         clear_bits |= EXTENT_LOCKED;
3038                 if (clear_new_delalloc_bytes)
3039                         clear_bits |= EXTENT_DELALLOC_NEW;
3040                 clear_extent_bit(&BTRFS_I(inode)->io_tree,
3041                                  ordered_extent->file_offset,
3042                                  ordered_extent->file_offset +
3043                                  ordered_extent->len - 1,
3044                                  clear_bits,
3045                                  (clear_bits & EXTENT_LOCKED) ? 1 : 0,
3046                                  0, &cached_state, GFP_NOFS);
3047         }
3048
3049         if (root != fs_info->tree_root)
3050                 btrfs_delalloc_release_metadata(BTRFS_I(inode),
3051                                 ordered_extent->len);
3052         if (trans)
3053                 btrfs_end_transaction(trans);
3054
3055         if (ret || truncated) {
3056                 u64 start, end;
3057
3058                 if (truncated)
3059                         start = ordered_extent->file_offset + logical_len;
3060                 else
3061                         start = ordered_extent->file_offset;
3062                 end = ordered_extent->file_offset + ordered_extent->len - 1;
3063                 clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS);
3064
3065                 /* Drop the cache for the part of the extent we didn't write. */
3066                 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
3067
3068                 /*
3069                  * If the ordered extent had an IOERR or something else went
3070                  * wrong we need to return the space for this ordered extent
3071