a49dd803fb0359a6b9e2b142d79ae9ad9ade37d1
[sfrench/cifs-2.6.git] / fs / btrfs / inode.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/kernel.h>
20 #include <linux/bio.h>
21 #include <linux/buffer_head.h>
22 #include <linux/file.h>
23 #include <linux/fs.h>
24 #include <linux/pagemap.h>
25 #include <linux/highmem.h>
26 #include <linux/time.h>
27 #include <linux/init.h>
28 #include <linux/string.h>
29 #include <linux/backing-dev.h>
30 #include <linux/mpage.h>
31 #include <linux/swap.h>
32 #include <linux/writeback.h>
33 #include <linux/compat.h>
34 #include <linux/bit_spinlock.h>
35 #include <linux/xattr.h>
36 #include <linux/posix_acl.h>
37 #include <linux/falloc.h>
38 #include <linux/slab.h>
39 #include <linux/ratelimit.h>
40 #include <linux/mount.h>
41 #include <linux/btrfs.h>
42 #include <linux/blkdev.h>
43 #include <linux/posix_acl_xattr.h>
44 #include <linux/uio.h>
45 #include <linux/magic.h>
46 #include "ctree.h"
47 #include "disk-io.h"
48 #include "transaction.h"
49 #include "btrfs_inode.h"
50 #include "print-tree.h"
51 #include "ordered-data.h"
52 #include "xattr.h"
53 #include "tree-log.h"
54 #include "volumes.h"
55 #include "compression.h"
56 #include "locking.h"
57 #include "free-space-cache.h"
58 #include "inode-map.h"
59 #include "backref.h"
60 #include "hash.h"
61 #include "props.h"
62 #include "qgroup.h"
63 #include "dedupe.h"
64
65 struct btrfs_iget_args {
66         struct btrfs_key *location;
67         struct btrfs_root *root;
68 };
69
70 struct btrfs_dio_data {
71         u64 reserve;
72         u64 unsubmitted_oe_range_start;
73         u64 unsubmitted_oe_range_end;
74         int overwrite;
75 };
76
77 static const struct inode_operations btrfs_dir_inode_operations;
78 static const struct inode_operations btrfs_symlink_inode_operations;
79 static const struct inode_operations btrfs_dir_ro_inode_operations;
80 static const struct inode_operations btrfs_special_inode_operations;
81 static const struct inode_operations btrfs_file_inode_operations;
82 static const struct address_space_operations btrfs_aops;
83 static const struct address_space_operations btrfs_symlink_aops;
84 static const struct file_operations btrfs_dir_file_operations;
85 static const struct extent_io_ops btrfs_extent_io_ops;
86
87 static struct kmem_cache *btrfs_inode_cachep;
88 struct kmem_cache *btrfs_trans_handle_cachep;
89 struct kmem_cache *btrfs_path_cachep;
90 struct kmem_cache *btrfs_free_space_cachep;
91
92 #define S_SHIFT 12
93 static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
94         [S_IFREG >> S_SHIFT]    = BTRFS_FT_REG_FILE,
95         [S_IFDIR >> S_SHIFT]    = BTRFS_FT_DIR,
96         [S_IFCHR >> S_SHIFT]    = BTRFS_FT_CHRDEV,
97         [S_IFBLK >> S_SHIFT]    = BTRFS_FT_BLKDEV,
98         [S_IFIFO >> S_SHIFT]    = BTRFS_FT_FIFO,
99         [S_IFSOCK >> S_SHIFT]   = BTRFS_FT_SOCK,
100         [S_IFLNK >> S_SHIFT]    = BTRFS_FT_SYMLINK,
101 };
102
103 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
104 static int btrfs_truncate(struct inode *inode);
105 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
106 static noinline int cow_file_range(struct inode *inode,
107                                    struct page *locked_page,
108                                    u64 start, u64 end, u64 delalloc_end,
109                                    int *page_started, unsigned long *nr_written,
110                                    int unlock, struct btrfs_dedupe_hash *hash);
111 static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
112                                        u64 orig_start, u64 block_start,
113                                        u64 block_len, u64 orig_block_len,
114                                        u64 ram_bytes, int compress_type,
115                                        int type);
116
117 static void __endio_write_update_ordered(struct inode *inode,
118                                          const u64 offset, const u64 bytes,
119                                          const bool uptodate);
120
121 /*
122  * Cleanup all submitted ordered extents in specified range to handle errors
123  * from the fill_dellaloc() callback.
124  *
125  * NOTE: caller must ensure that when an error happens, it can not call
126  * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
127  * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
128  * to be released, which we want to happen only when finishing the ordered
129  * extent (btrfs_finish_ordered_io()). Also note that the caller of the
130  * fill_delalloc() callback already does proper cleanup for the first page of
131  * the range, that is, it invokes the callback writepage_end_io_hook() for the
132  * range of the first page.
133  */
134 static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
135                                                  const u64 offset,
136                                                  const u64 bytes)
137 {
138         unsigned long index = offset >> PAGE_SHIFT;
139         unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
140         struct page *page;
141
142         while (index <= end_index) {
143                 page = find_get_page(inode->i_mapping, index);
144                 index++;
145                 if (!page)
146                         continue;
147                 ClearPagePrivate2(page);
148                 put_page(page);
149         }
150         return __endio_write_update_ordered(inode, offset + PAGE_SIZE,
151                                             bytes - PAGE_SIZE, false);
152 }
153
154 static int btrfs_dirty_inode(struct inode *inode);
155
156 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
157 void btrfs_test_inode_set_ops(struct inode *inode)
158 {
159         BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
160 }
161 #endif
162
163 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
164                                      struct inode *inode,  struct inode *dir,
165                                      const struct qstr *qstr)
166 {
167         int err;
168
169         err = btrfs_init_acl(trans, inode, dir);
170         if (!err)
171                 err = btrfs_xattr_security_init(trans, inode, dir, qstr);
172         return err;
173 }
174
175 /*
176  * this does all the hard work for inserting an inline extent into
177  * the btree.  The caller should have done a btrfs_drop_extents so that
178  * no overlapping inline items exist in the btree
179  */
180 static int insert_inline_extent(struct btrfs_trans_handle *trans,
181                                 struct btrfs_path *path, int extent_inserted,
182                                 struct btrfs_root *root, struct inode *inode,
183                                 u64 start, size_t size, size_t compressed_size,
184                                 int compress_type,
185                                 struct page **compressed_pages)
186 {
187         struct extent_buffer *leaf;
188         struct page *page = NULL;
189         char *kaddr;
190         unsigned long ptr;
191         struct btrfs_file_extent_item *ei;
192         int ret;
193         size_t cur_size = size;
194         unsigned long offset;
195
196         if (compressed_size && compressed_pages)
197                 cur_size = compressed_size;
198
199         inode_add_bytes(inode, size);
200
201         if (!extent_inserted) {
202                 struct btrfs_key key;
203                 size_t datasize;
204
205                 key.objectid = btrfs_ino(BTRFS_I(inode));
206                 key.offset = start;
207                 key.type = BTRFS_EXTENT_DATA_KEY;
208
209                 datasize = btrfs_file_extent_calc_inline_size(cur_size);
210                 path->leave_spinning = 1;
211                 ret = btrfs_insert_empty_item(trans, root, path, &key,
212                                               datasize);
213                 if (ret)
214                         goto fail;
215         }
216         leaf = path->nodes[0];
217         ei = btrfs_item_ptr(leaf, path->slots[0],
218                             struct btrfs_file_extent_item);
219         btrfs_set_file_extent_generation(leaf, ei, trans->transid);
220         btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
221         btrfs_set_file_extent_encryption(leaf, ei, 0);
222         btrfs_set_file_extent_other_encoding(leaf, ei, 0);
223         btrfs_set_file_extent_ram_bytes(leaf, ei, size);
224         ptr = btrfs_file_extent_inline_start(ei);
225
226         if (compress_type != BTRFS_COMPRESS_NONE) {
227                 struct page *cpage;
228                 int i = 0;
229                 while (compressed_size > 0) {
230                         cpage = compressed_pages[i];
231                         cur_size = min_t(unsigned long, compressed_size,
232                                        PAGE_SIZE);
233
234                         kaddr = kmap_atomic(cpage);
235                         write_extent_buffer(leaf, kaddr, ptr, cur_size);
236                         kunmap_atomic(kaddr);
237
238                         i++;
239                         ptr += cur_size;
240                         compressed_size -= cur_size;
241                 }
242                 btrfs_set_file_extent_compression(leaf, ei,
243                                                   compress_type);
244         } else {
245                 page = find_get_page(inode->i_mapping,
246                                      start >> PAGE_SHIFT);
247                 btrfs_set_file_extent_compression(leaf, ei, 0);
248                 kaddr = kmap_atomic(page);
249                 offset = start & (PAGE_SIZE - 1);
250                 write_extent_buffer(leaf, kaddr + offset, ptr, size);
251                 kunmap_atomic(kaddr);
252                 put_page(page);
253         }
254         btrfs_mark_buffer_dirty(leaf);
255         btrfs_release_path(path);
256
257         /*
258          * we're an inline extent, so nobody can
259          * extend the file past i_size without locking
260          * a page we already have locked.
261          *
262          * We must do any isize and inode updates
263          * before we unlock the pages.  Otherwise we
264          * could end up racing with unlink.
265          */
266         BTRFS_I(inode)->disk_i_size = inode->i_size;
267         ret = btrfs_update_inode(trans, root, inode);
268
269 fail:
270         return ret;
271 }
272
273
274 /*
275  * conditionally insert an inline extent into the file.  This
276  * does the checks required to make sure the data is small enough
277  * to fit as an inline extent.
278  */
279 static noinline int cow_file_range_inline(struct btrfs_root *root,
280                                           struct inode *inode, u64 start,
281                                           u64 end, size_t compressed_size,
282                                           int compress_type,
283                                           struct page **compressed_pages)
284 {
285         struct btrfs_fs_info *fs_info = root->fs_info;
286         struct btrfs_trans_handle *trans;
287         u64 isize = i_size_read(inode);
288         u64 actual_end = min(end + 1, isize);
289         u64 inline_len = actual_end - start;
290         u64 aligned_end = ALIGN(end, fs_info->sectorsize);
291         u64 data_len = inline_len;
292         int ret;
293         struct btrfs_path *path;
294         int extent_inserted = 0;
295         u32 extent_item_size;
296
297         if (compressed_size)
298                 data_len = compressed_size;
299
300         if (start > 0 ||
301             actual_end > fs_info->sectorsize ||
302             data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
303             (!compressed_size &&
304             (actual_end & (fs_info->sectorsize - 1)) == 0) ||
305             end + 1 < isize ||
306             data_len > fs_info->max_inline) {
307                 return 1;
308         }
309
310         path = btrfs_alloc_path();
311         if (!path)
312                 return -ENOMEM;
313
314         trans = btrfs_join_transaction(root);
315         if (IS_ERR(trans)) {
316                 btrfs_free_path(path);
317                 return PTR_ERR(trans);
318         }
319         trans->block_rsv = &BTRFS_I(inode)->block_rsv;
320
321         if (compressed_size && compressed_pages)
322                 extent_item_size = btrfs_file_extent_calc_inline_size(
323                    compressed_size);
324         else
325                 extent_item_size = btrfs_file_extent_calc_inline_size(
326                     inline_len);
327
328         ret = __btrfs_drop_extents(trans, root, inode, path,
329                                    start, aligned_end, NULL,
330                                    1, 1, extent_item_size, &extent_inserted);
331         if (ret) {
332                 btrfs_abort_transaction(trans, ret);
333                 goto out;
334         }
335
336         if (isize > actual_end)
337                 inline_len = min_t(u64, isize, actual_end);
338         ret = insert_inline_extent(trans, path, extent_inserted,
339                                    root, inode, start,
340                                    inline_len, compressed_size,
341                                    compress_type, compressed_pages);
342         if (ret && ret != -ENOSPC) {
343                 btrfs_abort_transaction(trans, ret);
344                 goto out;
345         } else if (ret == -ENOSPC) {
346                 ret = 1;
347                 goto out;
348         }
349
350         set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
351         btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0);
352 out:
353         /*
354          * Don't forget to free the reserved space, as for inlined extent
355          * it won't count as data extent, free them directly here.
356          * And at reserve time, it's always aligned to page size, so
357          * just free one page here.
358          */
359         btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
360         btrfs_free_path(path);
361         btrfs_end_transaction(trans);
362         return ret;
363 }
364
365 struct async_extent {
366         u64 start;
367         u64 ram_size;
368         u64 compressed_size;
369         struct page **pages;
370         unsigned long nr_pages;
371         int compress_type;
372         struct list_head list;
373 };
374
375 struct async_cow {
376         struct inode *inode;
377         struct btrfs_root *root;
378         struct page *locked_page;
379         u64 start;
380         u64 end;
381         unsigned int write_flags;
382         struct list_head extents;
383         struct btrfs_work work;
384 };
385
386 static noinline int add_async_extent(struct async_cow *cow,
387                                      u64 start, u64 ram_size,
388                                      u64 compressed_size,
389                                      struct page **pages,
390                                      unsigned long nr_pages,
391                                      int compress_type)
392 {
393         struct async_extent *async_extent;
394
395         async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
396         BUG_ON(!async_extent); /* -ENOMEM */
397         async_extent->start = start;
398         async_extent->ram_size = ram_size;
399         async_extent->compressed_size = compressed_size;
400         async_extent->pages = pages;
401         async_extent->nr_pages = nr_pages;
402         async_extent->compress_type = compress_type;
403         list_add_tail(&async_extent->list, &cow->extents);
404         return 0;
405 }
406
407 static inline int inode_need_compress(struct inode *inode, u64 start, u64 end)
408 {
409         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
410
411         /* force compress */
412         if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
413                 return 1;
414         /* defrag ioctl */
415         if (BTRFS_I(inode)->defrag_compress)
416                 return 1;
417         /* bad compression ratios */
418         if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
419                 return 0;
420         if (btrfs_test_opt(fs_info, COMPRESS) ||
421             BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
422             BTRFS_I(inode)->prop_compress)
423                 return btrfs_compress_heuristic(inode, start, end);
424         return 0;
425 }
426
427 static inline void inode_should_defrag(struct btrfs_inode *inode,
428                 u64 start, u64 end, u64 num_bytes, u64 small_write)
429 {
430         /* If this is a small write inside eof, kick off a defrag */
431         if (num_bytes < small_write &&
432             (start > 0 || end + 1 < inode->disk_i_size))
433                 btrfs_add_inode_defrag(NULL, inode);
434 }
435
436 /*
437  * we create compressed extents in two phases.  The first
438  * phase compresses a range of pages that have already been
439  * locked (both pages and state bits are locked).
440  *
441  * This is done inside an ordered work queue, and the compression
442  * is spread across many cpus.  The actual IO submission is step
443  * two, and the ordered work queue takes care of making sure that
444  * happens in the same order things were put onto the queue by
445  * writepages and friends.
446  *
447  * If this code finds it can't get good compression, it puts an
448  * entry onto the work queue to write the uncompressed bytes.  This
449  * makes sure that both compressed inodes and uncompressed inodes
450  * are written in the same order that the flusher thread sent them
451  * down.
452  */
453 static noinline void compress_file_range(struct inode *inode,
454                                         struct page *locked_page,
455                                         u64 start, u64 end,
456                                         struct async_cow *async_cow,
457                                         int *num_added)
458 {
459         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
460         struct btrfs_root *root = BTRFS_I(inode)->root;
461         u64 blocksize = fs_info->sectorsize;
462         u64 actual_end;
463         u64 isize = i_size_read(inode);
464         int ret = 0;
465         struct page **pages = NULL;
466         unsigned long nr_pages;
467         unsigned long total_compressed = 0;
468         unsigned long total_in = 0;
469         int i;
470         int will_compress;
471         int compress_type = fs_info->compress_type;
472         int redirty = 0;
473
474         inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
475                         SZ_16K);
476
477         actual_end = min_t(u64, isize, end + 1);
478 again:
479         will_compress = 0;
480         nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
481         BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0);
482         nr_pages = min_t(unsigned long, nr_pages,
483                         BTRFS_MAX_COMPRESSED / PAGE_SIZE);
484
485         /*
486          * we don't want to send crud past the end of i_size through
487          * compression, that's just a waste of CPU time.  So, if the
488          * end of the file is before the start of our current
489          * requested range of bytes, we bail out to the uncompressed
490          * cleanup code that can deal with all of this.
491          *
492          * It isn't really the fastest way to fix things, but this is a
493          * very uncommon corner.
494          */
495         if (actual_end <= start)
496                 goto cleanup_and_bail_uncompressed;
497
498         total_compressed = actual_end - start;
499
500         /*
501          * skip compression for a small file range(<=blocksize) that
502          * isn't an inline extent, since it doesn't save disk space at all.
503          */
504         if (total_compressed <= blocksize &&
505            (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
506                 goto cleanup_and_bail_uncompressed;
507
508         total_compressed = min_t(unsigned long, total_compressed,
509                         BTRFS_MAX_UNCOMPRESSED);
510         total_in = 0;
511         ret = 0;
512
513         /*
514          * we do compression for mount -o compress and when the
515          * inode has not been flagged as nocompress.  This flag can
516          * change at any time if we discover bad compression ratios.
517          */
518         if (inode_need_compress(inode, start, end)) {
519                 WARN_ON(pages);
520                 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
521                 if (!pages) {
522                         /* just bail out to the uncompressed code */
523                         goto cont;
524                 }
525
526                 if (BTRFS_I(inode)->defrag_compress)
527                         compress_type = BTRFS_I(inode)->defrag_compress;
528                 else if (BTRFS_I(inode)->prop_compress)
529                         compress_type = BTRFS_I(inode)->prop_compress;
530
531                 /*
532                  * we need to call clear_page_dirty_for_io on each
533                  * page in the range.  Otherwise applications with the file
534                  * mmap'd can wander in and change the page contents while
535                  * we are compressing them.
536                  *
537                  * If the compression fails for any reason, we set the pages
538                  * dirty again later on.
539                  */
540                 extent_range_clear_dirty_for_io(inode, start, end);
541                 redirty = 1;
542
543                 /* Compression level is applied here and only here */
544                 ret = btrfs_compress_pages(
545                         compress_type | (fs_info->compress_level << 4),
546                                            inode->i_mapping, start,
547                                            pages,
548                                            &nr_pages,
549                                            &total_in,
550                                            &total_compressed);
551
552                 if (!ret) {
553                         unsigned long offset = total_compressed &
554                                 (PAGE_SIZE - 1);
555                         struct page *page = pages[nr_pages - 1];
556                         char *kaddr;
557
558                         /* zero the tail end of the last page, we might be
559                          * sending it down to disk
560                          */
561                         if (offset) {
562                                 kaddr = kmap_atomic(page);
563                                 memset(kaddr + offset, 0,
564                                        PAGE_SIZE - offset);
565                                 kunmap_atomic(kaddr);
566                         }
567                         will_compress = 1;
568                 }
569         }
570 cont:
571         if (start == 0) {
572                 /* lets try to make an inline extent */
573                 if (ret || total_in < actual_end) {
574                         /* we didn't compress the entire range, try
575                          * to make an uncompressed inline extent.
576                          */
577                         ret = cow_file_range_inline(root, inode, start, end,
578                                             0, BTRFS_COMPRESS_NONE, NULL);
579                 } else {
580                         /* try making a compressed inline extent */
581                         ret = cow_file_range_inline(root, inode, start, end,
582                                                     total_compressed,
583                                                     compress_type, pages);
584                 }
585                 if (ret <= 0) {
586                         unsigned long clear_flags = EXTENT_DELALLOC |
587                                 EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
588                                 EXTENT_DO_ACCOUNTING;
589                         unsigned long page_error_op;
590
591                         page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
592
593                         /*
594                          * inline extent creation worked or returned error,
595                          * we don't need to create any more async work items.
596                          * Unlock and free up our temp pages.
597                          *
598                          * We use DO_ACCOUNTING here because we need the
599                          * delalloc_release_metadata to be done _after_ we drop
600                          * our outstanding extent for clearing delalloc for this
601                          * range.
602                          */
603                         extent_clear_unlock_delalloc(inode, start, end, end,
604                                                      NULL, clear_flags,
605                                                      PAGE_UNLOCK |
606                                                      PAGE_CLEAR_DIRTY |
607                                                      PAGE_SET_WRITEBACK |
608                                                      page_error_op |
609                                                      PAGE_END_WRITEBACK);
610                         goto free_pages_out;
611                 }
612         }
613
614         if (will_compress) {
615                 /*
616                  * we aren't doing an inline extent round the compressed size
617                  * up to a block size boundary so the allocator does sane
618                  * things
619                  */
620                 total_compressed = ALIGN(total_compressed, blocksize);
621
622                 /*
623                  * one last check to make sure the compression is really a
624                  * win, compare the page count read with the blocks on disk,
625                  * compression must free at least one sector size
626                  */
627                 total_in = ALIGN(total_in, PAGE_SIZE);
628                 if (total_compressed + blocksize <= total_in) {
629                         *num_added += 1;
630
631                         /*
632                          * The async work queues will take care of doing actual
633                          * allocation on disk for these compressed pages, and
634                          * will submit them to the elevator.
635                          */
636                         add_async_extent(async_cow, start, total_in,
637                                         total_compressed, pages, nr_pages,
638                                         compress_type);
639
640                         if (start + total_in < end) {
641                                 start += total_in;
642                                 pages = NULL;
643                                 cond_resched();
644                                 goto again;
645                         }
646                         return;
647                 }
648         }
649         if (pages) {
650                 /*
651                  * the compression code ran but failed to make things smaller,
652                  * free any pages it allocated and our page pointer array
653                  */
654                 for (i = 0; i < nr_pages; i++) {
655                         WARN_ON(pages[i]->mapping);
656                         put_page(pages[i]);
657                 }
658                 kfree(pages);
659                 pages = NULL;
660                 total_compressed = 0;
661                 nr_pages = 0;
662
663                 /* flag the file so we don't compress in the future */
664                 if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
665                     !(BTRFS_I(inode)->prop_compress)) {
666                         BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
667                 }
668         }
669 cleanup_and_bail_uncompressed:
670         /*
671          * No compression, but we still need to write the pages in the file
672          * we've been given so far.  redirty the locked page if it corresponds
673          * to our extent and set things up for the async work queue to run
674          * cow_file_range to do the normal delalloc dance.
675          */
676         if (page_offset(locked_page) >= start &&
677             page_offset(locked_page) <= end)
678                 __set_page_dirty_nobuffers(locked_page);
679                 /* unlocked later on in the async handlers */
680
681         if (redirty)
682                 extent_range_redirty_for_io(inode, start, end);
683         add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0,
684                          BTRFS_COMPRESS_NONE);
685         *num_added += 1;
686
687         return;
688
689 free_pages_out:
690         for (i = 0; i < nr_pages; i++) {
691                 WARN_ON(pages[i]->mapping);
692                 put_page(pages[i]);
693         }
694         kfree(pages);
695 }
696
697 static void free_async_extent_pages(struct async_extent *async_extent)
698 {
699         int i;
700
701         if (!async_extent->pages)
702                 return;
703
704         for (i = 0; i < async_extent->nr_pages; i++) {
705                 WARN_ON(async_extent->pages[i]->mapping);
706                 put_page(async_extent->pages[i]);
707         }
708         kfree(async_extent->pages);
709         async_extent->nr_pages = 0;
710         async_extent->pages = NULL;
711 }
712
713 /*
714  * phase two of compressed writeback.  This is the ordered portion
715  * of the code, which only gets called in the order the work was
716  * queued.  We walk all the async extents created by compress_file_range
717  * and send them down to the disk.
718  */
719 static noinline void submit_compressed_extents(struct inode *inode,
720                                               struct async_cow *async_cow)
721 {
722         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
723         struct async_extent *async_extent;
724         u64 alloc_hint = 0;
725         struct btrfs_key ins;
726         struct extent_map *em;
727         struct btrfs_root *root = BTRFS_I(inode)->root;
728         struct extent_io_tree *io_tree;
729         int ret = 0;
730
731 again:
732         while (!list_empty(&async_cow->extents)) {
733                 async_extent = list_entry(async_cow->extents.next,
734                                           struct async_extent, list);
735                 list_del(&async_extent->list);
736
737                 io_tree = &BTRFS_I(inode)->io_tree;
738
739 retry:
740                 /* did the compression code fall back to uncompressed IO? */
741                 if (!async_extent->pages) {
742                         int page_started = 0;
743                         unsigned long nr_written = 0;
744
745                         lock_extent(io_tree, async_extent->start,
746                                          async_extent->start +
747                                          async_extent->ram_size - 1);
748
749                         /* allocate blocks */
750                         ret = cow_file_range(inode, async_cow->locked_page,
751                                              async_extent->start,
752                                              async_extent->start +
753                                              async_extent->ram_size - 1,
754                                              async_extent->start +
755                                              async_extent->ram_size - 1,
756                                              &page_started, &nr_written, 0,
757                                              NULL);
758
759                         /* JDM XXX */
760
761                         /*
762                          * if page_started, cow_file_range inserted an
763                          * inline extent and took care of all the unlocking
764                          * and IO for us.  Otherwise, we need to submit
765                          * all those pages down to the drive.
766                          */
767                         if (!page_started && !ret)
768                                 extent_write_locked_range(io_tree,
769                                                   inode, async_extent->start,
770                                                   async_extent->start +
771                                                   async_extent->ram_size - 1,
772                                                   btrfs_get_extent,
773                                                   WB_SYNC_ALL);
774                         else if (ret)
775                                 unlock_page(async_cow->locked_page);
776                         kfree(async_extent);
777                         cond_resched();
778                         continue;
779                 }
780
781                 lock_extent(io_tree, async_extent->start,
782                             async_extent->start + async_extent->ram_size - 1);
783
784                 ret = btrfs_reserve_extent(root, async_extent->ram_size,
785                                            async_extent->compressed_size,
786                                            async_extent->compressed_size,
787                                            0, alloc_hint, &ins, 1, 1);
788                 if (ret) {
789                         free_async_extent_pages(async_extent);
790
791                         if (ret == -ENOSPC) {
792                                 unlock_extent(io_tree, async_extent->start,
793                                               async_extent->start +
794                                               async_extent->ram_size - 1);
795
796                                 /*
797                                  * we need to redirty the pages if we decide to
798                                  * fallback to uncompressed IO, otherwise we
799                                  * will not submit these pages down to lower
800                                  * layers.
801                                  */
802                                 extent_range_redirty_for_io(inode,
803                                                 async_extent->start,
804                                                 async_extent->start +
805                                                 async_extent->ram_size - 1);
806
807                                 goto retry;
808                         }
809                         goto out_free;
810                 }
811                 /*
812                  * here we're doing allocation and writeback of the
813                  * compressed pages
814                  */
815                 em = create_io_em(inode, async_extent->start,
816                                   async_extent->ram_size, /* len */
817                                   async_extent->start, /* orig_start */
818                                   ins.objectid, /* block_start */
819                                   ins.offset, /* block_len */
820                                   ins.offset, /* orig_block_len */
821                                   async_extent->ram_size, /* ram_bytes */
822                                   async_extent->compress_type,
823                                   BTRFS_ORDERED_COMPRESSED);
824                 if (IS_ERR(em))
825                         /* ret value is not necessary due to void function */
826                         goto out_free_reserve;
827                 free_extent_map(em);
828
829                 ret = btrfs_add_ordered_extent_compress(inode,
830                                                 async_extent->start,
831                                                 ins.objectid,
832                                                 async_extent->ram_size,
833                                                 ins.offset,
834                                                 BTRFS_ORDERED_COMPRESSED,
835                                                 async_extent->compress_type);
836                 if (ret) {
837                         btrfs_drop_extent_cache(BTRFS_I(inode),
838                                                 async_extent->start,
839                                                 async_extent->start +
840                                                 async_extent->ram_size - 1, 0);
841                         goto out_free_reserve;
842                 }
843                 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
844
845                 /*
846                  * clear dirty, set writeback and unlock the pages.
847                  */
848                 extent_clear_unlock_delalloc(inode, async_extent->start,
849                                 async_extent->start +
850                                 async_extent->ram_size - 1,
851                                 async_extent->start +
852                                 async_extent->ram_size - 1,
853                                 NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
854                                 PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
855                                 PAGE_SET_WRITEBACK);
856                 if (btrfs_submit_compressed_write(inode,
857                                     async_extent->start,
858                                     async_extent->ram_size,
859                                     ins.objectid,
860                                     ins.offset, async_extent->pages,
861                                     async_extent->nr_pages,
862                                     async_cow->write_flags)) {
863                         struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
864                         struct page *p = async_extent->pages[0];
865                         const u64 start = async_extent->start;
866                         const u64 end = start + async_extent->ram_size - 1;
867
868                         p->mapping = inode->i_mapping;
869                         tree->ops->writepage_end_io_hook(p, start, end,
870                                                          NULL, 0);
871                         p->mapping = NULL;
872                         extent_clear_unlock_delalloc(inode, start, end, end,
873                                                      NULL, 0,
874                                                      PAGE_END_WRITEBACK |
875                                                      PAGE_SET_ERROR);
876                         free_async_extent_pages(async_extent);
877                 }
878                 alloc_hint = ins.objectid + ins.offset;
879                 kfree(async_extent);
880                 cond_resched();
881         }
882         return;
883 out_free_reserve:
884         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
885         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
886 out_free:
887         extent_clear_unlock_delalloc(inode, async_extent->start,
888                                      async_extent->start +
889                                      async_extent->ram_size - 1,
890                                      async_extent->start +
891                                      async_extent->ram_size - 1,
892                                      NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
893                                      EXTENT_DELALLOC_NEW |
894                                      EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
895                                      PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
896                                      PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
897                                      PAGE_SET_ERROR);
898         free_async_extent_pages(async_extent);
899         kfree(async_extent);
900         goto again;
901 }
902
903 static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
904                                       u64 num_bytes)
905 {
906         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
907         struct extent_map *em;
908         u64 alloc_hint = 0;
909
910         read_lock(&em_tree->lock);
911         em = search_extent_mapping(em_tree, start, num_bytes);
912         if (em) {
913                 /*
914                  * if block start isn't an actual block number then find the
915                  * first block in this inode and use that as a hint.  If that
916                  * block is also bogus then just don't worry about it.
917                  */
918                 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
919                         free_extent_map(em);
920                         em = search_extent_mapping(em_tree, 0, 0);
921                         if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
922                                 alloc_hint = em->block_start;
923                         if (em)
924                                 free_extent_map(em);
925                 } else {
926                         alloc_hint = em->block_start;
927                         free_extent_map(em);
928                 }
929         }
930         read_unlock(&em_tree->lock);
931
932         return alloc_hint;
933 }
934
935 /*
936  * when extent_io.c finds a delayed allocation range in the file,
937  * the call backs end up in this code.  The basic idea is to
938  * allocate extents on disk for the range, and create ordered data structs
939  * in ram to track those extents.
940  *
941  * locked_page is the page that writepage had locked already.  We use
942  * it to make sure we don't do extra locks or unlocks.
943  *
944  * *page_started is set to one if we unlock locked_page and do everything
945  * required to start IO on it.  It may be clean and already done with
946  * IO when we return.
947  */
948 static noinline int cow_file_range(struct inode *inode,
949                                    struct page *locked_page,
950                                    u64 start, u64 end, u64 delalloc_end,
951                                    int *page_started, unsigned long *nr_written,
952                                    int unlock, struct btrfs_dedupe_hash *hash)
953 {
954         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
955         struct btrfs_root *root = BTRFS_I(inode)->root;
956         u64 alloc_hint = 0;
957         u64 num_bytes;
958         unsigned long ram_size;
959         u64 disk_num_bytes;
960         u64 cur_alloc_size = 0;
961         u64 blocksize = fs_info->sectorsize;
962         struct btrfs_key ins;
963         struct extent_map *em;
964         unsigned clear_bits;
965         unsigned long page_ops;
966         bool extent_reserved = false;
967         int ret = 0;
968
969         if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
970                 WARN_ON_ONCE(1);
971                 ret = -EINVAL;
972                 goto out_unlock;
973         }
974
975         num_bytes = ALIGN(end - start + 1, blocksize);
976         num_bytes = max(blocksize,  num_bytes);
977         disk_num_bytes = num_bytes;
978
979         inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K);
980
981         if (start == 0) {
982                 /* lets try to make an inline extent */
983                 ret = cow_file_range_inline(root, inode, start, end, 0,
984                                         BTRFS_COMPRESS_NONE, NULL);
985                 if (ret == 0) {
986                         /*
987                          * We use DO_ACCOUNTING here because we need the
988                          * delalloc_release_metadata to be run _after_ we drop
989                          * our outstanding extent for clearing delalloc for this
990                          * range.
991                          */
992                         extent_clear_unlock_delalloc(inode, start, end,
993                                      delalloc_end, NULL,
994                                      EXTENT_LOCKED | EXTENT_DELALLOC |
995                                      EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
996                                      EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
997                                      PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
998                                      PAGE_END_WRITEBACK);
999                         *nr_written = *nr_written +
1000                              (end - start + PAGE_SIZE) / PAGE_SIZE;
1001                         *page_started = 1;
1002                         goto out;
1003                 } else if (ret < 0) {
1004                         goto out_unlock;
1005                 }
1006         }
1007
1008         BUG_ON(disk_num_bytes >
1009                btrfs_super_total_bytes(fs_info->super_copy));
1010
1011         alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
1012         btrfs_drop_extent_cache(BTRFS_I(inode), start,
1013                         start + num_bytes - 1, 0);
1014
1015         while (disk_num_bytes > 0) {
1016                 cur_alloc_size = disk_num_bytes;
1017                 ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
1018                                            fs_info->sectorsize, 0, alloc_hint,
1019                                            &ins, 1, 1);
1020                 if (ret < 0)
1021                         goto out_unlock;
1022                 cur_alloc_size = ins.offset;
1023                 extent_reserved = true;
1024
1025                 ram_size = ins.offset;
1026                 em = create_io_em(inode, start, ins.offset, /* len */
1027                                   start, /* orig_start */
1028                                   ins.objectid, /* block_start */
1029                                   ins.offset, /* block_len */
1030                                   ins.offset, /* orig_block_len */
1031                                   ram_size, /* ram_bytes */
1032                                   BTRFS_COMPRESS_NONE, /* compress_type */
1033                                   BTRFS_ORDERED_REGULAR /* type */);
1034                 if (IS_ERR(em))
1035                         goto out_reserve;
1036                 free_extent_map(em);
1037
1038                 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
1039                                                ram_size, cur_alloc_size, 0);
1040                 if (ret)
1041                         goto out_drop_extent_cache;
1042
1043                 if (root->root_key.objectid ==
1044                     BTRFS_DATA_RELOC_TREE_OBJECTID) {
1045                         ret = btrfs_reloc_clone_csums(inode, start,
1046                                                       cur_alloc_size);
1047                         /*
1048                          * Only drop cache here, and process as normal.
1049                          *
1050                          * We must not allow extent_clear_unlock_delalloc()
1051                          * at out_unlock label to free meta of this ordered
1052                          * extent, as its meta should be freed by
1053                          * btrfs_finish_ordered_io().
1054                          *
1055                          * So we must continue until @start is increased to
1056                          * skip current ordered extent.
1057                          */
1058                         if (ret)
1059                                 btrfs_drop_extent_cache(BTRFS_I(inode), start,
1060                                                 start + ram_size - 1, 0);
1061                 }
1062
1063                 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1064
1065                 /* we're not doing compressed IO, don't unlock the first
1066                  * page (which the caller expects to stay locked), don't
1067                  * clear any dirty bits and don't set any writeback bits
1068                  *
1069                  * Do set the Private2 bit so we know this page was properly
1070                  * setup for writepage
1071                  */
1072                 page_ops = unlock ? PAGE_UNLOCK : 0;
1073                 page_ops |= PAGE_SET_PRIVATE2;
1074
1075                 extent_clear_unlock_delalloc(inode, start,
1076                                              start + ram_size - 1,
1077                                              delalloc_end, locked_page,
1078                                              EXTENT_LOCKED | EXTENT_DELALLOC,
1079                                              page_ops);
1080                 if (disk_num_bytes < cur_alloc_size)
1081                         disk_num_bytes = 0;
1082                 else
1083                         disk_num_bytes -= cur_alloc_size;
1084                 num_bytes -= cur_alloc_size;
1085                 alloc_hint = ins.objectid + ins.offset;
1086                 start += cur_alloc_size;
1087                 extent_reserved = false;
1088
1089                 /*
1090                  * btrfs_reloc_clone_csums() error, since start is increased
1091                  * extent_clear_unlock_delalloc() at out_unlock label won't
1092                  * free metadata of current ordered extent, we're OK to exit.
1093                  */
1094                 if (ret)
1095                         goto out_unlock;
1096         }
1097 out:
1098         return ret;
1099
1100 out_drop_extent_cache:
1101         btrfs_drop_extent_cache(BTRFS_I(inode), start, start + ram_size - 1, 0);
1102 out_reserve:
1103         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1104         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1105 out_unlock:
1106         clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
1107                 EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
1108         page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
1109                 PAGE_END_WRITEBACK;
1110         /*
1111          * If we reserved an extent for our delalloc range (or a subrange) and
1112          * failed to create the respective ordered extent, then it means that
1113          * when we reserved the extent we decremented the extent's size from
1114          * the data space_info's bytes_may_use counter and incremented the
1115          * space_info's bytes_reserved counter by the same amount. We must make
1116          * sure extent_clear_unlock_delalloc() does not try to decrement again
1117          * the data space_info's bytes_may_use counter, therefore we do not pass
1118          * it the flag EXTENT_CLEAR_DATA_RESV.
1119          */
1120         if (extent_reserved) {
1121                 extent_clear_unlock_delalloc(inode, start,
1122                                              start + cur_alloc_size,
1123                                              start + cur_alloc_size,
1124                                              locked_page,
1125                                              clear_bits,
1126                                              page_ops);
1127                 start += cur_alloc_size;
1128                 if (start >= end)
1129                         goto out;
1130         }
1131         extent_clear_unlock_delalloc(inode, start, end, delalloc_end,
1132                                      locked_page,
1133                                      clear_bits | EXTENT_CLEAR_DATA_RESV,
1134                                      page_ops);
1135         goto out;
1136 }
1137
1138 /*
1139  * work queue call back to started compression on a file and pages
1140  */
1141 static noinline void async_cow_start(struct btrfs_work *work)
1142 {
1143         struct async_cow *async_cow;
1144         int num_added = 0;
1145         async_cow = container_of(work, struct async_cow, work);
1146
1147         compress_file_range(async_cow->inode, async_cow->locked_page,
1148                             async_cow->start, async_cow->end, async_cow,
1149                             &num_added);
1150         if (num_added == 0) {
1151                 btrfs_add_delayed_iput(async_cow->inode);
1152                 async_cow->inode = NULL;
1153         }
1154 }
1155
1156 /*
1157  * work queue call back to submit previously compressed pages
1158  */
1159 static noinline void async_cow_submit(struct btrfs_work *work)
1160 {
1161         struct btrfs_fs_info *fs_info;
1162         struct async_cow *async_cow;
1163         struct btrfs_root *root;
1164         unsigned long nr_pages;
1165
1166         async_cow = container_of(work, struct async_cow, work);
1167
1168         root = async_cow->root;
1169         fs_info = root->fs_info;
1170         nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >>
1171                 PAGE_SHIFT;
1172
1173         /*
1174          * atomic_sub_return implies a barrier for waitqueue_active
1175          */
1176         if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
1177             5 * SZ_1M &&
1178             waitqueue_active(&fs_info->async_submit_wait))
1179                 wake_up(&fs_info->async_submit_wait);
1180
1181         if (async_cow->inode)
1182                 submit_compressed_extents(async_cow->inode, async_cow);
1183 }
1184
1185 static noinline void async_cow_free(struct btrfs_work *work)
1186 {
1187         struct async_cow *async_cow;
1188         async_cow = container_of(work, struct async_cow, work);
1189         if (async_cow->inode)
1190                 btrfs_add_delayed_iput(async_cow->inode);
1191         kfree(async_cow);
1192 }
1193
1194 static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1195                                 u64 start, u64 end, int *page_started,
1196                                 unsigned long *nr_written,
1197                                 unsigned int write_flags)
1198 {
1199         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1200         struct async_cow *async_cow;
1201         struct btrfs_root *root = BTRFS_I(inode)->root;
1202         unsigned long nr_pages;
1203         u64 cur_end;
1204
1205         clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1206                          1, 0, NULL);
1207         while (start < end) {
1208                 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
1209                 BUG_ON(!async_cow); /* -ENOMEM */
1210                 async_cow->inode = igrab(inode);
1211                 async_cow->root = root;
1212                 async_cow->locked_page = locked_page;
1213                 async_cow->start = start;
1214                 async_cow->write_flags = write_flags;
1215
1216                 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
1217                     !btrfs_test_opt(fs_info, FORCE_COMPRESS))
1218                         cur_end = end;
1219                 else
1220                         cur_end = min(end, start + SZ_512K - 1);
1221
1222                 async_cow->end = cur_end;
1223                 INIT_LIST_HEAD(&async_cow->extents);
1224
1225                 btrfs_init_work(&async_cow->work,
1226                                 btrfs_delalloc_helper,
1227                                 async_cow_start, async_cow_submit,
1228                                 async_cow_free);
1229
1230                 nr_pages = (cur_end - start + PAGE_SIZE) >>
1231                         PAGE_SHIFT;
1232                 atomic_add(nr_pages, &fs_info->async_delalloc_pages);
1233
1234                 btrfs_queue_work(fs_info->delalloc_workers, &async_cow->work);
1235
1236                 *nr_written += nr_pages;
1237                 start = cur_end + 1;
1238         }
1239         *page_started = 1;
1240         return 0;
1241 }
1242
1243 static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
1244                                         u64 bytenr, u64 num_bytes)
1245 {
1246         int ret;
1247         struct btrfs_ordered_sum *sums;
1248         LIST_HEAD(list);
1249
1250         ret = btrfs_lookup_csums_range(fs_info->csum_root, bytenr,
1251                                        bytenr + num_bytes - 1, &list, 0);
1252         if (ret == 0 && list_empty(&list))
1253                 return 0;
1254
1255         while (!list_empty(&list)) {
1256                 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1257                 list_del(&sums->list);
1258                 kfree(sums);
1259         }
1260         return 1;
1261 }
1262
1263 /*
1264  * when nowcow writeback call back.  This checks for snapshots or COW copies
1265  * of the extents that exist in the file, and COWs the file as required.
1266  *
1267  * If no cow copies or snapshots exist, we write directly to the existing
1268  * blocks on disk
1269  */
1270 static noinline int run_delalloc_nocow(struct inode *inode,
1271                                        struct page *locked_page,
1272                               u64 start, u64 end, int *page_started, int force,
1273                               unsigned long *nr_written)
1274 {
1275         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1276         struct btrfs_root *root = BTRFS_I(inode)->root;
1277         struct extent_buffer *leaf;
1278         struct btrfs_path *path;
1279         struct btrfs_file_extent_item *fi;
1280         struct btrfs_key found_key;
1281         struct extent_map *em;
1282         u64 cow_start;
1283         u64 cur_offset;
1284         u64 extent_end;
1285         u64 extent_offset;
1286         u64 disk_bytenr;
1287         u64 num_bytes;
1288         u64 disk_num_bytes;
1289         u64 ram_bytes;
1290         int extent_type;
1291         int ret, err;
1292         int type;
1293         int nocow;
1294         int check_prev = 1;
1295         bool nolock;
1296         u64 ino = btrfs_ino(BTRFS_I(inode));
1297
1298         path = btrfs_alloc_path();
1299         if (!path) {
1300                 extent_clear_unlock_delalloc(inode, start, end, end,
1301                                              locked_page,
1302                                              EXTENT_LOCKED | EXTENT_DELALLOC |
1303                                              EXTENT_DO_ACCOUNTING |
1304                                              EXTENT_DEFRAG, PAGE_UNLOCK |
1305                                              PAGE_CLEAR_DIRTY |
1306                                              PAGE_SET_WRITEBACK |
1307                                              PAGE_END_WRITEBACK);
1308                 return -ENOMEM;
1309         }
1310
1311         nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
1312
1313         cow_start = (u64)-1;
1314         cur_offset = start;
1315         while (1) {
1316                 ret = btrfs_lookup_file_extent(NULL, root, path, ino,
1317                                                cur_offset, 0);
1318                 if (ret < 0)
1319                         goto error;
1320                 if (ret > 0 && path->slots[0] > 0 && check_prev) {
1321                         leaf = path->nodes[0];
1322                         btrfs_item_key_to_cpu(leaf, &found_key,
1323                                               path->slots[0] - 1);
1324                         if (found_key.objectid == ino &&
1325                             found_key.type == BTRFS_EXTENT_DATA_KEY)
1326                                 path->slots[0]--;
1327                 }
1328                 check_prev = 0;
1329 next_slot:
1330                 leaf = path->nodes[0];
1331                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1332                         ret = btrfs_next_leaf(root, path);
1333                         if (ret < 0)
1334                                 goto error;
1335                         if (ret > 0)
1336                                 break;
1337                         leaf = path->nodes[0];
1338                 }
1339
1340                 nocow = 0;
1341                 disk_bytenr = 0;
1342                 num_bytes = 0;
1343                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1344
1345                 if (found_key.objectid > ino)
1346                         break;
1347                 if (WARN_ON_ONCE(found_key.objectid < ino) ||
1348                     found_key.type < BTRFS_EXTENT_DATA_KEY) {
1349                         path->slots[0]++;
1350                         goto next_slot;
1351                 }
1352                 if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
1353                     found_key.offset > end)
1354                         break;
1355
1356                 if (found_key.offset > cur_offset) {
1357                         extent_end = found_key.offset;
1358                         extent_type = 0;
1359                         goto out_check;
1360                 }
1361
1362                 fi = btrfs_item_ptr(leaf, path->slots[0],
1363                                     struct btrfs_file_extent_item);
1364                 extent_type = btrfs_file_extent_type(leaf, fi);
1365
1366                 ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1367                 if (extent_type == BTRFS_FILE_EXTENT_REG ||
1368                     extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1369                         disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1370                         extent_offset = btrfs_file_extent_offset(leaf, fi);
1371                         extent_end = found_key.offset +
1372                                 btrfs_file_extent_num_bytes(leaf, fi);
1373                         disk_num_bytes =
1374                                 btrfs_file_extent_disk_num_bytes(leaf, fi);
1375                         if (extent_end <= start) {
1376                                 path->slots[0]++;
1377                                 goto next_slot;
1378                         }
1379                         if (disk_bytenr == 0)
1380                                 goto out_check;
1381                         if (btrfs_file_extent_compression(leaf, fi) ||
1382                             btrfs_file_extent_encryption(leaf, fi) ||
1383                             btrfs_file_extent_other_encoding(leaf, fi))
1384                                 goto out_check;
1385                         if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1386                                 goto out_check;
1387                         if (btrfs_extent_readonly(fs_info, disk_bytenr))
1388                                 goto out_check;
1389                         if (btrfs_cross_ref_exist(root, ino,
1390                                                   found_key.offset -
1391                                                   extent_offset, disk_bytenr))
1392                                 goto out_check;
1393                         disk_bytenr += extent_offset;
1394                         disk_bytenr += cur_offset - found_key.offset;
1395                         num_bytes = min(end + 1, extent_end) - cur_offset;
1396                         /*
1397                          * if there are pending snapshots for this root,
1398                          * we fall into common COW way.
1399                          */
1400                         if (!nolock) {
1401                                 err = btrfs_start_write_no_snapshotting(root);
1402                                 if (!err)
1403                                         goto out_check;
1404                         }
1405                         /*
1406                          * force cow if csum exists in the range.
1407                          * this ensure that csum for a given extent are
1408                          * either valid or do not exist.
1409                          */
1410                         if (csum_exist_in_range(fs_info, disk_bytenr,
1411                                                 num_bytes)) {
1412                                 if (!nolock)
1413                                         btrfs_end_write_no_snapshotting(root);
1414                                 goto out_check;
1415                         }
1416                         if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) {
1417                                 if (!nolock)
1418                                         btrfs_end_write_no_snapshotting(root);
1419                                 goto out_check;
1420                         }
1421                         nocow = 1;
1422                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1423                         extent_end = found_key.offset +
1424                                 btrfs_file_extent_inline_len(leaf,
1425                                                      path->slots[0], fi);
1426                         extent_end = ALIGN(extent_end,
1427                                            fs_info->sectorsize);
1428                 } else {
1429                         BUG_ON(1);
1430                 }
1431 out_check:
1432                 if (extent_end <= start) {
1433                         path->slots[0]++;
1434                         if (!nolock && nocow)
1435                                 btrfs_end_write_no_snapshotting(root);
1436                         if (nocow)
1437                                 btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1438                         goto next_slot;
1439                 }
1440                 if (!nocow) {
1441                         if (cow_start == (u64)-1)
1442                                 cow_start = cur_offset;
1443                         cur_offset = extent_end;
1444                         if (cur_offset > end)
1445                                 break;
1446                         path->slots[0]++;
1447                         goto next_slot;
1448                 }
1449
1450                 btrfs_release_path(path);
1451                 if (cow_start != (u64)-1) {
1452                         ret = cow_file_range(inode, locked_page,
1453                                              cow_start, found_key.offset - 1,
1454                                              end, page_started, nr_written, 1,
1455                                              NULL);
1456                         if (ret) {
1457                                 if (!nolock && nocow)
1458                                         btrfs_end_write_no_snapshotting(root);
1459                                 if (nocow)
1460                                         btrfs_dec_nocow_writers(fs_info,
1461                                                                 disk_bytenr);
1462                                 goto error;
1463                         }
1464                         cow_start = (u64)-1;
1465                 }
1466
1467                 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1468                         u64 orig_start = found_key.offset - extent_offset;
1469
1470                         em = create_io_em(inode, cur_offset, num_bytes,
1471                                           orig_start,
1472                                           disk_bytenr, /* block_start */
1473                                           num_bytes, /* block_len */
1474                                           disk_num_bytes, /* orig_block_len */
1475                                           ram_bytes, BTRFS_COMPRESS_NONE,
1476                                           BTRFS_ORDERED_PREALLOC);
1477                         if (IS_ERR(em)) {
1478                                 if (!nolock && nocow)
1479                                         btrfs_end_write_no_snapshotting(root);
1480                                 if (nocow)
1481                                         btrfs_dec_nocow_writers(fs_info,
1482                                                                 disk_bytenr);
1483                                 ret = PTR_ERR(em);
1484                                 goto error;
1485                         }
1486                         free_extent_map(em);
1487                 }
1488
1489                 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1490                         type = BTRFS_ORDERED_PREALLOC;
1491                 } else {
1492                         type = BTRFS_ORDERED_NOCOW;
1493                 }
1494
1495                 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1496                                                num_bytes, num_bytes, type);
1497                 if (nocow)
1498                         btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1499                 BUG_ON(ret); /* -ENOMEM */
1500
1501                 if (root->root_key.objectid ==
1502                     BTRFS_DATA_RELOC_TREE_OBJECTID)
1503                         /*
1504                          * Error handled later, as we must prevent
1505                          * extent_clear_unlock_delalloc() in error handler
1506                          * from freeing metadata of created ordered extent.
1507                          */
1508                         ret = btrfs_reloc_clone_csums(inode, cur_offset,
1509                                                       num_bytes);
1510
1511                 extent_clear_unlock_delalloc(inode, cur_offset,
1512                                              cur_offset + num_bytes - 1, end,
1513                                              locked_page, EXTENT_LOCKED |
1514                                              EXTENT_DELALLOC |
1515                                              EXTENT_CLEAR_DATA_RESV,
1516                                              PAGE_UNLOCK | PAGE_SET_PRIVATE2);
1517
1518                 if (!nolock && nocow)
1519                         btrfs_end_write_no_snapshotting(root);
1520                 cur_offset = extent_end;
1521
1522                 /*
1523                  * btrfs_reloc_clone_csums() error, now we're OK to call error
1524                  * handler, as metadata for created ordered extent will only
1525                  * be freed by btrfs_finish_ordered_io().
1526                  */
1527                 if (ret)
1528                         goto error;
1529                 if (cur_offset > end)
1530                         break;
1531         }
1532         btrfs_release_path(path);
1533
1534         if (cur_offset <= end && cow_start == (u64)-1) {
1535                 cow_start = cur_offset;
1536                 cur_offset = end;
1537         }
1538
1539         if (cow_start != (u64)-1) {
1540                 ret = cow_file_range(inode, locked_page, cow_start, end, end,
1541                                      page_started, nr_written, 1, NULL);
1542                 if (ret)
1543                         goto error;
1544         }
1545
1546 error:
1547         if (ret && cur_offset < end)
1548                 extent_clear_unlock_delalloc(inode, cur_offset, end, end,
1549                                              locked_page, EXTENT_LOCKED |
1550                                              EXTENT_DELALLOC | EXTENT_DEFRAG |
1551                                              EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1552                                              PAGE_CLEAR_DIRTY |
1553                                              PAGE_SET_WRITEBACK |
1554                                              PAGE_END_WRITEBACK);
1555         btrfs_free_path(path);
1556         return ret;
1557 }
1558
1559 static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
1560 {
1561
1562         if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
1563             !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
1564                 return 0;
1565
1566         /*
1567          * @defrag_bytes is a hint value, no spinlock held here,
1568          * if is not zero, it means the file is defragging.
1569          * Force cow if given extent needs to be defragged.
1570          */
1571         if (BTRFS_I(inode)->defrag_bytes &&
1572             test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1573                            EXTENT_DEFRAG, 0, NULL))
1574                 return 1;
1575
1576         return 0;
1577 }
1578
1579 /*
1580  * extent_io.c call back to do delayed allocation processing
1581  */
1582 static int run_delalloc_range(void *private_data, struct page *locked_page,
1583                               u64 start, u64 end, int *page_started,
1584                               unsigned long *nr_written,
1585                               struct writeback_control *wbc)
1586 {
1587         struct inode *inode = private_data;
1588         int ret;
1589         int force_cow = need_force_cow(inode, start, end);
1590         unsigned int write_flags = wbc_to_write_flags(wbc);
1591
1592         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
1593                 ret = run_delalloc_nocow(inode, locked_page, start, end,
1594                                          page_started, 1, nr_written);
1595         } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
1596                 ret = run_delalloc_nocow(inode, locked_page, start, end,
1597                                          page_started, 0, nr_written);
1598         } else if (!inode_need_compress(inode, start, end)) {
1599                 ret = cow_file_range(inode, locked_page, start, end, end,
1600                                       page_started, nr_written, 1, NULL);
1601         } else {
1602                 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1603                         &BTRFS_I(inode)->runtime_flags);
1604                 ret = cow_file_range_async(inode, locked_page, start, end,
1605                                            page_started, nr_written,
1606                                            write_flags);
1607         }
1608         if (ret)
1609                 btrfs_cleanup_ordered_extents(inode, start, end - start + 1);
1610         return ret;
1611 }
1612
1613 static void btrfs_split_extent_hook(void *private_data,
1614                                     struct extent_state *orig, u64 split)
1615 {
1616         struct inode *inode = private_data;
1617         u64 size;
1618
1619         /* not delalloc, ignore it */
1620         if (!(orig->state & EXTENT_DELALLOC))
1621                 return;
1622
1623         size = orig->end - orig->start + 1;
1624         if (size > BTRFS_MAX_EXTENT_SIZE) {
1625                 u32 num_extents;
1626                 u64 new_size;
1627
1628                 /*
1629                  * See the explanation in btrfs_merge_extent_hook, the same
1630                  * applies here, just in reverse.
1631                  */
1632                 new_size = orig->end - split + 1;
1633                 num_extents = count_max_extents(new_size);
1634                 new_size = split - orig->start;
1635                 num_extents += count_max_extents(new_size);
1636                 if (count_max_extents(size) >= num_extents)
1637                         return;
1638         }
1639
1640         spin_lock(&BTRFS_I(inode)->lock);
1641         btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
1642         spin_unlock(&BTRFS_I(inode)->lock);
1643 }
1644
1645 /*
1646  * extent_io.c merge_extent_hook, used to track merged delayed allocation
1647  * extents so we can keep track of new extents that are just merged onto old
1648  * extents, such as when we are doing sequential writes, so we can properly
1649  * account for the metadata space we'll need.
1650  */
1651 static void btrfs_merge_extent_hook(void *private_data,
1652                                     struct extent_state *new,
1653                                     struct extent_state *other)
1654 {
1655         struct inode *inode = private_data;
1656         u64 new_size, old_size;
1657         u32 num_extents;
1658
1659         /* not delalloc, ignore it */
1660         if (!(other->state & EXTENT_DELALLOC))
1661                 return;
1662
1663         if (new->start > other->start)
1664                 new_size = new->end - other->start + 1;
1665         else
1666                 new_size = other->end - new->start + 1;
1667
1668         /* we're not bigger than the max, unreserve the space and go */
1669         if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
1670                 spin_lock(&BTRFS_I(inode)->lock);
1671                 btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
1672                 spin_unlock(&BTRFS_I(inode)->lock);
1673                 return;
1674         }
1675
1676         /*
1677          * We have to add up either side to figure out how many extents were
1678          * accounted for before we merged into one big extent.  If the number of
1679          * extents we accounted for is <= the amount we need for the new range
1680          * then we can return, otherwise drop.  Think of it like this
1681          *
1682          * [ 4k][MAX_SIZE]
1683          *
1684          * So we've grown the extent by a MAX_SIZE extent, this would mean we
1685          * need 2 outstanding extents, on one side we have 1 and the other side
1686          * we have 1 so they are == and we can return.  But in this case
1687          *
1688          * [MAX_SIZE+4k][MAX_SIZE+4k]
1689          *
1690          * Each range on their own accounts for 2 extents, but merged together
1691          * they are only 3 extents worth of accounting, so we need to drop in
1692          * this case.
1693          */
1694         old_size = other->end - other->start + 1;
1695         num_extents = count_max_extents(old_size);
1696         old_size = new->end - new->start + 1;
1697         num_extents += count_max_extents(old_size);
1698         if (count_max_extents(new_size) >= num_extents)
1699                 return;
1700
1701         spin_lock(&BTRFS_I(inode)->lock);
1702         btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
1703         spin_unlock(&BTRFS_I(inode)->lock);
1704 }
1705
1706 static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
1707                                       struct inode *inode)
1708 {
1709         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1710
1711         spin_lock(&root->delalloc_lock);
1712         if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1713                 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1714                               &root->delalloc_inodes);
1715                 set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1716                         &BTRFS_I(inode)->runtime_flags);
1717                 root->nr_delalloc_inodes++;
1718                 if (root->nr_delalloc_inodes == 1) {
1719                         spin_lock(&fs_info->delalloc_root_lock);
1720                         BUG_ON(!list_empty(&root->delalloc_root));
1721                         list_add_tail(&root->delalloc_root,
1722                                       &fs_info->delalloc_roots);
1723                         spin_unlock(&fs_info->delalloc_root_lock);
1724                 }
1725         }
1726         spin_unlock(&root->delalloc_lock);
1727 }
1728
1729 static void btrfs_del_delalloc_inode(struct btrfs_root *root,
1730                                      struct btrfs_inode *inode)
1731 {
1732         struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
1733
1734         spin_lock(&root->delalloc_lock);
1735         if (!list_empty(&inode->delalloc_inodes)) {
1736                 list_del_init(&inode->delalloc_inodes);
1737                 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1738                           &inode->runtime_flags);
1739                 root->nr_delalloc_inodes--;
1740                 if (!root->nr_delalloc_inodes) {
1741                         spin_lock(&fs_info->delalloc_root_lock);
1742                         BUG_ON(list_empty(&root->delalloc_root));
1743                         list_del_init(&root->delalloc_root);
1744                         spin_unlock(&fs_info->delalloc_root_lock);
1745                 }
1746         }
1747         spin_unlock(&root->delalloc_lock);
1748 }
1749
1750 /*
1751  * extent_io.c set_bit_hook, used to track delayed allocation
1752  * bytes in this file, and to maintain the list of inodes that
1753  * have pending delalloc work to be done.
1754  */
1755 static void btrfs_set_bit_hook(void *private_data,
1756                                struct extent_state *state, unsigned *bits)
1757 {
1758         struct inode *inode = private_data;
1759
1760         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1761
1762         if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
1763                 WARN_ON(1);
1764         /*
1765          * set_bit and clear bit hooks normally require _irqsave/restore
1766          * but in this case, we are only testing for the DELALLOC
1767          * bit, which is only set or cleared with irqs on
1768          */
1769         if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1770                 struct btrfs_root *root = BTRFS_I(inode)->root;
1771                 u64 len = state->end + 1 - state->start;
1772                 u32 num_extents = count_max_extents(len);
1773                 bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
1774
1775                 spin_lock(&BTRFS_I(inode)->lock);
1776                 btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents);
1777                 spin_unlock(&BTRFS_I(inode)->lock);
1778
1779                 /* For sanity tests */
1780                 if (btrfs_is_testing(fs_info))
1781                         return;
1782
1783                 percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
1784                                          fs_info->delalloc_batch);
1785                 spin_lock(&BTRFS_I(inode)->lock);
1786                 BTRFS_I(inode)->delalloc_bytes += len;
1787                 if (*bits & EXTENT_DEFRAG)
1788                         BTRFS_I(inode)->defrag_bytes += len;
1789                 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1790                                          &BTRFS_I(inode)->runtime_flags))
1791                         btrfs_add_delalloc_inodes(root, inode);
1792                 spin_unlock(&BTRFS_I(inode)->lock);
1793         }
1794
1795         if (!(state->state & EXTENT_DELALLOC_NEW) &&
1796             (*bits & EXTENT_DELALLOC_NEW)) {
1797                 spin_lock(&BTRFS_I(inode)->lock);
1798                 BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
1799                         state->start;
1800                 spin_unlock(&BTRFS_I(inode)->lock);
1801         }
1802 }
1803
1804 /*
1805  * extent_io.c clear_bit_hook, see set_bit_hook for why
1806  */
1807 static void btrfs_clear_bit_hook(void *private_data,
1808                                  struct extent_state *state,
1809                                  unsigned *bits)
1810 {
1811         struct btrfs_inode *inode = BTRFS_I((struct inode *)private_data);
1812         struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
1813         u64 len = state->end + 1 - state->start;
1814         u32 num_extents = count_max_extents(len);
1815
1816         if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) {
1817                 spin_lock(&inode->lock);
1818                 inode->defrag_bytes -= len;
1819                 spin_unlock(&inode->lock);
1820         }
1821
1822         /*
1823          * set_bit and clear bit hooks normally require _irqsave/restore
1824          * but in this case, we are only testing for the DELALLOC
1825          * bit, which is only set or cleared with irqs on
1826          */
1827         if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1828                 struct btrfs_root *root = inode->root;
1829                 bool do_list = !btrfs_is_free_space_inode(inode);
1830
1831                 spin_lock(&inode->lock);
1832                 btrfs_mod_outstanding_extents(inode, -num_extents);
1833                 spin_unlock(&inode->lock);
1834
1835                 /*
1836                  * We don't reserve metadata space for space cache inodes so we
1837                  * don't need to call dellalloc_release_metadata if there is an
1838                  * error.
1839                  */
1840                 if (*bits & EXTENT_CLEAR_META_RESV &&
1841                     root != fs_info->tree_root)
1842                         btrfs_delalloc_release_metadata(inode, len);
1843
1844                 /* For sanity tests. */
1845                 if (btrfs_is_testing(fs_info))
1846                         return;
1847
1848                 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID &&
1849                     do_list && !(state->state & EXTENT_NORESERVE) &&
1850                     (*bits & EXTENT_CLEAR_DATA_RESV))
1851                         btrfs_free_reserved_data_space_noquota(
1852                                         &inode->vfs_inode,
1853                                         state->start, len);
1854
1855                 percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
1856                                          fs_info->delalloc_batch);
1857                 spin_lock(&inode->lock);
1858                 inode->delalloc_bytes -= len;
1859                 if (do_list && inode->delalloc_bytes == 0 &&
1860                     test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1861                                         &inode->runtime_flags))
1862                         btrfs_del_delalloc_inode(root, inode);
1863                 spin_unlock(&inode->lock);
1864         }
1865
1866         if ((state->state & EXTENT_DELALLOC_NEW) &&
1867             (*bits & EXTENT_DELALLOC_NEW)) {
1868                 spin_lock(&inode->lock);
1869                 ASSERT(inode->new_delalloc_bytes >= len);
1870                 inode->new_delalloc_bytes -= len;
1871                 spin_unlock(&inode->lock);
1872         }
1873 }
1874
1875 /*
1876  * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1877  * we don't create bios that span stripes or chunks
1878  *
1879  * return 1 if page cannot be merged to bio
1880  * return 0 if page can be merged to bio
1881  * return error otherwise
1882  */
1883 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1884                          size_t size, struct bio *bio,
1885                          unsigned long bio_flags)
1886 {
1887         struct inode *inode = page->mapping->host;
1888         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1889         u64 logical = (u64)bio->bi_iter.bi_sector << 9;
1890         u64 length = 0;
1891         u64 map_length;
1892         int ret;
1893
1894         if (bio_flags & EXTENT_BIO_COMPRESSED)
1895                 return 0;
1896
1897         length = bio->bi_iter.bi_size;
1898         map_length = length;
1899         ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
1900                               NULL, 0);
1901         if (ret < 0)
1902                 return ret;
1903         if (map_length < length + size)
1904                 return 1;
1905         return 0;
1906 }
1907
1908 /*
1909  * in order to insert checksums into the metadata in large chunks,
1910  * we wait until bio submission time.   All the pages in the bio are
1911  * checksummed and sums are attached onto the ordered extent record.
1912  *
1913  * At IO completion time the cums attached on the ordered extent record
1914  * are inserted into the btree
1915  */
1916 static blk_status_t __btrfs_submit_bio_start(void *private_data, struct bio *bio,
1917                                     int mirror_num, unsigned long bio_flags,
1918                                     u64 bio_offset)
1919 {
1920         struct inode *inode = private_data;
1921         blk_status_t ret = 0;
1922
1923         ret = btrfs_csum_one_bio(inode, bio, 0, 0);
1924         BUG_ON(ret); /* -ENOMEM */
1925         return 0;
1926 }
1927
1928 /*
1929  * in order to insert checksums into the metadata in large chunks,
1930  * we wait until bio submission time.   All the pages in the bio are
1931  * checksummed and sums are attached onto the ordered extent record.
1932  *
1933  * At IO completion time the cums attached on the ordered extent record
1934  * are inserted into the btree
1935  */
1936 static blk_status_t __btrfs_submit_bio_done(void *private_data, struct bio *bio,
1937                           int mirror_num, unsigned long bio_flags,
1938                           u64 bio_offset)
1939 {
1940         struct inode *inode = private_data;
1941         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1942         blk_status_t ret;
1943
1944         ret = btrfs_map_bio(fs_info, bio, mirror_num, 1);
1945         if (ret) {
1946                 bio->bi_status = ret;
1947                 bio_endio(bio);
1948         }
1949         return ret;
1950 }
1951
1952 /*
1953  * extent_io.c submission hook. This does the right thing for csum calculation
1954  * on write, or reading the csums from the tree before a read.
1955  *
1956  * Rules about async/sync submit,
1957  * a) read:                             sync submit
1958  *
1959  * b) write without checksum:           sync submit
1960  *
1961  * c) write with checksum:
1962  *    c-1) if bio is issued by fsync:   sync submit
1963  *         (sync_writers != 0)
1964  *
1965  *    c-2) if root is reloc root:       sync submit
1966  *         (only in case of buffered IO)
1967  *
1968  *    c-3) otherwise:                   async submit
1969  */
1970 static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio,
1971                                  int mirror_num, unsigned long bio_flags,
1972                                  u64 bio_offset)
1973 {
1974         struct inode *inode = private_data;
1975         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1976         struct btrfs_root *root = BTRFS_I(inode)->root;
1977         enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
1978         blk_status_t ret = 0;
1979         int skip_sum;
1980         int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
1981
1982         skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1983
1984         if (btrfs_is_free_space_inode(BTRFS_I(inode)))
1985                 metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
1986
1987         if (bio_op(bio) != REQ_OP_WRITE) {
1988                 ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
1989                 if (ret)
1990                         goto out;
1991
1992                 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1993                         ret = btrfs_submit_compressed_read(inode, bio,
1994                                                            mirror_num,
1995                                                            bio_flags);
1996                         goto out;
1997                 } else if (!skip_sum) {
1998                         ret = btrfs_lookup_bio_sums(inode, bio, NULL);
1999                         if (ret)
2000                                 goto out;
2001                 }
2002                 goto mapit;
2003         } else if (async && !skip_sum) {
2004                 /* csum items have already been cloned */
2005                 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2006                         goto mapit;
2007                 /* we're doing a write, do the async checksumming */
2008                 ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags,
2009                                           bio_offset, inode,
2010                                           __btrfs_submit_bio_start,
2011                                           __btrfs_submit_bio_done);
2012                 goto out;
2013         } else if (!skip_sum) {
2014                 ret = btrfs_csum_one_bio(inode, bio, 0, 0);
2015                 if (ret)
2016                         goto out;
2017         }
2018
2019 mapit:
2020         ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
2021
2022 out:
2023         if (ret) {
2024                 bio->bi_status = ret;
2025                 bio_endio(bio);
2026         }
2027         return ret;
2028 }
2029
2030 /*
2031  * given a list of ordered sums record them in the inode.  This happens
2032  * at IO completion time based on sums calculated at bio submission time.
2033  */
2034 static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
2035                              struct inode *inode, struct list_head *list)
2036 {
2037         struct btrfs_ordered_sum *sum;
2038
2039         list_for_each_entry(sum, list, list) {
2040                 trans->adding_csums = 1;
2041                 btrfs_csum_file_blocks(trans,
2042                        BTRFS_I(inode)->root->fs_info->csum_root, sum);
2043                 trans->adding_csums = 0;
2044         }
2045         return 0;
2046 }
2047
2048 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2049                               unsigned int extra_bits,
2050                               struct extent_state **cached_state, int dedupe)
2051 {
2052         WARN_ON((end & (PAGE_SIZE - 1)) == 0);
2053         return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
2054                                    extra_bits, cached_state);
2055 }
2056
2057 /* see btrfs_writepage_start_hook for details on why this is required */
2058 struct btrfs_writepage_fixup {
2059         struct page *page;
2060         struct btrfs_work work;
2061 };
2062
2063 static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
2064 {
2065         struct btrfs_writepage_fixup *fixup;
2066         struct btrfs_ordered_extent *ordered;
2067         struct extent_state *cached_state = NULL;
2068         struct extent_changeset *data_reserved = NULL;
2069         struct page *page;
2070         struct inode *inode;
2071         u64 page_start;
2072         u64 page_end;
2073         int ret;
2074
2075         fixup = container_of(work, struct btrfs_writepage_fixup, work);
2076         page = fixup->page;
2077 again:
2078         lock_page(page);
2079         if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
2080                 ClearPageChecked(page);
2081                 goto out_page;
2082         }
2083
2084         inode = page->mapping->host;
2085         page_start = page_offset(page);
2086         page_end = page_offset(page) + PAGE_SIZE - 1;
2087
2088         lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
2089                          &cached_state);
2090
2091         /* already ordered? We're done */
2092         if (PagePrivate2(page))
2093                 goto out;
2094
2095         ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
2096                                         PAGE_SIZE);
2097         if (ordered) {
2098                 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
2099                                      page_end, &cached_state, GFP_NOFS);
2100                 unlock_page(page);
2101                 btrfs_start_ordered_extent(inode, ordered, 1);
2102                 btrfs_put_ordered_extent(ordered);
2103                 goto again;
2104         }
2105
2106         ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
2107                                            PAGE_SIZE);
2108         if (ret) {
2109                 mapping_set_error(page->mapping, ret);
2110                 end_extent_writepage(page, ret, page_start, page_end);
2111                 ClearPageChecked(page);
2112                 goto out;
2113          }
2114
2115         btrfs_set_extent_delalloc(inode, page_start, page_end, 0, &cached_state,
2116                                   0);
2117         ClearPageChecked(page);
2118         set_page_dirty(page);
2119         btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
2120 out:
2121         unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
2122                              &cached_state, GFP_NOFS);
2123 out_page:
2124         unlock_page(page);
2125         put_page(page);
2126         kfree(fixup);
2127         extent_changeset_free(data_reserved);
2128 }
2129
2130 /*
2131  * There are a few paths in the higher layers of the kernel that directly
2132  * set the page dirty bit without asking the filesystem if it is a
2133  * good idea.  This causes problems because we want to make sure COW
2134  * properly happens and the data=ordered rules are followed.
2135  *
2136  * In our case any range that doesn't have the ORDERED bit set
2137  * hasn't been properly setup for IO.  We kick off an async process
2138  * to fix it up.  The async helper will wait for ordered extents, set
2139  * the delalloc bit and make it safe to write the page.
2140  */
2141 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
2142 {
2143         struct inode *inode = page->mapping->host;
2144         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2145         struct btrfs_writepage_fixup *fixup;
2146
2147         /* this page is properly in the ordered list */
2148         if (TestClearPagePrivate2(page))
2149                 return 0;
2150
2151         if (PageChecked(page))
2152                 return -EAGAIN;
2153
2154         fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
2155         if (!fixup)
2156                 return -EAGAIN;
2157
2158         SetPageChecked(page);
2159         get_page(page);
2160         btrfs_init_work(&fixup->work, btrfs_fixup_helper,
2161                         btrfs_writepage_fixup_worker, NULL, NULL);
2162         fixup->page = page;
2163         btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
2164         return -EBUSY;
2165 }
2166
2167 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2168                                        struct inode *inode, u64 file_pos,
2169                                        u64 disk_bytenr, u64 disk_num_bytes,
2170                                        u64 num_bytes, u64 ram_bytes,
2171                                        u8 compression, u8 encryption,
2172                                        u16 other_encoding, int extent_type)
2173 {
2174         struct btrfs_root *root = BTRFS_I(inode)->root;
2175         struct btrfs_file_extent_item *fi;
2176         struct btrfs_path *path;
2177         struct extent_buffer *leaf;
2178         struct btrfs_key ins;
2179         u64 qg_released;
2180         int extent_inserted = 0;
2181         int ret;
2182
2183         path = btrfs_alloc_path();
2184         if (!path)
2185                 return -ENOMEM;
2186
2187         /*
2188          * we may be replacing one extent in the tree with another.
2189          * The new extent is pinned in the extent map, and we don't want
2190          * to drop it from the cache until it is completely in the btree.
2191          *
2192          * So, tell btrfs_drop_extents to leave this extent in the cache.
2193          * the caller is expected to unpin it and allow it to be merged
2194          * with the others.
2195          */
2196         ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
2197                                    file_pos + num_bytes, NULL, 0,
2198                                    1, sizeof(*fi), &extent_inserted);
2199         if (ret)
2200                 goto out;
2201
2202         if (!extent_inserted) {
2203                 ins.objectid = btrfs_ino(BTRFS_I(inode));
2204                 ins.offset = file_pos;
2205                 ins.type = BTRFS_EXTENT_DATA_KEY;
2206
2207                 path->leave_spinning = 1;
2208                 ret = btrfs_insert_empty_item(trans, root, path, &ins,
2209                                               sizeof(*fi));
2210                 if (ret)
2211                         goto out;
2212         }
2213         leaf = path->nodes[0];
2214         fi = btrfs_item_ptr(leaf, path->slots[0],
2215                             struct btrfs_file_extent_item);
2216         btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2217         btrfs_set_file_extent_type(leaf, fi, extent_type);
2218         btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
2219         btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
2220         btrfs_set_file_extent_offset(leaf, fi, 0);
2221         btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2222         btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
2223         btrfs_set_file_extent_compression(leaf, fi, compression);
2224         btrfs_set_file_extent_encryption(leaf, fi, encryption);
2225         btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
2226
2227         btrfs_mark_buffer_dirty(leaf);
2228         btrfs_release_path(path);
2229
2230         inode_add_bytes(inode, num_bytes);
2231
2232         ins.objectid = disk_bytenr;
2233         ins.offset = disk_num_bytes;
2234         ins.type = BTRFS_EXTENT_ITEM_KEY;
2235
2236         /*
2237          * Release the reserved range from inode dirty range map, as it is
2238          * already moved into delayed_ref_head
2239          */
2240         ret = btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
2241         if (ret < 0)
2242                 goto out;
2243         qg_released = ret;
2244         ret = btrfs_alloc_reserved_file_extent(trans, root,
2245                                                btrfs_ino(BTRFS_I(inode)),
2246                                                file_pos, qg_released, &ins);
2247 out:
2248         btrfs_free_path(path);
2249
2250         return ret;
2251 }
2252
2253 /* snapshot-aware defrag */
2254 struct sa_defrag_extent_backref {
2255         struct rb_node node;
2256         struct old_sa_defrag_extent *old;
2257         u64 root_id;
2258         u64 inum;
2259         u64 file_pos;
2260         u64 extent_offset;
2261         u64 num_bytes;
2262         u64 generation;
2263 };
2264
2265 struct old_sa_defrag_extent {
2266         struct list_head list;
2267         struct new_sa_defrag_extent *new;
2268
2269         u64 extent_offset;
2270         u64 bytenr;
2271         u64 offset;
2272         u64 len;
2273         int count;
2274 };
2275
2276 struct new_sa_defrag_extent {
2277         struct rb_root root;
2278         struct list_head head;
2279         struct btrfs_path *path;
2280         struct inode *inode;
2281         u64 file_pos;
2282         u64 len;
2283         u64 bytenr;
2284         u64 disk_len;
2285         u8 compress_type;
2286 };
2287
2288 static int backref_comp(struct sa_defrag_extent_backref *b1,
2289                         struct sa_defrag_extent_backref *b2)
2290 {
2291         if (b1->root_id < b2->root_id)
2292                 return -1;
2293         else if (b1->root_id > b2->root_id)
2294                 return 1;
2295
2296         if (b1->inum < b2->inum)
2297                 return -1;
2298         else if (b1->inum > b2->inum)
2299                 return 1;
2300
2301         if (b1->file_pos < b2->file_pos)
2302                 return -1;
2303         else if (b1->file_pos > b2->file_pos)
2304                 return 1;
2305
2306         /*
2307          * [------------------------------] ===> (a range of space)
2308          *     |<--->|   |<---->| =============> (fs/file tree A)
2309          * |<---------------------------->| ===> (fs/file tree B)
2310          *
2311          * A range of space can refer to two file extents in one tree while
2312          * refer to only one file extent in another tree.
2313          *
2314          * So we may process a disk offset more than one time(two extents in A)
2315          * and locate at the same extent(one extent in B), then insert two same
2316          * backrefs(both refer to the extent in B).
2317          */
2318         return 0;
2319 }
2320
2321 static void backref_insert(struct rb_root *root,
2322                            struct sa_defrag_extent_backref *backref)
2323 {
2324         struct rb_node **p = &root->rb_node;
2325         struct rb_node *parent = NULL;
2326         struct sa_defrag_extent_backref *entry;
2327         int ret;
2328
2329         while (*p) {
2330                 parent = *p;
2331                 entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
2332
2333                 ret = backref_comp(backref, entry);
2334                 if (ret < 0)
2335                         p = &(*p)->rb_left;
2336                 else
2337                         p = &(*p)->rb_right;
2338         }
2339
2340         rb_link_node(&backref->node, parent, p);
2341         rb_insert_color(&backref->node, root);
2342 }
2343
2344 /*
2345  * Note the backref might has changed, and in this case we just return 0.
2346  */
2347 static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2348                                        void *ctx)
2349 {
2350         struct btrfs_file_extent_item *extent;
2351         struct old_sa_defrag_extent *old = ctx;
2352         struct new_sa_defrag_extent *new = old->new;
2353         struct btrfs_path *path = new->path;
2354         struct btrfs_key key;
2355         struct btrfs_root *root;
2356         struct sa_defrag_extent_backref *backref;
2357         struct extent_buffer *leaf;
2358         struct inode *inode = new->inode;
2359         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2360         int slot;
2361         int ret;
2362         u64 extent_offset;
2363         u64 num_bytes;
2364
2365         if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
2366             inum == btrfs_ino(BTRFS_I(inode)))
2367                 return 0;
2368
2369         key.objectid = root_id;
2370         key.type = BTRFS_ROOT_ITEM_KEY;
2371         key.offset = (u64)-1;
2372
2373         root = btrfs_read_fs_root_no_name(fs_info, &key);
2374         if (IS_ERR(root)) {
2375                 if (PTR_ERR(root) == -ENOENT)
2376                         return 0;
2377                 WARN_ON(1);
2378                 btrfs_debug(fs_info, "inum=%llu, offset=%llu, root_id=%llu",
2379                          inum, offset, root_id);
2380                 return PTR_ERR(root);
2381         }
2382
2383         key.objectid = inum;
2384         key.type = BTRFS_EXTENT_DATA_KEY;
2385         if (offset > (u64)-1 << 32)
2386                 key.offset = 0;
2387         else
2388                 key.offset = offset;
2389
2390         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2391         if (WARN_ON(ret < 0))
2392                 return ret;
2393         ret = 0;
2394
2395         while (1) {
2396                 cond_resched();
2397
2398                 leaf = path->nodes[0];
2399                 slot = path->slots[0];
2400
2401                 if (slot >= btrfs_header_nritems(leaf)) {
2402                         ret = btrfs_next_leaf(root, path);
2403                         if (ret < 0) {
2404                                 goto out;
2405                         } else if (ret > 0) {
2406                                 ret = 0;
2407                                 goto out;
2408                         }
2409                         continue;
2410                 }
2411
2412                 path->slots[0]++;
2413
2414                 btrfs_item_key_to_cpu(leaf, &key, slot);
2415
2416                 if (key.objectid > inum)
2417                         goto out;
2418
2419                 if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
2420                         continue;
2421
2422                 extent = btrfs_item_ptr(leaf, slot,
2423                                         struct btrfs_file_extent_item);
2424
2425                 if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
2426                         continue;
2427
2428                 /*
2429                  * 'offset' refers to the exact key.offset,
2430                  * NOT the 'offset' field in btrfs_extent_data_ref, ie.
2431                  * (key.offset - extent_offset).
2432                  */
2433                 if (key.offset != offset)
2434                         continue;
2435
2436                 extent_offset = btrfs_file_extent_offset(leaf, extent);
2437                 num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
2438
2439                 if (extent_offset >= old->extent_offset + old->offset +
2440                     old->len || extent_offset + num_bytes <=
2441                     old->extent_offset + old->offset)
2442                         continue;
2443                 break;
2444         }
2445
2446         backref = kmalloc(sizeof(*backref), GFP_NOFS);
2447         if (!backref) {
2448                 ret = -ENOENT;
2449                 goto out;
2450         }
2451
2452         backref->root_id = root_id;
2453         backref->inum = inum;
2454         backref->file_pos = offset;
2455         backref->num_bytes = num_bytes;
2456         backref->extent_offset = extent_offset;
2457         backref->generation = btrfs_file_extent_generation(leaf, extent);
2458         backref->old = old;
2459         backref_insert(&new->root, backref);
2460         old->count++;
2461 out:
2462         btrfs_release_path(path);
2463         WARN_ON(ret);
2464         return ret;
2465 }
2466
2467 static noinline bool record_extent_backrefs(struct btrfs_path *path,
2468                                    struct new_sa_defrag_extent *new)
2469 {
2470         struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2471         struct old_sa_defrag_extent *old, *tmp;
2472         int ret;
2473
2474         new->path = path;
2475
2476         list_for_each_entry_safe(old, tmp, &new->head, list) {
2477                 ret = iterate_inodes_from_logical(old->bytenr +
2478                                                   old->extent_offset, fs_info,
2479                                                   path, record_one_backref,
2480                                                   old, false);
2481                 if (ret < 0 && ret != -ENOENT)
2482                         return false;
2483
2484                 /* no backref to be processed for this extent */
2485                 if (!old->count) {
2486                         list_del(&old->list);
2487                         kfree(old);
2488                 }
2489         }
2490
2491         if (list_empty(&new->head))
2492                 return false;
2493
2494         return true;
2495 }
2496
2497 static int relink_is_mergable(struct extent_buffer *leaf,
2498                               struct btrfs_file_extent_item *fi,
2499                               struct new_sa_defrag_extent *new)
2500 {
2501         if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
2502                 return 0;
2503
2504         if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2505                 return 0;
2506
2507         if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
2508                 return 0;
2509
2510         if (btrfs_file_extent_encryption(leaf, fi) ||
2511             btrfs_file_extent_other_encoding(leaf, fi))
2512                 return 0;
2513
2514         return 1;
2515 }
2516
2517 /*
2518  * Note the backref might has changed, and in this case we just return 0.
2519  */
2520 static noinline int relink_extent_backref(struct btrfs_path *path,
2521                                  struct sa_defrag_extent_backref *prev,
2522                                  struct sa_defrag_extent_backref *backref)
2523 {
2524         struct btrfs_file_extent_item *extent;
2525         struct btrfs_file_extent_item *item;
2526         struct btrfs_ordered_extent *ordered;
2527         struct btrfs_trans_handle *trans;
2528         struct btrfs_root *root;
2529         struct btrfs_key key;
2530         struct extent_buffer *leaf;
2531         struct old_sa_defrag_extent *old = backref->old;
2532         struct new_sa_defrag_extent *new = old->new;
2533         struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2534         struct inode *inode;
2535         struct extent_state *cached = NULL;
2536         int ret = 0;
2537         u64 start;
2538         u64 len;
2539         u64 lock_start;
2540         u64 lock_end;
2541         bool merge = false;
2542         int index;
2543
2544         if (prev && prev->root_id == backref->root_id &&
2545             prev->inum == backref->inum &&
2546             prev->file_pos + prev->num_bytes == backref->file_pos)
2547                 merge = true;
2548
2549         /* step 1: get root */
2550         key.objectid = backref->root_id;
2551         key.type = BTRFS_ROOT_ITEM_KEY;
2552         key.offset = (u64)-1;
2553
2554         index = srcu_read_lock(&fs_info->subvol_srcu);
2555
2556         root = btrfs_read_fs_root_no_name(fs_info, &key);
2557         if (IS_ERR(root)) {
2558                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2559                 if (PTR_ERR(root) == -ENOENT)
2560                         return 0;
2561                 return PTR_ERR(root);
2562         }
2563
2564         if (btrfs_root_readonly(root)) {
2565                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2566                 return 0;
2567         }
2568
2569         /* step 2: get inode */
2570         key.objectid = backref->inum;
2571         key.type = BTRFS_INODE_ITEM_KEY;
2572         key.offset = 0;
2573
2574         inode = btrfs_iget(fs_info->sb, &key, root, NULL);
2575         if (IS_ERR(inode)) {
2576                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2577                 return 0;
2578         }
2579
2580         srcu_read_unlock(&fs_info->subvol_srcu, index);
2581
2582         /* step 3: relink backref */
2583         lock_start = backref->file_pos;
2584         lock_end = backref->file_pos + backref->num_bytes - 1;
2585         lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2586                          &cached);
2587
2588         ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
2589         if (ordered) {
2590                 btrfs_put_ordered_extent(ordered);
2591                 goto out_unlock;
2592         }
2593
2594         trans = btrfs_join_transaction(root);
2595         if (IS_ERR(trans)) {
2596                 ret = PTR_ERR(trans);
2597                 goto out_unlock;
2598         }
2599
2600         key.objectid = backref->inum;
2601         key.type = BTRFS_EXTENT_DATA_KEY;
2602         key.offset = backref->file_pos;
2603
2604         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2605         if (ret < 0) {
2606                 goto out_free_path;
2607         } else if (ret > 0) {
2608                 ret = 0;
2609                 goto out_free_path;
2610         }
2611
2612         extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
2613                                 struct btrfs_file_extent_item);
2614
2615         if (btrfs_file_extent_generation(path->nodes[0], extent) !=
2616             backref->generation)
2617                 goto out_free_path;
2618
2619         btrfs_release_path(path);
2620
2621         start = backref->file_pos;
2622         if (backref->extent_offset < old->extent_offset + old->offset)
2623                 start += old->extent_offset + old->offset -
2624                          backref->extent_offset;
2625
2626         len = min(backref->extent_offset + backref->num_bytes,
2627                   old->extent_offset + old->offset + old->len);
2628         len -= max(backref->extent_offset, old->extent_offset + old->offset);
2629
2630         ret = btrfs_drop_extents(trans, root, inode, start,
2631                                  start + len, 1);
2632         if (ret)
2633                 goto out_free_path;
2634 again:
2635         key.objectid = btrfs_ino(BTRFS_I(inode));
2636         key.type = BTRFS_EXTENT_DATA_KEY;
2637         key.offset = start;
2638
2639         path->leave_spinning = 1;
2640         if (merge) {
2641                 struct btrfs_file_extent_item *fi;
2642                 u64 extent_len;
2643                 struct btrfs_key found_key;
2644
2645                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2646                 if (ret < 0)
2647                         goto out_free_path;
2648
2649                 path->slots[0]--;
2650                 leaf = path->nodes[0];
2651                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2652
2653                 fi = btrfs_item_ptr(leaf, path->slots[0],
2654                                     struct btrfs_file_extent_item);
2655                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
2656
2657                 if (extent_len + found_key.offset == start &&
2658                     relink_is_mergable(leaf, fi, new)) {
2659                         btrfs_set_file_extent_num_bytes(leaf, fi,
2660                                                         extent_len + len);
2661                         btrfs_mark_buffer_dirty(leaf);
2662                         inode_add_bytes(inode, len);
2663
2664                         ret = 1;
2665                         goto out_free_path;
2666                 } else {
2667                         merge = false;
2668                         btrfs_release_path(path);
2669                         goto again;
2670                 }
2671         }
2672
2673         ret = btrfs_insert_empty_item(trans, root, path, &key,
2674                                         sizeof(*extent));
2675         if (ret) {
2676                 btrfs_abort_transaction(trans, ret);
2677                 goto out_free_path;
2678         }
2679
2680         leaf = path->nodes[0];
2681         item = btrfs_item_ptr(leaf, path->slots[0],
2682                                 struct btrfs_file_extent_item);
2683         btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
2684         btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
2685         btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
2686         btrfs_set_file_extent_num_bytes(leaf, item, len);
2687         btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
2688         btrfs_set_file_extent_generation(leaf, item, trans->transid);
2689         btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
2690         btrfs_set_file_extent_compression(leaf, item, new->compress_type);
2691         btrfs_set_file_extent_encryption(leaf, item, 0);
2692         btrfs_set_file_extent_other_encoding(leaf, item, 0);
2693
2694         btrfs_mark_buffer_dirty(leaf);
2695         inode_add_bytes(inode, len);
2696         btrfs_release_path(path);
2697
2698         ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
2699                         new->disk_len, 0,
2700                         backref->root_id, backref->inum,
2701                         new->file_pos); /* start - extent_offset */
2702         if (ret) {
2703                 btrfs_abort_transaction(trans, ret);
2704                 goto out_free_path;
2705         }
2706
2707         ret = 1;
2708 out_free_path:
2709         btrfs_release_path(path);
2710         path->leave_spinning = 0;
2711         btrfs_end_transaction(trans);
2712 out_unlock:
2713         unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2714                              &cached, GFP_NOFS);
2715         iput(inode);
2716         return ret;
2717 }
2718
2719 static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
2720 {
2721         struct old_sa_defrag_extent *old, *tmp;
2722
2723         if (!new)
2724                 return;
2725
2726         list_for_each_entry_safe(old, tmp, &new->head, list) {
2727                 kfree(old);
2728         }
2729         kfree(new);
2730 }
2731
2732 static void relink_file_extents(struct new_sa_defrag_extent *new)
2733 {
2734         struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2735         struct btrfs_path *path;
2736         struct sa_defrag_extent_backref *backref;
2737         struct sa_defrag_extent_backref *prev = NULL;
2738         struct inode *inode;
2739         struct btrfs_root *root;
2740         struct rb_node *node;
2741         int ret;
2742
2743         inode = new->inode;
2744         root = BTRFS_I(inode)->root;
2745
2746         path = btrfs_alloc_path();
2747         if (!path)
2748                 return;
2749
2750         if (!record_extent_backrefs(path, new)) {
2751                 btrfs_free_path(path);
2752                 goto out;
2753         }
2754         btrfs_release_path(path);
2755
2756         while (1) {
2757                 node = rb_first(&new->root);
2758                 if (!node)
2759                         break;
2760                 rb_erase(node, &new->root);
2761
2762                 backref = rb_entry(node, struct sa_defrag_extent_backref, node);
2763
2764                 ret = relink_extent_backref(path, prev, backref);
2765                 WARN_ON(ret < 0);
2766
2767                 kfree(prev);
2768
2769                 if (ret == 1)
2770                         prev = backref;
2771                 else
2772                         prev = NULL;
2773                 cond_resched();
2774         }
2775         kfree(prev);
2776
2777         btrfs_free_path(path);
2778 out:
2779         free_sa_defrag_extent(new);
2780
2781         atomic_dec(&fs_info->defrag_running);
2782         wake_up(&fs_info->transaction_wait);
2783 }
2784
2785 static struct new_sa_defrag_extent *
2786 record_old_file_extents(struct inode *inode,
2787                         struct btrfs_ordered_extent *ordered)
2788 {
2789         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2790         struct btrfs_root *root = BTRFS_I(inode)->root;
2791         struct btrfs_path *path;
2792         struct btrfs_key key;
2793         struct old_sa_defrag_extent *old;
2794         struct new_sa_defrag_extent *new;
2795         int ret;
2796
2797         new = kmalloc(sizeof(*new), GFP_NOFS);
2798         if (!new)
2799                 return NULL;
2800
2801         new->inode = inode;
2802         new->file_pos = ordered->file_offset;
2803         new->len = ordered->len;
2804         new->bytenr = ordered->start;
2805         new->disk_len = ordered->disk_len;
2806         new->compress_type = ordered->compress_type;
2807         new->root = RB_ROOT;
2808         INIT_LIST_HEAD(&new->head);
2809
2810         path = btrfs_alloc_path();
2811         if (!path)
2812                 goto out_kfree;
2813
2814         key.objectid = btrfs_ino(BTRFS_I(inode));
2815         key.type = BTRFS_EXTENT_DATA_KEY;
2816         key.offset = new->file_pos;
2817
2818         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2819         if (ret < 0)
2820                 goto out_free_path;
2821         if (ret > 0 && path->slots[0] > 0)
2822                 path->slots[0]--;
2823
2824         /* find out all the old extents for the file range */
2825         while (1) {
2826                 struct btrfs_file_extent_item *extent;
2827                 struct extent_buffer *l;
2828                 int slot;
2829                 u64 num_bytes;
2830                 u64 offset;
2831                 u64 end;
2832                 u64 disk_bytenr;
2833                 u64 extent_offset;
2834
2835                 l = path->nodes[0];
2836                 slot = path->slots[0];
2837
2838                 if (slot >= btrfs_header_nritems(l)) {
2839                         ret = btrfs_next_leaf(root, path);
2840                         if (ret < 0)
2841                                 goto out_free_path;
2842                         else if (ret > 0)
2843                                 break;
2844                         continue;
2845                 }
2846
2847                 btrfs_item_key_to_cpu(l, &key, slot);
2848
2849                 if (key.objectid != btrfs_ino(BTRFS_I(inode)))
2850                         break;
2851                 if (key.type != BTRFS_EXTENT_DATA_KEY)
2852                         break;
2853                 if (key.offset >= new->file_pos + new->len)
2854                         break;
2855
2856                 extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
2857
2858                 num_bytes = btrfs_file_extent_num_bytes(l, extent);
2859                 if (key.offset + num_bytes < new->file_pos)
2860                         goto next;
2861
2862                 disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
2863                 if (!disk_bytenr)
2864                         goto next;
2865
2866                 extent_offset = btrfs_file_extent_offset(l, extent);
2867
2868                 old = kmalloc(sizeof(*old), GFP_NOFS);
2869                 if (!old)
2870                         goto out_free_path;
2871
2872                 offset = max(new->file_pos, key.offset);
2873                 end = min(new->file_pos + new->len, key.offset + num_bytes);
2874
2875                 old->bytenr = disk_bytenr;
2876                 old->extent_offset = extent_offset;
2877                 old->offset = offset - key.offset;
2878                 old->len = end - offset;
2879                 old->new = new;
2880                 old->count = 0;
2881                 list_add_tail(&old->list, &new->head);
2882 next:
2883                 path->slots[0]++;
2884                 cond_resched();
2885         }
2886
2887         btrfs_free_path(path);
2888         atomic_inc(&fs_info->defrag_running);
2889
2890         return new;
2891
2892 out_free_path:
2893         btrfs_free_path(path);
2894 out_kfree:
2895         free_sa_defrag_extent(new);
2896         return NULL;
2897 }
2898
2899 static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
2900                                          u64 start, u64 len)
2901 {
2902         struct btrfs_block_group_cache *cache;
2903
2904         cache = btrfs_lookup_block_group(fs_info, start);
2905         ASSERT(cache);
2906
2907         spin_lock(&cache->lock);
2908         cache->delalloc_bytes -= len;
2909         spin_unlock(&cache->lock);
2910
2911         btrfs_put_block_group(cache);
2912 }
2913
2914 /* as ordered data IO finishes, this gets called so we can finish
2915  * an ordered extent if the range of bytes in the file it covers are
2916  * fully written.
2917  */
2918 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2919 {
2920         struct inode *inode = ordered_extent->inode;
2921         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2922         struct btrfs_root *root = BTRFS_I(inode)->root;
2923         struct btrfs_trans_handle *trans = NULL;
2924         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2925         struct extent_state *cached_state = NULL;
2926         struct new_sa_defrag_extent *new = NULL;
2927         int compress_type = 0;
2928         int ret = 0;
2929         u64 logical_len = ordered_extent->len;
2930         bool nolock;
2931         bool truncated = false;
2932         bool range_locked = false;
2933         bool clear_new_delalloc_bytes = false;
2934
2935         if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
2936             !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
2937             !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
2938                 clear_new_delalloc_bytes = true;
2939
2940         nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
2941
2942         if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
2943                 ret = -EIO;
2944                 goto out;
2945         }
2946
2947         btrfs_free_io_failure_record(BTRFS_I(inode),
2948                         ordered_extent->file_offset,
2949                         ordered_extent->file_offset +
2950                         ordered_extent->len - 1);
2951
2952         if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
2953                 truncated = true;
2954                 logical_len = ordered_extent->truncated_len;
2955                 /* Truncated the entire extent, don't bother adding */
2956                 if (!logical_len)
2957                         goto out;
2958         }
2959
2960         if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
2961                 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
2962
2963                 /*
2964                  * For mwrite(mmap + memset to write) case, we still reserve
2965                  * space for NOCOW range.
2966                  * As NOCOW won't cause a new delayed ref, just free the space
2967                  */
2968                 btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
2969                                        ordered_extent->len);
2970                 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2971                 if (nolock)
2972                         trans = btrfs_join_transaction_nolock(root);
2973                 else
2974                         trans = btrfs_join_transaction(root);
2975                 if (IS_ERR(trans)) {
2976                         ret = PTR_ERR(trans);
2977                         trans = NULL;
2978                         goto out;
2979                 }
2980                 trans->block_rsv = &BTRFS_I(inode)->block_rsv;
2981                 ret = btrfs_update_inode_fallback(trans, root, inode);
2982                 if (ret) /* -ENOMEM or corruption */
2983                         btrfs_abort_transaction(trans, ret);
2984                 goto out;
2985         }
2986
2987         range_locked = true;
2988         lock_extent_bits(io_tree, ordered_extent->file_offset,
2989                          ordered_extent->file_offset + ordered_extent->len - 1,
2990                          &cached_state);
2991
2992         ret = test_range_bit(io_tree, ordered_extent->file_offset,
2993                         ordered_extent->file_offset + ordered_extent->len - 1,
2994                         EXTENT_DEFRAG, 0, cached_state);
2995         if (ret) {
2996                 u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
2997                 if (0 && last_snapshot >= BTRFS_I(inode)->generation)
2998                         /* the inode is shared */
2999                         new = record_old_file_extents(inode, ordered_extent);
3000
3001                 clear_extent_bit(io_tree, ordered_extent->file_offset,
3002                         ordered_extent->file_offset + ordered_extent->len - 1,
3003                         EXTENT_DEFRAG, 0, 0, &cached_state);
3004         }
3005
3006         if (nolock)
3007                 trans = btrfs_join_transaction_nolock(root);
3008         else
3009                 trans = btrfs_join_transaction(root);
3010         if (IS_ERR(trans)) {
3011                 ret = PTR_ERR(trans);
3012                 trans = NULL;
3013                 goto out;
3014         }
3015
3016         trans->block_rsv = &BTRFS_I(inode)->block_rsv;
3017
3018         if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
3019                 compress_type = ordered_extent->compress_type;
3020         if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3021                 BUG_ON(compress_type);
3022                 btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
3023                                        ordered_extent->len);
3024                 ret = btrfs_mark_extent_written(trans, BTRFS_I(inode),
3025                                                 ordered_extent->file_offset,
3026                                                 ordered_extent->file_offset +
3027                                                 logical_len);
3028         } else {
3029                 BUG_ON(root == fs_info->tree_root);
3030                 ret = insert_reserved_file_extent(trans, inode,
3031                                                 ordered_extent->file_offset,
3032                                                 ordered_extent->start,
3033                                                 ordered_extent->disk_len,
3034                                                 logical_len, logical_len,
3035                                                 compress_type, 0, 0,
3036                                                 BTRFS_FILE_EXTENT_REG);
3037                 if (!ret)
3038                         btrfs_release_delalloc_bytes(fs_info,
3039                                                      ordered_extent->start,
3040                                                      ordered_extent->disk_len);
3041         }
3042         unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
3043                            ordered_extent->file_offset, ordered_extent->len,
3044                            trans->transid);
3045         if (ret < 0) {
3046                 btrfs_abort_transaction(trans, ret);
3047                 goto out;
3048         }
3049
3050         add_pending_csums(trans, inode, &ordered_extent->list);
3051
3052         btrfs_ordered_update_i_size(inode, 0, ordered_extent);
3053         ret = btrfs_update_inode_fallback(trans, root, inode);
3054         if (ret) { /* -ENOMEM or corruption */
3055                 btrfs_abort_transaction(trans, ret);
3056                 goto out;
3057         }
3058         ret = 0;
3059 out:
3060         if (range_locked || clear_new_delalloc_bytes) {
3061                 unsigned int clear_bits = 0;
3062
3063                 if (range_locked)
3064                         clear_bits |= EXTENT_LOCKED;
3065                 if (clear_new_delalloc_bytes)
3066                         clear_bits |= EXTENT_DELALLOC_NEW;
3067                 clear_extent_bit(&BTRFS_I(inode)->io_tree,
3068                                  ordered_extent->file_offset,
3069                                  ordered_extent->file_offset +
3070                                  ordered_extent->len - 1,
3071                                  clear_bits,
3072                                  (clear_bits & EXTENT_LOCKED) ? 1 : 0,
3073                                  0, &cached_state);
3074         }
3075
3076         if (trans)
3077                 btrfs_end_transaction(trans);
3078
3079         if (ret || truncated) {
3080                 u64 start, end;
3081
3082                 if (truncated)
3083                         start = ordered_extent->file_offset + logical_len;
3084                 else
3085                         start = ordered_extent->file_offset;
3086                 end = ordered_extent->file_offset + ordered_extent->len - 1;
3087                 clear_extent_uptodate(io_tree, start, end, NULL);
3088
3089                 /* Drop the cache for the part of the extent we didn't write. */
3090                 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
3091
3092                 /*
3093                  * If the ordered extent had an IOERR or something else went
3094                  * wrong we need to return the space for this ordered extent
3095                  * back to the allocator.  We only free the extent in the
3096                  * truncated case if we didn't write out the extent at all.
3097                  */
3098                 if ((ret || !logical_len) &&
3099                     !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3100                     !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
3101                         btrfs_free_reserved_extent(fs_info,
3102                                                    ordered_extent->start,
3103                                                    ordered_extent->disk_len, 1);
3104         }
3105
3106
3107         /*
3108          * This needs to be done to make sure anybody waiting knows we are done
3109          * updating everything for this ordered extent.
3110          */
3111         btrfs_remove_ordered_extent(inode, ordered_extent);
3112
3113         /* for snapshot-aware defrag */
3114         if (new) {
3115                 if (ret) {
3116                         free_sa_defrag_extent(new);
3117                         atomic_dec(&fs_info->defrag_running);
3118                 } else {
3119                         relink_file_extents(new);
3120                 }
3121         }
3122
3123         /* once for us */
3124         btrfs_put_ordered_extent(ordered_extent);
3125         /* once for the tree */
3126         btrfs_put_ordered_extent(ordered_extent);
3127
3128         return ret;
3129 }
3130
3131 static void finish_ordered_fn(struct btrfs_work *work)
3132 {
3133         struct btrfs_ordered_extent *ordered_extent;
3134         ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
3135         btrfs_finish_ordered_io(ordered_extent);
3136 }
3137
3138 static void btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
3139                                 struct extent_state *state, int uptodate)
3140 {
3141         struct inode *inode = page->mapping->host;
3142         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3143         struct btrfs_ordered_extent *ordered_extent = NULL;
3144         struct btrfs_workqueue *wq;
3145         btrfs_work_func_t func;
3146
3147         trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
3148
3149         ClearPagePrivate2(page);
3150         if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
3151                                             end - start + 1, uptodate))
3152                 return;
3153
3154         if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
3155                 wq = fs_info->endio_freespace_worker;
3156                 func = btrfs_freespace_write_helper;
3157         } else {
3158                 wq = fs_info->endio_write_workers;
3159                 func = btrfs_endio_write_helper;
3160         }
3161
3162         btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
3163                         NULL);
3164         btrfs_queue_work(wq, &ordered_extent->work);
3165 }
3166
3167 static int __readpage_endio_check(struct inode *inode,
3168                                   struct btrfs_io_bio *io_bio,
3169                                   int icsum, struct page *page,
3170                                   int pgoff, u64 start, size_t len)
3171 {
3172         char *kaddr;
3173         u32 csum_expected;
3174         u32 csum = ~(u32)0;
3175
3176         csum_expected = *(((u32 *)io_bio->csum) + icsum);
3177
3178         kaddr = kmap_atomic(page);
3179         csum = btrfs_csum_data(kaddr + pgoff, csum,  len);
3180         btrfs_csum_final(csum, (u8 *)&csum);
3181         if (csum != csum_expected)
3182                 goto zeroit;
3183
3184         kunmap_atomic(kaddr);
3185         return 0;
3186 zeroit:
3187         btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
3188                                     io_bio->mirror_num);
3189         memset(kaddr + pgoff, 1, len);
3190         flush_dcache_page(page);
3191         kunmap_atomic(kaddr);
3192         return -EIO;
3193 }
3194
3195 /*
3196  * when reads are done, we need to check csums to verify the data is correct
3197  * if there's a match, we allow the bio to finish.  If not, the code in
3198  * extent_io.c will try to find good copies for us.
3199  */
3200 static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
3201                                       u64 phy_offset, struct page *page,
3202                                       u64 start, u64 end, int mirror)
3203 {
3204         size_t offset = start - page_offset(page);
3205         struct inode *inode = page->mapping->host;
3206         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3207         struct btrfs_root *root = BTRFS_I(inode)->root;
3208
3209         if (PageChecked(page)) {
3210                 ClearPageChecked(page);
3211                 return 0;
3212         }
3213
3214         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
3215                 return 0;
3216
3217         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
3218             test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
3219                 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM);
3220                 return 0;
3221         }
3222
3223         phy_offset >>= inode->i_sb->s_blocksize_bits;
3224         return __readpage_endio_check(inode, io_bio, phy_offset, page, offset,
3225                                       start, (size_t)(end - start + 1));
3226 }
3227
3228 void btrfs_add_delayed_iput(struct inode *inode)
3229 {
3230         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3231         struct btrfs_inode *binode = BTRFS_I(inode);
3232
3233         if (atomic_add_unless(&inode->i_count, -1, 1))
3234                 return;
3235
3236         spin_lock(&fs_info->delayed_iput_lock);
3237         if (binode->delayed_iput_count == 0) {
3238                 ASSERT(list_empty(&binode->delayed_iput));
3239                 list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
3240         } else {
3241                 binode->delayed_iput_count++;
3242         }
3243         spin_unlock(&fs_info->delayed_iput_lock);
3244 }
3245
3246 void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
3247 {
3248
3249         spin_lock(&fs_info->delayed_iput_lock);
3250         while (!list_empty(&fs_info->delayed_iputs)) {
3251                 struct btrfs_inode *inode;
3252
3253                 inode = list_first_entry(&fs_info->delayed_iputs,
3254                                 struct btrfs_inode, delayed_iput);
3255                 if (inode->delayed_iput_count) {
3256                         inode->delayed_iput_count--;
3257                         list_move_tail(&inode->delayed_iput,
3258                                         &fs_info->delayed_iputs);
3259                 } else {
3260                         list_del_init(&inode->delayed_iput);
3261                 }
3262                 spin_unlock(&fs_info->delayed_iput_lock);
3263                 iput(&inode->vfs_inode);
3264                 spin_lock(&fs_info->delayed_iput_lock);
3265         }
3266         spin_unlock(&fs_info->delayed_iput_lock);
3267 }
3268
3269 /*
3270  * This is called in transaction commit time. If there are no orphan
3271  * files in the subvolume, it removes orphan item and frees block_rsv
3272  * structure.
3273  */
3274 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
3275                               struct btrfs_root *root)
3276 {
3277         struct btrfs_fs_info *fs_info = root->fs_info;
3278         struct btrfs_block_rsv *block_rsv;
3279         int ret;
3280
3281         if (atomic_read(&root->orphan_inodes) ||
3282             root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
3283                 return;
3284
3285         spin_lock(&root->orphan_lock);
3286         if (atomic_read(&root->orphan_inodes)) {
3287                 spin_unlock(&root->orphan_lock);
3288                 return;
3289         }
3290
3291         if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
3292                 spin_unlock(&root->orphan_lock);
3293                 return;
3294         }
3295
3296         block_rsv = root->orphan_block_rsv;
3297         root->orphan_block_rsv = NULL;
3298         spin_unlock(&root->orphan_lock);
3299
3300         if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) &&
3301             btrfs_root_refs(&root->root_item) > 0) {
3302                 ret = btrfs_del_orphan_item(trans, fs_info->tree_root,
3303                                             root->root_key.objectid);
3304                 if (ret)
3305                         btrfs_abort_transaction(trans, ret);
3306                 else
3307                         clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
3308                                   &root->state);
3309         }
3310
3311         if (block_rsv) {
3312                 WARN_ON(block_rsv->size > 0);
3313                 btrfs_free_block_rsv(fs_info, block_rsv);
3314         }
3315 }
3316
3317 /*
3318  * This creates an orphan entry for the given inode in case something goes
3319  * wrong in the middle of an unlink/truncate.
3320  *
3321  * NOTE: caller of this function should reserve 5 units of metadata for
3322  *       this function.
3323  */
3324 int btrfs_orphan_add(struct btrfs_trans_handle *trans,
3325                 struct btrfs_inode *inode)
3326 {
3327         struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
3328         struct btrfs_root *root = inode->root;
3329         struct btrfs_block_rsv *block_rsv = NULL;
3330         int reserve = 0;
3331         int insert = 0;
3332         int ret;
3333
3334         if (!root->orphan_block_rsv) {
3335                 block_rsv = btrfs_alloc_block_rsv(fs_info,
3336                                                   BTRFS_BLOCK_RSV_TEMP);
3337                 if (!block_rsv)
3338                         return -ENOMEM;
3339         }
3340
3341         spin_lock(&root->orphan_lock);
3342         if (!root->orphan_block_rsv) {
3343                 root->orphan_block_rsv = block_rsv;
3344         } else if (block_rsv) {
3345                 btrfs_free_block_rsv(fs_info, block_rsv);
3346                 block_rsv = NULL;
3347         }
3348
3349         if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3350                               &inode->runtime_flags)) {
3351 #if 0
3352                 /*
3353                  * For proper ENOSPC handling, we should do orphan
3354                  * cleanup when mounting. But this introduces backward
3355                  * compatibility issue.
3356                  */
3357                 if (!xchg(&root->orphan_item_inserted, 1))
3358                         insert = 2;
3359                 else
3360                         insert = 1;
3361 #endif
3362                 insert = 1;
3363                 atomic_inc(&root->orphan_inodes);
3364         }
3365
3366         if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3367                               &inode->runtime_flags))
3368                 reserve = 1;
3369         spin_unlock(&root->orphan_lock);
3370
3371         /* grab metadata reservation from transaction handle */
3372         if (reserve) {
3373                 ret = btrfs_orphan_reserve_metadata(trans, inode);
3374                 ASSERT(!ret);
3375                 if (ret) {
3376                         atomic_dec(&root->orphan_inodes);
3377                         clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3378                                   &inode->runtime_flags);
3379                         if (insert)
3380                                 clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3381                                           &inode->runtime_flags);
3382                         return ret;
3383                 }
3384         }
3385
3386         /* insert an orphan item to track this unlinked/truncated file */
3387         if (insert >= 1) {
3388                 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
3389                 if (ret) {
3390                         atomic_dec(&root->orphan_inodes);
3391                         if (reserve) {
3392                                 clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3393                                           &inode->runtime_flags);
3394                                 btrfs_orphan_release_metadata(inode);
3395                         }
3396                         if (ret != -EEXIST) {
3397                                 clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3398                                           &inode->runtime_flags);
3399                                 btrfs_abort_transaction(trans, ret);
3400                                 return ret;
3401                         }
3402                 }
3403                 ret = 0;
3404         }
3405
3406         /* insert an orphan item to track subvolume contains orphan files */
3407         if (insert >= 2) {
3408                 ret = btrfs_insert_orphan_item(trans, fs_info->tree_root,
3409                                                root->root_key.objectid);
3410                 if (ret && ret != -EEXIST) {
3411                         btrfs_abort_transaction(trans, ret);
3412                         return ret;
3413                 }
3414         }
3415         return 0;
3416 }
3417
3418 /*
3419  * We have done the truncate/delete so we can go ahead and remove the orphan
3420  * item for this particular inode.
3421  */
3422 static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
3423                             struct btrfs_inode *inode)
3424 {
3425         struct btrfs_root *root = inode->root;
3426         int delete_item = 0;
3427         int release_rsv = 0;
3428         int ret = 0;
3429
3430         spin_lock(&root->orphan_lock);
3431         if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3432                                &inode->runtime_flags))
3433                 delete_item = 1;
3434
3435         if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3436                                &inode->runtime_flags))
3437                 release_rsv = 1;
3438         spin_unlock(&root->orphan_lock);
3439
3440         if (delete_item) {
3441                 atomic_dec(&root->orphan_inodes);
3442                 if (trans)
3443                         ret = btrfs_del_orphan_item(trans, root,
3444                                                     btrfs_ino(inode));
3445         }
3446
3447         if (release_rsv)
3448                 btrfs_orphan_release_metadata(inode);
3449
3450         return ret;
3451 }
3452
3453 /*
3454  * this cleans up any orphans that may be left on the list from the last use
3455  * of this root.
3456  */
3457 int btrfs_orphan_cleanup(struct btrfs_root *root)
3458 {
3459         struct btrfs_fs_info *fs_info = root->fs_info;
3460         struct btrfs_path *path;
3461         struct extent_buffer *leaf;
3462         struct btrfs_key key, found_key;
3463         struct btrfs_trans_handle *trans;
3464         struct inode *inode;
3465         u64 last_objectid = 0;
3466         int ret = 0, nr_unlink = 0, nr_truncate = 0;
3467
3468         if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
3469                 return 0;
3470
3471         path = btrfs_alloc_path();
3472         if (!path) {
3473                 ret = -ENOMEM;
3474                 goto out;
3475         }
3476         path->reada = READA_BACK;
3477
3478         key.objectid = BTRFS_ORPHAN_OBJECTID;
3479         key.type = BTRFS_ORPHAN_ITEM_KEY;
3480         key.offset = (u64)-1;
3481
3482         while (1) {
3483                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3484                 if (ret < 0)
3485                         goto out;
3486
3487                 /*
3488                  * if ret == 0 means we found what we were searching for, which
3489                  * is weird, but possible, so only screw with path if we didn't
3490                  * find the key and see if we have stuff that matches
3491                  */
3492                 if (ret > 0) {
3493                         ret = 0;
3494                         if (path->slots[0] == 0)
3495                                 break;
3496                         path->slots[0]--;
3497                 }
3498
3499                 /* pull out the item */
3500                 leaf = path->nodes[0];
3501                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3502
3503                 /* make sure the item matches what we want */
3504                 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3505                         break;
3506                 if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
3507                         break;
3508
3509                 /* release the path since we're done with it */
3510                 btrfs_release_path(path);
3511
3512                 /*
3513                  * this is where we are basically btrfs_lookup, without the
3514                  * crossing root thing.  we store the inode number in the
3515                  * offset of the orphan item.
3516                  */
3517
3518                 if (found_key.offset == last_objectid) {
3519                         btrfs_err(fs_info,
3520                                   "Error removing orphan entry, stopping orphan cleanup");
3521                         ret = -EINVAL;
3522                         goto out;
3523                 }
3524
3525                 last_objectid = found_key.offset;
3526
3527                 found_key.objectid = found_key.offset;
3528                 found_key.type = BTRFS_INODE_ITEM_KEY;
3529                 found_key.offset = 0;
3530                 inode = btrfs_iget(fs_info->sb, &found_key, root, NULL);
3531                 ret = PTR_ERR_OR_ZERO(inode);
3532                 if (ret && ret != -ENOENT)
3533                         goto out;
3534
3535                 if (ret == -ENOENT && root == fs_info->tree_root) {
3536                         struct btrfs_root *dead_root;
3537                         struct btrfs_fs_info *fs_info = root->fs_info;
3538                         int is_dead_root = 0;
3539
3540                         /*
3541                          * this is an orphan in the tree root. Currently these
3542                          * could come from 2 sources:
3543                          *  a) a snapshot deletion in progress
3544                          *  b) a free space cache inode
3545                          * We need to distinguish those two, as the snapshot
3546                          * orphan must not get deleted.
3547                          * find_dead_roots already ran before us, so if this
3548                          * is a snapshot deletion, we should find the root
3549                          * in the dead_roots list