btrfs: sink get_extent parameter to extent_write_full_page
[sfrench/cifs-2.6.git] / fs / btrfs / inode.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/kernel.h>
20 #include <linux/bio.h>
21 #include <linux/buffer_head.h>
22 #include <linux/file.h>
23 #include <linux/fs.h>
24 #include <linux/pagemap.h>
25 #include <linux/highmem.h>
26 #include <linux/time.h>
27 #include <linux/init.h>
28 #include <linux/string.h>
29 #include <linux/backing-dev.h>
30 #include <linux/mpage.h>
31 #include <linux/swap.h>
32 #include <linux/writeback.h>
33 #include <linux/compat.h>
34 #include <linux/bit_spinlock.h>
35 #include <linux/xattr.h>
36 #include <linux/posix_acl.h>
37 #include <linux/falloc.h>
38 #include <linux/slab.h>
39 #include <linux/ratelimit.h>
40 #include <linux/mount.h>
41 #include <linux/btrfs.h>
42 #include <linux/blkdev.h>
43 #include <linux/posix_acl_xattr.h>
44 #include <linux/uio.h>
45 #include <linux/magic.h>
46 #include "ctree.h"
47 #include "disk-io.h"
48 #include "transaction.h"
49 #include "btrfs_inode.h"
50 #include "print-tree.h"
51 #include "ordered-data.h"
52 #include "xattr.h"
53 #include "tree-log.h"
54 #include "volumes.h"
55 #include "compression.h"
56 #include "locking.h"
57 #include "free-space-cache.h"
58 #include "inode-map.h"
59 #include "backref.h"
60 #include "hash.h"
61 #include "props.h"
62 #include "qgroup.h"
63 #include "dedupe.h"
64
65 struct btrfs_iget_args {
66         struct btrfs_key *location;
67         struct btrfs_root *root;
68 };
69
70 struct btrfs_dio_data {
71         u64 reserve;
72         u64 unsubmitted_oe_range_start;
73         u64 unsubmitted_oe_range_end;
74         int overwrite;
75 };
76
77 static const struct inode_operations btrfs_dir_inode_operations;
78 static const struct inode_operations btrfs_symlink_inode_operations;
79 static const struct inode_operations btrfs_dir_ro_inode_operations;
80 static const struct inode_operations btrfs_special_inode_operations;
81 static const struct inode_operations btrfs_file_inode_operations;
82 static const struct address_space_operations btrfs_aops;
83 static const struct address_space_operations btrfs_symlink_aops;
84 static const struct file_operations btrfs_dir_file_operations;
85 static const struct extent_io_ops btrfs_extent_io_ops;
86
87 static struct kmem_cache *btrfs_inode_cachep;
88 struct kmem_cache *btrfs_trans_handle_cachep;
89 struct kmem_cache *btrfs_path_cachep;
90 struct kmem_cache *btrfs_free_space_cachep;
91
92 #define S_SHIFT 12
93 static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
94         [S_IFREG >> S_SHIFT]    = BTRFS_FT_REG_FILE,
95         [S_IFDIR >> S_SHIFT]    = BTRFS_FT_DIR,
96         [S_IFCHR >> S_SHIFT]    = BTRFS_FT_CHRDEV,
97         [S_IFBLK >> S_SHIFT]    = BTRFS_FT_BLKDEV,
98         [S_IFIFO >> S_SHIFT]    = BTRFS_FT_FIFO,
99         [S_IFSOCK >> S_SHIFT]   = BTRFS_FT_SOCK,
100         [S_IFLNK >> S_SHIFT]    = BTRFS_FT_SYMLINK,
101 };
102
103 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
104 static int btrfs_truncate(struct inode *inode);
105 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
106 static noinline int cow_file_range(struct inode *inode,
107                                    struct page *locked_page,
108                                    u64 start, u64 end, u64 delalloc_end,
109                                    int *page_started, unsigned long *nr_written,
110                                    int unlock, struct btrfs_dedupe_hash *hash);
111 static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
112                                        u64 orig_start, u64 block_start,
113                                        u64 block_len, u64 orig_block_len,
114                                        u64 ram_bytes, int compress_type,
115                                        int type);
116
117 static void __endio_write_update_ordered(struct inode *inode,
118                                          const u64 offset, const u64 bytes,
119                                          const bool uptodate);
120
121 /*
122  * Cleanup all submitted ordered extents in specified range to handle errors
123  * from the fill_dellaloc() callback.
124  *
125  * NOTE: caller must ensure that when an error happens, it can not call
126  * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
127  * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
128  * to be released, which we want to happen only when finishing the ordered
129  * extent (btrfs_finish_ordered_io()). Also note that the caller of the
130  * fill_delalloc() callback already does proper cleanup for the first page of
131  * the range, that is, it invokes the callback writepage_end_io_hook() for the
132  * range of the first page.
133  */
134 static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
135                                                  const u64 offset,
136                                                  const u64 bytes)
137 {
138         unsigned long index = offset >> PAGE_SHIFT;
139         unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
140         struct page *page;
141
142         while (index <= end_index) {
143                 page = find_get_page(inode->i_mapping, index);
144                 index++;
145                 if (!page)
146                         continue;
147                 ClearPagePrivate2(page);
148                 put_page(page);
149         }
150         return __endio_write_update_ordered(inode, offset + PAGE_SIZE,
151                                             bytes - PAGE_SIZE, false);
152 }
153
154 static int btrfs_dirty_inode(struct inode *inode);
155
156 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
157 void btrfs_test_inode_set_ops(struct inode *inode)
158 {
159         BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
160 }
161 #endif
162
163 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
164                                      struct inode *inode,  struct inode *dir,
165                                      const struct qstr *qstr)
166 {
167         int err;
168
169         err = btrfs_init_acl(trans, inode, dir);
170         if (!err)
171                 err = btrfs_xattr_security_init(trans, inode, dir, qstr);
172         return err;
173 }
174
175 /*
176  * this does all the hard work for inserting an inline extent into
177  * the btree.  The caller should have done a btrfs_drop_extents so that
178  * no overlapping inline items exist in the btree
179  */
180 static int insert_inline_extent(struct btrfs_trans_handle *trans,
181                                 struct btrfs_path *path, int extent_inserted,
182                                 struct btrfs_root *root, struct inode *inode,
183                                 u64 start, size_t size, size_t compressed_size,
184                                 int compress_type,
185                                 struct page **compressed_pages)
186 {
187         struct extent_buffer *leaf;
188         struct page *page = NULL;
189         char *kaddr;
190         unsigned long ptr;
191         struct btrfs_file_extent_item *ei;
192         int ret;
193         size_t cur_size = size;
194         unsigned long offset;
195
196         if (compressed_size && compressed_pages)
197                 cur_size = compressed_size;
198
199         inode_add_bytes(inode, size);
200
201         if (!extent_inserted) {
202                 struct btrfs_key key;
203                 size_t datasize;
204
205                 key.objectid = btrfs_ino(BTRFS_I(inode));
206                 key.offset = start;
207                 key.type = BTRFS_EXTENT_DATA_KEY;
208
209                 datasize = btrfs_file_extent_calc_inline_size(cur_size);
210                 path->leave_spinning = 1;
211                 ret = btrfs_insert_empty_item(trans, root, path, &key,
212                                               datasize);
213                 if (ret)
214                         goto fail;
215         }
216         leaf = path->nodes[0];
217         ei = btrfs_item_ptr(leaf, path->slots[0],
218                             struct btrfs_file_extent_item);
219         btrfs_set_file_extent_generation(leaf, ei, trans->transid);
220         btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
221         btrfs_set_file_extent_encryption(leaf, ei, 0);
222         btrfs_set_file_extent_other_encoding(leaf, ei, 0);
223         btrfs_set_file_extent_ram_bytes(leaf, ei, size);
224         ptr = btrfs_file_extent_inline_start(ei);
225
226         if (compress_type != BTRFS_COMPRESS_NONE) {
227                 struct page *cpage;
228                 int i = 0;
229                 while (compressed_size > 0) {
230                         cpage = compressed_pages[i];
231                         cur_size = min_t(unsigned long, compressed_size,
232                                        PAGE_SIZE);
233
234                         kaddr = kmap_atomic(cpage);
235                         write_extent_buffer(leaf, kaddr, ptr, cur_size);
236                         kunmap_atomic(kaddr);
237
238                         i++;
239                         ptr += cur_size;
240                         compressed_size -= cur_size;
241                 }
242                 btrfs_set_file_extent_compression(leaf, ei,
243                                                   compress_type);
244         } else {
245                 page = find_get_page(inode->i_mapping,
246                                      start >> PAGE_SHIFT);
247                 btrfs_set_file_extent_compression(leaf, ei, 0);
248                 kaddr = kmap_atomic(page);
249                 offset = start & (PAGE_SIZE - 1);
250                 write_extent_buffer(leaf, kaddr + offset, ptr, size);
251                 kunmap_atomic(kaddr);
252                 put_page(page);
253         }
254         btrfs_mark_buffer_dirty(leaf);
255         btrfs_release_path(path);
256
257         /*
258          * we're an inline extent, so nobody can
259          * extend the file past i_size without locking
260          * a page we already have locked.
261          *
262          * We must do any isize and inode updates
263          * before we unlock the pages.  Otherwise we
264          * could end up racing with unlink.
265          */
266         BTRFS_I(inode)->disk_i_size = inode->i_size;
267         ret = btrfs_update_inode(trans, root, inode);
268
269 fail:
270         return ret;
271 }
272
273
274 /*
275  * conditionally insert an inline extent into the file.  This
276  * does the checks required to make sure the data is small enough
277  * to fit as an inline extent.
278  */
279 static noinline int cow_file_range_inline(struct btrfs_root *root,
280                                           struct inode *inode, u64 start,
281                                           u64 end, size_t compressed_size,
282                                           int compress_type,
283                                           struct page **compressed_pages)
284 {
285         struct btrfs_fs_info *fs_info = root->fs_info;
286         struct btrfs_trans_handle *trans;
287         u64 isize = i_size_read(inode);
288         u64 actual_end = min(end + 1, isize);
289         u64 inline_len = actual_end - start;
290         u64 aligned_end = ALIGN(end, fs_info->sectorsize);
291         u64 data_len = inline_len;
292         int ret;
293         struct btrfs_path *path;
294         int extent_inserted = 0;
295         u32 extent_item_size;
296
297         if (compressed_size)
298                 data_len = compressed_size;
299
300         if (start > 0 ||
301             actual_end > fs_info->sectorsize ||
302             data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
303             (!compressed_size &&
304             (actual_end & (fs_info->sectorsize - 1)) == 0) ||
305             end + 1 < isize ||
306             data_len > fs_info->max_inline) {
307                 return 1;
308         }
309
310         path = btrfs_alloc_path();
311         if (!path)
312                 return -ENOMEM;
313
314         trans = btrfs_join_transaction(root);
315         if (IS_ERR(trans)) {
316                 btrfs_free_path(path);
317                 return PTR_ERR(trans);
318         }
319         trans->block_rsv = &BTRFS_I(inode)->block_rsv;
320
321         if (compressed_size && compressed_pages)
322                 extent_item_size = btrfs_file_extent_calc_inline_size(
323                    compressed_size);
324         else
325                 extent_item_size = btrfs_file_extent_calc_inline_size(
326                     inline_len);
327
328         ret = __btrfs_drop_extents(trans, root, inode, path,
329                                    start, aligned_end, NULL,
330                                    1, 1, extent_item_size, &extent_inserted);
331         if (ret) {
332                 btrfs_abort_transaction(trans, ret);
333                 goto out;
334         }
335
336         if (isize > actual_end)
337                 inline_len = min_t(u64, isize, actual_end);
338         ret = insert_inline_extent(trans, path, extent_inserted,
339                                    root, inode, start,
340                                    inline_len, compressed_size,
341                                    compress_type, compressed_pages);
342         if (ret && ret != -ENOSPC) {
343                 btrfs_abort_transaction(trans, ret);
344                 goto out;
345         } else if (ret == -ENOSPC) {
346                 ret = 1;
347                 goto out;
348         }
349
350         set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
351         btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0);
352 out:
353         /*
354          * Don't forget to free the reserved space, as for inlined extent
355          * it won't count as data extent, free them directly here.
356          * And at reserve time, it's always aligned to page size, so
357          * just free one page here.
358          */
359         btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
360         btrfs_free_path(path);
361         btrfs_end_transaction(trans);
362         return ret;
363 }
364
365 struct async_extent {
366         u64 start;
367         u64 ram_size;
368         u64 compressed_size;
369         struct page **pages;
370         unsigned long nr_pages;
371         int compress_type;
372         struct list_head list;
373 };
374
375 struct async_cow {
376         struct inode *inode;
377         struct btrfs_root *root;
378         struct page *locked_page;
379         u64 start;
380         u64 end;
381         unsigned int write_flags;
382         struct list_head extents;
383         struct btrfs_work work;
384 };
385
386 static noinline int add_async_extent(struct async_cow *cow,
387                                      u64 start, u64 ram_size,
388                                      u64 compressed_size,
389                                      struct page **pages,
390                                      unsigned long nr_pages,
391                                      int compress_type)
392 {
393         struct async_extent *async_extent;
394
395         async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
396         BUG_ON(!async_extent); /* -ENOMEM */
397         async_extent->start = start;
398         async_extent->ram_size = ram_size;
399         async_extent->compressed_size = compressed_size;
400         async_extent->pages = pages;
401         async_extent->nr_pages = nr_pages;
402         async_extent->compress_type = compress_type;
403         list_add_tail(&async_extent->list, &cow->extents);
404         return 0;
405 }
406
407 static inline int inode_need_compress(struct inode *inode, u64 start, u64 end)
408 {
409         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
410
411         /* force compress */
412         if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
413                 return 1;
414         /* defrag ioctl */
415         if (BTRFS_I(inode)->defrag_compress)
416                 return 1;
417         /* bad compression ratios */
418         if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
419                 return 0;
420         if (btrfs_test_opt(fs_info, COMPRESS) ||
421             BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
422             BTRFS_I(inode)->prop_compress)
423                 return btrfs_compress_heuristic(inode, start, end);
424         return 0;
425 }
426
427 static inline void inode_should_defrag(struct btrfs_inode *inode,
428                 u64 start, u64 end, u64 num_bytes, u64 small_write)
429 {
430         /* If this is a small write inside eof, kick off a defrag */
431         if (num_bytes < small_write &&
432             (start > 0 || end + 1 < inode->disk_i_size))
433                 btrfs_add_inode_defrag(NULL, inode);
434 }
435
436 /*
437  * we create compressed extents in two phases.  The first
438  * phase compresses a range of pages that have already been
439  * locked (both pages and state bits are locked).
440  *
441  * This is done inside an ordered work queue, and the compression
442  * is spread across many cpus.  The actual IO submission is step
443  * two, and the ordered work queue takes care of making sure that
444  * happens in the same order things were put onto the queue by
445  * writepages and friends.
446  *
447  * If this code finds it can't get good compression, it puts an
448  * entry onto the work queue to write the uncompressed bytes.  This
449  * makes sure that both compressed inodes and uncompressed inodes
450  * are written in the same order that the flusher thread sent them
451  * down.
452  */
453 static noinline void compress_file_range(struct inode *inode,
454                                         struct page *locked_page,
455                                         u64 start, u64 end,
456                                         struct async_cow *async_cow,
457                                         int *num_added)
458 {
459         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
460         struct btrfs_root *root = BTRFS_I(inode)->root;
461         u64 blocksize = fs_info->sectorsize;
462         u64 actual_end;
463         u64 isize = i_size_read(inode);
464         int ret = 0;
465         struct page **pages = NULL;
466         unsigned long nr_pages;
467         unsigned long total_compressed = 0;
468         unsigned long total_in = 0;
469         int i;
470         int will_compress;
471         int compress_type = fs_info->compress_type;
472         int redirty = 0;
473
474         inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
475                         SZ_16K);
476
477         actual_end = min_t(u64, isize, end + 1);
478 again:
479         will_compress = 0;
480         nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
481         BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0);
482         nr_pages = min_t(unsigned long, nr_pages,
483                         BTRFS_MAX_COMPRESSED / PAGE_SIZE);
484
485         /*
486          * we don't want to send crud past the end of i_size through
487          * compression, that's just a waste of CPU time.  So, if the
488          * end of the file is before the start of our current
489          * requested range of bytes, we bail out to the uncompressed
490          * cleanup code that can deal with all of this.
491          *
492          * It isn't really the fastest way to fix things, but this is a
493          * very uncommon corner.
494          */
495         if (actual_end <= start)
496                 goto cleanup_and_bail_uncompressed;
497
498         total_compressed = actual_end - start;
499
500         /*
501          * skip compression for a small file range(<=blocksize) that
502          * isn't an inline extent, since it doesn't save disk space at all.
503          */
504         if (total_compressed <= blocksize &&
505            (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
506                 goto cleanup_and_bail_uncompressed;
507
508         total_compressed = min_t(unsigned long, total_compressed,
509                         BTRFS_MAX_UNCOMPRESSED);
510         total_in = 0;
511         ret = 0;
512
513         /*
514          * we do compression for mount -o compress and when the
515          * inode has not been flagged as nocompress.  This flag can
516          * change at any time if we discover bad compression ratios.
517          */
518         if (inode_need_compress(inode, start, end)) {
519                 WARN_ON(pages);
520                 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
521                 if (!pages) {
522                         /* just bail out to the uncompressed code */
523                         goto cont;
524                 }
525
526                 if (BTRFS_I(inode)->defrag_compress)
527                         compress_type = BTRFS_I(inode)->defrag_compress;
528                 else if (BTRFS_I(inode)->prop_compress)
529                         compress_type = BTRFS_I(inode)->prop_compress;
530
531                 /*
532                  * we need to call clear_page_dirty_for_io on each
533                  * page in the range.  Otherwise applications with the file
534                  * mmap'd can wander in and change the page contents while
535                  * we are compressing them.
536                  *
537                  * If the compression fails for any reason, we set the pages
538                  * dirty again later on.
539                  */
540                 extent_range_clear_dirty_for_io(inode, start, end);
541                 redirty = 1;
542
543                 /* Compression level is applied here and only here */
544                 ret = btrfs_compress_pages(
545                         compress_type | (fs_info->compress_level << 4),
546                                            inode->i_mapping, start,
547                                            pages,
548                                            &nr_pages,
549                                            &total_in,
550                                            &total_compressed);
551
552                 if (!ret) {
553                         unsigned long offset = total_compressed &
554                                 (PAGE_SIZE - 1);
555                         struct page *page = pages[nr_pages - 1];
556                         char *kaddr;
557
558                         /* zero the tail end of the last page, we might be
559                          * sending it down to disk
560                          */
561                         if (offset) {
562                                 kaddr = kmap_atomic(page);
563                                 memset(kaddr + offset, 0,
564                                        PAGE_SIZE - offset);
565                                 kunmap_atomic(kaddr);
566                         }
567                         will_compress = 1;
568                 }
569         }
570 cont:
571         if (start == 0) {
572                 /* lets try to make an inline extent */
573                 if (ret || total_in < actual_end) {
574                         /* we didn't compress the entire range, try
575                          * to make an uncompressed inline extent.
576                          */
577                         ret = cow_file_range_inline(root, inode, start, end,
578                                             0, BTRFS_COMPRESS_NONE, NULL);
579                 } else {
580                         /* try making a compressed inline extent */
581                         ret = cow_file_range_inline(root, inode, start, end,
582                                                     total_compressed,
583                                                     compress_type, pages);
584                 }
585                 if (ret <= 0) {
586                         unsigned long clear_flags = EXTENT_DELALLOC |
587                                 EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
588                                 EXTENT_DO_ACCOUNTING;
589                         unsigned long page_error_op;
590
591                         page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
592
593                         /*
594                          * inline extent creation worked or returned error,
595                          * we don't need to create any more async work items.
596                          * Unlock and free up our temp pages.
597                          *
598                          * We use DO_ACCOUNTING here because we need the
599                          * delalloc_release_metadata to be done _after_ we drop
600                          * our outstanding extent for clearing delalloc for this
601                          * range.
602                          */
603                         extent_clear_unlock_delalloc(inode, start, end, end,
604                                                      NULL, clear_flags,
605                                                      PAGE_UNLOCK |
606                                                      PAGE_CLEAR_DIRTY |
607                                                      PAGE_SET_WRITEBACK |
608                                                      page_error_op |
609                                                      PAGE_END_WRITEBACK);
610                         goto free_pages_out;
611                 }
612         }
613
614         if (will_compress) {
615                 /*
616                  * we aren't doing an inline extent round the compressed size
617                  * up to a block size boundary so the allocator does sane
618                  * things
619                  */
620                 total_compressed = ALIGN(total_compressed, blocksize);
621
622                 /*
623                  * one last check to make sure the compression is really a
624                  * win, compare the page count read with the blocks on disk,
625                  * compression must free at least one sector size
626                  */
627                 total_in = ALIGN(total_in, PAGE_SIZE);
628                 if (total_compressed + blocksize <= total_in) {
629                         *num_added += 1;
630
631                         /*
632                          * The async work queues will take care of doing actual
633                          * allocation on disk for these compressed pages, and
634                          * will submit them to the elevator.
635                          */
636                         add_async_extent(async_cow, start, total_in,
637                                         total_compressed, pages, nr_pages,
638                                         compress_type);
639
640                         if (start + total_in < end) {
641                                 start += total_in;
642                                 pages = NULL;
643                                 cond_resched();
644                                 goto again;
645                         }
646                         return;
647                 }
648         }
649         if (pages) {
650                 /*
651                  * the compression code ran but failed to make things smaller,
652                  * free any pages it allocated and our page pointer array
653                  */
654                 for (i = 0; i < nr_pages; i++) {
655                         WARN_ON(pages[i]->mapping);
656                         put_page(pages[i]);
657                 }
658                 kfree(pages);
659                 pages = NULL;
660                 total_compressed = 0;
661                 nr_pages = 0;
662
663                 /* flag the file so we don't compress in the future */
664                 if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
665                     !(BTRFS_I(inode)->prop_compress)) {
666                         BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
667                 }
668         }
669 cleanup_and_bail_uncompressed:
670         /*
671          * No compression, but we still need to write the pages in the file
672          * we've been given so far.  redirty the locked page if it corresponds
673          * to our extent and set things up for the async work queue to run
674          * cow_file_range to do the normal delalloc dance.
675          */
676         if (page_offset(locked_page) >= start &&
677             page_offset(locked_page) <= end)
678                 __set_page_dirty_nobuffers(locked_page);
679                 /* unlocked later on in the async handlers */
680
681         if (redirty)
682                 extent_range_redirty_for_io(inode, start, end);
683         add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0,
684                          BTRFS_COMPRESS_NONE);
685         *num_added += 1;
686
687         return;
688
689 free_pages_out:
690         for (i = 0; i < nr_pages; i++) {
691                 WARN_ON(pages[i]->mapping);
692                 put_page(pages[i]);
693         }
694         kfree(pages);
695 }
696
697 static void free_async_extent_pages(struct async_extent *async_extent)
698 {
699         int i;
700
701         if (!async_extent->pages)
702                 return;
703
704         for (i = 0; i < async_extent->nr_pages; i++) {
705                 WARN_ON(async_extent->pages[i]->mapping);
706                 put_page(async_extent->pages[i]);
707         }
708         kfree(async_extent->pages);
709         async_extent->nr_pages = 0;
710         async_extent->pages = NULL;
711 }
712
713 /*
714  * phase two of compressed writeback.  This is the ordered portion
715  * of the code, which only gets called in the order the work was
716  * queued.  We walk all the async extents created by compress_file_range
717  * and send them down to the disk.
718  */
719 static noinline void submit_compressed_extents(struct inode *inode,
720                                               struct async_cow *async_cow)
721 {
722         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
723         struct async_extent *async_extent;
724         u64 alloc_hint = 0;
725         struct btrfs_key ins;
726         struct extent_map *em;
727         struct btrfs_root *root = BTRFS_I(inode)->root;
728         struct extent_io_tree *io_tree;
729         int ret = 0;
730
731 again:
732         while (!list_empty(&async_cow->extents)) {
733                 async_extent = list_entry(async_cow->extents.next,
734                                           struct async_extent, list);
735                 list_del(&async_extent->list);
736
737                 io_tree = &BTRFS_I(inode)->io_tree;
738
739 retry:
740                 /* did the compression code fall back to uncompressed IO? */
741                 if (!async_extent->pages) {
742                         int page_started = 0;
743                         unsigned long nr_written = 0;
744
745                         lock_extent(io_tree, async_extent->start,
746                                          async_extent->start +
747                                          async_extent->ram_size - 1);
748
749                         /* allocate blocks */
750                         ret = cow_file_range(inode, async_cow->locked_page,
751                                              async_extent->start,
752                                              async_extent->start +
753                                              async_extent->ram_size - 1,
754                                              async_extent->start +
755                                              async_extent->ram_size - 1,
756                                              &page_started, &nr_written, 0,
757                                              NULL);
758
759                         /* JDM XXX */
760
761                         /*
762                          * if page_started, cow_file_range inserted an
763                          * inline extent and took care of all the unlocking
764                          * and IO for us.  Otherwise, we need to submit
765                          * all those pages down to the drive.
766                          */
767                         if (!page_started && !ret)
768                                 extent_write_locked_range(io_tree,
769                                                   inode, async_extent->start,
770                                                   async_extent->start +
771                                                   async_extent->ram_size - 1,
772                                                   WB_SYNC_ALL);
773                         else if (ret)
774                                 unlock_page(async_cow->locked_page);
775                         kfree(async_extent);
776                         cond_resched();
777                         continue;
778                 }
779
780                 lock_extent(io_tree, async_extent->start,
781                             async_extent->start + async_extent->ram_size - 1);
782
783                 ret = btrfs_reserve_extent(root, async_extent->ram_size,
784                                            async_extent->compressed_size,
785                                            async_extent->compressed_size,
786                                            0, alloc_hint, &ins, 1, 1);
787                 if (ret) {
788                         free_async_extent_pages(async_extent);
789
790                         if (ret == -ENOSPC) {
791                                 unlock_extent(io_tree, async_extent->start,
792                                               async_extent->start +
793                                               async_extent->ram_size - 1);
794
795                                 /*
796                                  * we need to redirty the pages if we decide to
797                                  * fallback to uncompressed IO, otherwise we
798                                  * will not submit these pages down to lower
799                                  * layers.
800                                  */
801                                 extent_range_redirty_for_io(inode,
802                                                 async_extent->start,
803                                                 async_extent->start +
804                                                 async_extent->ram_size - 1);
805
806                                 goto retry;
807                         }
808                         goto out_free;
809                 }
810                 /*
811                  * here we're doing allocation and writeback of the
812                  * compressed pages
813                  */
814                 em = create_io_em(inode, async_extent->start,
815                                   async_extent->ram_size, /* len */
816                                   async_extent->start, /* orig_start */
817                                   ins.objectid, /* block_start */
818                                   ins.offset, /* block_len */
819                                   ins.offset, /* orig_block_len */
820                                   async_extent->ram_size, /* ram_bytes */
821                                   async_extent->compress_type,
822                                   BTRFS_ORDERED_COMPRESSED);
823                 if (IS_ERR(em))
824                         /* ret value is not necessary due to void function */
825                         goto out_free_reserve;
826                 free_extent_map(em);
827
828                 ret = btrfs_add_ordered_extent_compress(inode,
829                                                 async_extent->start,
830                                                 ins.objectid,
831                                                 async_extent->ram_size,
832                                                 ins.offset,
833                                                 BTRFS_ORDERED_COMPRESSED,
834                                                 async_extent->compress_type);
835                 if (ret) {
836                         btrfs_drop_extent_cache(BTRFS_I(inode),
837                                                 async_extent->start,
838                                                 async_extent->start +
839                                                 async_extent->ram_size - 1, 0);
840                         goto out_free_reserve;
841                 }
842                 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
843
844                 /*
845                  * clear dirty, set writeback and unlock the pages.
846                  */
847                 extent_clear_unlock_delalloc(inode, async_extent->start,
848                                 async_extent->start +
849                                 async_extent->ram_size - 1,
850                                 async_extent->start +
851                                 async_extent->ram_size - 1,
852                                 NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
853                                 PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
854                                 PAGE_SET_WRITEBACK);
855                 if (btrfs_submit_compressed_write(inode,
856                                     async_extent->start,
857                                     async_extent->ram_size,
858                                     ins.objectid,
859                                     ins.offset, async_extent->pages,
860                                     async_extent->nr_pages,
861                                     async_cow->write_flags)) {
862                         struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
863                         struct page *p = async_extent->pages[0];
864                         const u64 start = async_extent->start;
865                         const u64 end = start + async_extent->ram_size - 1;
866
867                         p->mapping = inode->i_mapping;
868                         tree->ops->writepage_end_io_hook(p, start, end,
869                                                          NULL, 0);
870                         p->mapping = NULL;
871                         extent_clear_unlock_delalloc(inode, start, end, end,
872                                                      NULL, 0,
873                                                      PAGE_END_WRITEBACK |
874                                                      PAGE_SET_ERROR);
875                         free_async_extent_pages(async_extent);
876                 }
877                 alloc_hint = ins.objectid + ins.offset;
878                 kfree(async_extent);
879                 cond_resched();
880         }
881         return;
882 out_free_reserve:
883         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
884         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
885 out_free:
886         extent_clear_unlock_delalloc(inode, async_extent->start,
887                                      async_extent->start +
888                                      async_extent->ram_size - 1,
889                                      async_extent->start +
890                                      async_extent->ram_size - 1,
891                                      NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
892                                      EXTENT_DELALLOC_NEW |
893                                      EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
894                                      PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
895                                      PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
896                                      PAGE_SET_ERROR);
897         free_async_extent_pages(async_extent);
898         kfree(async_extent);
899         goto again;
900 }
901
902 static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
903                                       u64 num_bytes)
904 {
905         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
906         struct extent_map *em;
907         u64 alloc_hint = 0;
908
909         read_lock(&em_tree->lock);
910         em = search_extent_mapping(em_tree, start, num_bytes);
911         if (em) {
912                 /*
913                  * if block start isn't an actual block number then find the
914                  * first block in this inode and use that as a hint.  If that
915                  * block is also bogus then just don't worry about it.
916                  */
917                 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
918                         free_extent_map(em);
919                         em = search_extent_mapping(em_tree, 0, 0);
920                         if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
921                                 alloc_hint = em->block_start;
922                         if (em)
923                                 free_extent_map(em);
924                 } else {
925                         alloc_hint = em->block_start;
926                         free_extent_map(em);
927                 }
928         }
929         read_unlock(&em_tree->lock);
930
931         return alloc_hint;
932 }
933
934 /*
935  * when extent_io.c finds a delayed allocation range in the file,
936  * the call backs end up in this code.  The basic idea is to
937  * allocate extents on disk for the range, and create ordered data structs
938  * in ram to track those extents.
939  *
940  * locked_page is the page that writepage had locked already.  We use
941  * it to make sure we don't do extra locks or unlocks.
942  *
943  * *page_started is set to one if we unlock locked_page and do everything
944  * required to start IO on it.  It may be clean and already done with
945  * IO when we return.
946  */
947 static noinline int cow_file_range(struct inode *inode,
948                                    struct page *locked_page,
949                                    u64 start, u64 end, u64 delalloc_end,
950                                    int *page_started, unsigned long *nr_written,
951                                    int unlock, struct btrfs_dedupe_hash *hash)
952 {
953         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
954         struct btrfs_root *root = BTRFS_I(inode)->root;
955         u64 alloc_hint = 0;
956         u64 num_bytes;
957         unsigned long ram_size;
958         u64 disk_num_bytes;
959         u64 cur_alloc_size = 0;
960         u64 blocksize = fs_info->sectorsize;
961         struct btrfs_key ins;
962         struct extent_map *em;
963         unsigned clear_bits;
964         unsigned long page_ops;
965         bool extent_reserved = false;
966         int ret = 0;
967
968         if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
969                 WARN_ON_ONCE(1);
970                 ret = -EINVAL;
971                 goto out_unlock;
972         }
973
974         num_bytes = ALIGN(end - start + 1, blocksize);
975         num_bytes = max(blocksize,  num_bytes);
976         disk_num_bytes = num_bytes;
977
978         inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K);
979
980         if (start == 0) {
981                 /* lets try to make an inline extent */
982                 ret = cow_file_range_inline(root, inode, start, end, 0,
983                                         BTRFS_COMPRESS_NONE, NULL);
984                 if (ret == 0) {
985                         /*
986                          * We use DO_ACCOUNTING here because we need the
987                          * delalloc_release_metadata to be run _after_ we drop
988                          * our outstanding extent for clearing delalloc for this
989                          * range.
990                          */
991                         extent_clear_unlock_delalloc(inode, start, end,
992                                      delalloc_end, NULL,
993                                      EXTENT_LOCKED | EXTENT_DELALLOC |
994                                      EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
995                                      EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
996                                      PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
997                                      PAGE_END_WRITEBACK);
998                         *nr_written = *nr_written +
999                              (end - start + PAGE_SIZE) / PAGE_SIZE;
1000                         *page_started = 1;
1001                         goto out;
1002                 } else if (ret < 0) {
1003                         goto out_unlock;
1004                 }
1005         }
1006
1007         BUG_ON(disk_num_bytes >
1008                btrfs_super_total_bytes(fs_info->super_copy));
1009
1010         alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
1011         btrfs_drop_extent_cache(BTRFS_I(inode), start,
1012                         start + num_bytes - 1, 0);
1013
1014         while (disk_num_bytes > 0) {
1015                 cur_alloc_size = disk_num_bytes;
1016                 ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
1017                                            fs_info->sectorsize, 0, alloc_hint,
1018                                            &ins, 1, 1);
1019                 if (ret < 0)
1020                         goto out_unlock;
1021                 cur_alloc_size = ins.offset;
1022                 extent_reserved = true;
1023
1024                 ram_size = ins.offset;
1025                 em = create_io_em(inode, start, ins.offset, /* len */
1026                                   start, /* orig_start */
1027                                   ins.objectid, /* block_start */
1028                                   ins.offset, /* block_len */
1029                                   ins.offset, /* orig_block_len */
1030                                   ram_size, /* ram_bytes */
1031                                   BTRFS_COMPRESS_NONE, /* compress_type */
1032                                   BTRFS_ORDERED_REGULAR /* type */);
1033                 if (IS_ERR(em))
1034                         goto out_reserve;
1035                 free_extent_map(em);
1036
1037                 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
1038                                                ram_size, cur_alloc_size, 0);
1039                 if (ret)
1040                         goto out_drop_extent_cache;
1041
1042                 if (root->root_key.objectid ==
1043                     BTRFS_DATA_RELOC_TREE_OBJECTID) {
1044                         ret = btrfs_reloc_clone_csums(inode, start,
1045                                                       cur_alloc_size);
1046                         /*
1047                          * Only drop cache here, and process as normal.
1048                          *
1049                          * We must not allow extent_clear_unlock_delalloc()
1050                          * at out_unlock label to free meta of this ordered
1051                          * extent, as its meta should be freed by
1052                          * btrfs_finish_ordered_io().
1053                          *
1054                          * So we must continue until @start is increased to
1055                          * skip current ordered extent.
1056                          */
1057                         if (ret)
1058                                 btrfs_drop_extent_cache(BTRFS_I(inode), start,
1059                                                 start + ram_size - 1, 0);
1060                 }
1061
1062                 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1063
1064                 /* we're not doing compressed IO, don't unlock the first
1065                  * page (which the caller expects to stay locked), don't
1066                  * clear any dirty bits and don't set any writeback bits
1067                  *
1068                  * Do set the Private2 bit so we know this page was properly
1069                  * setup for writepage
1070                  */
1071                 page_ops = unlock ? PAGE_UNLOCK : 0;
1072                 page_ops |= PAGE_SET_PRIVATE2;
1073
1074                 extent_clear_unlock_delalloc(inode, start,
1075                                              start + ram_size - 1,
1076                                              delalloc_end, locked_page,
1077                                              EXTENT_LOCKED | EXTENT_DELALLOC,
1078                                              page_ops);
1079                 if (disk_num_bytes < cur_alloc_size)
1080                         disk_num_bytes = 0;
1081                 else
1082                         disk_num_bytes -= cur_alloc_size;
1083                 num_bytes -= cur_alloc_size;
1084                 alloc_hint = ins.objectid + ins.offset;
1085                 start += cur_alloc_size;
1086                 extent_reserved = false;
1087
1088                 /*
1089                  * btrfs_reloc_clone_csums() error, since start is increased
1090                  * extent_clear_unlock_delalloc() at out_unlock label won't
1091                  * free metadata of current ordered extent, we're OK to exit.
1092                  */
1093                 if (ret)
1094                         goto out_unlock;
1095         }
1096 out:
1097         return ret;
1098
1099 out_drop_extent_cache:
1100         btrfs_drop_extent_cache(BTRFS_I(inode), start, start + ram_size - 1, 0);
1101 out_reserve:
1102         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1103         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1104 out_unlock:
1105         clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
1106                 EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
1107         page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
1108                 PAGE_END_WRITEBACK;
1109         /*
1110          * If we reserved an extent for our delalloc range (or a subrange) and
1111          * failed to create the respective ordered extent, then it means that
1112          * when we reserved the extent we decremented the extent's size from
1113          * the data space_info's bytes_may_use counter and incremented the
1114          * space_info's bytes_reserved counter by the same amount. We must make
1115          * sure extent_clear_unlock_delalloc() does not try to decrement again
1116          * the data space_info's bytes_may_use counter, therefore we do not pass
1117          * it the flag EXTENT_CLEAR_DATA_RESV.
1118          */
1119         if (extent_reserved) {
1120                 extent_clear_unlock_delalloc(inode, start,
1121                                              start + cur_alloc_size,
1122                                              start + cur_alloc_size,
1123                                              locked_page,
1124                                              clear_bits,
1125                                              page_ops);
1126                 start += cur_alloc_size;
1127                 if (start >= end)
1128                         goto out;
1129         }
1130         extent_clear_unlock_delalloc(inode, start, end, delalloc_end,
1131                                      locked_page,
1132                                      clear_bits | EXTENT_CLEAR_DATA_RESV,
1133                                      page_ops);
1134         goto out;
1135 }
1136
1137 /*
1138  * work queue call back to started compression on a file and pages
1139  */
1140 static noinline void async_cow_start(struct btrfs_work *work)
1141 {
1142         struct async_cow *async_cow;
1143         int num_added = 0;
1144         async_cow = container_of(work, struct async_cow, work);
1145
1146         compress_file_range(async_cow->inode, async_cow->locked_page,
1147                             async_cow->start, async_cow->end, async_cow,
1148                             &num_added);
1149         if (num_added == 0) {
1150                 btrfs_add_delayed_iput(async_cow->inode);
1151                 async_cow->inode = NULL;
1152         }
1153 }
1154
1155 /*
1156  * work queue call back to submit previously compressed pages
1157  */
1158 static noinline void async_cow_submit(struct btrfs_work *work)
1159 {
1160         struct btrfs_fs_info *fs_info;
1161         struct async_cow *async_cow;
1162         struct btrfs_root *root;
1163         unsigned long nr_pages;
1164
1165         async_cow = container_of(work, struct async_cow, work);
1166
1167         root = async_cow->root;
1168         fs_info = root->fs_info;
1169         nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >>
1170                 PAGE_SHIFT;
1171
1172         /*
1173          * atomic_sub_return implies a barrier for waitqueue_active
1174          */
1175         if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
1176             5 * SZ_1M &&
1177             waitqueue_active(&fs_info->async_submit_wait))
1178                 wake_up(&fs_info->async_submit_wait);
1179
1180         if (async_cow->inode)
1181                 submit_compressed_extents(async_cow->inode, async_cow);
1182 }
1183
1184 static noinline void async_cow_free(struct btrfs_work *work)
1185 {
1186         struct async_cow *async_cow;
1187         async_cow = container_of(work, struct async_cow, work);
1188         if (async_cow->inode)
1189                 btrfs_add_delayed_iput(async_cow->inode);
1190         kfree(async_cow);
1191 }
1192
1193 static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1194                                 u64 start, u64 end, int *page_started,
1195                                 unsigned long *nr_written,
1196                                 unsigned int write_flags)
1197 {
1198         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1199         struct async_cow *async_cow;
1200         struct btrfs_root *root = BTRFS_I(inode)->root;
1201         unsigned long nr_pages;
1202         u64 cur_end;
1203
1204         clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1205                          1, 0, NULL);
1206         while (start < end) {
1207                 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
1208                 BUG_ON(!async_cow); /* -ENOMEM */
1209                 async_cow->inode = igrab(inode);
1210                 async_cow->root = root;
1211                 async_cow->locked_page = locked_page;
1212                 async_cow->start = start;
1213                 async_cow->write_flags = write_flags;
1214
1215                 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
1216                     !btrfs_test_opt(fs_info, FORCE_COMPRESS))
1217                         cur_end = end;
1218                 else
1219                         cur_end = min(end, start + SZ_512K - 1);
1220
1221                 async_cow->end = cur_end;
1222                 INIT_LIST_HEAD(&async_cow->extents);
1223
1224                 btrfs_init_work(&async_cow->work,
1225                                 btrfs_delalloc_helper,
1226                                 async_cow_start, async_cow_submit,
1227                                 async_cow_free);
1228
1229                 nr_pages = (cur_end - start + PAGE_SIZE) >>
1230                         PAGE_SHIFT;
1231                 atomic_add(nr_pages, &fs_info->async_delalloc_pages);
1232
1233                 btrfs_queue_work(fs_info->delalloc_workers, &async_cow->work);
1234
1235                 *nr_written += nr_pages;
1236                 start = cur_end + 1;
1237         }
1238         *page_started = 1;
1239         return 0;
1240 }
1241
1242 static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
1243                                         u64 bytenr, u64 num_bytes)
1244 {
1245         int ret;
1246         struct btrfs_ordered_sum *sums;
1247         LIST_HEAD(list);
1248
1249         ret = btrfs_lookup_csums_range(fs_info->csum_root, bytenr,
1250                                        bytenr + num_bytes - 1, &list, 0);
1251         if (ret == 0 && list_empty(&list))
1252                 return 0;
1253
1254         while (!list_empty(&list)) {
1255                 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1256                 list_del(&sums->list);
1257                 kfree(sums);
1258         }
1259         return 1;
1260 }
1261
1262 /*
1263  * when nowcow writeback call back.  This checks for snapshots or COW copies
1264  * of the extents that exist in the file, and COWs the file as required.
1265  *
1266  * If no cow copies or snapshots exist, we write directly to the existing
1267  * blocks on disk
1268  */
1269 static noinline int run_delalloc_nocow(struct inode *inode,
1270                                        struct page *locked_page,
1271                               u64 start, u64 end, int *page_started, int force,
1272                               unsigned long *nr_written)
1273 {
1274         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1275         struct btrfs_root *root = BTRFS_I(inode)->root;
1276         struct extent_buffer *leaf;
1277         struct btrfs_path *path;
1278         struct btrfs_file_extent_item *fi;
1279         struct btrfs_key found_key;
1280         struct extent_map *em;
1281         u64 cow_start;
1282         u64 cur_offset;
1283         u64 extent_end;
1284         u64 extent_offset;
1285         u64 disk_bytenr;
1286         u64 num_bytes;
1287         u64 disk_num_bytes;
1288         u64 ram_bytes;
1289         int extent_type;
1290         int ret, err;
1291         int type;
1292         int nocow;
1293         int check_prev = 1;
1294         bool nolock;
1295         u64 ino = btrfs_ino(BTRFS_I(inode));
1296
1297         path = btrfs_alloc_path();
1298         if (!path) {
1299                 extent_clear_unlock_delalloc(inode, start, end, end,
1300                                              locked_page,
1301                                              EXTENT_LOCKED | EXTENT_DELALLOC |
1302                                              EXTENT_DO_ACCOUNTING |
1303                                              EXTENT_DEFRAG, PAGE_UNLOCK |
1304                                              PAGE_CLEAR_DIRTY |
1305                                              PAGE_SET_WRITEBACK |
1306                                              PAGE_END_WRITEBACK);
1307                 return -ENOMEM;
1308         }
1309
1310         nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
1311
1312         cow_start = (u64)-1;
1313         cur_offset = start;
1314         while (1) {
1315                 ret = btrfs_lookup_file_extent(NULL, root, path, ino,
1316                                                cur_offset, 0);
1317                 if (ret < 0)
1318                         goto error;
1319                 if (ret > 0 && path->slots[0] > 0 && check_prev) {
1320                         leaf = path->nodes[0];
1321                         btrfs_item_key_to_cpu(leaf, &found_key,
1322                                               path->slots[0] - 1);
1323                         if (found_key.objectid == ino &&
1324                             found_key.type == BTRFS_EXTENT_DATA_KEY)
1325                                 path->slots[0]--;
1326                 }
1327                 check_prev = 0;
1328 next_slot:
1329                 leaf = path->nodes[0];
1330                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1331                         ret = btrfs_next_leaf(root, path);
1332                         if (ret < 0)
1333                                 goto error;
1334                         if (ret > 0)
1335                                 break;
1336                         leaf = path->nodes[0];
1337                 }
1338
1339                 nocow = 0;
1340                 disk_bytenr = 0;
1341                 num_bytes = 0;
1342                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1343
1344                 if (found_key.objectid > ino)
1345                         break;
1346                 if (WARN_ON_ONCE(found_key.objectid < ino) ||
1347                     found_key.type < BTRFS_EXTENT_DATA_KEY) {
1348                         path->slots[0]++;
1349                         goto next_slot;
1350                 }
1351                 if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
1352                     found_key.offset > end)
1353                         break;
1354
1355                 if (found_key.offset > cur_offset) {
1356                         extent_end = found_key.offset;
1357                         extent_type = 0;
1358                         goto out_check;
1359                 }
1360
1361                 fi = btrfs_item_ptr(leaf, path->slots[0],
1362                                     struct btrfs_file_extent_item);
1363                 extent_type = btrfs_file_extent_type(leaf, fi);
1364
1365                 ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1366                 if (extent_type == BTRFS_FILE_EXTENT_REG ||
1367                     extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1368                         disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1369                         extent_offset = btrfs_file_extent_offset(leaf, fi);
1370                         extent_end = found_key.offset +
1371                                 btrfs_file_extent_num_bytes(leaf, fi);
1372                         disk_num_bytes =
1373                                 btrfs_file_extent_disk_num_bytes(leaf, fi);
1374                         if (extent_end <= start) {
1375                                 path->slots[0]++;
1376                                 goto next_slot;
1377                         }
1378                         if (disk_bytenr == 0)
1379                                 goto out_check;
1380                         if (btrfs_file_extent_compression(leaf, fi) ||
1381                             btrfs_file_extent_encryption(leaf, fi) ||
1382                             btrfs_file_extent_other_encoding(leaf, fi))
1383                                 goto out_check;
1384                         if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1385                                 goto out_check;
1386                         if (btrfs_extent_readonly(fs_info, disk_bytenr))
1387                                 goto out_check;
1388                         if (btrfs_cross_ref_exist(root, ino,
1389                                                   found_key.offset -
1390                                                   extent_offset, disk_bytenr))
1391                                 goto out_check;
1392                         disk_bytenr += extent_offset;
1393                         disk_bytenr += cur_offset - found_key.offset;
1394                         num_bytes = min(end + 1, extent_end) - cur_offset;
1395                         /*
1396                          * if there are pending snapshots for this root,
1397                          * we fall into common COW way.
1398                          */
1399                         if (!nolock) {
1400                                 err = btrfs_start_write_no_snapshotting(root);
1401                                 if (!err)
1402                                         goto out_check;
1403                         }
1404                         /*
1405                          * force cow if csum exists in the range.
1406                          * this ensure that csum for a given extent are
1407                          * either valid or do not exist.
1408                          */
1409                         if (csum_exist_in_range(fs_info, disk_bytenr,
1410                                                 num_bytes)) {
1411                                 if (!nolock)
1412                                         btrfs_end_write_no_snapshotting(root);
1413                                 goto out_check;
1414                         }
1415                         if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) {
1416                                 if (!nolock)
1417                                         btrfs_end_write_no_snapshotting(root);
1418                                 goto out_check;
1419                         }
1420                         nocow = 1;
1421                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1422                         extent_end = found_key.offset +
1423                                 btrfs_file_extent_inline_len(leaf,
1424                                                      path->slots[0], fi);
1425                         extent_end = ALIGN(extent_end,
1426                                            fs_info->sectorsize);
1427                 } else {
1428                         BUG_ON(1);
1429                 }
1430 out_check:
1431                 if (extent_end <= start) {
1432                         path->slots[0]++;
1433                         if (!nolock && nocow)
1434                                 btrfs_end_write_no_snapshotting(root);
1435                         if (nocow)
1436                                 btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1437                         goto next_slot;
1438                 }
1439                 if (!nocow) {
1440                         if (cow_start == (u64)-1)
1441                                 cow_start = cur_offset;
1442                         cur_offset = extent_end;
1443                         if (cur_offset > end)
1444                                 break;
1445                         path->slots[0]++;
1446                         goto next_slot;
1447                 }
1448
1449                 btrfs_release_path(path);
1450                 if (cow_start != (u64)-1) {
1451                         ret = cow_file_range(inode, locked_page,
1452                                              cow_start, found_key.offset - 1,
1453                                              end, page_started, nr_written, 1,
1454                                              NULL);
1455                         if (ret) {
1456                                 if (!nolock && nocow)
1457                                         btrfs_end_write_no_snapshotting(root);
1458                                 if (nocow)
1459                                         btrfs_dec_nocow_writers(fs_info,
1460                                                                 disk_bytenr);
1461                                 goto error;
1462                         }
1463                         cow_start = (u64)-1;
1464                 }
1465
1466                 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1467                         u64 orig_start = found_key.offset - extent_offset;
1468
1469                         em = create_io_em(inode, cur_offset, num_bytes,
1470                                           orig_start,
1471                                           disk_bytenr, /* block_start */
1472                                           num_bytes, /* block_len */
1473                                           disk_num_bytes, /* orig_block_len */
1474                                           ram_bytes, BTRFS_COMPRESS_NONE,
1475                                           BTRFS_ORDERED_PREALLOC);
1476                         if (IS_ERR(em)) {
1477                                 if (!nolock && nocow)
1478                                         btrfs_end_write_no_snapshotting(root);
1479                                 if (nocow)
1480                                         btrfs_dec_nocow_writers(fs_info,
1481                                                                 disk_bytenr);
1482                                 ret = PTR_ERR(em);
1483                                 goto error;
1484                         }
1485                         free_extent_map(em);
1486                 }
1487
1488                 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1489                         type = BTRFS_ORDERED_PREALLOC;
1490                 } else {
1491                         type = BTRFS_ORDERED_NOCOW;
1492                 }
1493
1494                 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1495                                                num_bytes, num_bytes, type);
1496                 if (nocow)
1497                         btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1498                 BUG_ON(ret); /* -ENOMEM */
1499
1500                 if (root->root_key.objectid ==
1501                     BTRFS_DATA_RELOC_TREE_OBJECTID)
1502                         /*
1503                          * Error handled later, as we must prevent
1504                          * extent_clear_unlock_delalloc() in error handler
1505                          * from freeing metadata of created ordered extent.
1506                          */
1507                         ret = btrfs_reloc_clone_csums(inode, cur_offset,
1508                                                       num_bytes);
1509
1510                 extent_clear_unlock_delalloc(inode, cur_offset,
1511                                              cur_offset + num_bytes - 1, end,
1512                                              locked_page, EXTENT_LOCKED |
1513                                              EXTENT_DELALLOC |
1514                                              EXTENT_CLEAR_DATA_RESV,
1515                                              PAGE_UNLOCK | PAGE_SET_PRIVATE2);
1516
1517                 if (!nolock && nocow)
1518                         btrfs_end_write_no_snapshotting(root);
1519                 cur_offset = extent_end;
1520
1521                 /*
1522                  * btrfs_reloc_clone_csums() error, now we're OK to call error
1523                  * handler, as metadata for created ordered extent will only
1524                  * be freed by btrfs_finish_ordered_io().
1525                  */
1526                 if (ret)
1527                         goto error;
1528                 if (cur_offset > end)
1529                         break;
1530         }
1531         btrfs_release_path(path);
1532
1533         if (cur_offset <= end && cow_start == (u64)-1) {
1534                 cow_start = cur_offset;
1535                 cur_offset = end;
1536         }
1537
1538         if (cow_start != (u64)-1) {
1539                 ret = cow_file_range(inode, locked_page, cow_start, end, end,
1540                                      page_started, nr_written, 1, NULL);
1541                 if (ret)
1542                         goto error;
1543         }
1544
1545 error:
1546         if (ret && cur_offset < end)
1547                 extent_clear_unlock_delalloc(inode, cur_offset, end, end,
1548                                              locked_page, EXTENT_LOCKED |
1549                                              EXTENT_DELALLOC | EXTENT_DEFRAG |
1550                                              EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1551                                              PAGE_CLEAR_DIRTY |
1552                                              PAGE_SET_WRITEBACK |
1553                                              PAGE_END_WRITEBACK);
1554         btrfs_free_path(path);
1555         return ret;
1556 }
1557
1558 static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
1559 {
1560
1561         if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
1562             !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
1563                 return 0;
1564
1565         /*
1566          * @defrag_bytes is a hint value, no spinlock held here,
1567          * if is not zero, it means the file is defragging.
1568          * Force cow if given extent needs to be defragged.
1569          */
1570         if (BTRFS_I(inode)->defrag_bytes &&
1571             test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1572                            EXTENT_DEFRAG, 0, NULL))
1573                 return 1;
1574
1575         return 0;
1576 }
1577
1578 /*
1579  * extent_io.c call back to do delayed allocation processing
1580  */
1581 static int run_delalloc_range(void *private_data, struct page *locked_page,
1582                               u64 start, u64 end, int *page_started,
1583                               unsigned long *nr_written,
1584                               struct writeback_control *wbc)
1585 {
1586         struct inode *inode = private_data;
1587         int ret;
1588         int force_cow = need_force_cow(inode, start, end);
1589         unsigned int write_flags = wbc_to_write_flags(wbc);
1590
1591         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
1592                 ret = run_delalloc_nocow(inode, locked_page, start, end,
1593                                          page_started, 1, nr_written);
1594         } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
1595                 ret = run_delalloc_nocow(inode, locked_page, start, end,
1596                                          page_started, 0, nr_written);
1597         } else if (!inode_need_compress(inode, start, end)) {
1598                 ret = cow_file_range(inode, locked_page, start, end, end,
1599                                       page_started, nr_written, 1, NULL);
1600         } else {
1601                 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1602                         &BTRFS_I(inode)->runtime_flags);
1603                 ret = cow_file_range_async(inode, locked_page, start, end,
1604                                            page_started, nr_written,
1605                                            write_flags);
1606         }
1607         if (ret)
1608                 btrfs_cleanup_ordered_extents(inode, start, end - start + 1);
1609         return ret;
1610 }
1611
1612 static void btrfs_split_extent_hook(void *private_data,
1613                                     struct extent_state *orig, u64 split)
1614 {
1615         struct inode *inode = private_data;
1616         u64 size;
1617
1618         /* not delalloc, ignore it */
1619         if (!(orig->state & EXTENT_DELALLOC))
1620                 return;
1621
1622         size = orig->end - orig->start + 1;
1623         if (size > BTRFS_MAX_EXTENT_SIZE) {
1624                 u32 num_extents;
1625                 u64 new_size;
1626
1627                 /*
1628                  * See the explanation in btrfs_merge_extent_hook, the same
1629                  * applies here, just in reverse.
1630                  */
1631                 new_size = orig->end - split + 1;
1632                 num_extents = count_max_extents(new_size);
1633                 new_size = split - orig->start;
1634                 num_extents += count_max_extents(new_size);
1635                 if (count_max_extents(size) >= num_extents)
1636                         return;
1637         }
1638
1639         spin_lock(&BTRFS_I(inode)->lock);
1640         btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
1641         spin_unlock(&BTRFS_I(inode)->lock);
1642 }
1643
1644 /*
1645  * extent_io.c merge_extent_hook, used to track merged delayed allocation
1646  * extents so we can keep track of new extents that are just merged onto old
1647  * extents, such as when we are doing sequential writes, so we can properly
1648  * account for the metadata space we'll need.
1649  */
1650 static void btrfs_merge_extent_hook(void *private_data,
1651                                     struct extent_state *new,
1652                                     struct extent_state *other)
1653 {
1654         struct inode *inode = private_data;
1655         u64 new_size, old_size;
1656         u32 num_extents;
1657
1658         /* not delalloc, ignore it */
1659         if (!(other->state & EXTENT_DELALLOC))
1660                 return;
1661
1662         if (new->start > other->start)
1663                 new_size = new->end - other->start + 1;
1664         else
1665                 new_size = other->end - new->start + 1;
1666
1667         /* we're not bigger than the max, unreserve the space and go */
1668         if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
1669                 spin_lock(&BTRFS_I(inode)->lock);
1670                 btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
1671                 spin_unlock(&BTRFS_I(inode)->lock);
1672                 return;
1673         }
1674
1675         /*
1676          * We have to add up either side to figure out how many extents were
1677          * accounted for before we merged into one big extent.  If the number of
1678          * extents we accounted for is <= the amount we need for the new range
1679          * then we can return, otherwise drop.  Think of it like this
1680          *
1681          * [ 4k][MAX_SIZE]
1682          *
1683          * So we've grown the extent by a MAX_SIZE extent, this would mean we
1684          * need 2 outstanding extents, on one side we have 1 and the other side
1685          * we have 1 so they are == and we can return.  But in this case
1686          *
1687          * [MAX_SIZE+4k][MAX_SIZE+4k]
1688          *
1689          * Each range on their own accounts for 2 extents, but merged together
1690          * they are only 3 extents worth of accounting, so we need to drop in
1691          * this case.
1692          */
1693         old_size = other->end - other->start + 1;
1694         num_extents = count_max_extents(old_size);
1695         old_size = new->end - new->start + 1;
1696         num_extents += count_max_extents(old_size);
1697         if (count_max_extents(new_size) >= num_extents)
1698                 return;
1699
1700         spin_lock(&BTRFS_I(inode)->lock);
1701         btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
1702         spin_unlock(&BTRFS_I(inode)->lock);
1703 }
1704
1705 static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
1706                                       struct inode *inode)
1707 {
1708         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1709
1710         spin_lock(&root->delalloc_lock);
1711         if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1712                 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1713                               &root->delalloc_inodes);
1714                 set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1715                         &BTRFS_I(inode)->runtime_flags);
1716                 root->nr_delalloc_inodes++;
1717                 if (root->nr_delalloc_inodes == 1) {
1718                         spin_lock(&fs_info->delalloc_root_lock);
1719                         BUG_ON(!list_empty(&root->delalloc_root));
1720                         list_add_tail(&root->delalloc_root,
1721                                       &fs_info->delalloc_roots);
1722                         spin_unlock(&fs_info->delalloc_root_lock);
1723                 }
1724         }
1725         spin_unlock(&root->delalloc_lock);
1726 }
1727
1728 static void btrfs_del_delalloc_inode(struct btrfs_root *root,
1729                                      struct btrfs_inode *inode)
1730 {
1731         struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
1732
1733         spin_lock(&root->delalloc_lock);
1734         if (!list_empty(&inode->delalloc_inodes)) {
1735                 list_del_init(&inode->delalloc_inodes);
1736                 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1737                           &inode->runtime_flags);
1738                 root->nr_delalloc_inodes--;
1739                 if (!root->nr_delalloc_inodes) {
1740                         spin_lock(&fs_info->delalloc_root_lock);
1741                         BUG_ON(list_empty(&root->delalloc_root));
1742                         list_del_init(&root->delalloc_root);
1743                         spin_unlock(&fs_info->delalloc_root_lock);
1744                 }
1745         }
1746         spin_unlock(&root->delalloc_lock);
1747 }
1748
1749 /*
1750  * extent_io.c set_bit_hook, used to track delayed allocation
1751  * bytes in this file, and to maintain the list of inodes that
1752  * have pending delalloc work to be done.
1753  */
1754 static void btrfs_set_bit_hook(void *private_data,
1755                                struct extent_state *state, unsigned *bits)
1756 {
1757         struct inode *inode = private_data;
1758
1759         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1760
1761         if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
1762                 WARN_ON(1);
1763         /*
1764          * set_bit and clear bit hooks normally require _irqsave/restore
1765          * but in this case, we are only testing for the DELALLOC
1766          * bit, which is only set or cleared with irqs on
1767          */
1768         if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1769                 struct btrfs_root *root = BTRFS_I(inode)->root;
1770                 u64 len = state->end + 1 - state->start;
1771                 u32 num_extents = count_max_extents(len);
1772                 bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
1773
1774                 spin_lock(&BTRFS_I(inode)->lock);
1775                 btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents);
1776                 spin_unlock(&BTRFS_I(inode)->lock);
1777
1778                 /* For sanity tests */
1779                 if (btrfs_is_testing(fs_info))
1780                         return;
1781
1782                 percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
1783                                          fs_info->delalloc_batch);
1784                 spin_lock(&BTRFS_I(inode)->lock);
1785                 BTRFS_I(inode)->delalloc_bytes += len;
1786                 if (*bits & EXTENT_DEFRAG)
1787                         BTRFS_I(inode)->defrag_bytes += len;
1788                 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1789                                          &BTRFS_I(inode)->runtime_flags))
1790                         btrfs_add_delalloc_inodes(root, inode);
1791                 spin_unlock(&BTRFS_I(inode)->lock);
1792         }
1793
1794         if (!(state->state & EXTENT_DELALLOC_NEW) &&
1795             (*bits & EXTENT_DELALLOC_NEW)) {
1796                 spin_lock(&BTRFS_I(inode)->lock);
1797                 BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
1798                         state->start;
1799                 spin_unlock(&BTRFS_I(inode)->lock);
1800         }
1801 }
1802
1803 /*
1804  * extent_io.c clear_bit_hook, see set_bit_hook for why
1805  */
1806 static void btrfs_clear_bit_hook(void *private_data,
1807                                  struct extent_state *state,
1808                                  unsigned *bits)
1809 {
1810         struct btrfs_inode *inode = BTRFS_I((struct inode *)private_data);
1811         struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
1812         u64 len = state->end + 1 - state->start;
1813         u32 num_extents = count_max_extents(len);
1814
1815         if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) {
1816                 spin_lock(&inode->lock);
1817                 inode->defrag_bytes -= len;
1818                 spin_unlock(&inode->lock);
1819         }
1820
1821         /*
1822          * set_bit and clear bit hooks normally require _irqsave/restore
1823          * but in this case, we are only testing for the DELALLOC
1824          * bit, which is only set or cleared with irqs on
1825          */
1826         if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1827                 struct btrfs_root *root = inode->root;
1828                 bool do_list = !btrfs_is_free_space_inode(inode);
1829
1830                 spin_lock(&inode->lock);
1831                 btrfs_mod_outstanding_extents(inode, -num_extents);
1832                 spin_unlock(&inode->lock);
1833
1834                 /*
1835                  * We don't reserve metadata space for space cache inodes so we
1836                  * don't need to call dellalloc_release_metadata if there is an
1837                  * error.
1838                  */
1839                 if (*bits & EXTENT_CLEAR_META_RESV &&
1840                     root != fs_info->tree_root)
1841                         btrfs_delalloc_release_metadata(inode, len);
1842
1843                 /* For sanity tests. */
1844                 if (btrfs_is_testing(fs_info))
1845                         return;
1846
1847                 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID &&
1848                     do_list && !(state->state & EXTENT_NORESERVE) &&
1849                     (*bits & EXTENT_CLEAR_DATA_RESV))
1850                         btrfs_free_reserved_data_space_noquota(
1851                                         &inode->vfs_inode,
1852                                         state->start, len);
1853
1854                 percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
1855                                          fs_info->delalloc_batch);
1856                 spin_lock(&inode->lock);
1857                 inode->delalloc_bytes -= len;
1858                 if (do_list && inode->delalloc_bytes == 0 &&
1859                     test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1860                                         &inode->runtime_flags))
1861                         btrfs_del_delalloc_inode(root, inode);
1862                 spin_unlock(&inode->lock);
1863         }
1864
1865         if ((state->state & EXTENT_DELALLOC_NEW) &&
1866             (*bits & EXTENT_DELALLOC_NEW)) {
1867                 spin_lock(&inode->lock);
1868                 ASSERT(inode->new_delalloc_bytes >= len);
1869                 inode->new_delalloc_bytes -= len;
1870                 spin_unlock(&inode->lock);
1871         }
1872 }
1873
1874 /*
1875  * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1876  * we don't create bios that span stripes or chunks
1877  *
1878  * return 1 if page cannot be merged to bio
1879  * return 0 if page can be merged to bio
1880  * return error otherwise
1881  */
1882 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1883                          size_t size, struct bio *bio,
1884                          unsigned long bio_flags)
1885 {
1886         struct inode *inode = page->mapping->host;
1887         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1888         u64 logical = (u64)bio->bi_iter.bi_sector << 9;
1889         u64 length = 0;
1890         u64 map_length;
1891         int ret;
1892
1893         if (bio_flags & EXTENT_BIO_COMPRESSED)
1894                 return 0;
1895
1896         length = bio->bi_iter.bi_size;
1897         map_length = length;
1898         ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
1899                               NULL, 0);
1900         if (ret < 0)
1901                 return ret;
1902         if (map_length < length + size)
1903                 return 1;
1904         return 0;
1905 }
1906
1907 /*
1908  * in order to insert checksums into the metadata in large chunks,
1909  * we wait until bio submission time.   All the pages in the bio are
1910  * checksummed and sums are attached onto the ordered extent record.
1911  *
1912  * At IO completion time the cums attached on the ordered extent record
1913  * are inserted into the btree
1914  */
1915 static blk_status_t __btrfs_submit_bio_start(void *private_data, struct bio *bio,
1916                                     int mirror_num, unsigned long bio_flags,
1917                                     u64 bio_offset)
1918 {
1919         struct inode *inode = private_data;
1920         blk_status_t ret = 0;
1921
1922         ret = btrfs_csum_one_bio(inode, bio, 0, 0);
1923         BUG_ON(ret); /* -ENOMEM */
1924         return 0;
1925 }
1926
1927 /*
1928  * in order to insert checksums into the metadata in large chunks,
1929  * we wait until bio submission time.   All the pages in the bio are
1930  * checksummed and sums are attached onto the ordered extent record.
1931  *
1932  * At IO completion time the cums attached on the ordered extent record
1933  * are inserted into the btree
1934  */
1935 static blk_status_t __btrfs_submit_bio_done(void *private_data, struct bio *bio,
1936                           int mirror_num, unsigned long bio_flags,
1937                           u64 bio_offset)
1938 {
1939         struct inode *inode = private_data;
1940         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1941         blk_status_t ret;
1942
1943         ret = btrfs_map_bio(fs_info, bio, mirror_num, 1);
1944         if (ret) {
1945                 bio->bi_status = ret;
1946                 bio_endio(bio);
1947         }
1948         return ret;
1949 }
1950
1951 /*
1952  * extent_io.c submission hook. This does the right thing for csum calculation
1953  * on write, or reading the csums from the tree before a read.
1954  *
1955  * Rules about async/sync submit,
1956  * a) read:                             sync submit
1957  *
1958  * b) write without checksum:           sync submit
1959  *
1960  * c) write with checksum:
1961  *    c-1) if bio is issued by fsync:   sync submit
1962  *         (sync_writers != 0)
1963  *
1964  *    c-2) if root is reloc root:       sync submit
1965  *         (only in case of buffered IO)
1966  *
1967  *    c-3) otherwise:                   async submit
1968  */
1969 static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio,
1970                                  int mirror_num, unsigned long bio_flags,
1971                                  u64 bio_offset)
1972 {
1973         struct inode *inode = private_data;
1974         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1975         struct btrfs_root *root = BTRFS_I(inode)->root;
1976         enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
1977         blk_status_t ret = 0;
1978         int skip_sum;
1979         int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
1980
1981         skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1982
1983         if (btrfs_is_free_space_inode(BTRFS_I(inode)))
1984                 metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
1985
1986         if (bio_op(bio) != REQ_OP_WRITE) {
1987                 ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
1988                 if (ret)
1989                         goto out;
1990
1991                 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1992                         ret = btrfs_submit_compressed_read(inode, bio,
1993                                                            mirror_num,
1994                                                            bio_flags);
1995                         goto out;
1996                 } else if (!skip_sum) {
1997                         ret = btrfs_lookup_bio_sums(inode, bio, NULL);
1998                         if (ret)
1999                                 goto out;
2000                 }
2001                 goto mapit;
2002         } else if (async && !skip_sum) {
2003                 /* csum items have already been cloned */
2004                 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2005                         goto mapit;
2006                 /* we're doing a write, do the async checksumming */
2007                 ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags,
2008                                           bio_offset, inode,
2009                                           __btrfs_submit_bio_start,
2010                                           __btrfs_submit_bio_done);
2011                 goto out;
2012         } else if (!skip_sum) {
2013                 ret = btrfs_csum_one_bio(inode, bio, 0, 0);
2014                 if (ret)
2015                         goto out;
2016         }
2017
2018 mapit:
2019         ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
2020
2021 out:
2022         if (ret) {
2023                 bio->bi_status = ret;
2024                 bio_endio(bio);
2025         }
2026         return ret;
2027 }
2028
2029 /*
2030  * given a list of ordered sums record them in the inode.  This happens
2031  * at IO completion time based on sums calculated at bio submission time.
2032  */
2033 static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
2034                              struct inode *inode, struct list_head *list)
2035 {
2036         struct btrfs_ordered_sum *sum;
2037
2038         list_for_each_entry(sum, list, list) {
2039                 trans->adding_csums = 1;
2040                 btrfs_csum_file_blocks(trans,
2041                        BTRFS_I(inode)->root->fs_info->csum_root, sum);
2042                 trans->adding_csums = 0;
2043         }
2044         return 0;
2045 }
2046
2047 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2048                               unsigned int extra_bits,
2049                               struct extent_state **cached_state, int dedupe)
2050 {
2051         WARN_ON((end & (PAGE_SIZE - 1)) == 0);
2052         return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
2053                                    extra_bits, cached_state);
2054 }
2055
2056 /* see btrfs_writepage_start_hook for details on why this is required */
2057 struct btrfs_writepage_fixup {
2058         struct page *page;
2059         struct btrfs_work work;
2060 };
2061
2062 static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
2063 {
2064         struct btrfs_writepage_fixup *fixup;
2065         struct btrfs_ordered_extent *ordered;
2066         struct extent_state *cached_state = NULL;
2067         struct extent_changeset *data_reserved = NULL;
2068         struct page *page;
2069         struct inode *inode;
2070         u64 page_start;
2071         u64 page_end;
2072         int ret;
2073
2074         fixup = container_of(work, struct btrfs_writepage_fixup, work);
2075         page = fixup->page;
2076 again:
2077         lock_page(page);
2078         if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
2079                 ClearPageChecked(page);
2080                 goto out_page;
2081         }
2082
2083         inode = page->mapping->host;
2084         page_start = page_offset(page);
2085         page_end = page_offset(page) + PAGE_SIZE - 1;
2086
2087         lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
2088                          &cached_state);
2089
2090         /* already ordered? We're done */
2091         if (PagePrivate2(page))
2092                 goto out;
2093
2094         ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
2095                                         PAGE_SIZE);
2096         if (ordered) {
2097                 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
2098                                      page_end, &cached_state, GFP_NOFS);
2099                 unlock_page(page);
2100                 btrfs_start_ordered_extent(inode, ordered, 1);
2101                 btrfs_put_ordered_extent(ordered);
2102                 goto again;
2103         }
2104
2105         ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
2106                                            PAGE_SIZE);
2107         if (ret) {
2108                 mapping_set_error(page->mapping, ret);
2109                 end_extent_writepage(page, ret, page_start, page_end);
2110                 ClearPageChecked(page);
2111                 goto out;
2112          }
2113
2114         btrfs_set_extent_delalloc(inode, page_start, page_end, 0, &cached_state,
2115                                   0);
2116         ClearPageChecked(page);
2117         set_page_dirty(page);
2118         btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
2119 out:
2120         unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
2121                              &cached_state, GFP_NOFS);
2122 out_page:
2123         unlock_page(page);
2124         put_page(page);
2125         kfree(fixup);
2126         extent_changeset_free(data_reserved);
2127 }
2128
2129 /*
2130  * There are a few paths in the higher layers of the kernel that directly
2131  * set the page dirty bit without asking the filesystem if it is a
2132  * good idea.  This causes problems because we want to make sure COW
2133  * properly happens and the data=ordered rules are followed.
2134  *
2135  * In our case any range that doesn't have the ORDERED bit set
2136  * hasn't been properly setup for IO.  We kick off an async process
2137  * to fix it up.  The async helper will wait for ordered extents, set
2138  * the delalloc bit and make it safe to write the page.
2139  */
2140 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
2141 {
2142         struct inode *inode = page->mapping->host;
2143         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2144         struct btrfs_writepage_fixup *fixup;
2145
2146         /* this page is properly in the ordered list */
2147         if (TestClearPagePrivate2(page))
2148                 return 0;
2149
2150         if (PageChecked(page))
2151                 return -EAGAIN;
2152
2153         fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
2154         if (!fixup)
2155                 return -EAGAIN;
2156
2157         SetPageChecked(page);
2158         get_page(page);
2159         btrfs_init_work(&fixup->work, btrfs_fixup_helper,
2160                         btrfs_writepage_fixup_worker, NULL, NULL);
2161         fixup->page = page;
2162         btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
2163         return -EBUSY;
2164 }
2165
2166 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2167                                        struct inode *inode, u64 file_pos,
2168                                        u64 disk_bytenr, u64 disk_num_bytes,
2169                                        u64 num_bytes, u64 ram_bytes,
2170                                        u8 compression, u8 encryption,
2171                                        u16 other_encoding, int extent_type)
2172 {
2173         struct btrfs_root *root = BTRFS_I(inode)->root;
2174         struct btrfs_file_extent_item *fi;
2175         struct btrfs_path *path;
2176         struct extent_buffer *leaf;
2177         struct btrfs_key ins;
2178         u64 qg_released;
2179         int extent_inserted = 0;
2180         int ret;
2181
2182         path = btrfs_alloc_path();
2183         if (!path)
2184                 return -ENOMEM;
2185
2186         /*
2187          * we may be replacing one extent in the tree with another.
2188          * The new extent is pinned in the extent map, and we don't want
2189          * to drop it from the cache until it is completely in the btree.
2190          *
2191          * So, tell btrfs_drop_extents to leave this extent in the cache.
2192          * the caller is expected to unpin it and allow it to be merged
2193          * with the others.
2194          */
2195         ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
2196                                    file_pos + num_bytes, NULL, 0,
2197                                    1, sizeof(*fi), &extent_inserted);
2198         if (ret)
2199                 goto out;
2200
2201         if (!extent_inserted) {
2202                 ins.objectid = btrfs_ino(BTRFS_I(inode));
2203                 ins.offset = file_pos;
2204                 ins.type = BTRFS_EXTENT_DATA_KEY;
2205
2206                 path->leave_spinning = 1;
2207                 ret = btrfs_insert_empty_item(trans, root, path, &ins,
2208                                               sizeof(*fi));
2209                 if (ret)
2210                         goto out;
2211         }
2212         leaf = path->nodes[0];
2213         fi = btrfs_item_ptr(leaf, path->slots[0],
2214                             struct btrfs_file_extent_item);
2215         btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2216         btrfs_set_file_extent_type(leaf, fi, extent_type);
2217         btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
2218         btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
2219         btrfs_set_file_extent_offset(leaf, fi, 0);
2220         btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2221         btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
2222         btrfs_set_file_extent_compression(leaf, fi, compression);
2223         btrfs_set_file_extent_encryption(leaf, fi, encryption);
2224         btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
2225
2226         btrfs_mark_buffer_dirty(leaf);
2227         btrfs_release_path(path);
2228
2229         inode_add_bytes(inode, num_bytes);
2230
2231         ins.objectid = disk_bytenr;
2232         ins.offset = disk_num_bytes;
2233         ins.type = BTRFS_EXTENT_ITEM_KEY;
2234
2235         /*
2236          * Release the reserved range from inode dirty range map, as it is
2237          * already moved into delayed_ref_head
2238          */
2239         ret = btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
2240         if (ret < 0)
2241                 goto out;
2242         qg_released = ret;
2243         ret = btrfs_alloc_reserved_file_extent(trans, root,
2244                                                btrfs_ino(BTRFS_I(inode)),
2245                                                file_pos, qg_released, &ins);
2246 out:
2247         btrfs_free_path(path);
2248
2249         return ret;
2250 }
2251
2252 /* snapshot-aware defrag */
2253 struct sa_defrag_extent_backref {
2254         struct rb_node node;
2255         struct old_sa_defrag_extent *old;
2256         u64 root_id;
2257         u64 inum;
2258         u64 file_pos;
2259         u64 extent_offset;
2260         u64 num_bytes;
2261         u64 generation;
2262 };
2263
2264 struct old_sa_defrag_extent {
2265         struct list_head list;
2266         struct new_sa_defrag_extent *new;
2267
2268         u64 extent_offset;
2269         u64 bytenr;
2270         u64 offset;
2271         u64 len;
2272         int count;
2273 };
2274
2275 struct new_sa_defrag_extent {
2276         struct rb_root root;
2277         struct list_head head;
2278         struct btrfs_path *path;
2279         struct inode *inode;
2280         u64 file_pos;
2281         u64 len;
2282         u64 bytenr;
2283         u64 disk_len;
2284         u8 compress_type;
2285 };
2286
2287 static int backref_comp(struct sa_defrag_extent_backref *b1,
2288                         struct sa_defrag_extent_backref *b2)
2289 {
2290         if (b1->root_id < b2->root_id)
2291                 return -1;
2292         else if (b1->root_id > b2->root_id)
2293                 return 1;
2294
2295         if (b1->inum < b2->inum)
2296                 return -1;
2297         else if (b1->inum > b2->inum)
2298                 return 1;
2299
2300         if (b1->file_pos < b2->file_pos)
2301                 return -1;
2302         else if (b1->file_pos > b2->file_pos)
2303                 return 1;
2304
2305         /*
2306          * [------------------------------] ===> (a range of space)
2307          *     |<--->|   |<---->| =============> (fs/file tree A)
2308          * |<---------------------------->| ===> (fs/file tree B)
2309          *
2310          * A range of space can refer to two file extents in one tree while
2311          * refer to only one file extent in another tree.
2312          *
2313          * So we may process a disk offset more than one time(two extents in A)
2314          * and locate at the same extent(one extent in B), then insert two same
2315          * backrefs(both refer to the extent in B).
2316          */
2317         return 0;
2318 }
2319
2320 static void backref_insert(struct rb_root *root,
2321                            struct sa_defrag_extent_backref *backref)
2322 {
2323         struct rb_node **p = &root->rb_node;
2324         struct rb_node *parent = NULL;
2325         struct sa_defrag_extent_backref *entry;
2326         int ret;
2327
2328         while (*p) {
2329                 parent = *p;
2330                 entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
2331
2332                 ret = backref_comp(backref, entry);
2333                 if (ret < 0)
2334                         p = &(*p)->rb_left;
2335                 else
2336                         p = &(*p)->rb_right;
2337         }
2338
2339         rb_link_node(&backref->node, parent, p);
2340         rb_insert_color(&backref->node, root);
2341 }
2342
2343 /*
2344  * Note the backref might has changed, and in this case we just return 0.
2345  */
2346 static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2347                                        void *ctx)
2348 {
2349         struct btrfs_file_extent_item *extent;
2350         struct old_sa_defrag_extent *old = ctx;
2351         struct new_sa_defrag_extent *new = old->new;
2352         struct btrfs_path *path = new->path;
2353         struct btrfs_key key;
2354         struct btrfs_root *root;
2355         struct sa_defrag_extent_backref *backref;
2356         struct extent_buffer *leaf;
2357         struct inode *inode = new->inode;
2358         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2359         int slot;
2360         int ret;
2361         u64 extent_offset;
2362         u64 num_bytes;
2363
2364         if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
2365             inum == btrfs_ino(BTRFS_I(inode)))
2366                 return 0;
2367
2368         key.objectid = root_id;
2369         key.type = BTRFS_ROOT_ITEM_KEY;
2370         key.offset = (u64)-1;
2371
2372         root = btrfs_read_fs_root_no_name(fs_info, &key);
2373         if (IS_ERR(root)) {
2374                 if (PTR_ERR(root) == -ENOENT)
2375                         return 0;
2376                 WARN_ON(1);
2377                 btrfs_debug(fs_info, "inum=%llu, offset=%llu, root_id=%llu",
2378                          inum, offset, root_id);
2379                 return PTR_ERR(root);
2380         }
2381
2382         key.objectid = inum;
2383         key.type = BTRFS_EXTENT_DATA_KEY;
2384         if (offset > (u64)-1 << 32)
2385                 key.offset = 0;
2386         else
2387                 key.offset = offset;
2388
2389         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2390         if (WARN_ON(ret < 0))
2391                 return ret;
2392         ret = 0;
2393
2394         while (1) {
2395                 cond_resched();
2396
2397                 leaf = path->nodes[0];
2398                 slot = path->slots[0];
2399
2400                 if (slot >= btrfs_header_nritems(leaf)) {
2401                         ret = btrfs_next_leaf(root, path);
2402                         if (ret < 0) {
2403                                 goto out;
2404                         } else if (ret > 0) {
2405                                 ret = 0;
2406                                 goto out;
2407                         }
2408                         continue;
2409                 }
2410
2411                 path->slots[0]++;
2412
2413                 btrfs_item_key_to_cpu(leaf, &key, slot);
2414
2415                 if (key.objectid > inum)
2416                         goto out;
2417
2418                 if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
2419                         continue;
2420
2421                 extent = btrfs_item_ptr(leaf, slot,
2422                                         struct btrfs_file_extent_item);
2423
2424                 if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
2425                         continue;
2426
2427                 /*
2428                  * 'offset' refers to the exact key.offset,
2429                  * NOT the 'offset' field in btrfs_extent_data_ref, ie.
2430                  * (key.offset - extent_offset).
2431                  */
2432                 if (key.offset != offset)
2433                         continue;
2434
2435                 extent_offset = btrfs_file_extent_offset(leaf, extent);
2436                 num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
2437
2438                 if (extent_offset >= old->extent_offset + old->offset +
2439                     old->len || extent_offset + num_bytes <=
2440                     old->extent_offset + old->offset)
2441                         continue;
2442                 break;
2443         }
2444
2445         backref = kmalloc(sizeof(*backref), GFP_NOFS);
2446         if (!backref) {
2447                 ret = -ENOENT;
2448                 goto out;
2449         }
2450
2451         backref->root_id = root_id;
2452         backref->inum = inum;
2453         backref->file_pos = offset;
2454         backref->num_bytes = num_bytes;
2455         backref->extent_offset = extent_offset;
2456         backref->generation = btrfs_file_extent_generation(leaf, extent);
2457         backref->old = old;
2458         backref_insert(&new->root, backref);
2459         old->count++;
2460 out:
2461         btrfs_release_path(path);
2462         WARN_ON(ret);
2463         return ret;
2464 }
2465
2466 static noinline bool record_extent_backrefs(struct btrfs_path *path,
2467                                    struct new_sa_defrag_extent *new)
2468 {
2469         struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2470         struct old_sa_defrag_extent *old, *tmp;
2471         int ret;
2472
2473         new->path = path;
2474
2475         list_for_each_entry_safe(old, tmp, &new->head, list) {
2476                 ret = iterate_inodes_from_logical(old->bytenr +
2477                                                   old->extent_offset, fs_info,
2478                                                   path, record_one_backref,
2479                                                   old, false);
2480                 if (ret < 0 && ret != -ENOENT)
2481                         return false;
2482
2483                 /* no backref to be processed for this extent */
2484                 if (!old->count) {
2485                         list_del(&old->list);
2486                         kfree(old);
2487                 }
2488         }
2489
2490         if (list_empty(&new->head))
2491                 return false;
2492
2493         return true;
2494 }
2495
2496 static int relink_is_mergable(struct extent_buffer *leaf,
2497                               struct btrfs_file_extent_item *fi,
2498                               struct new_sa_defrag_extent *new)
2499 {
2500         if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
2501                 return 0;
2502
2503         if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2504                 return 0;
2505
2506         if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
2507                 return 0;
2508
2509         if (btrfs_file_extent_encryption(leaf, fi) ||
2510             btrfs_file_extent_other_encoding(leaf, fi))
2511                 return 0;
2512
2513         return 1;
2514 }
2515
2516 /*
2517  * Note the backref might has changed, and in this case we just return 0.
2518  */
2519 static noinline int relink_extent_backref(struct btrfs_path *path,
2520                                  struct sa_defrag_extent_backref *prev,
2521                                  struct sa_defrag_extent_backref *backref)
2522 {
2523         struct btrfs_file_extent_item *extent;
2524         struct btrfs_file_extent_item *item;
2525         struct btrfs_ordered_extent *ordered;
2526         struct btrfs_trans_handle *trans;
2527         struct btrfs_root *root;
2528         struct btrfs_key key;
2529         struct extent_buffer *leaf;
2530         struct old_sa_defrag_extent *old = backref->old;
2531         struct new_sa_defrag_extent *new = old->new;
2532         struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2533         struct inode *inode;
2534         struct extent_state *cached = NULL;
2535         int ret = 0;
2536         u64 start;
2537         u64 len;
2538         u64 lock_start;
2539         u64 lock_end;
2540         bool merge = false;
2541         int index;
2542
2543         if (prev && prev->root_id == backref->root_id &&
2544             prev->inum == backref->inum &&
2545             prev->file_pos + prev->num_bytes == backref->file_pos)
2546                 merge = true;
2547
2548         /* step 1: get root */
2549         key.objectid = backref->root_id;
2550         key.type = BTRFS_ROOT_ITEM_KEY;
2551         key.offset = (u64)-1;
2552
2553         index = srcu_read_lock(&fs_info->subvol_srcu);
2554
2555         root = btrfs_read_fs_root_no_name(fs_info, &key);
2556         if (IS_ERR(root)) {
2557                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2558                 if (PTR_ERR(root) == -ENOENT)
2559                         return 0;
2560                 return PTR_ERR(root);
2561         }
2562
2563         if (btrfs_root_readonly(root)) {
2564                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2565                 return 0;
2566         }
2567
2568         /* step 2: get inode */
2569         key.objectid = backref->inum;
2570         key.type = BTRFS_INODE_ITEM_KEY;
2571         key.offset = 0;
2572
2573         inode = btrfs_iget(fs_info->sb, &key, root, NULL);
2574         if (IS_ERR(inode)) {
2575                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2576                 return 0;
2577         }
2578
2579         srcu_read_unlock(&fs_info->subvol_srcu, index);
2580
2581         /* step 3: relink backref */
2582         lock_start = backref->file_pos;
2583         lock_end = backref->file_pos + backref->num_bytes - 1;
2584         lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2585                          &cached);
2586
2587         ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
2588         if (ordered) {
2589                 btrfs_put_ordered_extent(ordered);
2590                 goto out_unlock;
2591         }
2592
2593         trans = btrfs_join_transaction(root);
2594         if (IS_ERR(trans)) {
2595                 ret = PTR_ERR(trans);
2596                 goto out_unlock;
2597         }
2598
2599         key.objectid = backref->inum;
2600         key.type = BTRFS_EXTENT_DATA_KEY;
2601         key.offset = backref->file_pos;
2602
2603         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2604         if (ret < 0) {
2605                 goto out_free_path;
2606         } else if (ret > 0) {
2607                 ret = 0;
2608                 goto out_free_path;
2609         }
2610
2611         extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
2612                                 struct btrfs_file_extent_item);
2613
2614         if (btrfs_file_extent_generation(path->nodes[0], extent) !=
2615             backref->generation)
2616                 goto out_free_path;
2617
2618         btrfs_release_path(path);
2619
2620         start = backref->file_pos;
2621         if (backref->extent_offset < old->extent_offset + old->offset)
2622                 start += old->extent_offset + old->offset -
2623                          backref->extent_offset;
2624
2625         len = min(backref->extent_offset + backref->num_bytes,
2626                   old->extent_offset + old->offset + old->len);
2627         len -= max(backref->extent_offset, old->extent_offset + old->offset);
2628
2629         ret = btrfs_drop_extents(trans, root, inode, start,
2630                                  start + len, 1);
2631         if (ret)
2632                 goto out_free_path;
2633 again:
2634         key.objectid = btrfs_ino(BTRFS_I(inode));
2635         key.type = BTRFS_EXTENT_DATA_KEY;
2636         key.offset = start;
2637
2638         path->leave_spinning = 1;
2639         if (merge) {
2640                 struct btrfs_file_extent_item *fi;
2641                 u64 extent_len;
2642                 struct btrfs_key found_key;
2643
2644                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2645                 if (ret < 0)
2646                         goto out_free_path;
2647
2648                 path->slots[0]--;
2649                 leaf = path->nodes[0];
2650                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2651
2652                 fi = btrfs_item_ptr(leaf, path->slots[0],
2653                                     struct btrfs_file_extent_item);
2654                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
2655
2656                 if (extent_len + found_key.offset == start &&
2657                     relink_is_mergable(leaf, fi, new)) {
2658                         btrfs_set_file_extent_num_bytes(leaf, fi,
2659                                                         extent_len + len);
2660                         btrfs_mark_buffer_dirty(leaf);
2661                         inode_add_bytes(inode, len);
2662
2663                         ret = 1;
2664                         goto out_free_path;
2665                 } else {
2666                         merge = false;
2667                         btrfs_release_path(path);
2668                         goto again;
2669                 }
2670         }
2671
2672         ret = btrfs_insert_empty_item(trans, root, path, &key,
2673                                         sizeof(*extent));
2674         if (ret) {
2675                 btrfs_abort_transaction(trans, ret);
2676                 goto out_free_path;
2677         }
2678
2679         leaf = path->nodes[0];
2680         item = btrfs_item_ptr(leaf, path->slots[0],
2681                                 struct btrfs_file_extent_item);
2682         btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
2683         btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
2684         btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
2685         btrfs_set_file_extent_num_bytes(leaf, item, len);
2686         btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
2687         btrfs_set_file_extent_generation(leaf, item, trans->transid);
2688         btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
2689         btrfs_set_file_extent_compression(leaf, item, new->compress_type);
2690         btrfs_set_file_extent_encryption(leaf, item, 0);
2691         btrfs_set_file_extent_other_encoding(leaf, item, 0);
2692
2693         btrfs_mark_buffer_dirty(leaf);
2694         inode_add_bytes(inode, len);
2695         btrfs_release_path(path);
2696
2697         ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
2698                         new->disk_len, 0,
2699                         backref->root_id, backref->inum,
2700                         new->file_pos); /* start - extent_offset */
2701         if (ret) {
2702                 btrfs_abort_transaction(trans, ret);
2703                 goto out_free_path;
2704         }
2705
2706         ret = 1;
2707 out_free_path:
2708         btrfs_release_path(path);
2709         path->leave_spinning = 0;
2710         btrfs_end_transaction(trans);
2711 out_unlock:
2712         unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2713                              &cached, GFP_NOFS);
2714         iput(inode);
2715         return ret;
2716 }
2717
2718 static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
2719 {
2720         struct old_sa_defrag_extent *old, *tmp;
2721
2722         if (!new)
2723                 return;
2724
2725         list_for_each_entry_safe(old, tmp, &new->head, list) {
2726                 kfree(old);
2727         }
2728         kfree(new);
2729 }
2730
2731 static void relink_file_extents(struct new_sa_defrag_extent *new)
2732 {
2733         struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2734         struct btrfs_path *path;
2735         struct sa_defrag_extent_backref *backref;
2736         struct sa_defrag_extent_backref *prev = NULL;
2737         struct inode *inode;
2738         struct btrfs_root *root;
2739         struct rb_node *node;
2740         int ret;
2741
2742         inode = new->inode;
2743         root = BTRFS_I(inode)->root;
2744
2745         path = btrfs_alloc_path();
2746         if (!path)
2747                 return;
2748
2749         if (!record_extent_backrefs(path, new)) {
2750                 btrfs_free_path(path);
2751                 goto out;
2752         }
2753         btrfs_release_path(path);
2754
2755         while (1) {
2756                 node = rb_first(&new->root);
2757                 if (!node)
2758                         break;
2759                 rb_erase(node, &new->root);
2760
2761                 backref = rb_entry(node, struct sa_defrag_extent_backref, node);
2762
2763                 ret = relink_extent_backref(path, prev, backref);
2764                 WARN_ON(ret < 0);
2765
2766                 kfree(prev);
2767
2768                 if (ret == 1)
2769                         prev = backref;
2770                 else
2771                         prev = NULL;
2772                 cond_resched();
2773         }
2774         kfree(prev);
2775
2776         btrfs_free_path(path);
2777 out:
2778         free_sa_defrag_extent(new);
2779
2780         atomic_dec(&fs_info->defrag_running);
2781         wake_up(&fs_info->transaction_wait);
2782 }
2783
2784 static struct new_sa_defrag_extent *
2785 record_old_file_extents(struct inode *inode,
2786                         struct btrfs_ordered_extent *ordered)
2787 {
2788         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2789         struct btrfs_root *root = BTRFS_I(inode)->root;
2790         struct btrfs_path *path;
2791         struct btrfs_key key;
2792         struct old_sa_defrag_extent *old;
2793         struct new_sa_defrag_extent *new;
2794         int ret;
2795
2796         new = kmalloc(sizeof(*new), GFP_NOFS);
2797         if (!new)
2798                 return NULL;
2799
2800         new->inode = inode;
2801         new->file_pos = ordered->file_offset;
2802         new->len = ordered->len;
2803         new->bytenr = ordered->start;
2804         new->disk_len = ordered->disk_len;
2805         new->compress_type = ordered->compress_type;
2806         new->root = RB_ROOT;
2807         INIT_LIST_HEAD(&new->head);
2808
2809         path = btrfs_alloc_path();
2810         if (!path)
2811                 goto out_kfree;
2812
2813         key.objectid = btrfs_ino(BTRFS_I(inode));
2814         key.type = BTRFS_EXTENT_DATA_KEY;
2815         key.offset = new->file_pos;
2816
2817         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2818         if (ret < 0)
2819                 goto out_free_path;
2820         if (ret > 0 && path->slots[0] > 0)
2821                 path->slots[0]--;
2822
2823         /* find out all the old extents for the file range */
2824         while (1) {
2825                 struct btrfs_file_extent_item *extent;
2826                 struct extent_buffer *l;
2827                 int slot;
2828                 u64 num_bytes;
2829                 u64 offset;
2830                 u64 end;
2831                 u64 disk_bytenr;
2832                 u64 extent_offset;
2833
2834                 l = path->nodes[0];
2835                 slot = path->slots[0];
2836
2837                 if (slot >= btrfs_header_nritems(l)) {
2838                         ret = btrfs_next_leaf(root, path);
2839                         if (ret < 0)
2840                                 goto out_free_path;
2841                         else if (ret > 0)
2842                                 break;
2843                         continue;
2844                 }
2845
2846                 btrfs_item_key_to_cpu(l, &key, slot);
2847
2848                 if (key.objectid != btrfs_ino(BTRFS_I(inode)))
2849                         break;
2850                 if (key.type != BTRFS_EXTENT_DATA_KEY)
2851                         break;
2852                 if (key.offset >= new->file_pos + new->len)
2853                         break;
2854
2855                 extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
2856
2857                 num_bytes = btrfs_file_extent_num_bytes(l, extent);
2858                 if (key.offset + num_bytes < new->file_pos)
2859                         goto next;
2860
2861                 disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
2862                 if (!disk_bytenr)
2863                         goto next;
2864
2865                 extent_offset = btrfs_file_extent_offset(l, extent);
2866
2867                 old = kmalloc(sizeof(*old), GFP_NOFS);
2868                 if (!old)
2869                         goto out_free_path;
2870
2871                 offset = max(new->file_pos, key.offset);
2872                 end = min(new->file_pos + new->len, key.offset + num_bytes);
2873
2874                 old->bytenr = disk_bytenr;
2875                 old->extent_offset = extent_offset;
2876                 old->offset = offset - key.offset;
2877                 old->len = end - offset;
2878                 old->new = new;
2879                 old->count = 0;
2880                 list_add_tail(&old->list, &new->head);
2881 next:
2882                 path->slots[0]++;
2883                 cond_resched();
2884         }
2885
2886         btrfs_free_path(path);
2887         atomic_inc(&fs_info->defrag_running);
2888
2889         return new;
2890
2891 out_free_path:
2892         btrfs_free_path(path);
2893 out_kfree:
2894         free_sa_defrag_extent(new);
2895         return NULL;
2896 }
2897
2898 static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
2899                                          u64 start, u64 len)
2900 {
2901         struct btrfs_block_group_cache *cache;
2902
2903         cache = btrfs_lookup_block_group(fs_info, start);
2904         ASSERT(cache);
2905
2906         spin_lock(&cache->lock);
2907         cache->delalloc_bytes -= len;
2908         spin_unlock(&cache->lock);
2909
2910         btrfs_put_block_group(cache);
2911 }
2912
2913 /* as ordered data IO finishes, this gets called so we can finish
2914  * an ordered extent if the range of bytes in the file it covers are
2915  * fully written.
2916  */
2917 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2918 {
2919         struct inode *inode = ordered_extent->inode;
2920         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2921         struct btrfs_root *root = BTRFS_I(inode)->root;
2922         struct btrfs_trans_handle *trans = NULL;
2923         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2924         struct extent_state *cached_state = NULL;
2925         struct new_sa_defrag_extent *new = NULL;
2926         int compress_type = 0;
2927         int ret = 0;
2928         u64 logical_len = ordered_extent->len;
2929         bool nolock;
2930         bool truncated = false;
2931         bool range_locked = false;
2932         bool clear_new_delalloc_bytes = false;
2933
2934         if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
2935             !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
2936             !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
2937                 clear_new_delalloc_bytes = true;
2938
2939         nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
2940
2941         if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
2942                 ret = -EIO;
2943                 goto out;
2944         }
2945
2946         btrfs_free_io_failure_record(BTRFS_I(inode),
2947                         ordered_extent->file_offset,
2948                         ordered_extent->file_offset +
2949                         ordered_extent->len - 1);
2950
2951         if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
2952                 truncated = true;
2953                 logical_len = ordered_extent->truncated_len;
2954                 /* Truncated the entire extent, don't bother adding */
2955                 if (!logical_len)
2956                         goto out;
2957         }
2958
2959         if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
2960                 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
2961
2962                 /*
2963                  * For mwrite(mmap + memset to write) case, we still reserve
2964                  * space for NOCOW range.
2965                  * As NOCOW won't cause a new delayed ref, just free the space
2966                  */
2967                 btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
2968                                        ordered_extent->len);
2969                 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2970                 if (nolock)
2971                         trans = btrfs_join_transaction_nolock(root);
2972                 else
2973                         trans = btrfs_join_transaction(root);
2974                 if (IS_ERR(trans)) {
2975                         ret = PTR_ERR(trans);
2976                         trans = NULL;
2977                         goto out;
2978                 }
2979                 trans->block_rsv = &BTRFS_I(inode)->block_rsv;
2980                 ret = btrfs_update_inode_fallback(trans, root, inode);
2981                 if (ret) /* -ENOMEM or corruption */
2982                         btrfs_abort_transaction(trans, ret);
2983                 goto out;
2984         }
2985
2986         range_locked = true;
2987         lock_extent_bits(io_tree, ordered_extent->file_offset,
2988                          ordered_extent->file_offset + ordered_extent->len - 1,
2989                          &cached_state);
2990
2991         ret = test_range_bit(io_tree, ordered_extent->file_offset,
2992                         ordered_extent->file_offset + ordered_extent->len - 1,
2993                         EXTENT_DEFRAG, 0, cached_state);
2994         if (ret) {
2995                 u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
2996                 if (0 && last_snapshot >= BTRFS_I(inode)->generation)
2997                         /* the inode is shared */
2998                         new = record_old_file_extents(inode, ordered_extent);
2999
3000                 clear_extent_bit(io_tree, ordered_extent->file_offset,
3001                         ordered_extent->file_offset + ordered_extent->len - 1,
3002                         EXTENT_DEFRAG, 0, 0, &cached_state);
3003         }
3004
3005         if (nolock)
3006                 trans = btrfs_join_transaction_nolock(root);
3007         else
3008                 trans = btrfs_join_transaction(root);
3009         if (IS_ERR(trans)) {
3010                 ret = PTR_ERR(trans);
3011                 trans = NULL;
3012                 goto out;
3013         }
3014
3015         trans->block_rsv = &BTRFS_I(inode)->block_rsv;
3016
3017         if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
3018                 compress_type = ordered_extent->compress_type;
3019         if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3020                 BUG_ON(compress_type);
3021                 btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
3022                                        ordered_extent->len);
3023                 ret = btrfs_mark_extent_written(trans, BTRFS_I(inode),
3024                                                 ordered_extent->file_offset,
3025                                                 ordered_extent->file_offset +
3026                                                 logical_len);
3027         } else {
3028                 BUG_ON(root == fs_info->tree_root);
3029                 ret = insert_reserved_file_extent(trans, inode,
3030                                                 ordered_extent->file_offset,
3031                                                 ordered_extent->start,
3032                                                 ordered_extent->disk_len,
3033                                                 logical_len, logical_len,
3034                                                 compress_type, 0, 0,
3035                                                 BTRFS_FILE_EXTENT_REG);
3036                 if (!ret)
3037                         btrfs_release_delalloc_bytes(fs_info,
3038                                                      ordered_extent->start,
3039                                                      ordered_extent->disk_len);
3040         }
3041         unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
3042                            ordered_extent->file_offset, ordered_extent->len,
3043                            trans->transid);
3044         if (ret < 0) {
3045                 btrfs_abort_transaction(trans, ret);
3046                 goto out;
3047         }
3048
3049         add_pending_csums(trans, inode, &ordered_extent->list);
3050
3051         btrfs_ordered_update_i_size(inode, 0, ordered_extent);
3052         ret = btrfs_update_inode_fallback(trans, root, inode);
3053         if (ret) { /* -ENOMEM or corruption */
3054                 btrfs_abort_transaction(trans, ret);
3055                 goto out;
3056         }
3057         ret = 0;
3058 out:
3059         if (range_locked || clear_new_delalloc_bytes) {
3060                 unsigned int clear_bits = 0;
3061
3062                 if (range_locked)
3063                         clear_bits |= EXTENT_LOCKED;
3064                 if (clear_new_delalloc_bytes)
3065                         clear_bits |= EXTENT_DELALLOC_NEW;
3066                 clear_extent_bit(&BTRFS_I(inode)->io_tree,
3067                                  ordered_extent->file_offset,
3068                                  ordered_extent->file_offset +
3069                                  ordered_extent->len - 1,
3070                                  clear_bits,
3071                                  (clear_bits & EXTENT_LOCKED) ? 1 : 0,
3072                                  0, &cached_state);
3073         }