Btrfs: fix xattr loss after power failure
[sfrench/cifs-2.6.git] / fs / btrfs / tree-log.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2008 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/blkdev.h>
9 #include <linux/list_sort.h>
10 #include <linux/iversion.h>
11 #include "ctree.h"
12 #include "tree-log.h"
13 #include "disk-io.h"
14 #include "locking.h"
15 #include "print-tree.h"
16 #include "backref.h"
17 #include "compression.h"
18 #include "qgroup.h"
19 #include "inode-map.h"
20
21 /* magic values for the inode_only field in btrfs_log_inode:
22  *
23  * LOG_INODE_ALL means to log everything
24  * LOG_INODE_EXISTS means to log just enough to recreate the inode
25  * during log replay
26  */
27 #define LOG_INODE_ALL 0
28 #define LOG_INODE_EXISTS 1
29 #define LOG_OTHER_INODE 2
30
31 /*
32  * directory trouble cases
33  *
34  * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
35  * log, we must force a full commit before doing an fsync of the directory
36  * where the unlink was done.
37  * ---> record transid of last unlink/rename per directory
38  *
39  * mkdir foo/some_dir
40  * normal commit
41  * rename foo/some_dir foo2/some_dir
42  * mkdir foo/some_dir
43  * fsync foo/some_dir/some_file
44  *
45  * The fsync above will unlink the original some_dir without recording
46  * it in its new location (foo2).  After a crash, some_dir will be gone
47  * unless the fsync of some_file forces a full commit
48  *
49  * 2) we must log any new names for any file or dir that is in the fsync
50  * log. ---> check inode while renaming/linking.
51  *
52  * 2a) we must log any new names for any file or dir during rename
53  * when the directory they are being removed from was logged.
54  * ---> check inode and old parent dir during rename
55  *
56  *  2a is actually the more important variant.  With the extra logging
57  *  a crash might unlink the old name without recreating the new one
58  *
59  * 3) after a crash, we must go through any directories with a link count
60  * of zero and redo the rm -rf
61  *
62  * mkdir f1/foo
63  * normal commit
64  * rm -rf f1/foo
65  * fsync(f1)
66  *
67  * The directory f1 was fully removed from the FS, but fsync was never
68  * called on f1, only its parent dir.  After a crash the rm -rf must
69  * be replayed.  This must be able to recurse down the entire
70  * directory tree.  The inode link count fixup code takes care of the
71  * ugly details.
72  */
73
74 /*
75  * stages for the tree walking.  The first
76  * stage (0) is to only pin down the blocks we find
77  * the second stage (1) is to make sure that all the inodes
78  * we find in the log are created in the subvolume.
79  *
80  * The last stage is to deal with directories and links and extents
81  * and all the other fun semantics
82  */
83 #define LOG_WALK_PIN_ONLY 0
84 #define LOG_WALK_REPLAY_INODES 1
85 #define LOG_WALK_REPLAY_DIR_INDEX 2
86 #define LOG_WALK_REPLAY_ALL 3
87
88 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
89                            struct btrfs_root *root, struct btrfs_inode *inode,
90                            int inode_only,
91                            const loff_t start,
92                            const loff_t end,
93                            struct btrfs_log_ctx *ctx);
94 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
95                              struct btrfs_root *root,
96                              struct btrfs_path *path, u64 objectid);
97 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
98                                        struct btrfs_root *root,
99                                        struct btrfs_root *log,
100                                        struct btrfs_path *path,
101                                        u64 dirid, int del_all);
102
103 /*
104  * tree logging is a special write ahead log used to make sure that
105  * fsyncs and O_SYNCs can happen without doing full tree commits.
106  *
107  * Full tree commits are expensive because they require commonly
108  * modified blocks to be recowed, creating many dirty pages in the
109  * extent tree an 4x-6x higher write load than ext3.
110  *
111  * Instead of doing a tree commit on every fsync, we use the
112  * key ranges and transaction ids to find items for a given file or directory
113  * that have changed in this transaction.  Those items are copied into
114  * a special tree (one per subvolume root), that tree is written to disk
115  * and then the fsync is considered complete.
116  *
117  * After a crash, items are copied out of the log-tree back into the
118  * subvolume tree.  Any file data extents found are recorded in the extent
119  * allocation tree, and the log-tree freed.
120  *
121  * The log tree is read three times, once to pin down all the extents it is
122  * using in ram and once, once to create all the inodes logged in the tree
123  * and once to do all the other items.
124  */
125
126 /*
127  * start a sub transaction and setup the log tree
128  * this increments the log tree writer count to make the people
129  * syncing the tree wait for us to finish
130  */
131 static int start_log_trans(struct btrfs_trans_handle *trans,
132                            struct btrfs_root *root,
133                            struct btrfs_log_ctx *ctx)
134 {
135         struct btrfs_fs_info *fs_info = root->fs_info;
136         int ret = 0;
137
138         mutex_lock(&root->log_mutex);
139
140         if (root->log_root) {
141                 if (btrfs_need_log_full_commit(fs_info, trans)) {
142                         ret = -EAGAIN;
143                         goto out;
144                 }
145
146                 if (!root->log_start_pid) {
147                         clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
148                         root->log_start_pid = current->pid;
149                 } else if (root->log_start_pid != current->pid) {
150                         set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
151                 }
152         } else {
153                 mutex_lock(&fs_info->tree_log_mutex);
154                 if (!fs_info->log_root_tree)
155                         ret = btrfs_init_log_root_tree(trans, fs_info);
156                 mutex_unlock(&fs_info->tree_log_mutex);
157                 if (ret)
158                         goto out;
159
160                 ret = btrfs_add_log_tree(trans, root);
161                 if (ret)
162                         goto out;
163
164                 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
165                 root->log_start_pid = current->pid;
166         }
167
168         atomic_inc(&root->log_batch);
169         atomic_inc(&root->log_writers);
170         if (ctx) {
171                 int index = root->log_transid % 2;
172                 list_add_tail(&ctx->list, &root->log_ctxs[index]);
173                 ctx->log_transid = root->log_transid;
174         }
175
176 out:
177         mutex_unlock(&root->log_mutex);
178         return ret;
179 }
180
181 /*
182  * returns 0 if there was a log transaction running and we were able
183  * to join, or returns -ENOENT if there were not transactions
184  * in progress
185  */
186 static int join_running_log_trans(struct btrfs_root *root)
187 {
188         int ret = -ENOENT;
189
190         smp_mb();
191         if (!root->log_root)
192                 return -ENOENT;
193
194         mutex_lock(&root->log_mutex);
195         if (root->log_root) {
196                 ret = 0;
197                 atomic_inc(&root->log_writers);
198         }
199         mutex_unlock(&root->log_mutex);
200         return ret;
201 }
202
203 /*
204  * This either makes the current running log transaction wait
205  * until you call btrfs_end_log_trans() or it makes any future
206  * log transactions wait until you call btrfs_end_log_trans()
207  */
208 int btrfs_pin_log_trans(struct btrfs_root *root)
209 {
210         int ret = -ENOENT;
211
212         mutex_lock(&root->log_mutex);
213         atomic_inc(&root->log_writers);
214         mutex_unlock(&root->log_mutex);
215         return ret;
216 }
217
218 /*
219  * indicate we're done making changes to the log tree
220  * and wake up anyone waiting to do a sync
221  */
222 void btrfs_end_log_trans(struct btrfs_root *root)
223 {
224         if (atomic_dec_and_test(&root->log_writers)) {
225                 /*
226                  * Implicit memory barrier after atomic_dec_and_test
227                  */
228                 if (waitqueue_active(&root->log_writer_wait))
229                         wake_up(&root->log_writer_wait);
230         }
231 }
232
233
234 /*
235  * the walk control struct is used to pass state down the chain when
236  * processing the log tree.  The stage field tells us which part
237  * of the log tree processing we are currently doing.  The others
238  * are state fields used for that specific part
239  */
240 struct walk_control {
241         /* should we free the extent on disk when done?  This is used
242          * at transaction commit time while freeing a log tree
243          */
244         int free;
245
246         /* should we write out the extent buffer?  This is used
247          * while flushing the log tree to disk during a sync
248          */
249         int write;
250
251         /* should we wait for the extent buffer io to finish?  Also used
252          * while flushing the log tree to disk for a sync
253          */
254         int wait;
255
256         /* pin only walk, we record which extents on disk belong to the
257          * log trees
258          */
259         int pin;
260
261         /* what stage of the replay code we're currently in */
262         int stage;
263
264         /* the root we are currently replaying */
265         struct btrfs_root *replay_dest;
266
267         /* the trans handle for the current replay */
268         struct btrfs_trans_handle *trans;
269
270         /* the function that gets used to process blocks we find in the
271          * tree.  Note the extent_buffer might not be up to date when it is
272          * passed in, and it must be checked or read if you need the data
273          * inside it
274          */
275         int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
276                             struct walk_control *wc, u64 gen, int level);
277 };
278
279 /*
280  * process_func used to pin down extents, write them or wait on them
281  */
282 static int process_one_buffer(struct btrfs_root *log,
283                               struct extent_buffer *eb,
284                               struct walk_control *wc, u64 gen, int level)
285 {
286         struct btrfs_fs_info *fs_info = log->fs_info;
287         int ret = 0;
288
289         /*
290          * If this fs is mixed then we need to be able to process the leaves to
291          * pin down any logged extents, so we have to read the block.
292          */
293         if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
294                 ret = btrfs_read_buffer(eb, gen, level, NULL);
295                 if (ret)
296                         return ret;
297         }
298
299         if (wc->pin)
300                 ret = btrfs_pin_extent_for_log_replay(fs_info, eb->start,
301                                                       eb->len);
302
303         if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
304                 if (wc->pin && btrfs_header_level(eb) == 0)
305                         ret = btrfs_exclude_logged_extents(fs_info, eb);
306                 if (wc->write)
307                         btrfs_write_tree_block(eb);
308                 if (wc->wait)
309                         btrfs_wait_tree_block_writeback(eb);
310         }
311         return ret;
312 }
313
314 /*
315  * Item overwrite used by replay and tree logging.  eb, slot and key all refer
316  * to the src data we are copying out.
317  *
318  * root is the tree we are copying into, and path is a scratch
319  * path for use in this function (it should be released on entry and
320  * will be released on exit).
321  *
322  * If the key is already in the destination tree the existing item is
323  * overwritten.  If the existing item isn't big enough, it is extended.
324  * If it is too large, it is truncated.
325  *
326  * If the key isn't in the destination yet, a new item is inserted.
327  */
328 static noinline int overwrite_item(struct btrfs_trans_handle *trans,
329                                    struct btrfs_root *root,
330                                    struct btrfs_path *path,
331                                    struct extent_buffer *eb, int slot,
332                                    struct btrfs_key *key)
333 {
334         struct btrfs_fs_info *fs_info = root->fs_info;
335         int ret;
336         u32 item_size;
337         u64 saved_i_size = 0;
338         int save_old_i_size = 0;
339         unsigned long src_ptr;
340         unsigned long dst_ptr;
341         int overwrite_root = 0;
342         bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
343
344         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
345                 overwrite_root = 1;
346
347         item_size = btrfs_item_size_nr(eb, slot);
348         src_ptr = btrfs_item_ptr_offset(eb, slot);
349
350         /* look for the key in the destination tree */
351         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
352         if (ret < 0)
353                 return ret;
354
355         if (ret == 0) {
356                 char *src_copy;
357                 char *dst_copy;
358                 u32 dst_size = btrfs_item_size_nr(path->nodes[0],
359                                                   path->slots[0]);
360                 if (dst_size != item_size)
361                         goto insert;
362
363                 if (item_size == 0) {
364                         btrfs_release_path(path);
365                         return 0;
366                 }
367                 dst_copy = kmalloc(item_size, GFP_NOFS);
368                 src_copy = kmalloc(item_size, GFP_NOFS);
369                 if (!dst_copy || !src_copy) {
370                         btrfs_release_path(path);
371                         kfree(dst_copy);
372                         kfree(src_copy);
373                         return -ENOMEM;
374                 }
375
376                 read_extent_buffer(eb, src_copy, src_ptr, item_size);
377
378                 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
379                 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
380                                    item_size);
381                 ret = memcmp(dst_copy, src_copy, item_size);
382
383                 kfree(dst_copy);
384                 kfree(src_copy);
385                 /*
386                  * they have the same contents, just return, this saves
387                  * us from cowing blocks in the destination tree and doing
388                  * extra writes that may not have been done by a previous
389                  * sync
390                  */
391                 if (ret == 0) {
392                         btrfs_release_path(path);
393                         return 0;
394                 }
395
396                 /*
397                  * We need to load the old nbytes into the inode so when we
398                  * replay the extents we've logged we get the right nbytes.
399                  */
400                 if (inode_item) {
401                         struct btrfs_inode_item *item;
402                         u64 nbytes;
403                         u32 mode;
404
405                         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
406                                               struct btrfs_inode_item);
407                         nbytes = btrfs_inode_nbytes(path->nodes[0], item);
408                         item = btrfs_item_ptr(eb, slot,
409                                               struct btrfs_inode_item);
410                         btrfs_set_inode_nbytes(eb, item, nbytes);
411
412                         /*
413                          * If this is a directory we need to reset the i_size to
414                          * 0 so that we can set it up properly when replaying
415                          * the rest of the items in this log.
416                          */
417                         mode = btrfs_inode_mode(eb, item);
418                         if (S_ISDIR(mode))
419                                 btrfs_set_inode_size(eb, item, 0);
420                 }
421         } else if (inode_item) {
422                 struct btrfs_inode_item *item;
423                 u32 mode;
424
425                 /*
426                  * New inode, set nbytes to 0 so that the nbytes comes out
427                  * properly when we replay the extents.
428                  */
429                 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
430                 btrfs_set_inode_nbytes(eb, item, 0);
431
432                 /*
433                  * If this is a directory we need to reset the i_size to 0 so
434                  * that we can set it up properly when replaying the rest of
435                  * the items in this log.
436                  */
437                 mode = btrfs_inode_mode(eb, item);
438                 if (S_ISDIR(mode))
439                         btrfs_set_inode_size(eb, item, 0);
440         }
441 insert:
442         btrfs_release_path(path);
443         /* try to insert the key into the destination tree */
444         path->skip_release_on_error = 1;
445         ret = btrfs_insert_empty_item(trans, root, path,
446                                       key, item_size);
447         path->skip_release_on_error = 0;
448
449         /* make sure any existing item is the correct size */
450         if (ret == -EEXIST || ret == -EOVERFLOW) {
451                 u32 found_size;
452                 found_size = btrfs_item_size_nr(path->nodes[0],
453                                                 path->slots[0]);
454                 if (found_size > item_size)
455                         btrfs_truncate_item(fs_info, path, item_size, 1);
456                 else if (found_size < item_size)
457                         btrfs_extend_item(fs_info, path,
458                                           item_size - found_size);
459         } else if (ret) {
460                 return ret;
461         }
462         dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
463                                         path->slots[0]);
464
465         /* don't overwrite an existing inode if the generation number
466          * was logged as zero.  This is done when the tree logging code
467          * is just logging an inode to make sure it exists after recovery.
468          *
469          * Also, don't overwrite i_size on directories during replay.
470          * log replay inserts and removes directory items based on the
471          * state of the tree found in the subvolume, and i_size is modified
472          * as it goes
473          */
474         if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
475                 struct btrfs_inode_item *src_item;
476                 struct btrfs_inode_item *dst_item;
477
478                 src_item = (struct btrfs_inode_item *)src_ptr;
479                 dst_item = (struct btrfs_inode_item *)dst_ptr;
480
481                 if (btrfs_inode_generation(eb, src_item) == 0) {
482                         struct extent_buffer *dst_eb = path->nodes[0];
483                         const u64 ino_size = btrfs_inode_size(eb, src_item);
484
485                         /*
486                          * For regular files an ino_size == 0 is used only when
487                          * logging that an inode exists, as part of a directory
488                          * fsync, and the inode wasn't fsynced before. In this
489                          * case don't set the size of the inode in the fs/subvol
490                          * tree, otherwise we would be throwing valid data away.
491                          */
492                         if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
493                             S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
494                             ino_size != 0) {
495                                 struct btrfs_map_token token;
496
497                                 btrfs_init_map_token(&token);
498                                 btrfs_set_token_inode_size(dst_eb, dst_item,
499                                                            ino_size, &token);
500                         }
501                         goto no_copy;
502                 }
503
504                 if (overwrite_root &&
505                     S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
506                     S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
507                         save_old_i_size = 1;
508                         saved_i_size = btrfs_inode_size(path->nodes[0],
509                                                         dst_item);
510                 }
511         }
512
513         copy_extent_buffer(path->nodes[0], eb, dst_ptr,
514                            src_ptr, item_size);
515
516         if (save_old_i_size) {
517                 struct btrfs_inode_item *dst_item;
518                 dst_item = (struct btrfs_inode_item *)dst_ptr;
519                 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
520         }
521
522         /* make sure the generation is filled in */
523         if (key->type == BTRFS_INODE_ITEM_KEY) {
524                 struct btrfs_inode_item *dst_item;
525                 dst_item = (struct btrfs_inode_item *)dst_ptr;
526                 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
527                         btrfs_set_inode_generation(path->nodes[0], dst_item,
528                                                    trans->transid);
529                 }
530         }
531 no_copy:
532         btrfs_mark_buffer_dirty(path->nodes[0]);
533         btrfs_release_path(path);
534         return 0;
535 }
536
537 /*
538  * simple helper to read an inode off the disk from a given root
539  * This can only be called for subvolume roots and not for the log
540  */
541 static noinline struct inode *read_one_inode(struct btrfs_root *root,
542                                              u64 objectid)
543 {
544         struct btrfs_key key;
545         struct inode *inode;
546
547         key.objectid = objectid;
548         key.type = BTRFS_INODE_ITEM_KEY;
549         key.offset = 0;
550         inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
551         if (IS_ERR(inode)) {
552                 inode = NULL;
553         } else if (is_bad_inode(inode)) {
554                 iput(inode);
555                 inode = NULL;
556         }
557         return inode;
558 }
559
560 /* replays a single extent in 'eb' at 'slot' with 'key' into the
561  * subvolume 'root'.  path is released on entry and should be released
562  * on exit.
563  *
564  * extents in the log tree have not been allocated out of the extent
565  * tree yet.  So, this completes the allocation, taking a reference
566  * as required if the extent already exists or creating a new extent
567  * if it isn't in the extent allocation tree yet.
568  *
569  * The extent is inserted into the file, dropping any existing extents
570  * from the file that overlap the new one.
571  */
572 static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
573                                       struct btrfs_root *root,
574                                       struct btrfs_path *path,
575                                       struct extent_buffer *eb, int slot,
576                                       struct btrfs_key *key)
577 {
578         struct btrfs_fs_info *fs_info = root->fs_info;
579         int found_type;
580         u64 extent_end;
581         u64 start = key->offset;
582         u64 nbytes = 0;
583         struct btrfs_file_extent_item *item;
584         struct inode *inode = NULL;
585         unsigned long size;
586         int ret = 0;
587
588         item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
589         found_type = btrfs_file_extent_type(eb, item);
590
591         if (found_type == BTRFS_FILE_EXTENT_REG ||
592             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
593                 nbytes = btrfs_file_extent_num_bytes(eb, item);
594                 extent_end = start + nbytes;
595
596                 /*
597                  * We don't add to the inodes nbytes if we are prealloc or a
598                  * hole.
599                  */
600                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
601                         nbytes = 0;
602         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
603                 size = btrfs_file_extent_inline_len(eb, slot, item);
604                 nbytes = btrfs_file_extent_ram_bytes(eb, item);
605                 extent_end = ALIGN(start + size,
606                                    fs_info->sectorsize);
607         } else {
608                 ret = 0;
609                 goto out;
610         }
611
612         inode = read_one_inode(root, key->objectid);
613         if (!inode) {
614                 ret = -EIO;
615                 goto out;
616         }
617
618         /*
619          * first check to see if we already have this extent in the
620          * file.  This must be done before the btrfs_drop_extents run
621          * so we don't try to drop this extent.
622          */
623         ret = btrfs_lookup_file_extent(trans, root, path,
624                         btrfs_ino(BTRFS_I(inode)), start, 0);
625
626         if (ret == 0 &&
627             (found_type == BTRFS_FILE_EXTENT_REG ||
628              found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
629                 struct btrfs_file_extent_item cmp1;
630                 struct btrfs_file_extent_item cmp2;
631                 struct btrfs_file_extent_item *existing;
632                 struct extent_buffer *leaf;
633
634                 leaf = path->nodes[0];
635                 existing = btrfs_item_ptr(leaf, path->slots[0],
636                                           struct btrfs_file_extent_item);
637
638                 read_extent_buffer(eb, &cmp1, (unsigned long)item,
639                                    sizeof(cmp1));
640                 read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
641                                    sizeof(cmp2));
642
643                 /*
644                  * we already have a pointer to this exact extent,
645                  * we don't have to do anything
646                  */
647                 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
648                         btrfs_release_path(path);
649                         goto out;
650                 }
651         }
652         btrfs_release_path(path);
653
654         /* drop any overlapping extents */
655         ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
656         if (ret)
657                 goto out;
658
659         if (found_type == BTRFS_FILE_EXTENT_REG ||
660             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
661                 u64 offset;
662                 unsigned long dest_offset;
663                 struct btrfs_key ins;
664
665                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
666                     btrfs_fs_incompat(fs_info, NO_HOLES))
667                         goto update_inode;
668
669                 ret = btrfs_insert_empty_item(trans, root, path, key,
670                                               sizeof(*item));
671                 if (ret)
672                         goto out;
673                 dest_offset = btrfs_item_ptr_offset(path->nodes[0],
674                                                     path->slots[0]);
675                 copy_extent_buffer(path->nodes[0], eb, dest_offset,
676                                 (unsigned long)item,  sizeof(*item));
677
678                 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
679                 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
680                 ins.type = BTRFS_EXTENT_ITEM_KEY;
681                 offset = key->offset - btrfs_file_extent_offset(eb, item);
682
683                 /*
684                  * Manually record dirty extent, as here we did a shallow
685                  * file extent item copy and skip normal backref update,
686                  * but modifying extent tree all by ourselves.
687                  * So need to manually record dirty extent for qgroup,
688                  * as the owner of the file extent changed from log tree
689                  * (doesn't affect qgroup) to fs/file tree(affects qgroup)
690                  */
691                 ret = btrfs_qgroup_trace_extent(trans, fs_info,
692                                 btrfs_file_extent_disk_bytenr(eb, item),
693                                 btrfs_file_extent_disk_num_bytes(eb, item),
694                                 GFP_NOFS);
695                 if (ret < 0)
696                         goto out;
697
698                 if (ins.objectid > 0) {
699                         u64 csum_start;
700                         u64 csum_end;
701                         LIST_HEAD(ordered_sums);
702                         /*
703                          * is this extent already allocated in the extent
704                          * allocation tree?  If so, just add a reference
705                          */
706                         ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
707                                                 ins.offset);
708                         if (ret == 0) {
709                                 ret = btrfs_inc_extent_ref(trans, root,
710                                                 ins.objectid, ins.offset,
711                                                 0, root->root_key.objectid,
712                                                 key->objectid, offset);
713                                 if (ret)
714                                         goto out;
715                         } else {
716                                 /*
717                                  * insert the extent pointer in the extent
718                                  * allocation tree
719                                  */
720                                 ret = btrfs_alloc_logged_file_extent(trans,
721                                                 fs_info,
722                                                 root->root_key.objectid,
723                                                 key->objectid, offset, &ins);
724                                 if (ret)
725                                         goto out;
726                         }
727                         btrfs_release_path(path);
728
729                         if (btrfs_file_extent_compression(eb, item)) {
730                                 csum_start = ins.objectid;
731                                 csum_end = csum_start + ins.offset;
732                         } else {
733                                 csum_start = ins.objectid +
734                                         btrfs_file_extent_offset(eb, item);
735                                 csum_end = csum_start +
736                                         btrfs_file_extent_num_bytes(eb, item);
737                         }
738
739                         ret = btrfs_lookup_csums_range(root->log_root,
740                                                 csum_start, csum_end - 1,
741                                                 &ordered_sums, 0);
742                         if (ret)
743                                 goto out;
744                         /*
745                          * Now delete all existing cums in the csum root that
746                          * cover our range. We do this because we can have an
747                          * extent that is completely referenced by one file
748                          * extent item and partially referenced by another
749                          * file extent item (like after using the clone or
750                          * extent_same ioctls). In this case if we end up doing
751                          * the replay of the one that partially references the
752                          * extent first, and we do not do the csum deletion
753                          * below, we can get 2 csum items in the csum tree that
754                          * overlap each other. For example, imagine our log has
755                          * the two following file extent items:
756                          *
757                          * key (257 EXTENT_DATA 409600)
758                          *     extent data disk byte 12845056 nr 102400
759                          *     extent data offset 20480 nr 20480 ram 102400
760                          *
761                          * key (257 EXTENT_DATA 819200)
762                          *     extent data disk byte 12845056 nr 102400
763                          *     extent data offset 0 nr 102400 ram 102400
764                          *
765                          * Where the second one fully references the 100K extent
766                          * that starts at disk byte 12845056, and the log tree
767                          * has a single csum item that covers the entire range
768                          * of the extent:
769                          *
770                          * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
771                          *
772                          * After the first file extent item is replayed, the
773                          * csum tree gets the following csum item:
774                          *
775                          * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
776                          *
777                          * Which covers the 20K sub-range starting at offset 20K
778                          * of our extent. Now when we replay the second file
779                          * extent item, if we do not delete existing csum items
780                          * that cover any of its blocks, we end up getting two
781                          * csum items in our csum tree that overlap each other:
782                          *
783                          * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
784                          * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
785                          *
786                          * Which is a problem, because after this anyone trying
787                          * to lookup up for the checksum of any block of our
788                          * extent starting at an offset of 40K or higher, will
789                          * end up looking at the second csum item only, which
790                          * does not contain the checksum for any block starting
791                          * at offset 40K or higher of our extent.
792                          */
793                         while (!list_empty(&ordered_sums)) {
794                                 struct btrfs_ordered_sum *sums;
795                                 sums = list_entry(ordered_sums.next,
796                                                 struct btrfs_ordered_sum,
797                                                 list);
798                                 if (!ret)
799                                         ret = btrfs_del_csums(trans, fs_info,
800                                                               sums->bytenr,
801                                                               sums->len);
802                                 if (!ret)
803                                         ret = btrfs_csum_file_blocks(trans,
804                                                 fs_info->csum_root, sums);
805                                 list_del(&sums->list);
806                                 kfree(sums);
807                         }
808                         if (ret)
809                                 goto out;
810                 } else {
811                         btrfs_release_path(path);
812                 }
813         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
814                 /* inline extents are easy, we just overwrite them */
815                 ret = overwrite_item(trans, root, path, eb, slot, key);
816                 if (ret)
817                         goto out;
818         }
819
820         inode_add_bytes(inode, nbytes);
821 update_inode:
822         ret = btrfs_update_inode(trans, root, inode);
823 out:
824         if (inode)
825                 iput(inode);
826         return ret;
827 }
828
829 /*
830  * when cleaning up conflicts between the directory names in the
831  * subvolume, directory names in the log and directory names in the
832  * inode back references, we may have to unlink inodes from directories.
833  *
834  * This is a helper function to do the unlink of a specific directory
835  * item
836  */
837 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
838                                       struct btrfs_root *root,
839                                       struct btrfs_path *path,
840                                       struct btrfs_inode *dir,
841                                       struct btrfs_dir_item *di)
842 {
843         struct inode *inode;
844         char *name;
845         int name_len;
846         struct extent_buffer *leaf;
847         struct btrfs_key location;
848         int ret;
849
850         leaf = path->nodes[0];
851
852         btrfs_dir_item_key_to_cpu(leaf, di, &location);
853         name_len = btrfs_dir_name_len(leaf, di);
854         name = kmalloc(name_len, GFP_NOFS);
855         if (!name)
856                 return -ENOMEM;
857
858         read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
859         btrfs_release_path(path);
860
861         inode = read_one_inode(root, location.objectid);
862         if (!inode) {
863                 ret = -EIO;
864                 goto out;
865         }
866
867         ret = link_to_fixup_dir(trans, root, path, location.objectid);
868         if (ret)
869                 goto out;
870
871         ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name,
872                         name_len);
873         if (ret)
874                 goto out;
875         else
876                 ret = btrfs_run_delayed_items(trans);
877 out:
878         kfree(name);
879         iput(inode);
880         return ret;
881 }
882
883 /*
884  * helper function to see if a given name and sequence number found
885  * in an inode back reference are already in a directory and correctly
886  * point to this inode
887  */
888 static noinline int inode_in_dir(struct btrfs_root *root,
889                                  struct btrfs_path *path,
890                                  u64 dirid, u64 objectid, u64 index,
891                                  const char *name, int name_len)
892 {
893         struct btrfs_dir_item *di;
894         struct btrfs_key location;
895         int match = 0;
896
897         di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
898                                          index, name, name_len, 0);
899         if (di && !IS_ERR(di)) {
900                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
901                 if (location.objectid != objectid)
902                         goto out;
903         } else
904                 goto out;
905         btrfs_release_path(path);
906
907         di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
908         if (di && !IS_ERR(di)) {
909                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
910                 if (location.objectid != objectid)
911                         goto out;
912         } else
913                 goto out;
914         match = 1;
915 out:
916         btrfs_release_path(path);
917         return match;
918 }
919
920 /*
921  * helper function to check a log tree for a named back reference in
922  * an inode.  This is used to decide if a back reference that is
923  * found in the subvolume conflicts with what we find in the log.
924  *
925  * inode backreferences may have multiple refs in a single item,
926  * during replay we process one reference at a time, and we don't
927  * want to delete valid links to a file from the subvolume if that
928  * link is also in the log.
929  */
930 static noinline int backref_in_log(struct btrfs_root *log,
931                                    struct btrfs_key *key,
932                                    u64 ref_objectid,
933                                    const char *name, int namelen)
934 {
935         struct btrfs_path *path;
936         struct btrfs_inode_ref *ref;
937         unsigned long ptr;
938         unsigned long ptr_end;
939         unsigned long name_ptr;
940         int found_name_len;
941         int item_size;
942         int ret;
943         int match = 0;
944
945         path = btrfs_alloc_path();
946         if (!path)
947                 return -ENOMEM;
948
949         ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
950         if (ret != 0)
951                 goto out;
952
953         ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
954
955         if (key->type == BTRFS_INODE_EXTREF_KEY) {
956                 if (btrfs_find_name_in_ext_backref(path->nodes[0],
957                                                    path->slots[0],
958                                                    ref_objectid,
959                                                    name, namelen, NULL))
960                         match = 1;
961
962                 goto out;
963         }
964
965         item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
966         ptr_end = ptr + item_size;
967         while (ptr < ptr_end) {
968                 ref = (struct btrfs_inode_ref *)ptr;
969                 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
970                 if (found_name_len == namelen) {
971                         name_ptr = (unsigned long)(ref + 1);
972                         ret = memcmp_extent_buffer(path->nodes[0], name,
973                                                    name_ptr, namelen);
974                         if (ret == 0) {
975                                 match = 1;
976                                 goto out;
977                         }
978                 }
979                 ptr = (unsigned long)(ref + 1) + found_name_len;
980         }
981 out:
982         btrfs_free_path(path);
983         return match;
984 }
985
986 static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
987                                   struct btrfs_root *root,
988                                   struct btrfs_path *path,
989                                   struct btrfs_root *log_root,
990                                   struct btrfs_inode *dir,
991                                   struct btrfs_inode *inode,
992                                   u64 inode_objectid, u64 parent_objectid,
993                                   u64 ref_index, char *name, int namelen,
994                                   int *search_done)
995 {
996         int ret;
997         char *victim_name;
998         int victim_name_len;
999         struct extent_buffer *leaf;
1000         struct btrfs_dir_item *di;
1001         struct btrfs_key search_key;
1002         struct btrfs_inode_extref *extref;
1003
1004 again:
1005         /* Search old style refs */
1006         search_key.objectid = inode_objectid;
1007         search_key.type = BTRFS_INODE_REF_KEY;
1008         search_key.offset = parent_objectid;
1009         ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
1010         if (ret == 0) {
1011                 struct btrfs_inode_ref *victim_ref;
1012                 unsigned long ptr;
1013                 unsigned long ptr_end;
1014
1015                 leaf = path->nodes[0];
1016
1017                 /* are we trying to overwrite a back ref for the root directory
1018                  * if so, just jump out, we're done
1019                  */
1020                 if (search_key.objectid == search_key.offset)
1021                         return 1;
1022
1023                 /* check all the names in this back reference to see
1024                  * if they are in the log.  if so, we allow them to stay
1025                  * otherwise they must be unlinked as a conflict
1026                  */
1027                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1028                 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
1029                 while (ptr < ptr_end) {
1030                         victim_ref = (struct btrfs_inode_ref *)ptr;
1031                         victim_name_len = btrfs_inode_ref_name_len(leaf,
1032                                                                    victim_ref);
1033                         victim_name = kmalloc(victim_name_len, GFP_NOFS);
1034                         if (!victim_name)
1035                                 return -ENOMEM;
1036
1037                         read_extent_buffer(leaf, victim_name,
1038                                            (unsigned long)(victim_ref + 1),
1039                                            victim_name_len);
1040
1041                         if (!backref_in_log(log_root, &search_key,
1042                                             parent_objectid,
1043                                             victim_name,
1044                                             victim_name_len)) {
1045                                 inc_nlink(&inode->vfs_inode);
1046                                 btrfs_release_path(path);
1047
1048                                 ret = btrfs_unlink_inode(trans, root, dir, inode,
1049                                                 victim_name, victim_name_len);
1050                                 kfree(victim_name);
1051                                 if (ret)
1052                                         return ret;
1053                                 ret = btrfs_run_delayed_items(trans);
1054                                 if (ret)
1055                                         return ret;
1056                                 *search_done = 1;
1057                                 goto again;
1058                         }
1059                         kfree(victim_name);
1060
1061                         ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
1062                 }
1063
1064                 /*
1065                  * NOTE: we have searched root tree and checked the
1066                  * corresponding ref, it does not need to check again.
1067                  */
1068                 *search_done = 1;
1069         }
1070         btrfs_release_path(path);
1071
1072         /* Same search but for extended refs */
1073         extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
1074                                            inode_objectid, parent_objectid, 0,
1075                                            0);
1076         if (!IS_ERR_OR_NULL(extref)) {
1077                 u32 item_size;
1078                 u32 cur_offset = 0;
1079                 unsigned long base;
1080                 struct inode *victim_parent;
1081
1082                 leaf = path->nodes[0];
1083
1084                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1085                 base = btrfs_item_ptr_offset(leaf, path->slots[0]);
1086
1087                 while (cur_offset < item_size) {
1088                         extref = (struct btrfs_inode_extref *)(base + cur_offset);
1089
1090                         victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
1091
1092                         if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
1093                                 goto next;
1094
1095                         victim_name = kmalloc(victim_name_len, GFP_NOFS);
1096                         if (!victim_name)
1097                                 return -ENOMEM;
1098                         read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
1099                                            victim_name_len);
1100
1101                         search_key.objectid = inode_objectid;
1102                         search_key.type = BTRFS_INODE_EXTREF_KEY;
1103                         search_key.offset = btrfs_extref_hash(parent_objectid,
1104                                                               victim_name,
1105                                                               victim_name_len);
1106                         ret = 0;
1107                         if (!backref_in_log(log_root, &search_key,
1108                                             parent_objectid, victim_name,
1109                                             victim_name_len)) {
1110                                 ret = -ENOENT;
1111                                 victim_parent = read_one_inode(root,
1112                                                 parent_objectid);
1113                                 if (victim_parent) {
1114                                         inc_nlink(&inode->vfs_inode);
1115                                         btrfs_release_path(path);
1116
1117                                         ret = btrfs_unlink_inode(trans, root,
1118                                                         BTRFS_I(victim_parent),
1119                                                         inode,
1120                                                         victim_name,
1121                                                         victim_name_len);
1122                                         if (!ret)
1123                                                 ret = btrfs_run_delayed_items(
1124                                                                   trans);
1125                                 }
1126                                 iput(victim_parent);
1127                                 kfree(victim_name);
1128                                 if (ret)
1129                                         return ret;
1130                                 *search_done = 1;
1131                                 goto again;
1132                         }
1133                         kfree(victim_name);
1134 next:
1135                         cur_offset += victim_name_len + sizeof(*extref);
1136                 }
1137                 *search_done = 1;
1138         }
1139         btrfs_release_path(path);
1140
1141         /* look for a conflicting sequence number */
1142         di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
1143                                          ref_index, name, namelen, 0);
1144         if (di && !IS_ERR(di)) {
1145                 ret = drop_one_dir_item(trans, root, path, dir, di);
1146                 if (ret)
1147                         return ret;
1148         }
1149         btrfs_release_path(path);
1150
1151         /* look for a conflicing name */
1152         di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
1153                                    name, namelen, 0);
1154         if (di && !IS_ERR(di)) {
1155                 ret = drop_one_dir_item(trans, root, path, dir, di);
1156                 if (ret)
1157                         return ret;
1158         }
1159         btrfs_release_path(path);
1160
1161         return 0;
1162 }
1163
1164 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1165                              u32 *namelen, char **name, u64 *index,
1166                              u64 *parent_objectid)
1167 {
1168         struct btrfs_inode_extref *extref;
1169
1170         extref = (struct btrfs_inode_extref *)ref_ptr;
1171
1172         *namelen = btrfs_inode_extref_name_len(eb, extref);
1173         *name = kmalloc(*namelen, GFP_NOFS);
1174         if (*name == NULL)
1175                 return -ENOMEM;
1176
1177         read_extent_buffer(eb, *name, (unsigned long)&extref->name,
1178                            *namelen);
1179
1180         if (index)
1181                 *index = btrfs_inode_extref_index(eb, extref);
1182         if (parent_objectid)
1183                 *parent_objectid = btrfs_inode_extref_parent(eb, extref);
1184
1185         return 0;
1186 }
1187
1188 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1189                           u32 *namelen, char **name, u64 *index)
1190 {
1191         struct btrfs_inode_ref *ref;
1192
1193         ref = (struct btrfs_inode_ref *)ref_ptr;
1194
1195         *namelen = btrfs_inode_ref_name_len(eb, ref);
1196         *name = kmalloc(*namelen, GFP_NOFS);
1197         if (*name == NULL)
1198                 return -ENOMEM;
1199
1200         read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
1201
1202         if (index)
1203                 *index = btrfs_inode_ref_index(eb, ref);
1204
1205         return 0;
1206 }
1207
1208 /*
1209  * Take an inode reference item from the log tree and iterate all names from the
1210  * inode reference item in the subvolume tree with the same key (if it exists).
1211  * For any name that is not in the inode reference item from the log tree, do a
1212  * proper unlink of that name (that is, remove its entry from the inode
1213  * reference item and both dir index keys).
1214  */
1215 static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
1216                                  struct btrfs_root *root,
1217                                  struct btrfs_path *path,
1218                                  struct btrfs_inode *inode,
1219                                  struct extent_buffer *log_eb,
1220                                  int log_slot,
1221                                  struct btrfs_key *key)
1222 {
1223         int ret;
1224         unsigned long ref_ptr;
1225         unsigned long ref_end;
1226         struct extent_buffer *eb;
1227
1228 again:
1229         btrfs_release_path(path);
1230         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
1231         if (ret > 0) {
1232                 ret = 0;
1233                 goto out;
1234         }
1235         if (ret < 0)
1236                 goto out;
1237
1238         eb = path->nodes[0];
1239         ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
1240         ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]);
1241         while (ref_ptr < ref_end) {
1242                 char *name = NULL;
1243                 int namelen;
1244                 u64 parent_id;
1245
1246                 if (key->type == BTRFS_INODE_EXTREF_KEY) {
1247                         ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1248                                                 NULL, &parent_id);
1249                 } else {
1250                         parent_id = key->offset;
1251                         ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1252                                              NULL);
1253                 }
1254                 if (ret)
1255                         goto out;
1256
1257                 if (key->type == BTRFS_INODE_EXTREF_KEY)
1258                         ret = btrfs_find_name_in_ext_backref(log_eb, log_slot,
1259                                                              parent_id, name,
1260                                                              namelen, NULL);
1261                 else
1262                         ret = btrfs_find_name_in_backref(log_eb, log_slot, name,
1263                                                          namelen, NULL);
1264
1265                 if (!ret) {
1266                         struct inode *dir;
1267
1268                         btrfs_release_path(path);
1269                         dir = read_one_inode(root, parent_id);
1270                         if (!dir) {
1271                                 ret = -ENOENT;
1272                                 kfree(name);
1273                                 goto out;
1274                         }
1275                         ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
1276                                                  inode, name, namelen);
1277                         kfree(name);
1278                         iput(dir);
1279                         if (ret)
1280                                 goto out;
1281                         goto again;
1282                 }
1283
1284                 kfree(name);
1285                 ref_ptr += namelen;
1286                 if (key->type == BTRFS_INODE_EXTREF_KEY)
1287                         ref_ptr += sizeof(struct btrfs_inode_extref);
1288                 else
1289                         ref_ptr += sizeof(struct btrfs_inode_ref);
1290         }
1291         ret = 0;
1292  out:
1293         btrfs_release_path(path);
1294         return ret;
1295 }
1296
1297 /*
1298  * replay one inode back reference item found in the log tree.
1299  * eb, slot and key refer to the buffer and key found in the log tree.
1300  * root is the destination we are replaying into, and path is for temp
1301  * use by this function.  (it should be released on return).
1302  */
1303 static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1304                                   struct btrfs_root *root,
1305                                   struct btrfs_root *log,
1306                                   struct btrfs_path *path,
1307                                   struct extent_buffer *eb, int slot,
1308                                   struct btrfs_key *key)
1309 {
1310         struct inode *dir = NULL;
1311         struct inode *inode = NULL;
1312         unsigned long ref_ptr;
1313         unsigned long ref_end;
1314         char *name = NULL;
1315         int namelen;
1316         int ret;
1317         int search_done = 0;
1318         int log_ref_ver = 0;
1319         u64 parent_objectid;
1320         u64 inode_objectid;
1321         u64 ref_index = 0;
1322         int ref_struct_size;
1323
1324         ref_ptr = btrfs_item_ptr_offset(eb, slot);
1325         ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
1326
1327         if (key->type == BTRFS_INODE_EXTREF_KEY) {
1328                 struct btrfs_inode_extref *r;
1329
1330                 ref_struct_size = sizeof(struct btrfs_inode_extref);
1331                 log_ref_ver = 1;
1332                 r = (struct btrfs_inode_extref *)ref_ptr;
1333                 parent_objectid = btrfs_inode_extref_parent(eb, r);
1334         } else {
1335                 ref_struct_size = sizeof(struct btrfs_inode_ref);
1336                 parent_objectid = key->offset;
1337         }
1338         inode_objectid = key->objectid;
1339
1340         /*
1341          * it is possible that we didn't log all the parent directories
1342          * for a given inode.  If we don't find the dir, just don't
1343          * copy the back ref in.  The link count fixup code will take
1344          * care of the rest
1345          */
1346         dir = read_one_inode(root, parent_objectid);
1347         if (!dir) {
1348                 ret = -ENOENT;
1349                 goto out;
1350         }
1351
1352         inode = read_one_inode(root, inode_objectid);
1353         if (!inode) {
1354                 ret = -EIO;
1355                 goto out;
1356         }
1357
1358         while (ref_ptr < ref_end) {
1359                 if (log_ref_ver) {
1360                         ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1361                                                 &ref_index, &parent_objectid);
1362                         /*
1363                          * parent object can change from one array
1364                          * item to another.
1365                          */
1366                         if (!dir)
1367                                 dir = read_one_inode(root, parent_objectid);
1368                         if (!dir) {
1369                                 ret = -ENOENT;
1370                                 goto out;
1371                         }
1372                 } else {
1373                         ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1374                                              &ref_index);
1375                 }
1376                 if (ret)
1377                         goto out;
1378
1379                 /* if we already have a perfect match, we're done */
1380                 if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
1381                                         btrfs_ino(BTRFS_I(inode)), ref_index,
1382                                         name, namelen)) {
1383                         /*
1384                          * look for a conflicting back reference in the
1385                          * metadata. if we find one we have to unlink that name
1386                          * of the file before we add our new link.  Later on, we
1387                          * overwrite any existing back reference, and we don't
1388                          * want to create dangling pointers in the directory.
1389                          */
1390
1391                         if (!search_done) {
1392                                 ret = __add_inode_ref(trans, root, path, log,
1393                                                       BTRFS_I(dir),
1394                                                       BTRFS_I(inode),
1395                                                       inode_objectid,
1396                                                       parent_objectid,
1397                                                       ref_index, name, namelen,
1398                                                       &search_done);
1399                                 if (ret) {
1400                                         if (ret == 1)
1401                                                 ret = 0;
1402                                         goto out;
1403                                 }
1404                         }
1405
1406                         /* insert our name */
1407                         ret = btrfs_add_link(trans, BTRFS_I(dir),
1408                                         BTRFS_I(inode),
1409                                         name, namelen, 0, ref_index);
1410                         if (ret)
1411                                 goto out;
1412
1413                         btrfs_update_inode(trans, root, inode);
1414                 }
1415
1416                 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
1417                 kfree(name);
1418                 name = NULL;
1419                 if (log_ref_ver) {
1420                         iput(dir);
1421                         dir = NULL;
1422                 }
1423         }
1424
1425         /*
1426          * Before we overwrite the inode reference item in the subvolume tree
1427          * with the item from the log tree, we must unlink all names from the
1428          * parent directory that are in the subvolume's tree inode reference
1429          * item, otherwise we end up with an inconsistent subvolume tree where
1430          * dir index entries exist for a name but there is no inode reference
1431          * item with the same name.
1432          */
1433         ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot,
1434                                     key);
1435         if (ret)
1436                 goto out;
1437
1438         /* finally write the back reference in the inode */
1439         ret = overwrite_item(trans, root, path, eb, slot, key);
1440 out:
1441         btrfs_release_path(path);
1442         kfree(name);
1443         iput(dir);
1444         iput(inode);
1445         return ret;
1446 }
1447
1448 static int insert_orphan_item(struct btrfs_trans_handle *trans,
1449                               struct btrfs_root *root, u64 ino)
1450 {
1451         int ret;
1452
1453         ret = btrfs_insert_orphan_item(trans, root, ino);
1454         if (ret == -EEXIST)
1455                 ret = 0;
1456
1457         return ret;
1458 }
1459
1460 static int count_inode_extrefs(struct btrfs_root *root,
1461                 struct btrfs_inode *inode, struct btrfs_path *path)
1462 {
1463         int ret = 0;
1464         int name_len;
1465         unsigned int nlink = 0;
1466         u32 item_size;
1467         u32 cur_offset = 0;
1468         u64 inode_objectid = btrfs_ino(inode);
1469         u64 offset = 0;
1470         unsigned long ptr;
1471         struct btrfs_inode_extref *extref;
1472         struct extent_buffer *leaf;
1473
1474         while (1) {
1475                 ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
1476                                             &extref, &offset);
1477                 if (ret)
1478                         break;
1479
1480                 leaf = path->nodes[0];
1481                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1482                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1483                 cur_offset = 0;
1484
1485                 while (cur_offset < item_size) {
1486                         extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1487                         name_len = btrfs_inode_extref_name_len(leaf, extref);
1488
1489                         nlink++;
1490
1491                         cur_offset += name_len + sizeof(*extref);
1492                 }
1493
1494                 offset++;
1495                 btrfs_release_path(path);
1496         }
1497         btrfs_release_path(path);
1498
1499         if (ret < 0 && ret != -ENOENT)
1500                 return ret;
1501         return nlink;
1502 }
1503
1504 static int count_inode_refs(struct btrfs_root *root,
1505                         struct btrfs_inode *inode, struct btrfs_path *path)
1506 {
1507         int ret;
1508         struct btrfs_key key;
1509         unsigned int nlink = 0;
1510         unsigned long ptr;
1511         unsigned long ptr_end;
1512         int name_len;
1513         u64 ino = btrfs_ino(inode);
1514
1515         key.objectid = ino;
1516         key.type = BTRFS_INODE_REF_KEY;
1517         key.offset = (u64)-1;
1518
1519         while (1) {
1520                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1521                 if (ret < 0)
1522                         break;
1523                 if (ret > 0) {
1524                         if (path->slots[0] == 0)
1525                                 break;
1526                         path->slots[0]--;
1527                 }
1528 process_slot:
1529                 btrfs_item_key_to_cpu(path->nodes[0], &key,
1530                                       path->slots[0]);
1531                 if (key.objectid != ino ||
1532                     key.type != BTRFS_INODE_REF_KEY)
1533                         break;
1534                 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1535                 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
1536                                                    path->slots[0]);
1537                 while (ptr < ptr_end) {
1538                         struct btrfs_inode_ref *ref;
1539
1540                         ref = (struct btrfs_inode_ref *)ptr;
1541                         name_len = btrfs_inode_ref_name_len(path->nodes[0],
1542                                                             ref);
1543                         ptr = (unsigned long)(ref + 1) + name_len;
1544                         nlink++;
1545                 }
1546
1547                 if (key.offset == 0)
1548                         break;
1549                 if (path->slots[0] > 0) {
1550                         path->slots[0]--;
1551                         goto process_slot;
1552                 }
1553                 key.offset--;
1554                 btrfs_release_path(path);
1555         }
1556         btrfs_release_path(path);
1557
1558         return nlink;
1559 }
1560
1561 /*
1562  * There are a few corners where the link count of the file can't
1563  * be properly maintained during replay.  So, instead of adding
1564  * lots of complexity to the log code, we just scan the backrefs
1565  * for any file that has been through replay.
1566  *
1567  * The scan will update the link count on the inode to reflect the
1568  * number of back refs found.  If it goes down to zero, the iput
1569  * will free the inode.
1570  */
1571 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1572                                            struct btrfs_root *root,
1573                                            struct inode *inode)
1574 {
1575         struct btrfs_path *path;
1576         int ret;
1577         u64 nlink = 0;
1578         u64 ino = btrfs_ino(BTRFS_I(inode));
1579
1580         path = btrfs_alloc_path();
1581         if (!path)
1582                 return -ENOMEM;
1583
1584         ret = count_inode_refs(root, BTRFS_I(inode), path);
1585         if (ret < 0)
1586                 goto out;
1587
1588         nlink = ret;
1589
1590         ret = count_inode_extrefs(root, BTRFS_I(inode), path);
1591         if (ret < 0)
1592                 goto out;
1593
1594         nlink += ret;
1595
1596         ret = 0;
1597
1598         if (nlink != inode->i_nlink) {
1599                 set_nlink(inode, nlink);
1600                 btrfs_update_inode(trans, root, inode);
1601         }
1602         BTRFS_I(inode)->index_cnt = (u64)-1;
1603
1604         if (inode->i_nlink == 0) {
1605                 if (S_ISDIR(inode->i_mode)) {
1606                         ret = replay_dir_deletes(trans, root, NULL, path,
1607                                                  ino, 1);
1608                         if (ret)
1609                                 goto out;
1610                 }
1611                 ret = insert_orphan_item(trans, root, ino);
1612         }
1613
1614 out:
1615         btrfs_free_path(path);
1616         return ret;
1617 }
1618
1619 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1620                                             struct btrfs_root *root,
1621                                             struct btrfs_path *path)
1622 {
1623         int ret;
1624         struct btrfs_key key;
1625         struct inode *inode;
1626
1627         key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1628         key.type = BTRFS_ORPHAN_ITEM_KEY;
1629         key.offset = (u64)-1;
1630         while (1) {
1631                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1632                 if (ret < 0)
1633                         break;
1634
1635                 if (ret == 1) {
1636                         if (path->slots[0] == 0)
1637                                 break;
1638                         path->slots[0]--;
1639                 }
1640
1641                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1642                 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1643                     key.type != BTRFS_ORPHAN_ITEM_KEY)
1644                         break;
1645
1646                 ret = btrfs_del_item(trans, root, path);
1647                 if (ret)
1648                         goto out;
1649
1650                 btrfs_release_path(path);
1651                 inode = read_one_inode(root, key.offset);
1652                 if (!inode)
1653                         return -EIO;
1654
1655                 ret = fixup_inode_link_count(trans, root, inode);
1656                 iput(inode);
1657                 if (ret)
1658                         goto out;
1659
1660                 /*
1661                  * fixup on a directory may create new entries,
1662                  * make sure we always look for the highset possible
1663                  * offset
1664                  */
1665                 key.offset = (u64)-1;
1666         }
1667         ret = 0;
1668 out:
1669         btrfs_release_path(path);
1670         return ret;
1671 }
1672
1673
1674 /*
1675  * record a given inode in the fixup dir so we can check its link
1676  * count when replay is done.  The link count is incremented here
1677  * so the inode won't go away until we check it
1678  */
1679 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1680                                       struct btrfs_root *root,
1681                                       struct btrfs_path *path,
1682                                       u64 objectid)
1683 {
1684         struct btrfs_key key;
1685         int ret = 0;
1686         struct inode *inode;
1687
1688         inode = read_one_inode(root, objectid);
1689         if (!inode)
1690                 return -EIO;
1691
1692         key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1693         key.type = BTRFS_ORPHAN_ITEM_KEY;
1694         key.offset = objectid;
1695
1696         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1697
1698         btrfs_release_path(path);
1699         if (ret == 0) {
1700                 if (!inode->i_nlink)
1701                         set_nlink(inode, 1);
1702                 else
1703                         inc_nlink(inode);
1704                 ret = btrfs_update_inode(trans, root, inode);
1705         } else if (ret == -EEXIST) {
1706                 ret = 0;
1707         } else {
1708                 BUG(); /* Logic Error */
1709         }
1710         iput(inode);
1711
1712         return ret;
1713 }
1714
1715 /*
1716  * when replaying the log for a directory, we only insert names
1717  * for inodes that actually exist.  This means an fsync on a directory
1718  * does not implicitly fsync all the new files in it
1719  */
1720 static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1721                                     struct btrfs_root *root,
1722                                     u64 dirid, u64 index,
1723                                     char *name, int name_len,
1724                                     struct btrfs_key *location)
1725 {
1726         struct inode *inode;
1727         struct inode *dir;
1728         int ret;
1729
1730         inode = read_one_inode(root, location->objectid);
1731         if (!inode)
1732                 return -ENOENT;
1733
1734         dir = read_one_inode(root, dirid);
1735         if (!dir) {
1736                 iput(inode);
1737                 return -EIO;
1738         }
1739
1740         ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
1741                         name_len, 1, index);
1742
1743         /* FIXME, put inode into FIXUP list */
1744
1745         iput(inode);
1746         iput(dir);
1747         return ret;
1748 }
1749
1750 /*
1751  * Return true if an inode reference exists in the log for the given name,
1752  * inode and parent inode.
1753  */
1754 static bool name_in_log_ref(struct btrfs_root *log_root,
1755                             const char *name, const int name_len,
1756                             const u64 dirid, const u64 ino)
1757 {
1758         struct btrfs_key search_key;
1759
1760         search_key.objectid = ino;
1761         search_key.type = BTRFS_INODE_REF_KEY;
1762         search_key.offset = dirid;
1763         if (backref_in_log(log_root, &search_key, dirid, name, name_len))
1764                 return true;
1765
1766         search_key.type = BTRFS_INODE_EXTREF_KEY;
1767         search_key.offset = btrfs_extref_hash(dirid, name, name_len);
1768         if (backref_in_log(log_root, &search_key, dirid, name, name_len))
1769                 return true;
1770
1771         return false;
1772 }
1773
1774 /*
1775  * take a single entry in a log directory item and replay it into
1776  * the subvolume.
1777  *
1778  * if a conflicting item exists in the subdirectory already,
1779  * the inode it points to is unlinked and put into the link count
1780  * fix up tree.
1781  *
1782  * If a name from the log points to a file or directory that does
1783  * not exist in the FS, it is skipped.  fsyncs on directories
1784  * do not force down inodes inside that directory, just changes to the
1785  * names or unlinks in a directory.
1786  *
1787  * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
1788  * non-existing inode) and 1 if the name was replayed.
1789  */
1790 static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1791                                     struct btrfs_root *root,
1792                                     struct btrfs_path *path,
1793                                     struct extent_buffer *eb,
1794                                     struct btrfs_dir_item *di,
1795                                     struct btrfs_key *key)
1796 {
1797         char *name;
1798         int name_len;
1799         struct btrfs_dir_item *dst_di;
1800         struct btrfs_key found_key;
1801         struct btrfs_key log_key;
1802         struct inode *dir;
1803         u8 log_type;
1804         int exists;
1805         int ret = 0;
1806         bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
1807         bool name_added = false;
1808
1809         dir = read_one_inode(root, key->objectid);
1810         if (!dir)
1811                 return -EIO;
1812
1813         name_len = btrfs_dir_name_len(eb, di);
1814         name = kmalloc(name_len, GFP_NOFS);
1815         if (!name) {
1816                 ret = -ENOMEM;
1817                 goto out;
1818         }
1819
1820         log_type = btrfs_dir_type(eb, di);
1821         read_extent_buffer(eb, name, (unsigned long)(di + 1),
1822                    name_len);
1823
1824         btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1825         exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1826         if (exists == 0)
1827                 exists = 1;
1828         else
1829                 exists = 0;
1830         btrfs_release_path(path);
1831
1832         if (key->type == BTRFS_DIR_ITEM_KEY) {
1833                 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1834                                        name, name_len, 1);
1835         } else if (key->type == BTRFS_DIR_INDEX_KEY) {
1836                 dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1837                                                      key->objectid,
1838                                                      key->offset, name,
1839                                                      name_len, 1);
1840         } else {
1841                 /* Corruption */
1842                 ret = -EINVAL;
1843                 goto out;
1844         }
1845         if (IS_ERR_OR_NULL(dst_di)) {
1846                 /* we need a sequence number to insert, so we only
1847                  * do inserts for the BTRFS_DIR_INDEX_KEY types
1848                  */
1849                 if (key->type != BTRFS_DIR_INDEX_KEY)
1850                         goto out;
1851                 goto insert;
1852         }
1853
1854         btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1855         /* the existing item matches the logged item */
1856         if (found_key.objectid == log_key.objectid &&
1857             found_key.type == log_key.type &&
1858             found_key.offset == log_key.offset &&
1859             btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1860                 update_size = false;
1861                 goto out;
1862         }
1863
1864         /*
1865          * don't drop the conflicting directory entry if the inode
1866          * for the new entry doesn't exist
1867          */
1868         if (!exists)
1869                 goto out;
1870
1871         ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di);
1872         if (ret)
1873                 goto out;
1874
1875         if (key->type == BTRFS_DIR_INDEX_KEY)
1876                 goto insert;
1877 out:
1878         btrfs_release_path(path);
1879         if (!ret && update_size) {
1880                 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
1881                 ret = btrfs_update_inode(trans, root, dir);
1882         }
1883         kfree(name);
1884         iput(dir);
1885         if (!ret && name_added)
1886                 ret = 1;
1887         return ret;
1888
1889 insert:
1890         if (name_in_log_ref(root->log_root, name, name_len,
1891                             key->objectid, log_key.objectid)) {
1892                 /* The dentry will be added later. */
1893                 ret = 0;
1894                 update_size = false;
1895                 goto out;
1896         }
1897         btrfs_release_path(path);
1898         ret = insert_one_name(trans, root, key->objectid, key->offset,
1899                               name, name_len, &log_key);
1900         if (ret && ret != -ENOENT && ret != -EEXIST)
1901                 goto out;
1902         if (!ret)
1903                 name_added = true;
1904         update_size = false;
1905         ret = 0;
1906         goto out;
1907 }
1908
1909 /*
1910  * find all the names in a directory item and reconcile them into
1911  * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
1912  * one name in a directory item, but the same code gets used for
1913  * both directory index types
1914  */
1915 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1916                                         struct btrfs_root *root,
1917                                         struct btrfs_path *path,
1918                                         struct extent_buffer *eb, int slot,
1919                                         struct btrfs_key *key)
1920 {
1921         int ret = 0;
1922         u32 item_size = btrfs_item_size_nr(eb, slot);
1923         struct btrfs_dir_item *di;
1924         int name_len;
1925         unsigned long ptr;
1926         unsigned long ptr_end;
1927         struct btrfs_path *fixup_path = NULL;
1928
1929         ptr = btrfs_item_ptr_offset(eb, slot);
1930         ptr_end = ptr + item_size;
1931         while (ptr < ptr_end) {
1932                 di = (struct btrfs_dir_item *)ptr;
1933                 name_len = btrfs_dir_name_len(eb, di);
1934                 ret = replay_one_name(trans, root, path, eb, di, key);
1935                 if (ret < 0)
1936                         break;
1937                 ptr = (unsigned long)(di + 1);
1938                 ptr += name_len;
1939
1940                 /*
1941                  * If this entry refers to a non-directory (directories can not
1942                  * have a link count > 1) and it was added in the transaction
1943                  * that was not committed, make sure we fixup the link count of
1944                  * the inode it the entry points to. Otherwise something like
1945                  * the following would result in a directory pointing to an
1946                  * inode with a wrong link that does not account for this dir
1947                  * entry:
1948                  *
1949                  * mkdir testdir
1950                  * touch testdir/foo
1951                  * touch testdir/bar
1952                  * sync
1953                  *
1954                  * ln testdir/bar testdir/bar_link
1955                  * ln testdir/foo testdir/foo_link
1956                  * xfs_io -c "fsync" testdir/bar
1957                  *
1958                  * <power failure>
1959                  *
1960                  * mount fs, log replay happens
1961                  *
1962                  * File foo would remain with a link count of 1 when it has two
1963                  * entries pointing to it in the directory testdir. This would
1964                  * make it impossible to ever delete the parent directory has
1965                  * it would result in stale dentries that can never be deleted.
1966                  */
1967                 if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
1968                         struct btrfs_key di_key;
1969
1970                         if (!fixup_path) {
1971                                 fixup_path = btrfs_alloc_path();
1972                                 if (!fixup_path) {
1973                                         ret = -ENOMEM;
1974                                         break;
1975                                 }
1976                         }
1977
1978                         btrfs_dir_item_key_to_cpu(eb, di, &di_key);
1979                         ret = link_to_fixup_dir(trans, root, fixup_path,
1980                                                 di_key.objectid);
1981                         if (ret)
1982                                 break;
1983                 }
1984                 ret = 0;
1985         }
1986         btrfs_free_path(fixup_path);
1987         return ret;
1988 }
1989
1990 /*
1991  * directory replay has two parts.  There are the standard directory
1992  * items in the log copied from the subvolume, and range items
1993  * created in the log while the subvolume was logged.
1994  *
1995  * The range items tell us which parts of the key space the log
1996  * is authoritative for.  During replay, if a key in the subvolume
1997  * directory is in a logged range item, but not actually in the log
1998  * that means it was deleted from the directory before the fsync
1999  * and should be removed.
2000  */
2001 static noinline int find_dir_range(struct btrfs_root *root,
2002                                    struct btrfs_path *path,
2003                                    u64 dirid, int key_type,
2004                                    u64 *start_ret, u64 *end_ret)
2005 {
2006         struct btrfs_key key;
2007         u64 found_end;
2008         struct btrfs_dir_log_item *item;
2009         int ret;
2010         int nritems;
2011
2012         if (*start_ret == (u64)-1)
2013                 return 1;
2014
2015         key.objectid = dirid;
2016         key.type = key_type;
2017         key.offset = *start_ret;
2018
2019         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2020         if (ret < 0)
2021                 goto out;
2022         if (ret > 0) {
2023                 if (path->slots[0] == 0)
2024                         goto out;
2025                 path->slots[0]--;
2026         }
2027         if (ret != 0)
2028                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2029
2030         if (key.type != key_type || key.objectid != dirid) {
2031                 ret = 1;
2032                 goto next;
2033         }
2034         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2035                               struct btrfs_dir_log_item);
2036         found_end = btrfs_dir_log_end(path->nodes[0], item);
2037
2038         if (*start_ret >= key.offset && *start_ret <= found_end) {
2039                 ret = 0;
2040                 *start_ret = key.offset;
2041                 *end_ret = found_end;
2042                 goto out;
2043         }
2044         ret = 1;
2045 next:
2046         /* check the next slot in the tree to see if it is a valid item */
2047         nritems = btrfs_header_nritems(path->nodes[0]);
2048         path->slots[0]++;
2049         if (path->slots[0] >= nritems) {
2050                 ret = btrfs_next_leaf(root, path);
2051                 if (ret)
2052                         goto out;
2053         }
2054
2055         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2056
2057         if (key.type != key_type || key.objectid != dirid) {
2058                 ret = 1;
2059                 goto out;
2060         }
2061         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2062                               struct btrfs_dir_log_item);
2063         found_end = btrfs_dir_log_end(path->nodes[0], item);
2064         *start_ret = key.offset;
2065         *end_ret = found_end;
2066         ret = 0;
2067 out:
2068         btrfs_release_path(path);
2069         return ret;
2070 }
2071
2072 /*
2073  * this looks for a given directory item in the log.  If the directory
2074  * item is not in the log, the item is removed and the inode it points
2075  * to is unlinked
2076  */
2077 static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
2078                                       struct btrfs_root *root,
2079                                       struct btrfs_root *log,
2080                                       struct btrfs_path *path,
2081                                       struct btrfs_path *log_path,
2082                                       struct inode *dir,
2083                                       struct btrfs_key *dir_key)
2084 {
2085         int ret;
2086         struct extent_buffer *eb;
2087         int slot;
2088         u32 item_size;
2089         struct btrfs_dir_item *di;
2090         struct btrfs_dir_item *log_di;
2091         int name_len;
2092         unsigned long ptr;
2093         unsigned long ptr_end;
2094         char *name;
2095         struct inode *inode;
2096         struct btrfs_key location;
2097
2098 again:
2099         eb = path->nodes[0];
2100         slot = path->slots[0];
2101         item_size = btrfs_item_size_nr(eb, slot);
2102         ptr = btrfs_item_ptr_offset(eb, slot);
2103         ptr_end = ptr + item_size;
2104         while (ptr < ptr_end) {
2105                 di = (struct btrfs_dir_item *)ptr;
2106                 name_len = btrfs_dir_name_len(eb, di);
2107                 name = kmalloc(name_len, GFP_NOFS);
2108                 if (!name) {
2109                         ret = -ENOMEM;
2110                         goto out;
2111                 }
2112                 read_extent_buffer(eb, name, (unsigned long)(di + 1),
2113                                   name_len);
2114                 log_di = NULL;
2115                 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
2116                         log_di = btrfs_lookup_dir_item(trans, log, log_path,
2117                                                        dir_key->objectid,
2118                                                        name, name_len, 0);
2119                 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
2120                         log_di = btrfs_lookup_dir_index_item(trans, log,
2121                                                      log_path,
2122                                                      dir_key->objectid,
2123                                                      dir_key->offset,
2124                                                      name, name_len, 0);
2125                 }
2126                 if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) {
2127                         btrfs_dir_item_key_to_cpu(eb, di, &location);
2128                         btrfs_release_path(path);
2129                         btrfs_release_path(log_path);
2130                         inode = read_one_inode(root, location.objectid);
2131                         if (!inode) {
2132                                 kfree(name);
2133                                 return -EIO;
2134                         }
2135
2136                         ret = link_to_fixup_dir(trans, root,
2137                                                 path, location.objectid);
2138                         if (ret) {
2139                                 kfree(name);
2140                                 iput(inode);
2141                                 goto out;
2142                         }
2143
2144                         inc_nlink(inode);
2145                         ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
2146                                         BTRFS_I(inode), name, name_len);
2147                         if (!ret)
2148                                 ret = btrfs_run_delayed_items(trans);
2149                         kfree(name);
2150                         iput(inode);
2151                         if (ret)
2152                                 goto out;
2153
2154                         /* there might still be more names under this key
2155                          * check and repeat if required
2156                          */
2157                         ret = btrfs_search_slot(NULL, root, dir_key, path,
2158                                                 0, 0);
2159                         if (ret == 0)
2160                                 goto again;
2161                         ret = 0;
2162                         goto out;
2163                 } else if (IS_ERR(log_di)) {
2164                         kfree(name);
2165                         return PTR_ERR(log_di);
2166                 }
2167                 btrfs_release_path(log_path);
2168                 kfree(name);
2169
2170                 ptr = (unsigned long)(di + 1);
2171                 ptr += name_len;
2172         }
2173         ret = 0;
2174 out:
2175         btrfs_release_path(path);
2176         btrfs_release_path(log_path);
2177         return ret;
2178 }
2179
2180 static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
2181                               struct btrfs_root *root,
2182                               struct btrfs_root *log,
2183                               struct btrfs_path *path,
2184                               const u64 ino)
2185 {
2186         struct btrfs_key search_key;
2187         struct btrfs_path *log_path;
2188         int i;
2189         int nritems;
2190         int ret;
2191
2192         log_path = btrfs_alloc_path();
2193         if (!log_path)
2194                 return -ENOMEM;
2195
2196         search_key.objectid = ino;
2197         search_key.type = BTRFS_XATTR_ITEM_KEY;
2198         search_key.offset = 0;
2199 again:
2200         ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
2201         if (ret < 0)
2202                 goto out;
2203 process_leaf:
2204         nritems = btrfs_header_nritems(path->nodes[0]);
2205         for (i = path->slots[0]; i < nritems; i++) {
2206                 struct btrfs_key key;
2207                 struct btrfs_dir_item *di;
2208                 struct btrfs_dir_item *log_di;
2209                 u32 total_size;
2210                 u32 cur;
2211
2212                 btrfs_item_key_to_cpu(path->nodes[0], &key, i);
2213                 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
2214                         ret = 0;
2215                         goto out;
2216                 }
2217
2218                 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
2219                 total_size = btrfs_item_size_nr(path->nodes[0], i);
2220                 cur = 0;
2221                 while (cur < total_size) {
2222                         u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
2223                         u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
2224                         u32 this_len = sizeof(*di) + name_len + data_len;
2225                         char *name;
2226
2227                         name = kmalloc(name_len, GFP_NOFS);
2228                         if (!name) {
2229                                 ret = -ENOMEM;
2230                                 goto out;
2231                         }
2232                         read_extent_buffer(path->nodes[0], name,
2233                                            (unsigned long)(di + 1), name_len);
2234
2235                         log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
2236                                                     name, name_len, 0);
2237                         btrfs_release_path(log_path);
2238                         if (!log_di) {
2239                                 /* Doesn't exist in log tree, so delete it. */
2240                                 btrfs_release_path(path);
2241                                 di = btrfs_lookup_xattr(trans, root, path, ino,
2242                                                         name, name_len, -1);
2243                                 kfree(name);
2244                                 if (IS_ERR(di)) {
2245                                         ret = PTR_ERR(di);
2246                                         goto out;
2247                                 }
2248                                 ASSERT(di);
2249                                 ret = btrfs_delete_one_dir_name(trans, root,
2250                                                                 path, di);
2251                                 if (ret)
2252                                         goto out;
2253                                 btrfs_release_path(path);
2254                                 search_key = key;
2255                                 goto again;
2256                         }
2257                         kfree(name);
2258                         if (IS_ERR(log_di)) {
2259                                 ret = PTR_ERR(log_di);
2260                                 goto out;
2261                         }
2262                         cur += this_len;
2263                         di = (struct btrfs_dir_item *)((char *)di + this_len);
2264                 }
2265         }
2266         ret = btrfs_next_leaf(root, path);
2267         if (ret > 0)
2268                 ret = 0;
2269         else if (ret == 0)
2270                 goto process_leaf;
2271 out:
2272         btrfs_free_path(log_path);
2273         btrfs_release_path(path);
2274         return ret;
2275 }
2276
2277
2278 /*
2279  * deletion replay happens before we copy any new directory items
2280  * out of the log or out of backreferences from inodes.  It
2281  * scans the log to find ranges of keys that log is authoritative for,
2282  * and then scans the directory to find items in those ranges that are
2283  * not present in the log.
2284  *
2285  * Anything we don't find in the log is unlinked and removed from the
2286  * directory.
2287  */
2288 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
2289                                        struct btrfs_root *root,
2290                                        struct btrfs_root *log,
2291                                        struct btrfs_path *path,
2292                                        u64 dirid, int del_all)
2293 {
2294         u64 range_start;
2295         u64 range_end;
2296         int key_type = BTRFS_DIR_LOG_ITEM_KEY;
2297         int ret = 0;
2298         struct btrfs_key dir_key;
2299         struct btrfs_key found_key;
2300         struct btrfs_path *log_path;
2301         struct inode *dir;
2302
2303         dir_key.objectid = dirid;
2304         dir_key.type = BTRFS_DIR_ITEM_KEY;
2305         log_path = btrfs_alloc_path();
2306         if (!log_path)
2307                 return -ENOMEM;
2308
2309         dir = read_one_inode(root, dirid);
2310         /* it isn't an error if the inode isn't there, that can happen
2311          * because we replay the deletes before we copy in the inode item
2312          * from the log
2313          */
2314         if (!dir) {
2315                 btrfs_free_path(log_path);
2316                 return 0;
2317         }
2318 again:
2319         range_start = 0;
2320         range_end = 0;
2321         while (1) {
2322                 if (del_all)
2323                         range_end = (u64)-1;
2324                 else {
2325                         ret = find_dir_range(log, path, dirid, key_type,
2326                                              &range_start, &range_end);
2327                         if (ret != 0)
2328                                 break;
2329                 }
2330
2331                 dir_key.offset = range_start;
2332                 while (1) {
2333                         int nritems;
2334                         ret = btrfs_search_slot(NULL, root, &dir_key, path,
2335                                                 0, 0);
2336                         if (ret < 0)
2337                                 goto out;
2338
2339                         nritems = btrfs_header_nritems(path->nodes[0]);
2340                         if (path->slots[0] >= nritems) {
2341                                 ret = btrfs_next_leaf(root, path);
2342                                 if (ret == 1)
2343                                         break;
2344                                 else if (ret < 0)
2345                                         goto out;
2346                         }
2347                         btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2348                                               path->slots[0]);
2349                         if (found_key.objectid != dirid ||
2350                             found_key.type != dir_key.type)
2351                                 goto next_type;
2352
2353                         if (found_key.offset > range_end)
2354                                 break;
2355
2356                         ret = check_item_in_log(trans, root, log, path,
2357                                                 log_path, dir,
2358                                                 &found_key);
2359                         if (ret)
2360                                 goto out;
2361                         if (found_key.offset == (u64)-1)
2362                                 break;
2363                         dir_key.offset = found_key.offset + 1;
2364                 }
2365                 btrfs_release_path(path);
2366                 if (range_end == (u64)-1)
2367                         break;
2368                 range_start = range_end + 1;
2369         }
2370
2371 next_type:
2372         ret = 0;
2373         if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
2374                 key_type = BTRFS_DIR_LOG_INDEX_KEY;
2375                 dir_key.type = BTRFS_DIR_INDEX_KEY;
2376                 btrfs_release_path(path);
2377                 goto again;
2378         }
2379 out:
2380         btrfs_release_path(path);
2381         btrfs_free_path(log_path);
2382         iput(dir);
2383         return ret;
2384 }
2385
2386 /*
2387  * the process_func used to replay items from the log tree.  This
2388  * gets called in two different stages.  The first stage just looks
2389  * for inodes and makes sure they are all copied into the subvolume.
2390  *
2391  * The second stage copies all the other item types from the log into
2392  * the subvolume.  The two stage approach is slower, but gets rid of
2393  * lots of complexity around inodes referencing other inodes that exist
2394  * only in the log (references come from either directory items or inode
2395  * back refs).
2396  */
2397 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2398                              struct walk_control *wc, u64 gen, int level)
2399 {
2400         int nritems;
2401         struct btrfs_path *path;
2402         struct btrfs_root *root = wc->replay_dest;
2403         struct btrfs_key key;
2404         int i;
2405         int ret;
2406
2407         ret = btrfs_read_buffer(eb, gen, level, NULL);
2408         if (ret)
2409                 return ret;
2410
2411         level = btrfs_header_level(eb);
2412
2413         if (level != 0)
2414                 return 0;
2415
2416         path = btrfs_alloc_path();
2417         if (!path)
2418                 return -ENOMEM;
2419
2420         nritems = btrfs_header_nritems(eb);
2421         for (i = 0; i < nritems; i++) {
2422                 btrfs_item_key_to_cpu(eb, &key, i);
2423
2424                 /* inode keys are done during the first stage */
2425                 if (key.type == BTRFS_INODE_ITEM_KEY &&
2426                     wc->stage == LOG_WALK_REPLAY_INODES) {
2427                         struct btrfs_inode_item *inode_item;
2428                         u32 mode;
2429
2430                         inode_item = btrfs_item_ptr(eb, i,
2431                                             struct btrfs_inode_item);
2432                         ret = replay_xattr_deletes(wc->trans, root, log,
2433                                                    path, key.objectid);
2434                         if (ret)
2435                                 break;
2436                         mode = btrfs_inode_mode(eb, inode_item);
2437                         if (S_ISDIR(mode)) {
2438                                 ret = replay_dir_deletes(wc->trans,
2439                                          root, log, path, key.objectid, 0);
2440                                 if (ret)
2441                                         break;
2442                         }
2443                         ret = overwrite_item(wc->trans, root, path,
2444                                              eb, i, &key);
2445                         if (ret)
2446                                 break;
2447
2448                         /*
2449                          * Before replaying extents, truncate the inode to its
2450                          * size. We need to do it now and not after log replay
2451                          * because before an fsync we can have prealloc extents
2452                          * added beyond the inode's i_size. If we did it after,
2453                          * through orphan cleanup for example, we would drop
2454                          * those prealloc extents just after replaying them.
2455                          */
2456                         if (S_ISREG(mode)) {
2457                                 struct inode *inode;
2458                                 u64 from;
2459
2460                                 inode = read_one_inode(root, key.objectid);
2461                                 if (!inode) {
2462                                         ret = -EIO;
2463                                         break;
2464                                 }
2465                                 from = ALIGN(i_size_read(inode),
2466                                              root->fs_info->sectorsize);
2467                                 ret = btrfs_drop_extents(wc->trans, root, inode,
2468                                                          from, (u64)-1, 1);
2469                                 /*
2470                                  * If the nlink count is zero here, the iput
2471                                  * will free the inode.  We bump it to make
2472                                  * sure it doesn't get freed until the link
2473                                  * count fixup is done.
2474                                  */
2475                                 if (!ret) {
2476                                         if (inode->i_nlink == 0)
2477                                                 inc_nlink(inode);
2478                                         /* Update link count and nbytes. */
2479                                         ret = btrfs_update_inode(wc->trans,
2480                                                                  root, inode);
2481                                 }
2482                                 iput(inode);
2483                                 if (ret)
2484                                         break;
2485                         }
2486
2487                         ret = link_to_fixup_dir(wc->trans, root,
2488                                                 path, key.objectid);
2489                         if (ret)
2490                                 break;
2491                 }
2492
2493                 if (key.type == BTRFS_DIR_INDEX_KEY &&
2494                     wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2495                         ret = replay_one_dir_item(wc->trans, root, path,
2496                                                   eb, i, &key);
2497                         if (ret)
2498                                 break;
2499                 }
2500
2501                 if (wc->stage < LOG_WALK_REPLAY_ALL)
2502                         continue;
2503
2504                 /* these keys are simply copied */
2505                 if (key.type == BTRFS_XATTR_ITEM_KEY) {
2506                         ret = overwrite_item(wc->trans, root, path,
2507                                              eb, i, &key);
2508                         if (ret)
2509                                 break;
2510                 } else if (key.type == BTRFS_INODE_REF_KEY ||
2511                            key.type == BTRFS_INODE_EXTREF_KEY) {
2512                         ret = add_inode_ref(wc->trans, root, log, path,
2513                                             eb, i, &key);
2514                         if (ret && ret != -ENOENT)
2515                                 break;
2516                         ret = 0;
2517                 } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
2518                         ret = replay_one_extent(wc->trans, root, path,
2519                                                 eb, i, &key);
2520                         if (ret)
2521                                 break;
2522                 } else if (key.type == BTRFS_DIR_ITEM_KEY) {
2523                         ret = replay_one_dir_item(wc->trans, root, path,
2524                                                   eb, i, &key);
2525                         if (ret)
2526                                 break;
2527                 }
2528         }
2529         btrfs_free_path(path);
2530         return ret;
2531 }
2532
2533 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2534                                    struct btrfs_root *root,
2535                                    struct btrfs_path *path, int *level,
2536                                    struct walk_control *wc)
2537 {
2538         struct btrfs_fs_info *fs_info = root->fs_info;
2539         u64 root_owner;
2540         u64 bytenr;
2541         u64 ptr_gen;
2542         struct extent_buffer *next;
2543         struct extent_buffer *cur;
2544         struct extent_buffer *parent;
2545         u32 blocksize;
2546         int ret = 0;
2547
2548         WARN_ON(*level < 0);
2549         WARN_ON(*level >= BTRFS_MAX_LEVEL);
2550
2551         while (*level > 0) {
2552                 struct btrfs_key first_key;
2553
2554                 WARN_ON(*level < 0);
2555                 WARN_ON(*level >= BTRFS_MAX_LEVEL);
2556                 cur = path->nodes[*level];
2557
2558                 WARN_ON(btrfs_header_level(cur) != *level);
2559
2560                 if (path->slots[*level] >=
2561                     btrfs_header_nritems(cur))
2562                         break;
2563
2564                 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2565                 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2566                 btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);
2567                 blocksize = fs_info->nodesize;
2568
2569                 parent = path->nodes[*level];
2570                 root_owner = btrfs_header_owner(parent);
2571
2572                 next = btrfs_find_create_tree_block(fs_info, bytenr);
2573                 if (IS_ERR(next))
2574                         return PTR_ERR(next);
2575
2576                 if (*level == 1) {
2577                         ret = wc->process_func(root, next, wc, ptr_gen,
2578                                                *level - 1);
2579                         if (ret) {
2580                                 free_extent_buffer(next);
2581                                 return ret;
2582                         }
2583
2584                         path->slots[*level]++;
2585                         if (wc->free) {
2586                                 ret = btrfs_read_buffer(next, ptr_gen,
2587                                                         *level - 1, &first_key);
2588                                 if (ret) {
2589                                         free_extent_buffer(next);
2590                                         return ret;
2591                                 }
2592
2593                                 if (trans) {
2594                                         btrfs_tree_lock(next);
2595                                         btrfs_set_lock_blocking(next);
2596                                         clean_tree_block(fs_info, next);
2597                                         btrfs_wait_tree_block_writeback(next);
2598                                         btrfs_tree_unlock(next);
2599                                 } else {
2600                                         if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2601                                                 clear_extent_buffer_dirty(next);
2602                                 }
2603
2604                                 WARN_ON(root_owner !=
2605                                         BTRFS_TREE_LOG_OBJECTID);
2606                                 ret = btrfs_free_and_pin_reserved_extent(
2607                                                         fs_info, bytenr,
2608                                                         blocksize);
2609                                 if (ret) {
2610                                         free_extent_buffer(next);
2611                                         return ret;
2612                                 }
2613                         }
2614                         free_extent_buffer(next);
2615                         continue;
2616                 }
2617                 ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key);
2618                 if (ret) {
2619                         free_extent_buffer(next);
2620                         return ret;
2621                 }
2622
2623                 WARN_ON(*level <= 0);
2624                 if (path->nodes[*level-1])
2625                         free_extent_buffer(path->nodes[*level-1]);
2626                 path->nodes[*level-1] = next;
2627                 *level = btrfs_header_level(next);
2628                 path->slots[*level] = 0;
2629                 cond_resched();
2630         }
2631         WARN_ON(*level < 0);
2632         WARN_ON(*level >= BTRFS_MAX_LEVEL);
2633
2634         path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
2635
2636         cond_resched();
2637         return 0;
2638 }
2639
2640 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2641                                  struct btrfs_root *root,
2642                                  struct btrfs_path *path, int *level,
2643                                  struct walk_control *wc)
2644 {
2645         struct btrfs_fs_info *fs_info = root->fs_info;
2646         u64 root_owner;
2647         int i;
2648         int slot;
2649         int ret;
2650
2651         for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2652                 slot = path->slots[i];
2653                 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
2654                         path->slots[i]++;
2655                         *level = i;
2656                         WARN_ON(*level == 0);
2657                         return 0;
2658                 } else {
2659                         struct extent_buffer *parent;
2660                         if (path->nodes[*level] == root->node)
2661                                 parent = path->nodes[*level];
2662                         else
2663                                 parent = path->nodes[*level + 1];
2664
2665                         root_owner = btrfs_header_owner(parent);
2666                         ret = wc->process_func(root, path->nodes[*level], wc,
2667                                  btrfs_header_generation(path->nodes[*level]),
2668                                  *level);
2669                         if (ret)
2670                                 return ret;
2671
2672                         if (wc->free) {
2673                                 struct extent_buffer *next;
2674
2675                                 next = path->nodes[*level];
2676
2677                                 if (trans) {
2678                                         btrfs_tree_lock(next);
2679                                         btrfs_set_lock_blocking(next);
2680                                         clean_tree_block(fs_info, next);
2681                                         btrfs_wait_tree_block_writeback(next);
2682                                         btrfs_tree_unlock(next);
2683                                 } else {
2684                                         if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2685                                                 clear_extent_buffer_dirty(next);
2686                                 }
2687
2688                                 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
2689                                 ret = btrfs_free_and_pin_reserved_extent(
2690                                                 fs_info,
2691                                                 path->nodes[*level]->start,
2692                                                 path->nodes[*level]->len);
2693                                 if (ret)
2694                                         return ret;
2695                         }
2696                         free_extent_buffer(path->nodes[*level]);
2697                         path->nodes[*level] = NULL;
2698                         *level = i + 1;
2699                 }
2700         }
2701         return 1;
2702 }
2703
2704 /*
2705  * drop the reference count on the tree rooted at 'snap'.  This traverses
2706  * the tree freeing any blocks that have a ref count of zero after being
2707  * decremented.
2708  */
2709 static int walk_log_tree(struct btrfs_trans_handle *trans,
2710                          struct btrfs_root *log, struct walk_control *wc)
2711 {
2712         struct btrfs_fs_info *fs_info = log->fs_info;
2713         int ret = 0;
2714         int wret;
2715         int level;
2716         struct btrfs_path *path;
2717         int orig_level;
2718
2719         path = btrfs_alloc_path();
2720         if (!path)
2721                 return -ENOMEM;
2722
2723         level = btrfs_header_level(log->node);
2724         orig_level = level;
2725         path->nodes[level] = log->node;
2726         extent_buffer_get(log->node);
2727         path->slots[level] = 0;
2728
2729         while (1) {
2730                 wret = walk_down_log_tree(trans, log, path, &level, wc);
2731                 if (wret > 0)
2732                         break;
2733                 if (wret < 0) {
2734                         ret = wret;
2735                         goto out;
2736                 }
2737
2738                 wret = walk_up_log_tree(trans, log, path, &level, wc);
2739                 if (wret > 0)
2740                         break;
2741                 if (wret < 0) {
2742                         ret = wret;
2743                         goto out;
2744                 }
2745         }
2746
2747         /* was the root node processed? if not, catch it here */
2748         if (path->nodes[orig_level]) {
2749                 ret = wc->process_func(log, path->nodes[orig_level], wc,
2750                          btrfs_header_generation(path->nodes[orig_level]),
2751                          orig_level);
2752                 if (ret)
2753                         goto out;
2754                 if (wc->free) {
2755                         struct extent_buffer *next;
2756
2757                         next = path->nodes[orig_level];
2758
2759                         if (trans) {
2760                                 btrfs_tree_lock(next);
2761                                 btrfs_set_lock_blocking(next);
2762                                 clean_tree_block(fs_info, next);
2763                                 btrfs_wait_tree_block_writeback(next);
2764                                 btrfs_tree_unlock(next);
2765                         } else {
2766                                 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2767                                         clear_extent_buffer_dirty(next);
2768                         }
2769
2770                         WARN_ON(log->root_key.objectid !=
2771                                 BTRFS_TREE_LOG_OBJECTID);
2772                         ret = btrfs_free_and_pin_reserved_extent(fs_info,
2773                                                         next->start, next->len);
2774                         if (ret)
2775                                 goto out;
2776                 }
2777         }
2778
2779 out:
2780         btrfs_free_path(path);
2781         return ret;
2782 }
2783
2784 /*
2785  * helper function to update the item for a given subvolumes log root
2786  * in the tree of log roots
2787  */
2788 static int update_log_root(struct btrfs_trans_handle *trans,
2789                            struct btrfs_root *log)
2790 {
2791         struct btrfs_fs_info *fs_info = log->fs_info;
2792         int ret;
2793
2794         if (log->log_transid == 1) {
2795                 /* insert root item on the first sync */
2796                 ret = btrfs_insert_root(trans, fs_info->log_root_tree,
2797                                 &log->root_key, &log->root_item);
2798         } else {
2799                 ret = btrfs_update_root(trans, fs_info->log_root_tree,
2800                                 &log->root_key, &log->root_item);
2801         }
2802         return ret;
2803 }
2804
2805 static void wait_log_commit(struct btrfs_root *root, int transid)
2806 {
2807         DEFINE_WAIT(wait);
2808         int index = transid % 2;
2809
2810         /*
2811          * we only allow two pending log transactions at a time,
2812          * so we know that if ours is more than 2 older than the
2813          * current transaction, we're done
2814          */
2815         for (;;) {
2816                 prepare_to_wait(&root->log_commit_wait[index],
2817                                 &wait, TASK_UNINTERRUPTIBLE);
2818
2819                 if (!(root->log_transid_committed < transid &&
2820                       atomic_read(&root->log_commit[index])))
2821                         break;
2822
2823                 mutex_unlock(&root->log_mutex);
2824                 schedule();
2825                 mutex_lock(&root->log_mutex);
2826         }
2827         finish_wait(&root->log_commit_wait[index], &wait);
2828 }
2829
2830 static void wait_for_writer(struct btrfs_root *root)
2831 {
2832         DEFINE_WAIT(wait);
2833
2834         for (;;) {
2835                 prepare_to_wait(&root->log_writer_wait, &wait,
2836                                 TASK_UNINTERRUPTIBLE);
2837                 if (!atomic_read(&root->log_writers))
2838                         break;
2839
2840                 mutex_unlock(&root->log_mutex);
2841                 schedule();
2842                 mutex_lock(&root->log_mutex);
2843         }
2844         finish_wait(&root->log_writer_wait, &wait);
2845 }
2846
2847 static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
2848                                         struct btrfs_log_ctx *ctx)
2849 {
2850         if (!ctx)
2851                 return;
2852
2853         mutex_lock(&root->log_mutex);
2854         list_del_init(&ctx->list);
2855         mutex_unlock(&root->log_mutex);
2856 }
2857
2858 /* 
2859  * Invoked in log mutex context, or be sure there is no other task which
2860  * can access the list.
2861  */
2862 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
2863                                              int index, int error)
2864 {
2865         struct btrfs_log_ctx *ctx;
2866         struct btrfs_log_ctx *safe;
2867
2868         list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
2869                 list_del_init(&ctx->list);
2870                 ctx->log_ret = error;
2871         }
2872
2873         INIT_LIST_HEAD(&root->log_ctxs[index]);
2874 }
2875
2876 /*
2877  * btrfs_sync_log does sends a given tree log down to the disk and
2878  * updates the super blocks to record it.  When this call is done,
2879  * you know that any inodes previously logged are safely on disk only
2880  * if it returns 0.
2881  *
2882  * Any other return value means you need to call btrfs_commit_transaction.
2883  * Some of the edge cases for fsyncing directories that have had unlinks
2884  * or renames done in the past mean that sometimes the only safe
2885  * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
2886  * that has happened.
2887  */
2888 int btrfs_sync_log(struct btrfs_trans_handle *trans,
2889                    struct btrfs_root *root, struct btrfs_log_ctx *ctx)
2890 {
2891         int index1;
2892         int index2;
2893         int mark;
2894         int ret;
2895         struct btrfs_fs_info *fs_info = root->fs_info;
2896         struct btrfs_root *log = root->log_root;
2897         struct btrfs_root *log_root_tree = fs_info->log_root_tree;
2898         int log_transid = 0;
2899         struct btrfs_log_ctx root_log_ctx;
2900         struct blk_plug plug;
2901
2902         mutex_lock(&root->log_mutex);
2903         log_transid = ctx->log_transid;
2904         if (root->log_transid_committed >= log_transid) {
2905                 mutex_unlock(&root->log_mutex);
2906                 return ctx->log_ret;
2907         }
2908
2909         index1 = log_transid % 2;
2910         if (atomic_read(&root->log_commit[index1])) {
2911                 wait_log_commit(root, log_transid);
2912                 mutex_unlock(&root->log_mutex);
2913                 return ctx->log_ret;
2914         }
2915         ASSERT(log_transid == root->log_transid);
2916         atomic_set(&root->log_commit[index1], 1);
2917
2918         /* wait for previous tree log sync to complete */
2919         if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2920                 wait_log_commit(root, log_transid - 1);
2921
2922         while (1) {
2923                 int batch = atomic_read(&root->log_batch);
2924                 /* when we're on an ssd, just kick the log commit out */
2925                 if (!btrfs_test_opt(fs_info, SSD) &&
2926                     test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
2927                         mutex_unlock(&root->log_mutex);
2928                         schedule_timeout_uninterruptible(1);
2929                         mutex_lock(&root->log_mutex);
2930                 }
2931                 wait_for_writer(root);
2932                 if (batch == atomic_read(&root->log_batch))
2933                         break;
2934         }
2935
2936         /* bail out if we need to do a full commit */
2937         if (btrfs_need_log_full_commit(fs_info, trans)) {
2938                 ret = -EAGAIN;
2939                 btrfs_free_logged_extents(log, log_transid);
2940                 mutex_unlock(&root->log_mutex);
2941                 goto out;
2942         }
2943
2944         if (log_transid % 2 == 0)
2945                 mark = EXTENT_DIRTY;
2946         else
2947                 mark = EXTENT_NEW;
2948
2949         /* we start IO on  all the marked extents here, but we don't actually
2950          * wait for them until later.
2951          */
2952         blk_start_plug(&plug);
2953         ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
2954         if (ret) {
2955                 blk_finish_plug(&plug);
2956                 btrfs_abort_transaction(trans, ret);
2957                 btrfs_free_logged_extents(log, log_transid);
2958                 btrfs_set_log_full_commit(fs_info, trans);
2959                 mutex_unlock(&root->log_mutex);
2960                 goto out;
2961         }
2962
2963         btrfs_set_root_node(&log->root_item, log->node);
2964
2965         root->log_transid++;
2966         log->log_transid = root->log_transid;
2967         root->log_start_pid = 0;
2968         /*
2969          * IO has been started, blocks of the log tree have WRITTEN flag set
2970          * in their headers. new modifications of the log will be written to
2971          * new positions. so it's safe to allow log writers to go in.
2972          */
2973         mutex_unlock(&root->log_mutex);
2974
2975         btrfs_init_log_ctx(&root_log_ctx, NULL);
2976
2977         mutex_lock(&log_root_tree->log_mutex);
2978         atomic_inc(&log_root_tree->log_batch);
2979         atomic_inc(&log_root_tree->log_writers);
2980
2981         index2 = log_root_tree->log_transid % 2;
2982         list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
2983         root_log_ctx.log_transid = log_root_tree->log_transid;
2984
2985         mutex_unlock(&log_root_tree->log_mutex);
2986
2987         ret = update_log_root(trans, log);
2988
2989         mutex_lock(&log_root_tree->log_mutex);
2990         if (atomic_dec_and_test(&log_root_tree->log_writers)) {
2991                 /*
2992                  * Implicit memory barrier after atomic_dec_and_test
2993                  */
2994                 if (waitqueue_active(&log_root_tree->log_writer_wait))
2995                         wake_up(&log_root_tree->log_writer_wait);
2996         }
2997