1920c2149f883208920013002fbd98eabd68c747
[sfrench/cifs-2.6.git] / fs / btrfs / tree-log.c
1 /*
2  * Copyright (C) 2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include <linux/blkdev.h>
22 #include <linux/list_sort.h>
23 #include "tree-log.h"
24 #include "disk-io.h"
25 #include "locking.h"
26 #include "print-tree.h"
27 #include "backref.h"
28 #include "hash.h"
29 #include "compression.h"
30 #include "qgroup.h"
31
32 /* magic values for the inode_only field in btrfs_log_inode:
33  *
34  * LOG_INODE_ALL means to log everything
35  * LOG_INODE_EXISTS means to log just enough to recreate the inode
36  * during log replay
37  */
38 #define LOG_INODE_ALL 0
39 #define LOG_INODE_EXISTS 1
40 #define LOG_OTHER_INODE 2
41
42 /*
43  * directory trouble cases
44  *
45  * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
46  * log, we must force a full commit before doing an fsync of the directory
47  * where the unlink was done.
48  * ---> record transid of last unlink/rename per directory
49  *
50  * mkdir foo/some_dir
51  * normal commit
52  * rename foo/some_dir foo2/some_dir
53  * mkdir foo/some_dir
54  * fsync foo/some_dir/some_file
55  *
56  * The fsync above will unlink the original some_dir without recording
57  * it in its new location (foo2).  After a crash, some_dir will be gone
58  * unless the fsync of some_file forces a full commit
59  *
60  * 2) we must log any new names for any file or dir that is in the fsync
61  * log. ---> check inode while renaming/linking.
62  *
63  * 2a) we must log any new names for any file or dir during rename
64  * when the directory they are being removed from was logged.
65  * ---> check inode and old parent dir during rename
66  *
67  *  2a is actually the more important variant.  With the extra logging
68  *  a crash might unlink the old name without recreating the new one
69  *
70  * 3) after a crash, we must go through any directories with a link count
71  * of zero and redo the rm -rf
72  *
73  * mkdir f1/foo
74  * normal commit
75  * rm -rf f1/foo
76  * fsync(f1)
77  *
78  * The directory f1 was fully removed from the FS, but fsync was never
79  * called on f1, only its parent dir.  After a crash the rm -rf must
80  * be replayed.  This must be able to recurse down the entire
81  * directory tree.  The inode link count fixup code takes care of the
82  * ugly details.
83  */
84
85 /*
86  * stages for the tree walking.  The first
87  * stage (0) is to only pin down the blocks we find
88  * the second stage (1) is to make sure that all the inodes
89  * we find in the log are created in the subvolume.
90  *
91  * The last stage is to deal with directories and links and extents
92  * and all the other fun semantics
93  */
94 #define LOG_WALK_PIN_ONLY 0
95 #define LOG_WALK_REPLAY_INODES 1
96 #define LOG_WALK_REPLAY_DIR_INDEX 2
97 #define LOG_WALK_REPLAY_ALL 3
98
99 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
100                            struct btrfs_root *root, struct btrfs_inode *inode,
101                            int inode_only,
102                            const loff_t start,
103                            const loff_t end,
104                            struct btrfs_log_ctx *ctx);
105 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
106                              struct btrfs_root *root,
107                              struct btrfs_path *path, u64 objectid);
108 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
109                                        struct btrfs_root *root,
110                                        struct btrfs_root *log,
111                                        struct btrfs_path *path,
112                                        u64 dirid, int del_all);
113
114 /*
115  * tree logging is a special write ahead log used to make sure that
116  * fsyncs and O_SYNCs can happen without doing full tree commits.
117  *
118  * Full tree commits are expensive because they require commonly
119  * modified blocks to be recowed, creating many dirty pages in the
120  * extent tree an 4x-6x higher write load than ext3.
121  *
122  * Instead of doing a tree commit on every fsync, we use the
123  * key ranges and transaction ids to find items for a given file or directory
124  * that have changed in this transaction.  Those items are copied into
125  * a special tree (one per subvolume root), that tree is written to disk
126  * and then the fsync is considered complete.
127  *
128  * After a crash, items are copied out of the log-tree back into the
129  * subvolume tree.  Any file data extents found are recorded in the extent
130  * allocation tree, and the log-tree freed.
131  *
132  * The log tree is read three times, once to pin down all the extents it is
133  * using in ram and once, once to create all the inodes logged in the tree
134  * and once to do all the other items.
135  */
136
137 /*
138  * start a sub transaction and setup the log tree
139  * this increments the log tree writer count to make the people
140  * syncing the tree wait for us to finish
141  */
142 static int start_log_trans(struct btrfs_trans_handle *trans,
143                            struct btrfs_root *root,
144                            struct btrfs_log_ctx *ctx)
145 {
146         struct btrfs_fs_info *fs_info = root->fs_info;
147         int ret = 0;
148
149         mutex_lock(&root->log_mutex);
150
151         if (root->log_root) {
152                 if (btrfs_need_log_full_commit(fs_info, trans)) {
153                         ret = -EAGAIN;
154                         goto out;
155                 }
156
157                 if (!root->log_start_pid) {
158                         clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
159                         root->log_start_pid = current->pid;
160                 } else if (root->log_start_pid != current->pid) {
161                         set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
162                 }
163         } else {
164                 mutex_lock(&fs_info->tree_log_mutex);
165                 if (!fs_info->log_root_tree)
166                         ret = btrfs_init_log_root_tree(trans, fs_info);
167                 mutex_unlock(&fs_info->tree_log_mutex);
168                 if (ret)
169                         goto out;
170
171                 ret = btrfs_add_log_tree(trans, root);
172                 if (ret)
173                         goto out;
174
175                 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
176                 root->log_start_pid = current->pid;
177         }
178
179         atomic_inc(&root->log_batch);
180         atomic_inc(&root->log_writers);
181         if (ctx) {
182                 int index = root->log_transid % 2;
183                 list_add_tail(&ctx->list, &root->log_ctxs[index]);
184                 ctx->log_transid = root->log_transid;
185         }
186
187 out:
188         mutex_unlock(&root->log_mutex);
189         return ret;
190 }
191
192 /*
193  * returns 0 if there was a log transaction running and we were able
194  * to join, or returns -ENOENT if there were not transactions
195  * in progress
196  */
197 static int join_running_log_trans(struct btrfs_root *root)
198 {
199         int ret = -ENOENT;
200
201         smp_mb();
202         if (!root->log_root)
203                 return -ENOENT;
204
205         mutex_lock(&root->log_mutex);
206         if (root->log_root) {
207                 ret = 0;
208                 atomic_inc(&root->log_writers);
209         }
210         mutex_unlock(&root->log_mutex);
211         return ret;
212 }
213
214 /*
215  * This either makes the current running log transaction wait
216  * until you call btrfs_end_log_trans() or it makes any future
217  * log transactions wait until you call btrfs_end_log_trans()
218  */
219 int btrfs_pin_log_trans(struct btrfs_root *root)
220 {
221         int ret = -ENOENT;
222
223         mutex_lock(&root->log_mutex);
224         atomic_inc(&root->log_writers);
225         mutex_unlock(&root->log_mutex);
226         return ret;
227 }
228
229 /*
230  * indicate we're done making changes to the log tree
231  * and wake up anyone waiting to do a sync
232  */
233 void btrfs_end_log_trans(struct btrfs_root *root)
234 {
235         if (atomic_dec_and_test(&root->log_writers)) {
236                 /*
237                  * Implicit memory barrier after atomic_dec_and_test
238                  */
239                 if (waitqueue_active(&root->log_writer_wait))
240                         wake_up(&root->log_writer_wait);
241         }
242 }
243
244
245 /*
246  * the walk control struct is used to pass state down the chain when
247  * processing the log tree.  The stage field tells us which part
248  * of the log tree processing we are currently doing.  The others
249  * are state fields used for that specific part
250  */
251 struct walk_control {
252         /* should we free the extent on disk when done?  This is used
253          * at transaction commit time while freeing a log tree
254          */
255         int free;
256
257         /* should we write out the extent buffer?  This is used
258          * while flushing the log tree to disk during a sync
259          */
260         int write;
261
262         /* should we wait for the extent buffer io to finish?  Also used
263          * while flushing the log tree to disk for a sync
264          */
265         int wait;
266
267         /* pin only walk, we record which extents on disk belong to the
268          * log trees
269          */
270         int pin;
271
272         /* what stage of the replay code we're currently in */
273         int stage;
274
275         /* the root we are currently replaying */
276         struct btrfs_root *replay_dest;
277
278         /* the trans handle for the current replay */
279         struct btrfs_trans_handle *trans;
280
281         /* the function that gets used to process blocks we find in the
282          * tree.  Note the extent_buffer might not be up to date when it is
283          * passed in, and it must be checked or read if you need the data
284          * inside it
285          */
286         int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
287                             struct walk_control *wc, u64 gen);
288 };
289
290 /*
291  * process_func used to pin down extents, write them or wait on them
292  */
293 static int process_one_buffer(struct btrfs_root *log,
294                               struct extent_buffer *eb,
295                               struct walk_control *wc, u64 gen)
296 {
297         struct btrfs_fs_info *fs_info = log->fs_info;
298         int ret = 0;
299
300         /*
301          * If this fs is mixed then we need to be able to process the leaves to
302          * pin down any logged extents, so we have to read the block.
303          */
304         if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
305                 ret = btrfs_read_buffer(eb, gen);
306                 if (ret)
307                         return ret;
308         }
309
310         if (wc->pin)
311                 ret = btrfs_pin_extent_for_log_replay(fs_info, eb->start,
312                                                       eb->len);
313
314         if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
315                 if (wc->pin && btrfs_header_level(eb) == 0)
316                         ret = btrfs_exclude_logged_extents(fs_info, eb);
317                 if (wc->write)
318                         btrfs_write_tree_block(eb);
319                 if (wc->wait)
320                         btrfs_wait_tree_block_writeback(eb);
321         }
322         return ret;
323 }
324
325 /*
326  * Item overwrite used by replay and tree logging.  eb, slot and key all refer
327  * to the src data we are copying out.
328  *
329  * root is the tree we are copying into, and path is a scratch
330  * path for use in this function (it should be released on entry and
331  * will be released on exit).
332  *
333  * If the key is already in the destination tree the existing item is
334  * overwritten.  If the existing item isn't big enough, it is extended.
335  * If it is too large, it is truncated.
336  *
337  * If the key isn't in the destination yet, a new item is inserted.
338  */
339 static noinline int overwrite_item(struct btrfs_trans_handle *trans,
340                                    struct btrfs_root *root,
341                                    struct btrfs_path *path,
342                                    struct extent_buffer *eb, int slot,
343                                    struct btrfs_key *key)
344 {
345         struct btrfs_fs_info *fs_info = root->fs_info;
346         int ret;
347         u32 item_size;
348         u64 saved_i_size = 0;
349         int save_old_i_size = 0;
350         unsigned long src_ptr;
351         unsigned long dst_ptr;
352         int overwrite_root = 0;
353         bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
354
355         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
356                 overwrite_root = 1;
357
358         item_size = btrfs_item_size_nr(eb, slot);
359         src_ptr = btrfs_item_ptr_offset(eb, slot);
360
361         /* look for the key in the destination tree */
362         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
363         if (ret < 0)
364                 return ret;
365
366         if (ret == 0) {
367                 char *src_copy;
368                 char *dst_copy;
369                 u32 dst_size = btrfs_item_size_nr(path->nodes[0],
370                                                   path->slots[0]);
371                 if (dst_size != item_size)
372                         goto insert;
373
374                 if (item_size == 0) {
375                         btrfs_release_path(path);
376                         return 0;
377                 }
378                 dst_copy = kmalloc(item_size, GFP_NOFS);
379                 src_copy = kmalloc(item_size, GFP_NOFS);
380                 if (!dst_copy || !src_copy) {
381                         btrfs_release_path(path);
382                         kfree(dst_copy);
383                         kfree(src_copy);
384                         return -ENOMEM;
385                 }
386
387                 read_extent_buffer(eb, src_copy, src_ptr, item_size);
388
389                 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
390                 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
391                                    item_size);
392                 ret = memcmp(dst_copy, src_copy, item_size);
393
394                 kfree(dst_copy);
395                 kfree(src_copy);
396                 /*
397                  * they have the same contents, just return, this saves
398                  * us from cowing blocks in the destination tree and doing
399                  * extra writes that may not have been done by a previous
400                  * sync
401                  */
402                 if (ret == 0) {
403                         btrfs_release_path(path);
404                         return 0;
405                 }
406
407                 /*
408                  * We need to load the old nbytes into the inode so when we
409                  * replay the extents we've logged we get the right nbytes.
410                  */
411                 if (inode_item) {
412                         struct btrfs_inode_item *item;
413                         u64 nbytes;
414                         u32 mode;
415
416                         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
417                                               struct btrfs_inode_item);
418                         nbytes = btrfs_inode_nbytes(path->nodes[0], item);
419                         item = btrfs_item_ptr(eb, slot,
420                                               struct btrfs_inode_item);
421                         btrfs_set_inode_nbytes(eb, item, nbytes);
422
423                         /*
424                          * If this is a directory we need to reset the i_size to
425                          * 0 so that we can set it up properly when replaying
426                          * the rest of the items in this log.
427                          */
428                         mode = btrfs_inode_mode(eb, item);
429                         if (S_ISDIR(mode))
430                                 btrfs_set_inode_size(eb, item, 0);
431                 }
432         } else if (inode_item) {
433                 struct btrfs_inode_item *item;
434                 u32 mode;
435
436                 /*
437                  * New inode, set nbytes to 0 so that the nbytes comes out
438                  * properly when we replay the extents.
439                  */
440                 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
441                 btrfs_set_inode_nbytes(eb, item, 0);
442
443                 /*
444                  * If this is a directory we need to reset the i_size to 0 so
445                  * that we can set it up properly when replaying the rest of
446                  * the items in this log.
447                  */
448                 mode = btrfs_inode_mode(eb, item);
449                 if (S_ISDIR(mode))
450                         btrfs_set_inode_size(eb, item, 0);
451         }
452 insert:
453         btrfs_release_path(path);
454         /* try to insert the key into the destination tree */
455         path->skip_release_on_error = 1;
456         ret = btrfs_insert_empty_item(trans, root, path,
457                                       key, item_size);
458         path->skip_release_on_error = 0;
459
460         /* make sure any existing item is the correct size */
461         if (ret == -EEXIST || ret == -EOVERFLOW) {
462                 u32 found_size;
463                 found_size = btrfs_item_size_nr(path->nodes[0],
464                                                 path->slots[0]);
465                 if (found_size > item_size)
466                         btrfs_truncate_item(fs_info, path, item_size, 1);
467                 else if (found_size < item_size)
468                         btrfs_extend_item(fs_info, path,
469                                           item_size - found_size);
470         } else if (ret) {
471                 return ret;
472         }
473         dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
474                                         path->slots[0]);
475
476         /* don't overwrite an existing inode if the generation number
477          * was logged as zero.  This is done when the tree logging code
478          * is just logging an inode to make sure it exists after recovery.
479          *
480          * Also, don't overwrite i_size on directories during replay.
481          * log replay inserts and removes directory items based on the
482          * state of the tree found in the subvolume, and i_size is modified
483          * as it goes
484          */
485         if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
486                 struct btrfs_inode_item *src_item;
487                 struct btrfs_inode_item *dst_item;
488
489                 src_item = (struct btrfs_inode_item *)src_ptr;
490                 dst_item = (struct btrfs_inode_item *)dst_ptr;
491
492                 if (btrfs_inode_generation(eb, src_item) == 0) {
493                         struct extent_buffer *dst_eb = path->nodes[0];
494                         const u64 ino_size = btrfs_inode_size(eb, src_item);
495
496                         /*
497                          * For regular files an ino_size == 0 is used only when
498                          * logging that an inode exists, as part of a directory
499                          * fsync, and the inode wasn't fsynced before. In this
500                          * case don't set the size of the inode in the fs/subvol
501                          * tree, otherwise we would be throwing valid data away.
502                          */
503                         if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
504                             S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
505                             ino_size != 0) {
506                                 struct btrfs_map_token token;
507
508                                 btrfs_init_map_token(&token);
509                                 btrfs_set_token_inode_size(dst_eb, dst_item,
510                                                            ino_size, &token);
511                         }
512                         goto no_copy;
513                 }
514
515                 if (overwrite_root &&
516                     S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
517                     S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
518                         save_old_i_size = 1;
519                         saved_i_size = btrfs_inode_size(path->nodes[0],
520                                                         dst_item);
521                 }
522         }
523
524         copy_extent_buffer(path->nodes[0], eb, dst_ptr,
525                            src_ptr, item_size);
526
527         if (save_old_i_size) {
528                 struct btrfs_inode_item *dst_item;
529                 dst_item = (struct btrfs_inode_item *)dst_ptr;
530                 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
531         }
532
533         /* make sure the generation is filled in */
534         if (key->type == BTRFS_INODE_ITEM_KEY) {
535                 struct btrfs_inode_item *dst_item;
536                 dst_item = (struct btrfs_inode_item *)dst_ptr;
537                 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
538                         btrfs_set_inode_generation(path->nodes[0], dst_item,
539                                                    trans->transid);
540                 }
541         }
542 no_copy:
543         btrfs_mark_buffer_dirty(path->nodes[0]);
544         btrfs_release_path(path);
545         return 0;
546 }
547
548 /*
549  * simple helper to read an inode off the disk from a given root
550  * This can only be called for subvolume roots and not for the log
551  */
552 static noinline struct inode *read_one_inode(struct btrfs_root *root,
553                                              u64 objectid)
554 {
555         struct btrfs_key key;
556         struct inode *inode;
557
558         key.objectid = objectid;
559         key.type = BTRFS_INODE_ITEM_KEY;
560         key.offset = 0;
561         inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
562         if (IS_ERR(inode)) {
563                 inode = NULL;
564         } else if (is_bad_inode(inode)) {
565                 iput(inode);
566                 inode = NULL;
567         }
568         return inode;
569 }
570
571 /* replays a single extent in 'eb' at 'slot' with 'key' into the
572  * subvolume 'root'.  path is released on entry and should be released
573  * on exit.
574  *
575  * extents in the log tree have not been allocated out of the extent
576  * tree yet.  So, this completes the allocation, taking a reference
577  * as required if the extent already exists or creating a new extent
578  * if it isn't in the extent allocation tree yet.
579  *
580  * The extent is inserted into the file, dropping any existing extents
581  * from the file that overlap the new one.
582  */
583 static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
584                                       struct btrfs_root *root,
585                                       struct btrfs_path *path,
586                                       struct extent_buffer *eb, int slot,
587                                       struct btrfs_key *key)
588 {
589         struct btrfs_fs_info *fs_info = root->fs_info;
590         int found_type;
591         u64 extent_end;
592         u64 start = key->offset;
593         u64 nbytes = 0;
594         struct btrfs_file_extent_item *item;
595         struct inode *inode = NULL;
596         unsigned long size;
597         int ret = 0;
598
599         item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
600         found_type = btrfs_file_extent_type(eb, item);
601
602         if (found_type == BTRFS_FILE_EXTENT_REG ||
603             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
604                 nbytes = btrfs_file_extent_num_bytes(eb, item);
605                 extent_end = start + nbytes;
606
607                 /*
608                  * We don't add to the inodes nbytes if we are prealloc or a
609                  * hole.
610                  */
611                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
612                         nbytes = 0;
613         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
614                 size = btrfs_file_extent_inline_len(eb, slot, item);
615                 nbytes = btrfs_file_extent_ram_bytes(eb, item);
616                 extent_end = ALIGN(start + size,
617                                    fs_info->sectorsize);
618         } else {
619                 ret = 0;
620                 goto out;
621         }
622
623         inode = read_one_inode(root, key->objectid);
624         if (!inode) {
625                 ret = -EIO;
626                 goto out;
627         }
628
629         /*
630          * first check to see if we already have this extent in the
631          * file.  This must be done before the btrfs_drop_extents run
632          * so we don't try to drop this extent.
633          */
634         ret = btrfs_lookup_file_extent(trans, root, path,
635                         btrfs_ino(BTRFS_I(inode)), start, 0);
636
637         if (ret == 0 &&
638             (found_type == BTRFS_FILE_EXTENT_REG ||
639              found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
640                 struct btrfs_file_extent_item cmp1;
641                 struct btrfs_file_extent_item cmp2;
642                 struct btrfs_file_extent_item *existing;
643                 struct extent_buffer *leaf;
644
645                 leaf = path->nodes[0];
646                 existing = btrfs_item_ptr(leaf, path->slots[0],
647                                           struct btrfs_file_extent_item);
648
649                 read_extent_buffer(eb, &cmp1, (unsigned long)item,
650                                    sizeof(cmp1));
651                 read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
652                                    sizeof(cmp2));
653
654                 /*
655                  * we already have a pointer to this exact extent,
656                  * we don't have to do anything
657                  */
658                 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
659                         btrfs_release_path(path);
660                         goto out;
661                 }
662         }
663         btrfs_release_path(path);
664
665         /* drop any overlapping extents */
666         ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
667         if (ret)
668                 goto out;
669
670         if (found_type == BTRFS_FILE_EXTENT_REG ||
671             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
672                 u64 offset;
673                 unsigned long dest_offset;
674                 struct btrfs_key ins;
675
676                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
677                     btrfs_fs_incompat(fs_info, NO_HOLES))
678                         goto update_inode;
679
680                 ret = btrfs_insert_empty_item(trans, root, path, key,
681                                               sizeof(*item));
682                 if (ret)
683                         goto out;
684                 dest_offset = btrfs_item_ptr_offset(path->nodes[0],
685                                                     path->slots[0]);
686                 copy_extent_buffer(path->nodes[0], eb, dest_offset,
687                                 (unsigned long)item,  sizeof(*item));
688
689                 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
690                 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
691                 ins.type = BTRFS_EXTENT_ITEM_KEY;
692                 offset = key->offset - btrfs_file_extent_offset(eb, item);
693
694                 /*
695                  * Manually record dirty extent, as here we did a shallow
696                  * file extent item copy and skip normal backref update,
697                  * but modifying extent tree all by ourselves.
698                  * So need to manually record dirty extent for qgroup,
699                  * as the owner of the file extent changed from log tree
700                  * (doesn't affect qgroup) to fs/file tree(affects qgroup)
701                  */
702                 ret = btrfs_qgroup_trace_extent(trans, fs_info,
703                                 btrfs_file_extent_disk_bytenr(eb, item),
704                                 btrfs_file_extent_disk_num_bytes(eb, item),
705                                 GFP_NOFS);
706                 if (ret < 0)
707                         goto out;
708
709                 if (ins.objectid > 0) {
710                         u64 csum_start;
711                         u64 csum_end;
712                         LIST_HEAD(ordered_sums);
713                         /*
714                          * is this extent already allocated in the extent
715                          * allocation tree?  If so, just add a reference
716                          */
717                         ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
718                                                 ins.offset);
719                         if (ret == 0) {
720                                 ret = btrfs_inc_extent_ref(trans, root,
721                                                 ins.objectid, ins.offset,
722                                                 0, root->root_key.objectid,
723                                                 key->objectid, offset);
724                                 if (ret)
725                                         goto out;
726                         } else {
727                                 /*
728                                  * insert the extent pointer in the extent
729                                  * allocation tree
730                                  */
731                                 ret = btrfs_alloc_logged_file_extent(trans,
732                                                 fs_info,
733                                                 root->root_key.objectid,
734                                                 key->objectid, offset, &ins);
735                                 if (ret)
736                                         goto out;
737                         }
738                         btrfs_release_path(path);
739
740                         if (btrfs_file_extent_compression(eb, item)) {
741                                 csum_start = ins.objectid;
742                                 csum_end = csum_start + ins.offset;
743                         } else {
744                                 csum_start = ins.objectid +
745                                         btrfs_file_extent_offset(eb, item);
746                                 csum_end = csum_start +
747                                         btrfs_file_extent_num_bytes(eb, item);
748                         }
749
750                         ret = btrfs_lookup_csums_range(root->log_root,
751                                                 csum_start, csum_end - 1,
752                                                 &ordered_sums, 0);
753                         if (ret)
754                                 goto out;
755                         /*
756                          * Now delete all existing cums in the csum root that
757                          * cover our range. We do this because we can have an
758                          * extent that is completely referenced by one file
759                          * extent item and partially referenced by another
760                          * file extent item (like after using the clone or
761                          * extent_same ioctls). In this case if we end up doing
762                          * the replay of the one that partially references the
763                          * extent first, and we do not do the csum deletion
764                          * below, we can get 2 csum items in the csum tree that
765                          * overlap each other. For example, imagine our log has
766                          * the two following file extent items:
767                          *
768                          * key (257 EXTENT_DATA 409600)
769                          *     extent data disk byte 12845056 nr 102400
770                          *     extent data offset 20480 nr 20480 ram 102400
771                          *
772                          * key (257 EXTENT_DATA 819200)
773                          *     extent data disk byte 12845056 nr 102400
774                          *     extent data offset 0 nr 102400 ram 102400
775                          *
776                          * Where the second one fully references the 100K extent
777                          * that starts at disk byte 12845056, and the log tree
778                          * has a single csum item that covers the entire range
779                          * of the extent:
780                          *
781                          * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
782                          *
783                          * After the first file extent item is replayed, the
784                          * csum tree gets the following csum item:
785                          *
786                          * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
787                          *
788                          * Which covers the 20K sub-range starting at offset 20K
789                          * of our extent. Now when we replay the second file
790                          * extent item, if we do not delete existing csum items
791                          * that cover any of its blocks, we end up getting two
792                          * csum items in our csum tree that overlap each other:
793                          *
794                          * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
795                          * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
796                          *
797                          * Which is a problem, because after this anyone trying
798                          * to lookup up for the checksum of any block of our
799                          * extent starting at an offset of 40K or higher, will
800                          * end up looking at the second csum item only, which
801                          * does not contain the checksum for any block starting
802                          * at offset 40K or higher of our extent.
803                          */
804                         while (!list_empty(&ordered_sums)) {
805                                 struct btrfs_ordered_sum *sums;
806                                 sums = list_entry(ordered_sums.next,
807                                                 struct btrfs_ordered_sum,
808                                                 list);
809                                 if (!ret)
810                                         ret = btrfs_del_csums(trans, fs_info,
811                                                               sums->bytenr,
812                                                               sums->len);
813                                 if (!ret)
814                                         ret = btrfs_csum_file_blocks(trans,
815                                                 fs_info->csum_root, sums);
816                                 list_del(&sums->list);
817                                 kfree(sums);
818                         }
819                         if (ret)
820                                 goto out;
821                 } else {
822                         btrfs_release_path(path);
823                 }
824         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
825                 /* inline extents are easy, we just overwrite them */
826                 ret = overwrite_item(trans, root, path, eb, slot, key);
827                 if (ret)
828                         goto out;
829         }
830
831         inode_add_bytes(inode, nbytes);
832 update_inode:
833         ret = btrfs_update_inode(trans, root, inode);
834 out:
835         if (inode)
836                 iput(inode);
837         return ret;
838 }
839
840 /*
841  * when cleaning up conflicts between the directory names in the
842  * subvolume, directory names in the log and directory names in the
843  * inode back references, we may have to unlink inodes from directories.
844  *
845  * This is a helper function to do the unlink of a specific directory
846  * item
847  */
848 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
849                                       struct btrfs_root *root,
850                                       struct btrfs_path *path,
851                                       struct btrfs_inode *dir,
852                                       struct btrfs_dir_item *di)
853 {
854         struct btrfs_fs_info *fs_info = root->fs_info;
855         struct inode *inode;
856         char *name;
857         int name_len;
858         struct extent_buffer *leaf;
859         struct btrfs_key location;
860         int ret;
861
862         leaf = path->nodes[0];
863
864         btrfs_dir_item_key_to_cpu(leaf, di, &location);
865         name_len = btrfs_dir_name_len(leaf, di);
866         name = kmalloc(name_len, GFP_NOFS);
867         if (!name)
868                 return -ENOMEM;
869
870         read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
871         btrfs_release_path(path);
872
873         inode = read_one_inode(root, location.objectid);
874         if (!inode) {
875                 ret = -EIO;
876                 goto out;
877         }
878
879         ret = link_to_fixup_dir(trans, root, path, location.objectid);
880         if (ret)
881                 goto out;
882
883         ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name,
884                         name_len);
885         if (ret)
886                 goto out;
887         else
888                 ret = btrfs_run_delayed_items(trans, fs_info);
889 out:
890         kfree(name);
891         iput(inode);
892         return ret;
893 }
894
895 /*
896  * helper function to see if a given name and sequence number found
897  * in an inode back reference are already in a directory and correctly
898  * point to this inode
899  */
900 static noinline int inode_in_dir(struct btrfs_root *root,
901                                  struct btrfs_path *path,
902                                  u64 dirid, u64 objectid, u64 index,
903                                  const char *name, int name_len)
904 {
905         struct btrfs_dir_item *di;
906         struct btrfs_key location;
907         int match = 0;
908
909         di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
910                                          index, name, name_len, 0);
911         if (di && !IS_ERR(di)) {
912                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
913                 if (location.objectid != objectid)
914                         goto out;
915         } else
916                 goto out;
917         btrfs_release_path(path);
918
919         di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
920         if (di && !IS_ERR(di)) {
921                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
922                 if (location.objectid != objectid)
923                         goto out;
924         } else
925                 goto out;
926         match = 1;
927 out:
928         btrfs_release_path(path);
929         return match;
930 }
931
932 /*
933  * helper function to check a log tree for a named back reference in
934  * an inode.  This is used to decide if a back reference that is
935  * found in the subvolume conflicts with what we find in the log.
936  *
937  * inode backreferences may have multiple refs in a single item,
938  * during replay we process one reference at a time, and we don't
939  * want to delete valid links to a file from the subvolume if that
940  * link is also in the log.
941  */
942 static noinline int backref_in_log(struct btrfs_root *log,
943                                    struct btrfs_key *key,
944                                    u64 ref_objectid,
945                                    const char *name, int namelen)
946 {
947         struct btrfs_path *path;
948         struct btrfs_inode_ref *ref;
949         unsigned long ptr;
950         unsigned long ptr_end;
951         unsigned long name_ptr;
952         int found_name_len;
953         int item_size;
954         int ret;
955         int match = 0;
956
957         path = btrfs_alloc_path();
958         if (!path)
959                 return -ENOMEM;
960
961         ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
962         if (ret != 0)
963                 goto out;
964
965         ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
966
967         if (key->type == BTRFS_INODE_EXTREF_KEY) {
968                 if (btrfs_find_name_in_ext_backref(path, ref_objectid,
969                                                    name, namelen, NULL))
970                         match = 1;
971
972                 goto out;
973         }
974
975         item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
976         ptr_end = ptr + item_size;
977         while (ptr < ptr_end) {
978                 ref = (struct btrfs_inode_ref *)ptr;
979                 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
980                 if (found_name_len == namelen) {
981                         name_ptr = (unsigned long)(ref + 1);
982                         ret = memcmp_extent_buffer(path->nodes[0], name,
983                                                    name_ptr, namelen);
984                         if (ret == 0) {
985                                 match = 1;
986                                 goto out;
987                         }
988                 }
989                 ptr = (unsigned long)(ref + 1) + found_name_len;
990         }
991 out:
992         btrfs_free_path(path);
993         return match;
994 }
995
996 static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
997                                   struct btrfs_root *root,
998                                   struct btrfs_path *path,
999                                   struct btrfs_root *log_root,
1000                                   struct btrfs_inode *dir,
1001                                   struct btrfs_inode *inode,
1002                                   u64 inode_objectid, u64 parent_objectid,
1003                                   u64 ref_index, char *name, int namelen,
1004                                   int *search_done)
1005 {
1006         struct btrfs_fs_info *fs_info = root->fs_info;
1007         int ret;
1008         char *victim_name;
1009         int victim_name_len;
1010         struct extent_buffer *leaf;
1011         struct btrfs_dir_item *di;
1012         struct btrfs_key search_key;
1013         struct btrfs_inode_extref *extref;
1014
1015 again:
1016         /* Search old style refs */
1017         search_key.objectid = inode_objectid;
1018         search_key.type = BTRFS_INODE_REF_KEY;
1019         search_key.offset = parent_objectid;
1020         ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
1021         if (ret == 0) {
1022                 struct btrfs_inode_ref *victim_ref;
1023                 unsigned long ptr;
1024                 unsigned long ptr_end;
1025
1026                 leaf = path->nodes[0];
1027
1028                 /* are we trying to overwrite a back ref for the root directory
1029                  * if so, just jump out, we're done
1030                  */
1031                 if (search_key.objectid == search_key.offset)
1032                         return 1;
1033
1034                 /* check all the names in this back reference to see
1035                  * if they are in the log.  if so, we allow them to stay
1036                  * otherwise they must be unlinked as a conflict
1037                  */
1038                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1039                 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
1040                 while (ptr < ptr_end) {
1041                         victim_ref = (struct btrfs_inode_ref *)ptr;
1042                         victim_name_len = btrfs_inode_ref_name_len(leaf,
1043                                                                    victim_ref);
1044                         victim_name = kmalloc(victim_name_len, GFP_NOFS);
1045                         if (!victim_name)
1046                                 return -ENOMEM;
1047
1048                         read_extent_buffer(leaf, victim_name,
1049                                            (unsigned long)(victim_ref + 1),
1050                                            victim_name_len);
1051
1052                         if (!backref_in_log(log_root, &search_key,
1053                                             parent_objectid,
1054                                             victim_name,
1055                                             victim_name_len)) {
1056                                 inc_nlink(&inode->vfs_inode);
1057                                 btrfs_release_path(path);
1058
1059                                 ret = btrfs_unlink_inode(trans, root, dir, inode,
1060                                                 victim_name, victim_name_len);
1061                                 kfree(victim_name);
1062                                 if (ret)
1063                                         return ret;
1064                                 ret = btrfs_run_delayed_items(trans, fs_info);
1065                                 if (ret)
1066                                         return ret;
1067                                 *search_done = 1;
1068                                 goto again;
1069                         }
1070                         kfree(victim_name);
1071
1072                         ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
1073                 }
1074
1075                 /*
1076                  * NOTE: we have searched root tree and checked the
1077                  * corresponding ref, it does not need to check again.
1078                  */
1079                 *search_done = 1;
1080         }
1081         btrfs_release_path(path);
1082
1083         /* Same search but for extended refs */
1084         extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
1085                                            inode_objectid, parent_objectid, 0,
1086                                            0);
1087         if (!IS_ERR_OR_NULL(extref)) {
1088                 u32 item_size;
1089                 u32 cur_offset = 0;
1090                 unsigned long base;
1091                 struct inode *victim_parent;
1092
1093                 leaf = path->nodes[0];
1094
1095                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1096                 base = btrfs_item_ptr_offset(leaf, path->slots[0]);
1097
1098                 while (cur_offset < item_size) {
1099                         extref = (struct btrfs_inode_extref *)(base + cur_offset);
1100
1101                         victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
1102
1103                         if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
1104                                 goto next;
1105
1106                         victim_name = kmalloc(victim_name_len, GFP_NOFS);
1107                         if (!victim_name)
1108                                 return -ENOMEM;
1109                         read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
1110                                            victim_name_len);
1111
1112                         search_key.objectid = inode_objectid;
1113                         search_key.type = BTRFS_INODE_EXTREF_KEY;
1114                         search_key.offset = btrfs_extref_hash(parent_objectid,
1115                                                               victim_name,
1116                                                               victim_name_len);
1117                         ret = 0;
1118                         if (!backref_in_log(log_root, &search_key,
1119                                             parent_objectid, victim_name,
1120                                             victim_name_len)) {
1121                                 ret = -ENOENT;
1122                                 victim_parent = read_one_inode(root,
1123                                                 parent_objectid);
1124                                 if (victim_parent) {
1125                                         inc_nlink(&inode->vfs_inode);
1126                                         btrfs_release_path(path);
1127
1128                                         ret = btrfs_unlink_inode(trans, root,
1129                                                         BTRFS_I(victim_parent),
1130                                                         inode,
1131                                                         victim_name,
1132                                                         victim_name_len);
1133                                         if (!ret)
1134                                                 ret = btrfs_run_delayed_items(
1135                                                                   trans,
1136                                                                   fs_info);
1137                                 }
1138                                 iput(victim_parent);
1139                                 kfree(victim_name);
1140                                 if (ret)
1141                                         return ret;
1142                                 *search_done = 1;
1143                                 goto again;
1144                         }
1145                         kfree(victim_name);
1146 next:
1147                         cur_offset += victim_name_len + sizeof(*extref);
1148                 }
1149                 *search_done = 1;
1150         }
1151         btrfs_release_path(path);
1152
1153         /* look for a conflicting sequence number */
1154         di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
1155                                          ref_index, name, namelen, 0);
1156         if (di && !IS_ERR(di)) {
1157                 ret = drop_one_dir_item(trans, root, path, dir, di);
1158                 if (ret)
1159                         return ret;
1160         }
1161         btrfs_release_path(path);
1162
1163         /* look for a conflicing name */
1164         di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
1165                                    name, namelen, 0);
1166         if (di && !IS_ERR(di)) {
1167                 ret = drop_one_dir_item(trans, root, path, dir, di);
1168                 if (ret)
1169                         return ret;
1170         }
1171         btrfs_release_path(path);
1172
1173         return 0;
1174 }
1175
1176 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1177                              u32 *namelen, char **name, u64 *index,
1178                              u64 *parent_objectid)
1179 {
1180         struct btrfs_inode_extref *extref;
1181
1182         extref = (struct btrfs_inode_extref *)ref_ptr;
1183
1184         *namelen = btrfs_inode_extref_name_len(eb, extref);
1185         *name = kmalloc(*namelen, GFP_NOFS);
1186         if (*name == NULL)
1187                 return -ENOMEM;
1188
1189         read_extent_buffer(eb, *name, (unsigned long)&extref->name,
1190                            *namelen);
1191
1192         *index = btrfs_inode_extref_index(eb, extref);
1193         if (parent_objectid)
1194                 *parent_objectid = btrfs_inode_extref_parent(eb, extref);
1195
1196         return 0;
1197 }
1198
1199 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1200                           u32 *namelen, char **name, u64 *index)
1201 {
1202         struct btrfs_inode_ref *ref;
1203
1204         ref = (struct btrfs_inode_ref *)ref_ptr;
1205
1206         *namelen = btrfs_inode_ref_name_len(eb, ref);
1207         *name = kmalloc(*namelen, GFP_NOFS);
1208         if (*name == NULL)
1209                 return -ENOMEM;
1210
1211         read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
1212
1213         *index = btrfs_inode_ref_index(eb, ref);
1214
1215         return 0;
1216 }
1217
1218 /*
1219  * replay one inode back reference item found in the log tree.
1220  * eb, slot and key refer to the buffer and key found in the log tree.
1221  * root is the destination we are replaying into, and path is for temp
1222  * use by this function.  (it should be released on return).
1223  */
1224 static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1225                                   struct btrfs_root *root,
1226                                   struct btrfs_root *log,
1227                                   struct btrfs_path *path,
1228                                   struct extent_buffer *eb, int slot,
1229                                   struct btrfs_key *key)
1230 {
1231         struct inode *dir = NULL;
1232         struct inode *inode = NULL;
1233         unsigned long ref_ptr;
1234         unsigned long ref_end;
1235         char *name = NULL;
1236         int namelen;
1237         int ret;
1238         int search_done = 0;
1239         int log_ref_ver = 0;
1240         u64 parent_objectid;
1241         u64 inode_objectid;
1242         u64 ref_index = 0;
1243         int ref_struct_size;
1244
1245         ref_ptr = btrfs_item_ptr_offset(eb, slot);
1246         ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
1247
1248         if (key->type == BTRFS_INODE_EXTREF_KEY) {
1249                 struct btrfs_inode_extref *r;
1250
1251                 ref_struct_size = sizeof(struct btrfs_inode_extref);
1252                 log_ref_ver = 1;
1253                 r = (struct btrfs_inode_extref *)ref_ptr;
1254                 parent_objectid = btrfs_inode_extref_parent(eb, r);
1255         } else {
1256                 ref_struct_size = sizeof(struct btrfs_inode_ref);
1257                 parent_objectid = key->offset;
1258         }
1259         inode_objectid = key->objectid;
1260
1261         /*
1262          * it is possible that we didn't log all the parent directories
1263          * for a given inode.  If we don't find the dir, just don't
1264          * copy the back ref in.  The link count fixup code will take
1265          * care of the rest
1266          */
1267         dir = read_one_inode(root, parent_objectid);
1268         if (!dir) {
1269                 ret = -ENOENT;
1270                 goto out;
1271         }
1272
1273         inode = read_one_inode(root, inode_objectid);
1274         if (!inode) {
1275                 ret = -EIO;
1276                 goto out;
1277         }
1278
1279         while (ref_ptr < ref_end) {
1280                 if (log_ref_ver) {
1281                         ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1282                                                 &ref_index, &parent_objectid);
1283                         /*
1284                          * parent object can change from one array
1285                          * item to another.
1286                          */
1287                         if (!dir)
1288                                 dir = read_one_inode(root, parent_objectid);
1289                         if (!dir) {
1290                                 ret = -ENOENT;
1291                                 goto out;
1292                         }
1293                 } else {
1294                         ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1295                                              &ref_index);
1296                 }
1297                 if (ret)
1298                         goto out;
1299
1300                 /* if we already have a perfect match, we're done */
1301                 if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
1302                                         btrfs_ino(BTRFS_I(inode)), ref_index,
1303                                         name, namelen)) {
1304                         /*
1305                          * look for a conflicting back reference in the
1306                          * metadata. if we find one we have to unlink that name
1307                          * of the file before we add our new link.  Later on, we
1308                          * overwrite any existing back reference, and we don't
1309                          * want to create dangling pointers in the directory.
1310                          */
1311
1312                         if (!search_done) {
1313                                 ret = __add_inode_ref(trans, root, path, log,
1314                                                       BTRFS_I(dir),
1315                                                       BTRFS_I(inode),
1316                                                       inode_objectid,
1317                                                       parent_objectid,
1318                                                       ref_index, name, namelen,
1319                                                       &search_done);
1320                                 if (ret) {
1321                                         if (ret == 1)
1322                                                 ret = 0;
1323                                         goto out;
1324                                 }
1325                         }
1326
1327                         /* insert our name */
1328                         ret = btrfs_add_link(trans, BTRFS_I(dir),
1329                                         BTRFS_I(inode),
1330                                         name, namelen, 0, ref_index);
1331                         if (ret)
1332                                 goto out;
1333
1334                         btrfs_update_inode(trans, root, inode);
1335                 }
1336
1337                 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
1338                 kfree(name);
1339                 name = NULL;
1340                 if (log_ref_ver) {
1341                         iput(dir);
1342                         dir = NULL;
1343                 }
1344         }
1345
1346         /* finally write the back reference in the inode */
1347         ret = overwrite_item(trans, root, path, eb, slot, key);
1348 out:
1349         btrfs_release_path(path);
1350         kfree(name);
1351         iput(dir);
1352         iput(inode);
1353         return ret;
1354 }
1355
1356 static int insert_orphan_item(struct btrfs_trans_handle *trans,
1357                               struct btrfs_root *root, u64 ino)
1358 {
1359         int ret;
1360
1361         ret = btrfs_insert_orphan_item(trans, root, ino);
1362         if (ret == -EEXIST)
1363                 ret = 0;
1364
1365         return ret;
1366 }
1367
1368 static int count_inode_extrefs(struct btrfs_root *root,
1369                 struct btrfs_inode *inode, struct btrfs_path *path)
1370 {
1371         int ret = 0;
1372         int name_len;
1373         unsigned int nlink = 0;
1374         u32 item_size;
1375         u32 cur_offset = 0;
1376         u64 inode_objectid = btrfs_ino(inode);
1377         u64 offset = 0;
1378         unsigned long ptr;
1379         struct btrfs_inode_extref *extref;
1380         struct extent_buffer *leaf;
1381
1382         while (1) {
1383                 ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
1384                                             &extref, &offset);
1385                 if (ret)
1386                         break;
1387
1388                 leaf = path->nodes[0];
1389                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1390                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1391                 cur_offset = 0;
1392
1393                 while (cur_offset < item_size) {
1394                         extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1395                         name_len = btrfs_inode_extref_name_len(leaf, extref);
1396
1397                         nlink++;
1398
1399                         cur_offset += name_len + sizeof(*extref);
1400                 }
1401
1402                 offset++;
1403                 btrfs_release_path(path);
1404         }
1405         btrfs_release_path(path);
1406
1407         if (ret < 0 && ret != -ENOENT)
1408                 return ret;
1409         return nlink;
1410 }
1411
1412 static int count_inode_refs(struct btrfs_root *root,
1413                         struct btrfs_inode *inode, struct btrfs_path *path)
1414 {
1415         int ret;
1416         struct btrfs_key key;
1417         unsigned int nlink = 0;
1418         unsigned long ptr;
1419         unsigned long ptr_end;
1420         int name_len;
1421         u64 ino = btrfs_ino(inode);
1422
1423         key.objectid = ino;
1424         key.type = BTRFS_INODE_REF_KEY;
1425         key.offset = (u64)-1;
1426
1427         while (1) {
1428                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1429                 if (ret < 0)
1430                         break;
1431                 if (ret > 0) {
1432                         if (path->slots[0] == 0)
1433                                 break;
1434                         path->slots[0]--;
1435                 }
1436 process_slot:
1437                 btrfs_item_key_to_cpu(path->nodes[0], &key,
1438                                       path->slots[0]);
1439                 if (key.objectid != ino ||
1440                     key.type != BTRFS_INODE_REF_KEY)
1441                         break;
1442                 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1443                 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
1444                                                    path->slots[0]);
1445                 while (ptr < ptr_end) {
1446                         struct btrfs_inode_ref *ref;
1447
1448                         ref = (struct btrfs_inode_ref *)ptr;
1449                         name_len = btrfs_inode_ref_name_len(path->nodes[0],
1450                                                             ref);
1451                         ptr = (unsigned long)(ref + 1) + name_len;
1452                         nlink++;
1453                 }
1454
1455                 if (key.offset == 0)
1456                         break;
1457                 if (path->slots[0] > 0) {
1458                         path->slots[0]--;
1459                         goto process_slot;
1460                 }
1461                 key.offset--;
1462                 btrfs_release_path(path);
1463         }
1464         btrfs_release_path(path);
1465
1466         return nlink;
1467 }
1468
1469 /*
1470  * There are a few corners where the link count of the file can't
1471  * be properly maintained during replay.  So, instead of adding
1472  * lots of complexity to the log code, we just scan the backrefs
1473  * for any file that has been through replay.
1474  *
1475  * The scan will update the link count on the inode to reflect the
1476  * number of back refs found.  If it goes down to zero, the iput
1477  * will free the inode.
1478  */
1479 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1480                                            struct btrfs_root *root,
1481                                            struct inode *inode)
1482 {
1483         struct btrfs_path *path;
1484         int ret;
1485         u64 nlink = 0;
1486         u64 ino = btrfs_ino(BTRFS_I(inode));
1487
1488         path = btrfs_alloc_path();
1489         if (!path)
1490                 return -ENOMEM;
1491
1492         ret = count_inode_refs(root, BTRFS_I(inode), path);
1493         if (ret < 0)
1494                 goto out;
1495
1496         nlink = ret;
1497
1498         ret = count_inode_extrefs(root, BTRFS_I(inode), path);
1499         if (ret < 0)
1500                 goto out;
1501
1502         nlink += ret;
1503
1504         ret = 0;
1505
1506         if (nlink != inode->i_nlink) {
1507                 set_nlink(inode, nlink);
1508                 btrfs_update_inode(trans, root, inode);
1509         }
1510         BTRFS_I(inode)->index_cnt = (u64)-1;
1511
1512         if (inode->i_nlink == 0) {
1513                 if (S_ISDIR(inode->i_mode)) {
1514                         ret = replay_dir_deletes(trans, root, NULL, path,
1515                                                  ino, 1);
1516                         if (ret)
1517                                 goto out;
1518                 }
1519                 ret = insert_orphan_item(trans, root, ino);
1520         }
1521
1522 out:
1523         btrfs_free_path(path);
1524         return ret;
1525 }
1526
1527 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1528                                             struct btrfs_root *root,
1529                                             struct btrfs_path *path)
1530 {
1531         int ret;
1532         struct btrfs_key key;
1533         struct inode *inode;
1534
1535         key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1536         key.type = BTRFS_ORPHAN_ITEM_KEY;
1537         key.offset = (u64)-1;
1538         while (1) {
1539                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1540                 if (ret < 0)
1541                         break;
1542
1543                 if (ret == 1) {
1544                         if (path->slots[0] == 0)
1545                                 break;
1546                         path->slots[0]--;
1547                 }
1548
1549                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1550                 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1551                     key.type != BTRFS_ORPHAN_ITEM_KEY)
1552                         break;
1553
1554                 ret = btrfs_del_item(trans, root, path);
1555                 if (ret)
1556                         goto out;
1557
1558                 btrfs_release_path(path);
1559                 inode = read_one_inode(root, key.offset);
1560                 if (!inode)
1561                         return -EIO;
1562
1563                 ret = fixup_inode_link_count(trans, root, inode);
1564                 iput(inode);
1565                 if (ret)
1566                         goto out;
1567
1568                 /*
1569                  * fixup on a directory may create new entries,
1570                  * make sure we always look for the highset possible
1571                  * offset
1572                  */
1573                 key.offset = (u64)-1;
1574         }
1575         ret = 0;
1576 out:
1577         btrfs_release_path(path);
1578         return ret;
1579 }
1580
1581
1582 /*
1583  * record a given inode in the fixup dir so we can check its link
1584  * count when replay is done.  The link count is incremented here
1585  * so the inode won't go away until we check it
1586  */
1587 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1588                                       struct btrfs_root *root,
1589                                       struct btrfs_path *path,
1590                                       u64 objectid)
1591 {
1592         struct btrfs_key key;
1593         int ret = 0;
1594         struct inode *inode;
1595
1596         inode = read_one_inode(root, objectid);
1597         if (!inode)
1598                 return -EIO;
1599
1600         key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1601         key.type = BTRFS_ORPHAN_ITEM_KEY;
1602         key.offset = objectid;
1603
1604         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1605
1606         btrfs_release_path(path);
1607         if (ret == 0) {
1608                 if (!inode->i_nlink)
1609                         set_nlink(inode, 1);
1610                 else
1611                         inc_nlink(inode);
1612                 ret = btrfs_update_inode(trans, root, inode);
1613         } else if (ret == -EEXIST) {
1614                 ret = 0;
1615         } else {
1616                 BUG(); /* Logic Error */
1617         }
1618         iput(inode);
1619
1620         return ret;
1621 }
1622
1623 /*
1624  * when replaying the log for a directory, we only insert names
1625  * for inodes that actually exist.  This means an fsync on a directory
1626  * does not implicitly fsync all the new files in it
1627  */
1628 static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1629                                     struct btrfs_root *root,
1630                                     u64 dirid, u64 index,
1631                                     char *name, int name_len,
1632                                     struct btrfs_key *location)
1633 {
1634         struct inode *inode;
1635         struct inode *dir;
1636         int ret;
1637
1638         inode = read_one_inode(root, location->objectid);
1639         if (!inode)
1640                 return -ENOENT;
1641
1642         dir = read_one_inode(root, dirid);
1643         if (!dir) {
1644                 iput(inode);
1645                 return -EIO;
1646         }
1647
1648         ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
1649                         name_len, 1, index);
1650
1651         /* FIXME, put inode into FIXUP list */
1652
1653         iput(inode);
1654         iput(dir);
1655         return ret;
1656 }
1657
1658 /*
1659  * Return true if an inode reference exists in the log for the given name,
1660  * inode and parent inode.
1661  */
1662 static bool name_in_log_ref(struct btrfs_root *log_root,
1663                             const char *name, const int name_len,
1664                             const u64 dirid, const u64 ino)
1665 {
1666         struct btrfs_key search_key;
1667
1668         search_key.objectid = ino;
1669         search_key.type = BTRFS_INODE_REF_KEY;
1670         search_key.offset = dirid;
1671         if (backref_in_log(log_root, &search_key, dirid, name, name_len))
1672                 return true;
1673
1674         search_key.type = BTRFS_INODE_EXTREF_KEY;
1675         search_key.offset = btrfs_extref_hash(dirid, name, name_len);
1676         if (backref_in_log(log_root, &search_key, dirid, name, name_len))
1677                 return true;
1678
1679         return false;
1680 }
1681
1682 /*
1683  * take a single entry in a log directory item and replay it into
1684  * the subvolume.
1685  *
1686  * if a conflicting item exists in the subdirectory already,
1687  * the inode it points to is unlinked and put into the link count
1688  * fix up tree.
1689  *
1690  * If a name from the log points to a file or directory that does
1691  * not exist in the FS, it is skipped.  fsyncs on directories
1692  * do not force down inodes inside that directory, just changes to the
1693  * names or unlinks in a directory.
1694  *
1695  * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
1696  * non-existing inode) and 1 if the name was replayed.
1697  */
1698 static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1699                                     struct btrfs_root *root,
1700                                     struct btrfs_path *path,
1701                                     struct extent_buffer *eb,
1702                                     struct btrfs_dir_item *di,
1703                                     struct btrfs_key *key)
1704 {
1705         char *name;
1706         int name_len;
1707         struct btrfs_dir_item *dst_di;
1708         struct btrfs_key found_key;
1709         struct btrfs_key log_key;
1710         struct inode *dir;
1711         u8 log_type;
1712         int exists;
1713         int ret = 0;
1714         bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
1715         bool name_added = false;
1716
1717         dir = read_one_inode(root, key->objectid);
1718         if (!dir)
1719                 return -EIO;
1720
1721         name_len = btrfs_dir_name_len(eb, di);
1722         name = kmalloc(name_len, GFP_NOFS);
1723         if (!name) {
1724                 ret = -ENOMEM;
1725                 goto out;
1726         }
1727
1728         log_type = btrfs_dir_type(eb, di);
1729         read_extent_buffer(eb, name, (unsigned long)(di + 1),
1730                    name_len);
1731
1732         btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1733         exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1734         if (exists == 0)
1735                 exists = 1;
1736         else
1737                 exists = 0;
1738         btrfs_release_path(path);
1739
1740         if (key->type == BTRFS_DIR_ITEM_KEY) {
1741                 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1742                                        name, name_len, 1);
1743         } else if (key->type == BTRFS_DIR_INDEX_KEY) {
1744                 dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1745                                                      key->objectid,
1746                                                      key->offset, name,
1747                                                      name_len, 1);
1748         } else {
1749                 /* Corruption */
1750                 ret = -EINVAL;
1751                 goto out;
1752         }
1753         if (IS_ERR_OR_NULL(dst_di)) {
1754                 /* we need a sequence number to insert, so we only
1755                  * do inserts for the BTRFS_DIR_INDEX_KEY types
1756                  */
1757                 if (key->type != BTRFS_DIR_INDEX_KEY)
1758                         goto out;
1759                 goto insert;
1760         }
1761
1762         btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1763         /* the existing item matches the logged item */
1764         if (found_key.objectid == log_key.objectid &&
1765             found_key.type == log_key.type &&
1766             found_key.offset == log_key.offset &&
1767             btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1768                 update_size = false;
1769                 goto out;
1770         }
1771
1772         /*
1773          * don't drop the conflicting directory entry if the inode
1774          * for the new entry doesn't exist
1775          */
1776         if (!exists)
1777                 goto out;
1778
1779         ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di);
1780         if (ret)
1781                 goto out;
1782
1783         if (key->type == BTRFS_DIR_INDEX_KEY)
1784                 goto insert;
1785 out:
1786         btrfs_release_path(path);
1787         if (!ret && update_size) {
1788                 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
1789                 ret = btrfs_update_inode(trans, root, dir);
1790         }
1791         kfree(name);
1792         iput(dir);
1793         if (!ret && name_added)
1794                 ret = 1;
1795         return ret;
1796
1797 insert:
1798         if (name_in_log_ref(root->log_root, name, name_len,
1799                             key->objectid, log_key.objectid)) {
1800                 /* The dentry will be added later. */
1801                 ret = 0;
1802                 update_size = false;
1803                 goto out;
1804         }
1805         btrfs_release_path(path);
1806         ret = insert_one_name(trans, root, key->objectid, key->offset,
1807                               name, name_len, &log_key);
1808         if (ret && ret != -ENOENT && ret != -EEXIST)
1809                 goto out;
1810         if (!ret)
1811                 name_added = true;
1812         update_size = false;
1813         ret = 0;
1814         goto out;
1815 }
1816
1817 /*
1818  * find all the names in a directory item and reconcile them into
1819  * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
1820  * one name in a directory item, but the same code gets used for
1821  * both directory index types
1822  */
1823 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1824                                         struct btrfs_root *root,
1825                                         struct btrfs_path *path,
1826                                         struct extent_buffer *eb, int slot,
1827                                         struct btrfs_key *key)
1828 {
1829         int ret = 0;
1830         u32 item_size = btrfs_item_size_nr(eb, slot);
1831         struct btrfs_dir_item *di;
1832         int name_len;
1833         unsigned long ptr;
1834         unsigned long ptr_end;
1835         struct btrfs_path *fixup_path = NULL;
1836
1837         ptr = btrfs_item_ptr_offset(eb, slot);
1838         ptr_end = ptr + item_size;
1839         while (ptr < ptr_end) {
1840                 di = (struct btrfs_dir_item *)ptr;
1841                 name_len = btrfs_dir_name_len(eb, di);
1842                 ret = replay_one_name(trans, root, path, eb, di, key);
1843                 if (ret < 0)
1844                         break;
1845                 ptr = (unsigned long)(di + 1);
1846                 ptr += name_len;
1847
1848                 /*
1849                  * If this entry refers to a non-directory (directories can not
1850                  * have a link count > 1) and it was added in the transaction
1851                  * that was not committed, make sure we fixup the link count of
1852                  * the inode it the entry points to. Otherwise something like
1853                  * the following would result in a directory pointing to an
1854                  * inode with a wrong link that does not account for this dir
1855                  * entry:
1856                  *
1857                  * mkdir testdir
1858                  * touch testdir/foo
1859                  * touch testdir/bar
1860                  * sync
1861                  *
1862                  * ln testdir/bar testdir/bar_link
1863                  * ln testdir/foo testdir/foo_link
1864                  * xfs_io -c "fsync" testdir/bar
1865                  *
1866                  * <power failure>
1867                  *
1868                  * mount fs, log replay happens
1869                  *
1870                  * File foo would remain with a link count of 1 when it has two
1871                  * entries pointing to it in the directory testdir. This would
1872                  * make it impossible to ever delete the parent directory has
1873                  * it would result in stale dentries that can never be deleted.
1874                  */
1875                 if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
1876                         struct btrfs_key di_key;
1877
1878                         if (!fixup_path) {
1879                                 fixup_path = btrfs_alloc_path();
1880                                 if (!fixup_path) {
1881                                         ret = -ENOMEM;
1882                                         break;
1883                                 }
1884                         }
1885
1886                         btrfs_dir_item_key_to_cpu(eb, di, &di_key);
1887                         ret = link_to_fixup_dir(trans, root, fixup_path,
1888                                                 di_key.objectid);
1889                         if (ret)
1890                                 break;
1891                 }
1892                 ret = 0;
1893         }
1894         btrfs_free_path(fixup_path);
1895         return ret;
1896 }
1897
1898 /*
1899  * directory replay has two parts.  There are the standard directory
1900  * items in the log copied from the subvolume, and range items
1901  * created in the log while the subvolume was logged.
1902  *
1903  * The range items tell us which parts of the key space the log
1904  * is authoritative for.  During replay, if a key in the subvolume
1905  * directory is in a logged range item, but not actually in the log
1906  * that means it was deleted from the directory before the fsync
1907  * and should be removed.
1908  */
1909 static noinline int find_dir_range(struct btrfs_root *root,
1910                                    struct btrfs_path *path,
1911                                    u64 dirid, int key_type,
1912                                    u64 *start_ret, u64 *end_ret)
1913 {
1914         struct btrfs_key key;
1915         u64 found_end;
1916         struct btrfs_dir_log_item *item;
1917         int ret;
1918         int nritems;
1919
1920         if (*start_ret == (u64)-1)
1921                 return 1;
1922
1923         key.objectid = dirid;
1924         key.type = key_type;
1925         key.offset = *start_ret;
1926
1927         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1928         if (ret < 0)
1929                 goto out;
1930         if (ret > 0) {
1931                 if (path->slots[0] == 0)
1932                         goto out;
1933                 path->slots[0]--;
1934         }
1935         if (ret != 0)
1936                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1937
1938         if (key.type != key_type || key.objectid != dirid) {
1939                 ret = 1;
1940                 goto next;
1941         }
1942         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1943                               struct btrfs_dir_log_item);
1944         found_end = btrfs_dir_log_end(path->nodes[0], item);
1945
1946         if (*start_ret >= key.offset && *start_ret <= found_end) {
1947                 ret = 0;
1948                 *start_ret = key.offset;
1949                 *end_ret = found_end;
1950                 goto out;
1951         }
1952         ret = 1;
1953 next:
1954         /* check the next slot in the tree to see if it is a valid item */
1955         nritems = btrfs_header_nritems(path->nodes[0]);
1956         path->slots[0]++;
1957         if (path->slots[0] >= nritems) {
1958                 ret = btrfs_next_leaf(root, path);
1959                 if (ret)
1960                         goto out;
1961         }
1962
1963         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1964
1965         if (key.type != key_type || key.objectid != dirid) {
1966                 ret = 1;
1967                 goto out;
1968         }
1969         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1970                               struct btrfs_dir_log_item);
1971         found_end = btrfs_dir_log_end(path->nodes[0], item);
1972         *start_ret = key.offset;
1973         *end_ret = found_end;
1974         ret = 0;
1975 out:
1976         btrfs_release_path(path);
1977         return ret;
1978 }
1979
1980 /*
1981  * this looks for a given directory item in the log.  If the directory
1982  * item is not in the log, the item is removed and the inode it points
1983  * to is unlinked
1984  */
1985 static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
1986                                       struct btrfs_root *root,
1987                                       struct btrfs_root *log,
1988                                       struct btrfs_path *path,
1989                                       struct btrfs_path *log_path,
1990                                       struct inode *dir,
1991                                       struct btrfs_key *dir_key)
1992 {
1993         struct btrfs_fs_info *fs_info = root->fs_info;
1994         int ret;
1995         struct extent_buffer *eb;
1996         int slot;
1997         u32 item_size;
1998         struct btrfs_dir_item *di;
1999         struct btrfs_dir_item *log_di;
2000         int name_len;
2001         unsigned long ptr;
2002         unsigned long ptr_end;
2003         char *name;
2004         struct inode *inode;
2005         struct btrfs_key location;
2006
2007 again:
2008         eb = path->nodes[0];
2009         slot = path->slots[0];
2010         item_size = btrfs_item_size_nr(eb, slot);
2011         ptr = btrfs_item_ptr_offset(eb, slot);
2012         ptr_end = ptr + item_size;
2013         while (ptr < ptr_end) {
2014                 di = (struct btrfs_dir_item *)ptr;
2015                 name_len = btrfs_dir_name_len(eb, di);
2016                 name = kmalloc(name_len, GFP_NOFS);
2017                 if (!name) {
2018                         ret = -ENOMEM;
2019                         goto out;
2020                 }
2021                 read_extent_buffer(eb, name, (unsigned long)(di + 1),
2022                                   name_len);
2023                 log_di = NULL;
2024                 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
2025                         log_di = btrfs_lookup_dir_item(trans, log, log_path,
2026                                                        dir_key->objectid,
2027                                                        name, name_len, 0);
2028                 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
2029                         log_di = btrfs_lookup_dir_index_item(trans, log,
2030                                                      log_path,
2031                                                      dir_key->objectid,
2032                                                      dir_key->offset,
2033                                                      name, name_len, 0);
2034                 }
2035                 if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) {
2036                         btrfs_dir_item_key_to_cpu(eb, di, &location);
2037                         btrfs_release_path(path);
2038                         btrfs_release_path(log_path);
2039                         inode = read_one_inode(root, location.objectid);
2040                         if (!inode) {
2041                                 kfree(name);
2042                                 return -EIO;
2043                         }
2044
2045                         ret = link_to_fixup_dir(trans, root,
2046                                                 path, location.objectid);
2047                         if (ret) {
2048                                 kfree(name);
2049                                 iput(inode);
2050                                 goto out;
2051                         }
2052
2053                         inc_nlink(inode);
2054                         ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
2055                                         BTRFS_I(inode), name, name_len);
2056                         if (!ret)
2057                                 ret = btrfs_run_delayed_items(trans, fs_info);
2058                         kfree(name);
2059                         iput(inode);
2060                         if (ret)
2061                                 goto out;
2062
2063                         /* there might still be more names under this key
2064                          * check and repeat if required
2065                          */
2066                         ret = btrfs_search_slot(NULL, root, dir_key, path,
2067                                                 0, 0);
2068                         if (ret == 0)
2069                                 goto again;
2070                         ret = 0;
2071                         goto out;
2072                 } else if (IS_ERR(log_di)) {
2073                         kfree(name);
2074                         return PTR_ERR(log_di);
2075                 }
2076                 btrfs_release_path(log_path);
2077                 kfree(name);
2078
2079                 ptr = (unsigned long)(di + 1);
2080                 ptr += name_len;
2081         }
2082         ret = 0;
2083 out:
2084         btrfs_release_path(path);
2085         btrfs_release_path(log_path);
2086         return ret;
2087 }
2088
2089 static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
2090                               struct btrfs_root *root,
2091                               struct btrfs_root *log,
2092                               struct btrfs_path *path,
2093                               const u64 ino)
2094 {
2095         struct btrfs_key search_key;
2096         struct btrfs_path *log_path;
2097         int i;
2098         int nritems;
2099         int ret;
2100
2101         log_path = btrfs_alloc_path();
2102         if (!log_path)
2103                 return -ENOMEM;
2104
2105         search_key.objectid = ino;
2106         search_key.type = BTRFS_XATTR_ITEM_KEY;
2107         search_key.offset = 0;
2108 again:
2109         ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
2110         if (ret < 0)
2111                 goto out;
2112 process_leaf:
2113         nritems = btrfs_header_nritems(path->nodes[0]);
2114         for (i = path->slots[0]; i < nritems; i++) {
2115                 struct btrfs_key key;
2116                 struct btrfs_dir_item *di;
2117                 struct btrfs_dir_item *log_di;
2118                 u32 total_size;
2119                 u32 cur;
2120
2121                 btrfs_item_key_to_cpu(path->nodes[0], &key, i);
2122                 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
2123                         ret = 0;
2124                         goto out;
2125                 }
2126
2127                 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
2128                 total_size = btrfs_item_size_nr(path->nodes[0], i);
2129                 cur = 0;
2130                 while (cur < total_size) {
2131                         u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
2132                         u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
2133                         u32 this_len = sizeof(*di) + name_len + data_len;
2134                         char *name;
2135
2136                         name = kmalloc(name_len, GFP_NOFS);
2137                         if (!name) {
2138                                 ret = -ENOMEM;
2139                                 goto out;
2140                         }
2141                         read_extent_buffer(path->nodes[0], name,
2142                                            (unsigned long)(di + 1), name_len);
2143
2144                         log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
2145                                                     name, name_len, 0);
2146                         btrfs_release_path(log_path);
2147                         if (!log_di) {
2148                                 /* Doesn't exist in log tree, so delete it. */
2149                                 btrfs_release_path(path);
2150                                 di = btrfs_lookup_xattr(trans, root, path, ino,
2151                                                         name, name_len, -1);
2152                                 kfree(name);
2153                                 if (IS_ERR(di)) {
2154                                         ret = PTR_ERR(di);
2155                                         goto out;
2156                                 }
2157                                 ASSERT(di);
2158                                 ret = btrfs_delete_one_dir_name(trans, root,
2159                                                                 path, di);
2160                                 if (ret)
2161                                         goto out;
2162                                 btrfs_release_path(path);
2163                                 search_key = key;
2164                                 goto again;
2165                         }
2166                         kfree(name);
2167                         if (IS_ERR(log_di)) {
2168                                 ret = PTR_ERR(log_di);
2169                                 goto out;
2170                         }
2171                         cur += this_len;
2172                         di = (struct btrfs_dir_item *)((char *)di + this_len);
2173                 }
2174         }
2175         ret = btrfs_next_leaf(root, path);
2176         if (ret > 0)
2177                 ret = 0;
2178         else if (ret == 0)
2179                 goto process_leaf;
2180 out:
2181         btrfs_free_path(log_path);
2182         btrfs_release_path(path);
2183         return ret;
2184 }
2185
2186
2187 /*
2188  * deletion replay happens before we copy any new directory items
2189  * out of the log or out of backreferences from inodes.  It
2190  * scans the log to find ranges of keys that log is authoritative for,
2191  * and then scans the directory to find items in those ranges that are
2192  * not present in the log.
2193  *
2194  * Anything we don't find in the log is unlinked and removed from the
2195  * directory.
2196  */
2197 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
2198                                        struct btrfs_root *root,
2199                                        struct btrfs_root *log,
2200                                        struct btrfs_path *path,
2201                                        u64 dirid, int del_all)
2202 {
2203         u64 range_start;
2204         u64 range_end;
2205         int key_type = BTRFS_DIR_LOG_ITEM_KEY;
2206         int ret = 0;
2207         struct btrfs_key dir_key;
2208         struct btrfs_key found_key;
2209         struct btrfs_path *log_path;
2210         struct inode *dir;
2211
2212         dir_key.objectid = dirid;
2213         dir_key.type = BTRFS_DIR_ITEM_KEY;
2214         log_path = btrfs_alloc_path();
2215         if (!log_path)
2216                 return -ENOMEM;
2217
2218         dir = read_one_inode(root, dirid);
2219         /* it isn't an error if the inode isn't there, that can happen
2220          * because we replay the deletes before we copy in the inode item
2221          * from the log
2222          */
2223         if (!dir) {
2224                 btrfs_free_path(log_path);
2225                 return 0;
2226         }
2227 again:
2228         range_start = 0;
2229         range_end = 0;
2230         while (1) {
2231                 if (del_all)
2232                         range_end = (u64)-1;
2233                 else {
2234                         ret = find_dir_range(log, path, dirid, key_type,
2235                                              &range_start, &range_end);
2236                         if (ret != 0)
2237                                 break;
2238                 }
2239
2240                 dir_key.offset = range_start;
2241                 while (1) {
2242                         int nritems;
2243                         ret = btrfs_search_slot(NULL, root, &dir_key, path,
2244                                                 0, 0);
2245                         if (ret < 0)
2246                                 goto out;
2247
2248                         nritems = btrfs_header_nritems(path->nodes[0]);
2249                         if (path->slots[0] >= nritems) {
2250                                 ret = btrfs_next_leaf(root, path);
2251                                 if (ret)
2252                                         break;
2253                         }
2254                         btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2255                                               path->slots[0]);
2256                         if (found_key.objectid != dirid ||
2257                             found_key.type != dir_key.type)
2258                                 goto next_type;
2259
2260                         if (found_key.offset > range_end)
2261                                 break;
2262
2263                         ret = check_item_in_log(trans, root, log, path,
2264                                                 log_path, dir,
2265                                                 &found_key);
2266                         if (ret)
2267                                 goto out;
2268                         if (found_key.offset == (u64)-1)
2269                                 break;
2270                         dir_key.offset = found_key.offset + 1;
2271                 }
2272                 btrfs_release_path(path);
2273                 if (range_end == (u64)-1)
2274                         break;
2275                 range_start = range_end + 1;
2276         }
2277
2278 next_type:
2279         ret = 0;
2280         if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
2281                 key_type = BTRFS_DIR_LOG_INDEX_KEY;
2282                 dir_key.type = BTRFS_DIR_INDEX_KEY;
2283                 btrfs_release_path(path);
2284                 goto again;
2285         }
2286 out:
2287         btrfs_release_path(path);
2288         btrfs_free_path(log_path);
2289         iput(dir);
2290         return ret;
2291 }
2292
2293 /*
2294  * the process_func used to replay items from the log tree.  This
2295  * gets called in two different stages.  The first stage just looks
2296  * for inodes and makes sure they are all copied into the subvolume.
2297  *
2298  * The second stage copies all the other item types from the log into
2299  * the subvolume.  The two stage approach is slower, but gets rid of
2300  * lots of complexity around inodes referencing other inodes that exist
2301  * only in the log (references come from either directory items or inode
2302  * back refs).
2303  */
2304 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2305                              struct walk_control *wc, u64 gen)
2306 {
2307         int nritems;
2308         struct btrfs_path *path;
2309         struct btrfs_root *root = wc->replay_dest;
2310         struct btrfs_key key;
2311         int level;
2312         int i;
2313         int ret;
2314
2315         ret = btrfs_read_buffer(eb, gen);
2316         if (ret)
2317                 return ret;
2318
2319         level = btrfs_header_level(eb);
2320
2321         if (level != 0)
2322                 return 0;
2323
2324         path = btrfs_alloc_path();
2325         if (!path)
2326                 return -ENOMEM;
2327
2328         nritems = btrfs_header_nritems(eb);
2329         for (i = 0; i < nritems; i++) {
2330                 btrfs_item_key_to_cpu(eb, &key, i);
2331
2332                 /* inode keys are done during the first stage */
2333                 if (key.type == BTRFS_INODE_ITEM_KEY &&
2334                     wc->stage == LOG_WALK_REPLAY_INODES) {
2335                         struct btrfs_inode_item *inode_item;
2336                         u32 mode;
2337
2338                         inode_item = btrfs_item_ptr(eb, i,
2339                                             struct btrfs_inode_item);
2340                         ret = replay_xattr_deletes(wc->trans, root, log,
2341                                                    path, key.objectid);
2342                         if (ret)
2343                                 break;
2344                         mode = btrfs_inode_mode(eb, inode_item);
2345                         if (S_ISDIR(mode)) {
2346                                 ret = replay_dir_deletes(wc->trans,
2347                                          root, log, path, key.objectid, 0);
2348                                 if (ret)
2349                                         break;
2350                         }
2351                         ret = overwrite_item(wc->trans, root, path,
2352                                              eb, i, &key);
2353                         if (ret)
2354                                 break;
2355
2356                         /* for regular files, make sure corresponding
2357                          * orphan item exist. extents past the new EOF
2358                          * will be truncated later by orphan cleanup.
2359                          */
2360                         if (S_ISREG(mode)) {
2361                                 ret = insert_orphan_item(wc->trans, root,
2362                                                          key.objectid);
2363                                 if (ret)
2364                                         break;
2365                         }
2366
2367                         ret = link_to_fixup_dir(wc->trans, root,
2368                                                 path, key.objectid);
2369                         if (ret)
2370                                 break;
2371                 }
2372
2373                 if (key.type == BTRFS_DIR_INDEX_KEY &&
2374                     wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2375                         ret = replay_one_dir_item(wc->trans, root, path,
2376                                                   eb, i, &key);
2377                         if (ret)
2378                                 break;
2379                 }
2380
2381                 if (wc->stage < LOG_WALK_REPLAY_ALL)
2382                         continue;
2383
2384                 /* these keys are simply copied */
2385                 if (key.type == BTRFS_XATTR_ITEM_KEY) {
2386                         ret = overwrite_item(wc->trans, root, path,
2387                                              eb, i, &key);
2388                         if (ret)
2389                                 break;
2390                 } else if (key.type == BTRFS_INODE_REF_KEY ||
2391                            key.type == BTRFS_INODE_EXTREF_KEY) {
2392                         ret = add_inode_ref(wc->trans, root, log, path,
2393                                             eb, i, &key);
2394                         if (ret && ret != -ENOENT)
2395                                 break;
2396                         ret = 0;
2397                 } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
2398                         ret = replay_one_extent(wc->trans, root, path,
2399                                                 eb, i, &key);
2400                         if (ret)
2401                                 break;
2402                 } else if (key.type == BTRFS_DIR_ITEM_KEY) {
2403                         ret = replay_one_dir_item(wc->trans, root, path,
2404                                                   eb, i, &key);
2405                         if (ret)
2406                                 break;
2407                 }
2408         }
2409         btrfs_free_path(path);
2410         return ret;
2411 }
2412
2413 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2414                                    struct btrfs_root *root,
2415                                    struct btrfs_path *path, int *level,
2416                                    struct walk_control *wc)
2417 {
2418         struct btrfs_fs_info *fs_info = root->fs_info;
2419         u64 root_owner;
2420         u64 bytenr;
2421         u64 ptr_gen;
2422         struct extent_buffer *next;
2423         struct extent_buffer *cur;
2424         struct extent_buffer *parent;
2425         u32 blocksize;
2426         int ret = 0;
2427
2428         WARN_ON(*level < 0);
2429         WARN_ON(*level >= BTRFS_MAX_LEVEL);
2430
2431         while (*level > 0) {
2432                 WARN_ON(*level < 0);
2433                 WARN_ON(*level >= BTRFS_MAX_LEVEL);
2434                 cur = path->nodes[*level];
2435
2436                 WARN_ON(btrfs_header_level(cur) != *level);
2437
2438                 if (path->slots[*level] >=
2439                     btrfs_header_nritems(cur))
2440                         break;
2441
2442                 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2443                 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2444                 blocksize = fs_info->nodesize;
2445
2446                 parent = path->nodes[*level];
2447                 root_owner = btrfs_header_owner(parent);
2448
2449                 next = btrfs_find_create_tree_block(fs_info, bytenr);
2450                 if (IS_ERR(next))
2451                         return PTR_ERR(next);
2452
2453                 if (*level == 1) {
2454                         ret = wc->process_func(root, next, wc, ptr_gen);
2455                         if (ret) {
2456                                 free_extent_buffer(next);
2457                                 return ret;
2458                         }
2459
2460                         path->slots[*level]++;
2461                         if (wc->free) {
2462                                 ret = btrfs_read_buffer(next, ptr_gen);
2463                                 if (ret) {
2464                                         free_extent_buffer(next);
2465                                         return ret;
2466                                 }
2467
2468                                 if (trans) {
2469                                         btrfs_tree_lock(next);
2470                                         btrfs_set_lock_blocking(next);
2471                                         clean_tree_block(fs_info, next);
2472                                         btrfs_wait_tree_block_writeback(next);
2473                                         btrfs_tree_unlock(next);
2474                                 } else {
2475                                         if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2476                                                 clear_extent_buffer_dirty(next);
2477                                 }
2478
2479                                 WARN_ON(root_owner !=
2480                                         BTRFS_TREE_LOG_OBJECTID);
2481                                 ret = btrfs_free_and_pin_reserved_extent(
2482                                                         fs_info, bytenr,
2483                                                         blocksize);
2484                                 if (ret) {
2485                                         free_extent_buffer(next);
2486                                         return ret;
2487                                 }
2488                         }
2489                         free_extent_buffer(next);
2490                         continue;
2491                 }
2492                 ret = btrfs_read_buffer(next, ptr_gen);
2493                 if (ret) {
2494                         free_extent_buffer(next);
2495                         return ret;
2496                 }
2497
2498                 WARN_ON(*level <= 0);
2499                 if (path->nodes[*level-1])
2500                         free_extent_buffer(path->nodes[*level-1]);
2501                 path->nodes[*level-1] = next;
2502                 *level = btrfs_header_level(next);
2503                 path->slots[*level] = 0;
2504                 cond_resched();
2505         }
2506         WARN_ON(*level < 0);
2507         WARN_ON(*level >= BTRFS_MAX_LEVEL);
2508
2509         path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
2510
2511         cond_resched();
2512         return 0;
2513 }
2514
2515 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2516                                  struct btrfs_root *root,
2517                                  struct btrfs_path *path, int *level,
2518                                  struct walk_control *wc)
2519 {
2520         struct btrfs_fs_info *fs_info = root->fs_info;
2521         u64 root_owner;
2522         int i;
2523         int slot;
2524         int ret;
2525
2526         for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2527                 slot = path->slots[i];
2528                 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
2529                         path->slots[i]++;
2530                         *level = i;
2531                         WARN_ON(*level == 0);
2532                         return 0;
2533                 } else {
2534                         struct extent_buffer *parent;
2535                         if (path->nodes[*level] == root->node)
2536                                 parent = path->nodes[*level];
2537                         else
2538                                 parent = path->nodes[*level + 1];
2539
2540                         root_owner = btrfs_header_owner(parent);
2541                         ret = wc->process_func(root, path->nodes[*level], wc,
2542                                  btrfs_header_generation(path->nodes[*level]));
2543                         if (ret)
2544                                 return ret;
2545
2546                         if (wc->free) {
2547                                 struct extent_buffer *next;
2548
2549                                 next = path->nodes[*level];
2550
2551                                 if (trans) {
2552                                         btrfs_tree_lock(next);
2553                                         btrfs_set_lock_blocking(next);
2554                                         clean_tree_block(fs_info, next);
2555                                         btrfs_wait_tree_block_writeback(next);
2556                                         btrfs_tree_unlock(next);
2557                                 } else {
2558                                         if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2559                                                 clear_extent_buffer_dirty(next);
2560                                 }
2561
2562                                 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
2563                                 ret = btrfs_free_and_pin_reserved_extent(
2564                                                 fs_info,
2565                                                 path->nodes[*level]->start,
2566                                                 path->nodes[*level]->len);
2567                                 if (ret)
2568                                         return ret;
2569                         }
2570                         free_extent_buffer(path->nodes[*level]);
2571                         path->nodes[*level] = NULL;
2572                         *level = i + 1;
2573                 }
2574         }
2575         return 1;
2576 }
2577
2578 /*
2579  * drop the reference count on the tree rooted at 'snap'.  This traverses
2580  * the tree freeing any blocks that have a ref count of zero after being
2581  * decremented.
2582  */
2583 static int walk_log_tree(struct btrfs_trans_handle *trans,
2584                          struct btrfs_root *log, struct walk_control *wc)
2585 {
2586         struct btrfs_fs_info *fs_info = log->fs_info;
2587         int ret = 0;
2588         int wret;
2589         int level;
2590         struct btrfs_path *path;
2591         int orig_level;
2592
2593         path = btrfs_alloc_path();
2594         if (!path)
2595                 return -ENOMEM;
2596
2597         level = btrfs_header_level(log->node);
2598         orig_level = level;
2599         path->nodes[level] = log->node;
2600         extent_buffer_get(log->node);
2601         path->slots[level] = 0;
2602
2603         while (1) {
2604                 wret = walk_down_log_tree(trans, log, path, &level, wc);
2605                 if (wret > 0)
2606                         break;
2607                 if (wret < 0) {
2608                         ret = wret;
2609                         goto out;
2610                 }
2611
2612                 wret = walk_up_log_tree(trans, log, path, &level, wc);
2613                 if (wret > 0)
2614                         break;
2615                 if (wret < 0) {
2616                         ret = wret;
2617                         goto out;
2618                 }
2619         }
2620
2621         /* was the root node processed? if not, catch it here */
2622         if (path->nodes[orig_level]) {
2623                 ret = wc->process_func(log, path->nodes[orig_level], wc,
2624                          btrfs_header_generation(path->nodes[orig_level]));
2625                 if (ret)
2626                         goto out;
2627                 if (wc->free) {
2628                         struct extent_buffer *next;
2629
2630                         next = path->nodes[orig_level];
2631
2632                         if (trans) {
2633                                 btrfs_tree_lock(next);
2634                                 btrfs_set_lock_blocking(next);
2635                                 clean_tree_block(fs_info, next);
2636                                 btrfs_wait_tree_block_writeback(next);
2637                                 btrfs_tree_unlock(next);
2638                         } else {
2639                                 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2640                                         clear_extent_buffer_dirty(next);
2641                         }
2642
2643                         WARN_ON(log->root_key.objectid !=
2644                                 BTRFS_TREE_LOG_OBJECTID);
2645                         ret = btrfs_free_and_pin_reserved_extent(fs_info,
2646                                                         next->start, next->len);
2647                         if (ret)
2648                                 goto out;
2649                 }
2650         }
2651
2652 out:
2653         btrfs_free_path(path);
2654         return ret;
2655 }
2656
2657 /*
2658  * helper function to update the item for a given subvolumes log root
2659  * in the tree of log roots
2660  */
2661 static int update_log_root(struct btrfs_trans_handle *trans,
2662                            struct btrfs_root *log)
2663 {
2664         struct btrfs_fs_info *fs_info = log->fs_info;
2665         int ret;
2666
2667         if (log->log_transid == 1) {
2668                 /* insert root item on the first sync */
2669                 ret = btrfs_insert_root(trans, fs_info->log_root_tree,
2670                                 &log->root_key, &log->root_item);
2671         } else {
2672                 ret = btrfs_update_root(trans, fs_info->log_root_tree,
2673                                 &log->root_key, &log->root_item);
2674         }
2675         return ret;
2676 }
2677
2678 static void wait_log_commit(struct btrfs_root *root, int transid)
2679 {
2680         DEFINE_WAIT(wait);
2681         int index = transid % 2;
2682
2683         /*
2684          * we only allow two pending log transactions at a time,
2685          * so we know that if ours is more than 2 older than the
2686          * current transaction, we're done
2687          */
2688         for (;;) {
2689                 prepare_to_wait(&root->log_commit_wait[index],
2690                                 &wait, TASK_UNINTERRUPTIBLE);
2691
2692                 if (!(root->log_transid_committed < transid &&
2693                       atomic_read(&root->log_commit[index])))
2694                         break;
2695
2696                 mutex_unlock(&root->log_mutex);
2697                 schedule();
2698                 mutex_lock(&root->log_mutex);
2699         }
2700         finish_wait(&root->log_commit_wait[index], &wait);
2701 }
2702
2703 static void wait_for_writer(struct btrfs_root *root)
2704 {
2705         DEFINE_WAIT(wait);
2706
2707         for (;;) {
2708                 prepare_to_wait(&root->log_writer_wait, &wait,
2709                                 TASK_UNINTERRUPTIBLE);
2710                 if (!atomic_read(&root->log_writers))
2711                         break;
2712
2713                 mutex_unlock(&root->log_mutex);
2714                 schedule();
2715                 mutex_lock(&root->log_mutex);
2716         }
2717         finish_wait(&root->log_writer_wait, &wait);
2718 }
2719
2720 static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
2721                                         struct btrfs_log_ctx *ctx)
2722 {
2723         if (!ctx)
2724                 return;
2725
2726         mutex_lock(&root->log_mutex);
2727         list_del_init(&ctx->list);
2728         mutex_unlock(&root->log_mutex);
2729 }
2730
2731 /* 
2732  * Invoked in log mutex context, or be sure there is no other task which
2733  * can access the list.
2734  */
2735 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
2736                                              int index, int error)
2737 {
2738         struct btrfs_log_ctx *ctx;
2739         struct btrfs_log_ctx *safe;
2740
2741         list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
2742                 list_del_init(&ctx->list);
2743                 ctx->log_ret = error;
2744         }
2745
2746         INIT_LIST_HEAD(&root->log_ctxs[index]);
2747 }
2748
2749 /*
2750  * btrfs_sync_log does sends a given tree log down to the disk and
2751  * updates the super blocks to record it.  When this call is done,
2752  * you know that any inodes previously logged are safely on disk only
2753  * if it returns 0.
2754  *
2755  * Any other return value means you need to call btrfs_commit_transaction.
2756  * Some of the edge cases for fsyncing directories that have had unlinks
2757  * or renames done in the past mean that sometimes the only safe
2758  * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
2759  * that has happened.
2760  */
2761 int btrfs_sync_log(struct btrfs_trans_handle *trans,
2762                    struct btrfs_root *root, struct btrfs_log_ctx *ctx)
2763 {
2764         int index1;
2765         int index2;
2766         int mark;
2767         int ret;
2768         struct btrfs_fs_info *fs_info = root->fs_info;
2769         struct btrfs_root *log = root->log_root;
2770         struct btrfs_root *log_root_tree = fs_info->log_root_tree;
2771         int log_transid = 0;
2772         struct btrfs_log_ctx root_log_ctx;
2773         struct blk_plug plug;
2774
2775         mutex_lock(&root->log_mutex);
2776         log_transid = ctx->log_transid;
2777         if (root->log_transid_committed >= log_transid) {
2778                 mutex_unlock(&root->log_mutex);
2779                 return ctx->log_ret;
2780         }
2781
2782         index1 = log_transid % 2;
2783         if (atomic_read(&root->log_commit[index1])) {
2784                 wait_log_commit(root, log_transid);
2785                 mutex_unlock(&root->log_mutex);
2786                 return ctx->log_ret;
2787         }
2788         ASSERT(log_transid == root->log_transid);
2789         atomic_set(&root->log_commit[index1], 1);
2790
2791         /* wait for previous tree log sync to complete */
2792         if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2793                 wait_log_commit(root, log_transid - 1);
2794
2795         while (1) {
2796                 int batch = atomic_read(&root->log_batch);
2797                 /* when we're on an ssd, just kick the log commit out */
2798                 if (!btrfs_test_opt(fs_info, SSD) &&
2799                     test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
2800                         mutex_unlock(&root->log_mutex);
2801                         schedule_timeout_uninterruptible(1);
2802                         mutex_lock(&root->log_mutex);
2803                 }
2804                 wait_for_writer(root);
2805                 if (batch == atomic_read(&root->log_batch))
2806                         break;
2807         }
2808
2809         /* bail out if we need to do a full commit */
2810         if (btrfs_need_log_full_commit(fs_info, trans)) {
2811                 ret = -EAGAIN;
2812                 btrfs_free_logged_extents(log, log_transid);
2813                 mutex_unlock(&root->log_mutex);
2814                 goto out;
2815         }
2816
2817         if (log_transid % 2 == 0)
2818                 mark = EXTENT_DIRTY;
2819         else
2820                 mark = EXTENT_NEW;
2821
2822         /* we start IO on  all the marked extents here, but we don't actually
2823          * wait for them until later.
2824          */
2825         blk_start_plug(&plug);
2826         ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
2827         if (ret) {
2828                 blk_finish_plug(&plug);
2829                 btrfs_abort_transaction(trans, ret);
2830                 btrfs_free_logged_extents(log, log_transid);
2831                 btrfs_set_log_full_commit(fs_info, trans);
2832                 mutex_unlock(&root->log_mutex);
2833                 goto out;
2834         }
2835
2836         btrfs_set_root_node(&log->root_item, log->node);
2837
2838         root->log_transid++;
2839         log->log_transid = root->log_transid;
2840         root->log_start_pid = 0;
2841         /*
2842          * IO has been started, blocks of the log tree have WRITTEN flag set
2843          * in their headers. new modifications of the log will be written to
2844          * new positions. so it's safe to allow log writers to go in.
2845          */
2846         mutex_unlock(&root->log_mutex);
2847
2848         btrfs_init_log_ctx(&root_log_ctx, NULL);
2849
2850         mutex_lock(&log_root_tree->log_mutex);
2851         atomic_inc(&log_root_tree->log_batch);
2852         atomic_inc(&log_root_tree->log_writers);
2853
2854         index2 = log_root_tree->log_transid % 2;
2855         list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
2856         root_log_ctx.log_transid = log_root_tree->log_transid;
2857
2858         mutex_unlock(&log_root_tree->log_mutex);
2859
2860         ret = update_log_root(trans, log);
2861
2862         mutex_lock(&log_root_tree->log_mutex);
2863         if (atomic_dec_and_test(&log_root_tree->log_writers)) {
2864                 /*
2865                  * Implicit memory barrier after atomic_dec_and_test
2866                  */
2867                 if (waitqueue_active(&log_root_tree->log_writer_wait))
2868                         wake_up(&log_root_tree->log_writer_wait);
2869         }
2870
2871         if (ret) {
2872                 if (!list_empty(&root_log_ctx.list))
2873                         list_del_init(&root_log_ctx.list);
2874
2875                 blk_finish_plug(&plug);
2876                 btrfs_set_log_full_commit(fs_info, trans);
2877
2878                 if (ret != -ENOSPC) {
2879                         btrfs_abort_transaction(trans, ret);
2880                         mutex_unlock(&log_root_tree->log_mutex);
2881                         goto out;
2882                 }
2883                 btrfs_wait_tree_log_extents(log, mark);
2884                 btrfs_free_logged_extents(log, log_transid);
2885                 mutex_unlock(&log_root_tree->log_mutex);
2886                 ret = -EAGAIN;
2887                 goto out;
2888         }
2889
2890         if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
2891                 blk_finish_plug(&plug);
2892                 list_del_init(&root_log_ctx.list);
2893                 mutex_unlock(&log_root_tree->log_mutex);
2894                 ret = root_log_ctx.log_ret;
2895                 goto out;
2896         }
2897
2898         index2 = root_log_ctx.log_transid % 2;
2899         if (atomic_read(&log_root_tree->log_commit[index2])) {
2900                 blk_finish_plug(&plug);
2901                 ret = btrfs_wait_tree_log_extents(log, mark);
2902                 btrfs_wait_logged_extents(trans, log, log_transid);
2903                 wait_log_commit(log_root_tree,
2904                                 root_log_ctx.log_transid);
2905                 mutex_unlock(&log_root_tree->log_mutex);
2906                 if (!ret)
2907                         ret = root_log_ctx.log_ret;
2908                 goto out;
2909         }
2910         ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
2911         atomic_set(&log_root_tree->log_commit[index2], 1);
2912
2913         if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
2914                 wait_log_commit(log_root_tree,
2915                                 root_log_ctx.log_transid - 1);
2916         }
2917
2918         wait_for_writer(log_root_tree);
2919
2920         /*
2921          * now that we've moved on to the tree of log tree roots,
2922          * check the full commit flag again
2923          */
2924         if (btrfs_need_log_full_commit(fs_info, trans)) {
2925                 blk_finish_plug(&plug);
2926                 btrfs_wait_tree_log_extents(log, mark);
2927                 btrfs_free_logged_extents(log, log_transid);
2928                 mutex_unlock(&log_root_tree->log_mutex);
2929                 ret = -EAGAIN;
2930                 goto out_wake_log_root;
2931         }
2932
2933         ret = btrfs_write_marked_extents(fs_info,
2934                                          &log_root_tree->dirty_log_pages,
2935                                          EXTENT_DIRTY | EXTENT_NEW);
2936         blk_finish_plug(&plug);
2937         if (ret) {
2938                 btrfs_set_log_full_commit(fs_info, trans);
2939                 btrfs_abort_transaction(trans, ret);
2940                 btrfs_free_logged_extents(log, log_transid);
2941                 mutex_unlock(&log_root_tree->log_mutex);
2942                 goto out_wake_log_root;
2943         }
2944         ret = btrfs_wait_tree_log_extents(log, mark);
2945         if (!ret)
2946                 ret = btrfs_wait_tree_log_extents(log_root_tree,
2947                                                   EXTENT_NEW | EXTENT_DIRTY);
2948         if (ret) {
2949                 btrfs_set_log_full_commit(fs_info, trans);
2950                 btrfs_free_logged_extents(log, log_transid);
2951                 mutex_unlock(&log_root_tree->log_mutex);
2952                 goto out_wake_log_root;
2953         }
2954         btrfs_wait_logged_extents(trans, log, log_transid);
2955
2956         btrfs_set_super_log_root(fs_info->super_for_commit,
2957                                  log_root_tree->node->start);
2958         btrfs_set_super_log_root_level(fs_info->super_for_commit,
2959                                        btrfs_header_level(log_root_tree->node));
2960
2961         log_root_tree->log_transid++;
2962         mutex_unlock(&log_root_tree->log_mutex);
2963
2964         /*
2965          * nobody else is going to jump in and write the the ctree
2966          * super here because the log_commit atomic below is protecting
2967          * us.  We must be called with a transaction handle pinning
2968          * the running transaction open, so a full commit can't hop
2969          * in and cause problems either.
2970          */
2971         ret = write_all_supers(fs_info, 1);
2972         if (ret) {
2973                 btrfs_set_log_full_commit(fs_info, trans);
2974                 btrfs_abort_transaction(trans, ret);
2975                 goto out_wake_log_root;
2976         }
2977
2978         mutex_lock(&root->log_mutex);
2979         if (root->last_log_commit < log_transid)
2980                 root->last_log_commit = log_transid;
2981         mutex_unlock(&root->log_mutex);
2982
2983 out_wake_log_root:
2984         mutex_lock(&log_root_tree->log_mutex);
2985         btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
2986
2987         log_root_tree->log_transid_committed++;
2988         atomic_set(&log_root_tree->log_commit[index2], 0);
2989         mutex_unlock(&log_root_tree->log_mutex);
2990
2991         /*
2992          * The barrier before waitqueue_active is implied by mutex_unlock
2993          */
2994         if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
2995                 wake_up(&log_root_tree->log_commit_wait[index2]);
2996 out:
2997         mutex_lock(&root->log_mutex);
2998         btrfs_remove_all_log_ctxs(root, index1, ret);
2999         root->log_transid_committed++;
3000         atomic_set(&root->log_commit[index1], 0);
3001         mutex_unlock(&root->log_mutex);
3002
3003         /*
3004          * The barrier before waitqueue_active is implied by mutex_unlock
3005          */
3006         if (waitqueue_active(&root->log_commit_wait[index1]))
3007                 wake_up(&root->log_commit_wait[index1]);
3008         return ret;
3009 }
3010
3011 static void free_log_tree(struct btrfs_trans_handle *trans,
3012                           struct btrfs_root *log)
3013 {
3014         int ret;
3015         u64 start;
3016         u64 end;
3017         struct walk_control wc = {
3018                 .free = 1,
3019                 .process_func = process_one_buffer
3020         };
3021
3022         ret = walk_log_tree(trans, log, &wc);
3023         /* I don't think this can happen but just in case */
3024         if (ret)
3025                 btrfs_abort_transaction(trans, ret);
3026
3027         while (1) {
3028                 ret = find_first_extent_bit(&log->dirty_log_pages,
3029                                 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW,
3030                                 NULL);
3031                 if (ret)
3032                         break;
3033
3034                 clear_extent_bits(&log->dirty_log_pages, start, end,
3035                                   EXTENT_DIRTY | EXTENT_NEW);
3036         }
3037
3038         /*
3039          * We may have short-circuited the log tree with the full commit logic
3040          * and left ordered extents on our list, so clear these out to keep us
3041          * from leaking inodes and memory.
3042          */
3043         btrfs_free_logged_extents(log, 0);
3044         btrfs_free_logged_extents(log, 1);
3045
3046         free_extent_buffer(log->node);
3047         kfree(log);
3048 }
3049
3050 /*
3051  * free all the extents used by the tree log.  This should be called
3052  * at commit time of the full transaction
3053  */
3054 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
3055 {
3056         if (root->log_root) {
3057                 free_log_tree(trans, root->log_root);
3058                 root->log_root = NULL;
3059         }
3060         return 0;
3061 }
3062
3063 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
3064                              struct btrfs_fs_info *fs_info)
3065 {
3066         if (fs_info->log_root_tree) {
3067                 free_log_tree(trans, fs_info->log_root_tree);
3068                 fs_info->log_root_tree = NULL;
3069         }
3070         return 0;
3071 }
3072
3073 /*
3074  * If both a file and directory are logged, and unlinks or renames are
3075  * mixed in, we have a few interesting corners:
3076  *
3077  * create file X in dir Y
3078  * link file X to X.link in dir Y
3079  * fsync file X
3080  * unlink file X but leave X.link
3081  * fsync dir Y
3082  *
3083  * After a crash we would expect only X.link to exist.  But file X
3084  * didn't get fsync'd again so the log has back refs for X and X.link.
3085  *
3086  * We solve this by removing directory entries and inode backrefs from the
3087  * log when a file that was logged in the current transaction is
3088  * unlinked.  Any later fsync will include the updated log entries, and
3089  * we'll be able to reconstruct the proper directory items from backrefs.
3090  *
3091  * This optimizations allows us to avoid relogging the entire inode
3092  * or the entire directory.
3093  */
3094 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
3095                                  struct btrfs_root *root,
3096                                  const char *name, int name_len,
3097                                  struct btrfs_inode *dir, u64 index)
3098 {
3099         struct btrfs_root *log;
3100         struct btrfs_dir_item *di;
3101         struct btrfs_path *path;
3102         int ret;
3103         int err = 0;
3104         int bytes_del = 0;
3105         u64 dir_ino = btrfs_ino(dir);
3106
3107         if (dir->logged_trans < trans->transid)
3108                 return 0;
3109
3110         ret = join_running_log_trans(root);
3111         if (ret)
3112                 return 0;
3113
3114         mutex_lock(&dir->log_mutex);
3115
3116         log = root->log_root;
3117         path = btrfs_alloc_path();
3118         if (!path) {
3119                 err = -ENOMEM;
3120                 goto out_unlock;
3121         }
3122
3123         di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
3124                                    name, name_len, -1);
3125         if (IS_ERR(di)) {
3126                 err = PTR_ERR(di);
3127                 goto fail;
3128         }
3129         if (di) {
3130                 ret = btrfs_delete_one_dir_name(trans, log, path, di);
3131                 bytes_del += name_len;
3132                 if (ret) {
3133                         err = ret;
3134                         goto fail;
3135                 }
3136         }
3137         btrfs_release_path(path);
3138         di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
3139                                          index, name, name_len, -1);
3140         if (IS_ERR(di)) {
3141                 err = PTR_ERR(di);
3142                 goto fail;
3143         }
3144         if (di) {
3145                 ret = btrfs_delete_one_dir_name(trans, log, path, di);
3146                 bytes_del += name_len;
3147                 if (ret) {
3148                         err = ret;
3149                         goto fail;
3150                 }
3151         }
3152
3153         /* update the directory size in the log to reflect the names
3154          * we have removed
3155          */
3156         if (bytes_del) {
3157                 struct btrfs_key key;
3158
3159                 key.objectid = dir_ino;
3160                 key.offset = 0;
3161                 key.type = BTRFS_INODE_ITEM_KEY;
3162                 btrfs_release_path(path);
3163
3164                 ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
3165                 if (ret < 0) {
3166                         err = ret;
3167                         goto fail;
3168                 }
3169                 if (ret == 0) {
3170                         struct btrfs_inode_item *item;
3171                         u64 i_size;
3172
3173                         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3174                                               struct btrfs_inode_item);
3175                         i_size = btrfs_inode_size(path->nodes[0], item);
3176                         if (i_size > bytes_del)
3177                                 i_size -= bytes_del;
3178                         else
3179                                 i_size = 0;
3180                         btrfs_set_inode_size(path->nodes[0], item, i_size);
3181                         btrfs_mark_buffer_dirty(path->nodes[0]);
3182                 } else
3183                         ret = 0;
3184                 btrfs_release_path(path);
3185         }
3186 fail:
3187         btrfs_free_path(path);
3188 out_unlock:
3189         mutex_unlock(&dir->log_mutex);
3190         if (ret == -ENOSPC) {
3191                 btrfs_set_log_full_commit(root->fs_info, trans);
3192                 ret = 0;
3193         } else if (ret < 0)
3194                 btrfs_abort_transaction(trans, ret);
3195
3196         btrfs_end_log_trans(root);
3197
3198         return err;
3199 }
3200
3201 /* see comments for btrfs_del_dir_entries_in_log */
3202 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
3203                                struct btrfs_root *root,
3204                                const char *name, int name_len,
3205                                struct btrfs_inode *inode, u64 dirid)
3206 {
3207         struct btrfs_fs_info *fs_info = root->fs_info;
3208         struct btrfs_root *log;
3209         u64 index;
3210         int ret;
3211
3212         if (inode->logged_trans < trans->transid)
3213                 return 0;
3214
3215         ret = join_running_log_trans(root);
3216         if (ret)
3217                 return 0;
3218         log = root->log_root;
3219         mutex_lock(&inode->log_mutex);
3220
3221         ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
3222                                   dirid, &index);
3223         mutex_unlock(&inode->log_mutex);
3224         if (ret == -ENOSPC) {
3225                 btrfs_set_log_full_commit(fs_info, trans);
3226                 ret = 0;
3227         } else if (ret < 0 && ret != -ENOENT)
3228                 btrfs_abort_transaction(trans, ret);
3229         btrfs_end_log_trans(root);
3230
3231         return ret;
3232 }
3233
3234 /*
3235  * creates a range item in the log for 'dirid'.  first_offset and
3236  * last_offset tell us which parts of the key space the log should
3237  * be considered authoritative for.
3238  */
3239 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
3240                                        struct btrfs_root *log,
3241                                        struct btrfs_path *path,
3242                                        int key_type, u64 dirid,
3243                                        u64 first_offset, u64 last_offset)
3244 {
3245         int ret;
3246         struct btrfs_key key;
3247         struct btrfs_dir_log_item *item;
3248
3249         key.objectid = dirid;
3250         key.offset = first_offset;
3251         if (key_type == BTRFS_DIR_ITEM_KEY)
3252                 key.type = BTRFS_DIR_LOG_ITEM_KEY;
3253         else
3254                 key.type = BTRFS_DIR_LOG_INDEX_KEY;
3255         ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
3256         if (ret)
3257                 return ret;
3258
3259         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3260                               struct btrfs_dir_log_item);
3261         btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
3262         btrfs_mark_buffer_dirty(path->nodes[0]);
3263         btrfs_release_path(path);
3264         return 0;
3265 }
3266
3267 /*
3268  * log all the items included in the current transaction for a given
3269  * directory.  This also creates the range items in the log tree required
3270  * to replay anything deleted before the fsync
3271  */
3272 static noinline int log_dir_items(struct btrfs_trans_handle *trans,
3273                           struct btrfs_root *root, struct btrfs_inode *inode,
3274                           struct btrfs_path *path,
3275                           struct btrfs_path *dst_path, int key_type,
3276                           struct btrfs_log_ctx *ctx,
3277                           u64 min_offset, u64 *last_offset_ret)
3278 {
3279         struct btrfs_key min_key;
3280         struct btrfs_root *log = root->log_root;
3281         struct extent_buffer *src;
3282         int err = 0;
3283         int ret;
3284         int i;
3285         int nritems;
3286         u64 first_offset = min_offset;
3287         u64 last_offset = (u64)-1;
3288         u64 ino = btrfs_ino(inode);
3289
3290         log = root->log_root;
3291
3292         min_key.objectid = ino;
3293         min_key.type = key_type;
3294         min_key.offset = min_offset;
3295
3296         ret = btrfs_search_forward(root, &min_key, path, trans->transid);
3297
3298         /*
3299          * we didn't find anything from this transaction, see if there
3300          * is anything at all
3301          */
3302         if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
3303                 min_key.objectid = ino;
3304                 min_key.type = key_type;
3305                 min_key.offset = (u64)-1;
3306                 btrfs_release_path(path);
3307                 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3308                 if (ret < 0) {
3309                         btrfs_release_path(path);
3310                         return ret;
3311                 }
3312                 ret = btrfs_previous_item(root, path, ino, key_type);
3313
3314                 /* if ret == 0 there are items for this type,
3315                  * create a range to tell us the last key of this type.
3316                  * otherwise, there are no items in this directory after
3317                  * *min_offset, and we create a range to indicate that.
3318                  */
3319                 if (ret == 0) {
3320                         struct btrfs_key tmp;
3321                         btrfs_item_key_to_cpu(path->nodes[0], &tmp,
3322                                               path->slots[0]);
3323                         if (key_type == tmp.type)
3324                                 first_offset = max(min_offset, tmp.offset) + 1;
3325                 }
3326                 goto done;
3327         }
3328
3329         /* go backward to find any previous key */
3330         ret = btrfs_previous_item(root, path, ino, key_type);
3331         if (ret == 0) {
3332                 struct btrfs_key tmp;
3333                 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3334                 if (key_type == tmp.type) {
3335                         first_offset = tmp.offset;
3336                         ret = overwrite_item(trans, log, dst_path,
3337                                              path->nodes[0], path->slots[0],
3338                                              &tmp);
3339                         if (ret) {
3340                                 err = ret;
3341                                 goto done;
3342                         }
3343                 }
3344         }
3345         btrfs_release_path(path);
3346
3347         /* find the first key from this transaction again */
3348         ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3349         if (WARN_ON(ret != 0))
3350                 goto done;
3351
3352         /*
3353          * we have a block from this transaction, log every item in it
3354          * from our directory
3355          */
3356         while (1) {
3357                 struct btrfs_key tmp;
3358                 src = path->nodes[0];
3359                 nritems = btrfs_header_nritems(src);
3360                 for (i = path->slots[0]; i < nritems; i++) {
3361                         struct btrfs_dir_item *di;
3362
3363                         btrfs_item_key_to_cpu(src, &min_key, i);
3364
3365                         if (min_key.objectid != ino || min_key.type != key_type)
3366                                 goto done;
3367                         ret = overwrite_item(trans, log, dst_path, src, i,
3368                                              &min_key);
3369                         if (ret) {
3370                                 err = ret;
3371                                 goto done;
3372                         }
3373
3374                         /*
3375                          * We must make sure that when we log a directory entry,
3376                          * the corresponding inode, after log replay, has a
3377                          * matching link count. For example:
3378                          *
3379                          * touch foo
3380                          * mkdir mydir
3381                          * sync
3382                          * ln foo mydir/bar
3383                          * xfs_io -c "fsync" mydir
3384                          * <crash>
3385                          * <mount fs and log replay>
3386                          *
3387                          * Would result in a fsync log that when replayed, our
3388                          * file inode would have a link count of 1, but we get
3389                          * two directory entries pointing to the same inode.
3390                          * After removing one of the names, it would not be
3391                          * possible to remove the other name, which resulted
3392                          * always in stale file handle errors, and would not
3393                          * be possible to rmdir the parent directory, since
3394                          * its i_size could never decrement to the value
3395                          * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
3396                          */
3397                         di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
3398                         btrfs_dir_item_key_to_cpu(src, di, &tmp);
3399                         if (ctx &&
3400                             (btrfs_dir_transid(src, di) == trans->transid ||
3401                              btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
3402                             tmp.type != BTRFS_ROOT_ITEM_KEY)
3403                                 ctx->log_new_dentries = true;
3404                 }
3405                 path->slots[0] = nritems;
3406
3407                 /*
3408                  * look ahead to the next item and see if it is also
3409                  * from this directory and from this transaction
3410                  */
3411                 ret = btrfs_next_leaf(root, path);
3412                 if (ret == 1) {
3413                         last_offset = (u64)-1;
3414                         goto done;
3415                 }
3416                 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3417                 if (tmp.objectid != ino || tmp.type != key_type) {
3418                         last_offset = (u64)-1;
3419                         goto done;
3420                 }
3421                 if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
3422                         ret = overwrite_item(trans, log, dst_path,
3423                                              path->nodes[0], path->slots[0],
3424                                              &tmp);
3425                         if (ret)
3426                                 err = ret;
3427                         else
3428                                 last_offset = tmp.offset;
3429                         goto done;
3430                 }
3431         }
3432 done:
3433         btrfs_release_path(path);
3434         btrfs_release_path(dst_path);
3435
3436         if (err == 0) {
3437                 *last_offset_ret = last_offset;
3438                 /*
3439                  * insert the log range keys to indicate where the log
3440                  * is valid
3441                  */
3442                 ret = insert_dir_log_key(trans, log, path, key_type,
3443                                          ino, first_offset, last_offset);
3444                 if (ret)
3445                         err = ret;
3446         }
3447         return err;
3448 }
3449
3450 /*
3451  * logging directories is very similar to logging inodes, We find all the items
3452  * from the current transaction and write them to the log.
3453  *
3454  * The recovery code scans the directory in the subvolume, and if it finds a
3455  * key in the range logged that is not present in the log tree, then it means
3456  * that dir entry was unlinked during the transaction.
3457  *
3458  * In order for that scan to work, we must include one key smaller than
3459  * the smallest logged by this transaction and one key larger than the largest
3460  * key logged by this transaction.
3461  */
3462 static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
3463                           struct btrfs_root *root, struct btrfs_inode *inode,
3464                           struct btrfs_path *path,
3465                           struct btrfs_path *dst_path,
3466                           struct btrfs_log_ctx *ctx)
3467 {
3468         u64 min_key;
3469         u64 max_key;
3470         int ret;
3471         int key_type = BTRFS_DIR_ITEM_KEY;
3472
3473 again:
3474         min_key = 0;
3475         max_key = 0;