fs/btrfs/tree-log.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2008 Oracle.  All rights reserved.
   4  */
   5
   6 #include <linux/sched.h>
   7 #include <linux/slab.h>
   8 #include <linux/blkdev.h>
   9 #include <linux/list_sort.h>
  10 #include <linux/iversion.h>
  11 #include "ctree.h"
  12 #include "tree-log.h"
  13 #include "disk-io.h"
  14 #include "locking.h"
  15 #include "print-tree.h"
  16 #include "backref.h"
  17 #include "compression.h"
  18 #include "qgroup.h"
  19 #include "inode-map.h"
  20
  21 /* magic values for the inode_only field in btrfs_log_inode:
  22  *
  23  * LOG_INODE_ALL means to log everything
  24  * LOG_INODE_EXISTS means to log just enough to recreate the inode
  25  * during log replay
  26  */
  27 #define LOG_INODE_ALL 0
  28 #define LOG_INODE_EXISTS 1
  29 #define LOG_OTHER_INODE 2
  30
  31 /*
  32  * directory trouble cases
  33  *
  34  * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
  35  * log, we must force a full commit before doing an fsync of the directory
  36  * where the unlink was done.
  37  * ---> record transid of last unlink/rename per directory
  38  *
  39  * mkdir foo/some_dir
  40  * normal commit
  41  * rename foo/some_dir foo2/some_dir
  42  * mkdir foo/some_dir
  43  * fsync foo/some_dir/some_file
  44  *
  45  * The fsync above will unlink the original some_dir without recording
  46  * it in its new location (foo2).  After a crash, some_dir will be gone
  47  * unless the fsync of some_file forces a full commit
  48  *
  49  * 2) we must log any new names for any file or dir that is in the fsync
  50  * log. ---> check inode while renaming/linking.
  51  *
  52  * 2a) we must log any new names for any file or dir during rename
  53  * when the directory they are being removed from was logged.
  54  * ---> check inode and old parent dir during rename
  55  *
  56  *  2a is actually the more important variant.  With the extra logging
  57  *  a crash might unlink the old name without recreating the new one
  58  *
  59  * 3) after a crash, we must go through any directories with a link count
  60  * of zero and redo the rm -rf
  61  *
  62  * mkdir f1/foo
  63  * normal commit
  64  * rm -rf f1/foo
  65  * fsync(f1)
  66  *
  67  * The directory f1 was fully removed from the FS, but fsync was never
  68  * called on f1, only its parent dir.  After a crash the rm -rf must
  69  * be replayed.  This must be able to recurse down the entire
  70  * directory tree.  The inode link count fixup code takes care of the
  71  * ugly details.
  72  */
  73
  74 /*
  75  * stages for the tree walking.  The first
  76  * stage (0) is to only pin down the blocks we find
  77  * the second stage (1) is to make sure that all the inodes
  78  * we find in the log are created in the subvolume.
  79  *
  80  * The last stage is to deal with directories and links and extents
  81  * and all the other fun semantics
  82  */
  83 #define LOG_WALK_PIN_ONLY 0
  84 #define LOG_WALK_REPLAY_INODES 1
  85 #define LOG_WALK_REPLAY_DIR_INDEX 2
  86 #define LOG_WALK_REPLAY_ALL 3
  87
  88 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
  89                            struct btrfs_root *root, struct btrfs_inode *inode,
  90                            int inode_only,
  91                            const loff_t start,
  92                            const loff_t end,
  93                            struct btrfs_log_ctx *ctx);
  94 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
  95                              struct btrfs_root *root,
  96                              struct btrfs_path *path, u64 objectid);
  97 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
  98                                        struct btrfs_root *root,
  99                                        struct btrfs_root *log,
 100                                        struct btrfs_path *path,
 101                                        u64 dirid, int del_all);
 102
 103 /*
 104  * tree logging is a special write ahead log used to make sure that
 105  * fsyncs and O_SYNCs can happen without doing full tree commits.
 106  *
 107  * Full tree commits are expensive because they require commonly
 108  * modified blocks to be recowed, creating many dirty pages in the
 109  * extent tree an 4x-6x higher write load than ext3.
 110  *
 111  * Instead of doing a tree commit on every fsync, we use the
 112  * key ranges and transaction ids to find items for a given file or directory
 113  * that have changed in this transaction.  Those items are copied into
 114  * a special tree (one per subvolume root), that tree is written to disk
 115  * and then the fsync is considered complete.
 116  *
 117  * After a crash, items are copied out of the log-tree back into the
 118  * subvolume tree.  Any file data extents found are recorded in the extent
 119  * allocation tree, and the log-tree freed.
 120  *
 121  * The log tree is read three times, once to pin down all the extents it is
 122  * using in ram and once, once to create all the inodes logged in the tree
 123  * and once to do all the other items.
 124  */
 125
 126 /*
 127  * start a sub transaction and setup the log tree
 128  * this increments the log tree writer count to make the people
 129  * syncing the tree wait for us to finish
 130  */
 131 static int start_log_trans(struct btrfs_trans_handle *trans,
 132                            struct btrfs_root *root,
 133                            struct btrfs_log_ctx *ctx)
 134 {
 135         struct btrfs_fs_info *fs_info = root->fs_info;
 136         int ret = 0;
 137
 138         mutex_lock(&root->log_mutex);
 139
 140         if (root->log_root) {
 141                 if (btrfs_need_log_full_commit(fs_info, trans)) {
 142                         ret = -EAGAIN;
 143                         goto out;
 144                 }
 145
 146                 if (!root->log_start_pid) {
 147                         clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 148                         root->log_start_pid = current->pid;
 149                 } else if (root->log_start_pid != current->pid) {
 150                         set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 151                 }
 152         } else {
 153                 mutex_lock(&fs_info->tree_log_mutex);
 154                 if (!fs_info->log_root_tree)
 155                         ret = btrfs_init_log_root_tree(trans, fs_info);
 156                 mutex_unlock(&fs_info->tree_log_mutex);
 157                 if (ret)
 158                         goto out;
 159
 160                 ret = btrfs_add_log_tree(trans, root);
 161                 if (ret)
 162                         goto out;
 163
 164                 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 165                 root->log_start_pid = current->pid;
 166         }
 167
 168         atomic_inc(&root->log_batch);
 169         atomic_inc(&root->log_writers);
 170         if (ctx) {
 171                 int index = root->log_transid % 2;
 172                 list_add_tail(&ctx->list, &root->log_ctxs[index]);
 173                 ctx->log_transid = root->log_transid;
 174         }
 175
 176 out:
 177         mutex_unlock(&root->log_mutex);
 178         return ret;
 179 }
 180
 181 /*
 182  * returns 0 if there was a log transaction running and we were able
 183  * to join, or returns -ENOENT if there were not transactions
 184  * in progress
 185  */
 186 static int join_running_log_trans(struct btrfs_root *root)
 187 {
 188         int ret = -ENOENT;
 189
 190         smp_mb();
 191         if (!root->log_root)
 192                 return -ENOENT;
 193
 194         mutex_lock(&root->log_mutex);
 195         if (root->log_root) {
 196                 ret = 0;
 197                 atomic_inc(&root->log_writers);
 198         }
 199         mutex_unlock(&root->log_mutex);
 200         return ret;
 201 }
 202
 203 /*
 204  * This either makes the current running log transaction wait
 205  * until you call btrfs_end_log_trans() or it makes any future
 206  * log transactions wait until you call btrfs_end_log_trans()
 207  */
 208 int btrfs_pin_log_trans(struct btrfs_root *root)
 209 {
 210         int ret = -ENOENT;
 211
 212         mutex_lock(&root->log_mutex);
 213         atomic_inc(&root->log_writers);
 214         mutex_unlock(&root->log_mutex);
 215         return ret;
 216 }
 217
 218 /*
 219  * indicate we're done making changes to the log tree
 220  * and wake up anyone waiting to do a sync
 221  */
 222 void btrfs_end_log_trans(struct btrfs_root *root)
 223 {
 224         if (atomic_dec_and_test(&root->log_writers)) {
 225                 /*
 226                  * Implicit memory barrier after atomic_dec_and_test
 227                  */
 228                 if (waitqueue_active(&root->log_writer_wait))
 229                         wake_up(&root->log_writer_wait);
 230         }
 231 }
 232
 233
 234 /*
 235  * the walk control struct is used to pass state down the chain when
 236  * processing the log tree.  The stage field tells us which part
 237  * of the log tree processing we are currently doing.  The others
 238  * are state fields used for that specific part
 239  */
 240 struct walk_control {
 241         /* should we free the extent on disk when done?  This is used
 242          * at transaction commit time while freeing a log tree
 243          */
 244         int free;
 245
 246         /* should we write out the extent buffer?  This is used
 247          * while flushing the log tree to disk during a sync
 248          */
 249         int write;
 250
 251         /* should we wait for the extent buffer io to finish?  Also used
 252          * while flushing the log tree to disk for a sync
 253          */
 254         int wait;
 255
 256         /* pin only walk, we record which extents on disk belong to the
 257          * log trees
 258          */
 259         int pin;
 260
 261         /* what stage of the replay code we're currently in */
 262         int stage;
 263
 264         /* the root we are currently replaying */
 265         struct btrfs_root *replay_dest;
 266
 267         /* the trans handle for the current replay */
 268         struct btrfs_trans_handle *trans;
 269
 270         /* the function that gets used to process blocks we find in the
 271          * tree.  Note the extent_buffer might not be up to date when it is
 272          * passed in, and it must be checked or read if you need the data
 273          * inside it
 274          */
 275         int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
 276                             struct walk_control *wc, u64 gen, int level);
 277 };
 278
 279 /*
 280  * process_func used to pin down extents, write them or wait on them
 281  */
 282 static int process_one_buffer(struct btrfs_root *log,
 283                               struct extent_buffer *eb,
 284                               struct walk_control *wc, u64 gen, int level)
 285 {
 286         struct btrfs_fs_info *fs_info = log->fs_info;
 287         int ret = 0;
 288
 289         /*
 290          * If this fs is mixed then we need to be able to process the leaves to
 291          * pin down any logged extents, so we have to read the block.
 292          */
 293         if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
 294                 ret = btrfs_read_buffer(eb, gen, level, NULL);
 295                 if (ret)
 296                         return ret;
 297         }
 298
 299         if (wc->pin)
 300                 ret = btrfs_pin_extent_for_log_replay(fs_info, eb->start,
 301                                                       eb->len);
 302
 303         if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
 304                 if (wc->pin && btrfs_header_level(eb) == 0)
 305                         ret = btrfs_exclude_logged_extents(fs_info, eb);
 306                 if (wc->write)
 307                         btrfs_write_tree_block(eb);
 308                 if (wc->wait)
 309                         btrfs_wait_tree_block_writeback(eb);
 310         }
 311         return ret;
 312 }
 313
 314 /*
 315  * Item overwrite used by replay and tree logging.  eb, slot and key all refer
 316  * to the src data we are copying out.
 317  *
 318  * root is the tree we are copying into, and path is a scratch
 319  * path for use in this function (it should be released on entry and
 320  * will be released on exit).
 321  *
 322  * If the key is already in the destination tree the existing item is
 323  * overwritten.  If the existing item isn't big enough, it is extended.
 324  * If it is too large, it is truncated.
 325  *
 326  * If the key isn't in the destination yet, a new item is inserted.
 327  */
 328 static noinline int overwrite_item(struct btrfs_trans_handle *trans,
 329                                    struct btrfs_root *root,
 330                                    struct btrfs_path *path,
 331                                    struct extent_buffer *eb, int slot,
 332                                    struct btrfs_key *key)
 333 {
 334         struct btrfs_fs_info *fs_info = root->fs_info;
 335         int ret;
 336         u32 item_size;
 337         u64 saved_i_size = 0;
 338         int save_old_i_size = 0;
 339         unsigned long src_ptr;
 340         unsigned long dst_ptr;
 341         int overwrite_root = 0;
 342         bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
 343
 344         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
 345                 overwrite_root = 1;
 346
 347         item_size = btrfs_item_size_nr(eb, slot);
 348         src_ptr = btrfs_item_ptr_offset(eb, slot);
 349
 350         /* look for the key in the destination tree */
 351         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
 352         if (ret < 0)
 353                 return ret;
 354
 355         if (ret == 0) {
 356                 char *src_copy;
 357                 char *dst_copy;
 358                 u32 dst_size = btrfs_item_size_nr(path->nodes[0],
 359                                                   path->slots[0]);
 360                 if (dst_size != item_size)
 361                         goto insert;
 362
 363                 if (item_size == 0) {
 364                         btrfs_release_path(path);
 365                         return 0;
 366                 }
 367                 dst_copy = kmalloc(item_size, GFP_NOFS);
 368                 src_copy = kmalloc(item_size, GFP_NOFS);
 369                 if (!dst_copy || !src_copy) {
 370                         btrfs_release_path(path);
 371                         kfree(dst_copy);
 372                         kfree(src_copy);
 373                         return -ENOMEM;
 374                 }
 375
 376                 read_extent_buffer(eb, src_copy, src_ptr, item_size);
 377
 378                 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
 379                 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
 380                                    item_size);
 381                 ret = memcmp(dst_copy, src_copy, item_size);
 382
 383                 kfree(dst_copy);
 384                 kfree(src_copy);
 385                 /*
 386                  * they have the same contents, just return, this saves
 387                  * us from cowing blocks in the destination tree and doing
 388                  * extra writes that may not have been done by a previous
 389                  * sync
 390                  */
 391                 if (ret == 0) {
 392                         btrfs_release_path(path);
 393                         return 0;
 394                 }
 395
 396                 /*
 397                  * We need to load the old nbytes into the inode so when we
 398                  * replay the extents we've logged we get the right nbytes.
 399                  */
 400                 if (inode_item) {
 401                         struct btrfs_inode_item *item;
 402                         u64 nbytes;
 403                         u32 mode;
 404
 405                         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 406                                               struct btrfs_inode_item);
 407                         nbytes = btrfs_inode_nbytes(path->nodes[0], item);
 408                         item = btrfs_item_ptr(eb, slot,
 409                                               struct btrfs_inode_item);
 410                         btrfs_set_inode_nbytes(eb, item, nbytes);
 411
 412                         /*
 413                          * If this is a directory we need to reset the i_size to
 414                          * 0 so that we can set it up properly when replaying
 415                          * the rest of the items in this log.
 416                          */
 417                         mode = btrfs_inode_mode(eb, item);
 418                         if (S_ISDIR(mode))
 419                                 btrfs_set_inode_size(eb, item, 0);
 420                 }
 421         } else if (inode_item) {
 422                 struct btrfs_inode_item *item;
 423                 u32 mode;
 424
 425                 /*
 426                  * New inode, set nbytes to 0 so that the nbytes comes out
 427                  * properly when we replay the extents.
 428                  */
 429                 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
 430                 btrfs_set_inode_nbytes(eb, item, 0);
 431
 432                 /*
 433                  * If this is a directory we need to reset the i_size to 0 so
 434                  * that we can set it up properly when replaying the rest of
 435                  * the items in this log.
 436                  */
 437                 mode = btrfs_inode_mode(eb, item);
 438                 if (S_ISDIR(mode))
 439                         btrfs_set_inode_size(eb, item, 0);
 440         }
 441 insert:
 442         btrfs_release_path(path);
 443         /* try to insert the key into the destination tree */
 444         path->skip_release_on_error = 1;
 445         ret = btrfs_insert_empty_item(trans, root, path,
 446                                       key, item_size);
 447         path->skip_release_on_error = 0;
 448
 449         /* make sure any existing item is the correct size */
 450         if (ret == -EEXIST || ret == -EOVERFLOW) {
 451                 u32 found_size;
 452                 found_size = btrfs_item_size_nr(path->nodes[0],
 453                                                 path->slots[0]);
 454                 if (found_size > item_size)
 455                         btrfs_truncate_item(fs_info, path, item_size, 1);
 456                 else if (found_size < item_size)
 457                         btrfs_extend_item(fs_info, path,
 458                                           item_size - found_size);
 459         } else if (ret) {
 460                 return ret;
 461         }
 462         dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
 463                                         path->slots[0]);
 464
 465         /* don't overwrite an existing inode if the generation number
 466          * was logged as zero.  This is done when the tree logging code
 467          * is just logging an inode to make sure it exists after recovery.
 468          *
 469          * Also, don't overwrite i_size on directories during replay.
 470          * log replay inserts and removes directory items based on the
 471          * state of the tree found in the subvolume, and i_size is modified
 472          * as it goes
 473          */
 474         if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
 475                 struct btrfs_inode_item *src_item;
 476                 struct btrfs_inode_item *dst_item;
 477
 478                 src_item = (struct btrfs_inode_item *)src_ptr;
 479                 dst_item = (struct btrfs_inode_item *)dst_ptr;
 480
 481                 if (btrfs_inode_generation(eb, src_item) == 0) {
 482                         struct extent_buffer *dst_eb = path->nodes[0];
 483                         const u64 ino_size = btrfs_inode_size(eb, src_item);
 484
 485                         /*
 486                          * For regular files an ino_size == 0 is used only when
 487                          * logging that an inode exists, as part of a directory
 488                          * fsync, and the inode wasn't fsynced before. In this
 489                          * case don't set the size of the inode in the fs/subvol
 490                          * tree, otherwise we would be throwing valid data away.
 491                          */
 492                         if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
 493                             S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
 494                             ino_size != 0) {
 495                                 struct btrfs_map_token token;
 496
 497                                 btrfs_init_map_token(&token);
 498                                 btrfs_set_token_inode_size(dst_eb, dst_item,
 499                                                            ino_size, &token);
 500                         }
 501                         goto no_copy;
 502                 }
 503
 504                 if (overwrite_root &&
 505                     S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
 506                     S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
 507                         save_old_i_size = 1;
 508                         saved_i_size = btrfs_inode_size(path->nodes[0],
 509                                                         dst_item);
 510                 }
 511         }
 512
 513         copy_extent_buffer(path->nodes[0], eb, dst_ptr,
 514                            src_ptr, item_size);
 515
 516         if (save_old_i_size) {
 517                 struct btrfs_inode_item *dst_item;
 518                 dst_item = (struct btrfs_inode_item *)dst_ptr;
 519                 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
 520         }
 521
 522         /* make sure the generation is filled in */
 523         if (key->type == BTRFS_INODE_ITEM_KEY) {
 524                 struct btrfs_inode_item *dst_item;
 525                 dst_item = (struct btrfs_inode_item *)dst_ptr;
 526                 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
 527                         btrfs_set_inode_generation(path->nodes[0], dst_item,
 528                                                    trans->transid);
 529                 }
 530         }
 531 no_copy:
 532         btrfs_mark_buffer_dirty(path->nodes[0]);
 533         btrfs_release_path(path);
 534         return 0;
 535 }
 536
 537 /*
 538  * simple helper to read an inode off the disk from a given root
 539  * This can only be called for subvolume roots and not for the log
 540  */
 541 static noinline struct inode *read_one_inode(struct btrfs_root *root,
 542                                              u64 objectid)
 543 {
 544         struct btrfs_key key;
 545         struct inode *inode;
 546
 547         key.objectid = objectid;
 548         key.type = BTRFS_INODE_ITEM_KEY;
 549         key.offset = 0;
 550         inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
 551         if (IS_ERR(inode)) {
 552                 inode = NULL;
 553         } else if (is_bad_inode(inode)) {
 554                 iput(inode);
 555                 inode = NULL;
 556         }
 557         return inode;
 558 }
 559
 560 /* replays a single extent in 'eb' at 'slot' with 'key' into the
 561  * subvolume 'root'.  path is released on entry and should be released
 562  * on exit.
 563  *
 564  * extents in the log tree have not been allocated out of the extent
 565  * tree yet.  So, this completes the allocation, taking a reference
 566  * as required if the extent already exists or creating a new extent
 567  * if it isn't in the extent allocation tree yet.
 568  *
 569  * The extent is inserted into the file, dropping any existing extents
 570  * from the file that overlap the new one.
 571  */
 572 static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 573                                       struct btrfs_root *root,
 574                                       struct btrfs_path *path,
 575                                       struct extent_buffer *eb, int slot,
 576                                       struct btrfs_key *key)
 577 {
 578         struct btrfs_fs_info *fs_info = root->fs_info;
 579         int found_type;
 580         u64 extent_end;
 581         u64 start = key->offset;
 582         u64 nbytes = 0;
 583         struct btrfs_file_extent_item *item;
 584         struct inode *inode = NULL;
 585         unsigned long size;
 586         int ret = 0;
 587
 588         item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
 589         found_type = btrfs_file_extent_type(eb, item);
 590
 591         if (found_type == BTRFS_FILE_EXTENT_REG ||
 592             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 593                 nbytes = btrfs_file_extent_num_bytes(eb, item);
 594                 extent_end = start + nbytes;
 595
 596                 /*
 597                  * We don't add to the inodes nbytes if we are prealloc or a
 598                  * hole.
 599                  */
 600                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
 601                         nbytes = 0;
 602         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 603                 size = btrfs_file_extent_inline_len(eb, slot, item);
 604                 nbytes = btrfs_file_extent_ram_bytes(eb, item);
 605                 extent_end = ALIGN(start + size,
 606                                    fs_info->sectorsize);
 607         } else {
 608                 ret = 0;
 609                 goto out;
 610         }
 611
 612         inode = read_one_inode(root, key->objectid);
 613         if (!inode) {
 614                 ret = -EIO;
 615                 goto out;
 616         }
 617
 618         /*
 619          * first check to see if we already have this extent in the
 620          * file.  This must be done before the btrfs_drop_extents run
 621          * so we don't try to drop this extent.
 622          */
 623         ret = btrfs_lookup_file_extent(trans, root, path,
 624                         btrfs_ino(BTRFS_I(inode)), start, 0);
 625
 626         if (ret == 0 &&
 627             (found_type == BTRFS_FILE_EXTENT_REG ||
 628              found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
 629                 struct btrfs_file_extent_item cmp1;
 630                 struct btrfs_file_extent_item cmp2;
 631                 struct btrfs_file_extent_item *existing;
 632                 struct extent_buffer *leaf;
 633
 634                 leaf = path->nodes[0];
 635                 existing = btrfs_item_ptr(leaf, path->slots[0],
 636                                           struct btrfs_file_extent_item);
 637
 638                 read_extent_buffer(eb, &cmp1, (unsigned long)item,
 639                                    sizeof(cmp1));
 640                 read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
 641                                    sizeof(cmp2));
 642
 643                 /*
 644                  * we already have a pointer to this exact extent,
 645                  * we don't have to do anything
 646                  */
 647                 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
 648                         btrfs_release_path(path);
 649                         goto out;
 650                 }
 651         }
 652         btrfs_release_path(path);
 653
 654         /* drop any overlapping extents */
 655         ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
 656         if (ret)
 657                 goto out;
 658
 659         if (found_type == BTRFS_FILE_EXTENT_REG ||
 660             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 661                 u64 offset;
 662                 unsigned long dest_offset;
 663                 struct btrfs_key ins;
 664
 665                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
 666                     btrfs_fs_incompat(fs_info, NO_HOLES))
 667                         goto update_inode;
 668
 669                 ret = btrfs_insert_empty_item(trans, root, path, key,
 670                                               sizeof(*item));
 671                 if (ret)
 672                         goto out;
 673                 dest_offset = btrfs_item_ptr_offset(path->nodes[0],
 674                                                     path->slots[0]);
 675                 copy_extent_buffer(path->nodes[0], eb, dest_offset,
 676                                 (unsigned long)item,  sizeof(*item));
 677
 678                 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
 679                 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
 680                 ins.type = BTRFS_EXTENT_ITEM_KEY;
 681                 offset = key->offset - btrfs_file_extent_offset(eb, item);
 682
 683                 /*
 684                  * Manually record dirty extent, as here we did a shallow
 685                  * file extent item copy and skip normal backref update,
 686                  * but modifying extent tree all by ourselves.
 687                  * So need to manually record dirty extent for qgroup,
 688                  * as the owner of the file extent changed from log tree
 689                  * (doesn't affect qgroup) to fs/file tree(affects qgroup)
 690                  */
 691                 ret = btrfs_qgroup_trace_extent(trans, fs_info,
 692                                 btrfs_file_extent_disk_bytenr(eb, item),
 693                                 btrfs_file_extent_disk_num_bytes(eb, item),
 694                                 GFP_NOFS);
 695                 if (ret < 0)
 696                         goto out;
 697
 698                 if (ins.objectid > 0) {
 699                         u64 csum_start;
 700                         u64 csum_end;
 701                         LIST_HEAD(ordered_sums);
 702                         /*
 703                          * is this extent already allocated in the extent
 704                          * allocation tree?  If so, just add a reference
 705                          */
 706                         ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
 707                                                 ins.offset);
 708                         if (ret == 0) {
 709                                 ret = btrfs_inc_extent_ref(trans, root,
 710                                                 ins.objectid, ins.offset,
 711                                                 0, root->root_key.objectid,
 712                                                 key->objectid, offset);
 713                                 if (ret)
 714                                         goto out;
 715                         } else {
 716                                 /*
 717                                  * insert the extent pointer in the extent
 718                                  * allocation tree
 719                                  */
 720                                 ret = btrfs_alloc_logged_file_extent(trans,
 721                                                 fs_info,
 722                                                 root->root_key.objectid,
 723                                                 key->objectid, offset, &ins);
 724                                 if (ret)
 725                                         goto out;
 726                         }
 727                         btrfs_release_path(path);
 728
 729                         if (btrfs_file_extent_compression(eb, item)) {
 730                                 csum_start = ins.objectid;
 731                                 csum_end = csum_start + ins.offset;
 732                         } else {
 733                                 csum_start = ins.objectid +
 734                                         btrfs_file_extent_offset(eb, item);
 735                                 csum_end = csum_start +
 736                                         btrfs_file_extent_num_bytes(eb, item);
 737                         }
 738
 739                         ret = btrfs_lookup_csums_range(root->log_root,
 740                                                 csum_start, csum_end - 1,
 741                                                 &ordered_sums, 0);
 742                         if (ret)
 743                                 goto out;
 744                         /*
 745                          * Now delete all existing cums in the csum root that
 746                          * cover our range. We do this because we can have an
 747                          * extent that is completely referenced by one file
 748                          * extent item and partially referenced by another
 749                          * file extent item (like after using the clone or
 750                          * extent_same ioctls). In this case if we end up doing
 751                          * the replay of the one that partially references the
 752                          * extent first, and we do not do the csum deletion
 753                          * below, we can get 2 csum items in the csum tree that
 754                          * overlap each other. For example, imagine our log has
 755                          * the two following file extent items:
 756                          *
 757                          * key (257 EXTENT_DATA 409600)
 758                          *     extent data disk byte 12845056 nr 102400
 759                          *     extent data offset 20480 nr 20480 ram 102400
 760                          *
 761                          * key (257 EXTENT_DATA 819200)
 762                          *     extent data disk byte 12845056 nr 102400
 763                          *     extent data offset 0 nr 102400 ram 102400
 764                          *
 765                          * Where the second one fully references the 100K extent
 766                          * that starts at disk byte 12845056, and the log tree
 767                          * has a single csum item that covers the entire range
 768                          * of the extent:
 769                          *
 770                          * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
 771                          *
 772                          * After the first file extent item is replayed, the
 773                          * csum tree gets the following csum item:
 774                          *
 775                          * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
 776                          *
 777                          * Which covers the 20K sub-range starting at offset 20K
 778                          * of our extent. Now when we replay the second file
 779                          * extent item, if we do not delete existing csum items
 780                          * that cover any of its blocks, we end up getting two
 781                          * csum items in our csum tree that overlap each other:
 782                          *
 783                          * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
 784                          * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
 785                          *
 786                          * Which is a problem, because after this anyone trying
 787                          * to lookup up for the checksum of any block of our
 788                          * extent starting at an offset of 40K or higher, will
 789                          * end up looking at the second csum item only, which
 790                          * does not contain the checksum for any block starting
 791                          * at offset 40K or higher of our extent.
 792                          */
 793                         while (!list_empty(&ordered_sums)) {
 794                                 struct btrfs_ordered_sum *sums;
 795                                 sums = list_entry(ordered_sums.next,
 796                                                 struct btrfs_ordered_sum,
 797                                                 list);
 798                                 if (!ret)
 799                                         ret = btrfs_del_csums(trans, fs_info,
 800                                                               sums->bytenr,
 801                                                               sums->len);
 802                                 if (!ret)
 803                                         ret = btrfs_csum_file_blocks(trans,
 804                                                 fs_info->csum_root, sums);
 805                                 list_del(&sums->list);
 806                                 kfree(sums);
 807                         }
 808                         if (ret)
 809                                 goto out;
 810                 } else {
 811                         btrfs_release_path(path);
 812                 }
 813         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 814                 /* inline extents are easy, we just overwrite them */
 815                 ret = overwrite_item(trans, root, path, eb, slot, key);
 816                 if (ret)
 817                         goto out;
 818         }
 819
 820         inode_add_bytes(inode, nbytes);
 821 update_inode:
 822         ret = btrfs_update_inode(trans, root, inode);
 823 out:
 824         if (inode)
 825                 iput(inode);
 826         return ret;
 827 }
 828
 829 /*
 830  * when cleaning up conflicts between the directory names in the
 831  * subvolume, directory names in the log and directory names in the
 832  * inode back references, we may have to unlink inodes from directories.
 833  *
 834  * This is a helper function to do the unlink of a specific directory
 835  * item
 836  */
 837 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
 838                                       struct btrfs_root *root,
 839                                       struct btrfs_path *path,
 840                                       struct btrfs_inode *dir,
 841                                       struct btrfs_dir_item *di)
 842 {
 843         struct inode *inode;
 844         char *name;
 845         int name_len;
 846         struct extent_buffer *leaf;
 847         struct btrfs_key location;
 848         int ret;
 849
 850         leaf = path->nodes[0];
 851
 852         btrfs_dir_item_key_to_cpu(leaf, di, &location);
 853         name_len = btrfs_dir_name_len(leaf, di);
 854         name = kmalloc(name_len, GFP_NOFS);
 855         if (!name)
 856                 return -ENOMEM;
 857
 858         read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
 859         btrfs_release_path(path);
 860
 861         inode = read_one_inode(root, location.objectid);
 862         if (!inode) {
 863                 ret = -EIO;
 864                 goto out;
 865         }
 866
 867         ret = link_to_fixup_dir(trans, root, path, location.objectid);
 868         if (ret)
 869                 goto out;
 870
 871         ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name,
 872                         name_len);
 873         if (ret)
 874                 goto out;
 875         else
 876                 ret = btrfs_run_delayed_items(trans);
 877 out:
 878         kfree(name);
 879         iput(inode);
 880         return ret;
 881 }
 882
 883 /*
 884  * helper function to see if a given name and sequence number found
 885  * in an inode back reference are already in a directory and correctly
 886  * point to this inode
 887  */
 888 static noinline int inode_in_dir(struct btrfs_root *root,
 889                                  struct btrfs_path *path,
 890                                  u64 dirid, u64 objectid, u64 index,
 891                                  const char *name, int name_len)
 892 {
 893         struct btrfs_dir_item *di;
 894         struct btrfs_key location;
 895         int match = 0;
 896
 897         di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
 898                                          index, name, name_len, 0);
 899         if (di && !IS_ERR(di)) {
 900                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
 901                 if (location.objectid != objectid)
 902                         goto out;
 903         } else
 904                 goto out;
 905         btrfs_release_path(path);
 906
 907         di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
 908         if (di && !IS_ERR(di)) {
 909                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
 910                 if (location.objectid != objectid)
 911                         goto out;
 912         } else
 913                 goto out;
 914         match = 1;
 915 out:
 916         btrfs_release_path(path);
 917         return match;
 918 }
 919
 920 /*
 921  * helper function to check a log tree for a named back reference in
 922  * an inode.  This is used to decide if a back reference that is
 923  * found in the subvolume conflicts with what we find in the log.
 924  *
 925  * inode backreferences may have multiple refs in a single item,
 926  * during replay we process one reference at a time, and we don't
 927  * want to delete valid links to a file from the subvolume if that
 928  * link is also in the log.
 929  */
 930 static noinline int backref_in_log(struct btrfs_root *log,
 931                                    struct btrfs_key *key,
 932                                    u64 ref_objectid,
 933                                    const char *name, int namelen)
 934 {
 935         struct btrfs_path *path;
 936         struct btrfs_inode_ref *ref;
 937         unsigned long ptr;
 938         unsigned long ptr_end;
 939         unsigned long name_ptr;
 940         int found_name_len;
 941         int item_size;
 942         int ret;
 943         int match = 0;
 944
 945         path = btrfs_alloc_path();
 946         if (!path)
 947                 return -ENOMEM;
 948
 949         ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
 950         if (ret != 0)
 951                 goto out;
 952
 953         ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
 954
 955         if (key->type == BTRFS_INODE_EXTREF_KEY) {
 956                 if (btrfs_find_name_in_ext_backref(path->nodes[0],
 957                                                    path->slots[0],
 958                                                    ref_objectid,
 959                                                    name, namelen, NULL))
 960                         match = 1;
 961
 962                 goto out;
 963         }
 964
 965         item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
 966         ptr_end = ptr + item_size;
 967         while (ptr < ptr_end) {
 968                 ref = (struct btrfs_inode_ref *)ptr;
 969                 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
 970                 if (found_name_len == namelen) {
 971                         name_ptr = (unsigned long)(ref + 1);
 972                         ret = memcmp_extent_buffer(path->nodes[0], name,
 973                                                    name_ptr, namelen);
 974                         if (ret == 0) {
 975                                 match = 1;
 976                                 goto out;
 977                         }
 978                 }
 979                 ptr = (unsigned long)(ref + 1) + found_name_len;
 980         }
 981 out:
 982         btrfs_free_path(path);
 983         return match;
 984 }
 985
 986 static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
 987                                   struct btrfs_root *root,
 988                                   struct btrfs_path *path,
 989                                   struct btrfs_root *log_root,
 990                                   struct btrfs_inode *dir,
 991                                   struct btrfs_inode *inode,
 992                                   u64 inode_objectid, u64 parent_objectid,
 993                                   u64 ref_index, char *name, int namelen,
 994                                   int *search_done)
 995 {
 996         int ret;
 997         char *victim_name;
 998         int victim_name_len;
 999         struct extent_buffer *leaf;
1000         struct btrfs_dir_item *di;
1001         struct btrfs_key search_key;
1002         struct btrfs_inode_extref *extref;
1003
1004 again:
1005         /* Search old style refs */
1006         search_key.objectid = inode_objectid;
1007         search_key.type = BTRFS_INODE_REF_KEY;
1008         search_key.offset = parent_objectid;
1009         ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
1010         if (ret == 0) {
1011                 struct btrfs_inode_ref *victim_ref;
1012                 unsigned long ptr;
1013                 unsigned long ptr_end;
1014
1015                 leaf = path->nodes[0];
1016
1017                 /* are we trying to overwrite a back ref for the root directory
1018                  * if so, just jump out, we're done
1019                  */
1020                 if (search_key.objectid == search_key.offset)
1021                         return 1;
1022
1023                 /* check all the names in this back reference to see
1024                  * if they are in the log.  if so, we allow them to stay
1025                  * otherwise they must be unlinked as a conflict
1026                  */
1027                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1028                 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
1029                 while (ptr < ptr_end) {
1030                         victim_ref = (struct btrfs_inode_ref *)ptr;
1031                         victim_name_len = btrfs_inode_ref_name_len(leaf,
1032                                                                    victim_ref);
1033                         victim_name = kmalloc(victim_name_len, GFP_NOFS);
1034                         if (!victim_name)
1035                                 return -ENOMEM;
1036
1037                         read_extent_buffer(leaf, victim_name,
1038                                            (unsigned long)(victim_ref + 1),
1039                                            victim_name_len);
1040
1041                         if (!backref_in_log(log_root, &search_key,
1042                                             parent_objectid,
1043                                             victim_name,
1044                                             victim_name_len)) {
1045                                 inc_nlink(&inode->vfs_inode);
1046                                 btrfs_release_path(path);
1047
1048                                 ret = btrfs_unlink_inode(trans, root, dir, inode,
1049                                                 victim_name, victim_name_len);
1050                                 kfree(victim_name);
1051                                 if (ret)
1052                                         return ret;
1053                                 ret = btrfs_run_delayed_items(trans);
1054                                 if (ret)
1055                                         return ret;
1056                                 *search_done = 1;
1057                                 goto again;
1058                         }
1059                         kfree(victim_name);
1060
1061                         ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
1062                 }
1063
1064                 /*
1065                  * NOTE: we have searched root tree and checked the
1066                  * corresponding ref, it does not need to check again.
1067                  */
1068                 *search_done = 1;
1069         }
1070         btrfs_release_path(path);
1071
1072         /* Same search but for extended refs */
1073         extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
1074                                            inode_objectid, parent_objectid, 0,
1075                                            0);
1076         if (!IS_ERR_OR_NULL(extref)) {
1077                 u32 item_size;
1078                 u32 cur_offset = 0;
1079                 unsigned long base;
1080                 struct inode *victim_parent;
1081
1082                 leaf = path->nodes[0];
1083
1084                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1085                 base = btrfs_item_ptr_offset(leaf, path->slots[0]);
1086
1087                 while (cur_offset < item_size) {
1088                         extref = (struct btrfs_inode_extref *)(base + cur_offset);
1089
1090                         victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
1091
1092                         if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
1093                                 goto next;
1094
1095                         victim_name = kmalloc(victim_name_len, GFP_NOFS);
1096                         if (!victim_name)
1097                                 return -ENOMEM;
1098                         read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
1099                                            victim_name_len);
1100
1101                         search_key.objectid = inode_objectid;
1102                         search_key.type = BTRFS_INODE_EXTREF_KEY;
1103                         search_key.offset = btrfs_extref_hash(parent_objectid,
1104                                                               victim_name,
1105                                                               victim_name_len);
1106                         ret = 0;
1107                         if (!backref_in_log(log_root, &search_key,
1108                                             parent_objectid, victim_name,
1109                                             victim_name_len)) {
1110                                 ret = -ENOENT;
1111                                 victim_parent = read_one_inode(root,
1112                                                 parent_objectid);
1113                                 if (victim_parent) {
1114                                         inc_nlink(&inode->vfs_inode);
1115                                         btrfs_release_path(path);
1116
1117                                         ret = btrfs_unlink_inode(trans, root,
1118                                                         BTRFS_I(victim_parent),
1119                                                         inode,
1120                                                         victim_name,
1121                                                         victim_name_len);
1122                                         if (!ret)
1123                                                 ret = btrfs_run_delayed_items(
1124                                                                   trans);
1125                                 }
1126                                 iput(victim_parent);
1127                                 kfree(victim_name);
1128                                 if (ret)
1129                                         return ret;
1130                                 *search_done = 1;
1131                                 goto again;
1132                         }
1133                         kfree(victim_name);
1134 next:
1135                         cur_offset += victim_name_len + sizeof(*extref);
1136                 }
1137                 *search_done = 1;
1138         }
1139         btrfs_release_path(path);
1140
1141         /* look for a conflicting sequence number */
1142         di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
1143                                          ref_index, name, namelen, 0);
1144         if (di && !IS_ERR(di)) {
1145                 ret = drop_one_dir_item(trans, root, path, dir, di);
1146                 if (ret)
1147                         return ret;
1148         }
1149         btrfs_release_path(path);
1150
1151         /* look for a conflicing name */
1152         di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
1153                                    name, namelen, 0);
1154         if (di && !IS_ERR(di)) {
1155                 ret = drop_one_dir_item(trans, root, path, dir, di);
1156                 if (ret)
1157                         return ret;
1158         }
1159         btrfs_release_path(path);
1160
1161         return 0;
1162 }
1163
1164 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1165                              u32 *namelen, char **name, u64 *index,
1166                              u64 *parent_objectid)
1167 {
1168         struct btrfs_inode_extref *extref;
1169
1170         extref = (struct btrfs_inode_extref *)ref_ptr;
1171
1172         *namelen = btrfs_inode_extref_name_len(eb, extref);
1173         *name = kmalloc(*namelen, GFP_NOFS);
1174         if (*name == NULL)
1175                 return -ENOMEM;
1176
1177         read_extent_buffer(eb, *name, (unsigned long)&extref->name,
1178                            *namelen);
1179
1180         if (index)
1181                 *index = btrfs_inode_extref_index(eb, extref);
1182         if (parent_objectid)
1183                 *parent_objectid = btrfs_inode_extref_parent(eb, extref);
1184
1185         return 0;
1186 }
1187
1188 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1189                           u32 *namelen, char **name, u64 *index)
1190 {
1191         struct btrfs_inode_ref *ref;
1192
1193         ref = (struct btrfs_inode_ref *)ref_ptr;
1194
1195         *namelen = btrfs_inode_ref_name_len(eb, ref);
1196         *name = kmalloc(*namelen, GFP_NOFS);
1197         if (*name == NULL)
1198                 return -ENOMEM;
1199
1200         read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
1201
1202         if (index)
1203                 *index = btrfs_inode_ref_index(eb, ref);
1204
1205         return 0;
1206 }
1207
1208 /*
1209  * Take an inode reference item from the log tree and iterate all names from the
1210  * inode reference item in the subvolume tree with the same key (if it exists).
1211  * For any name that is not in the inode reference item from the log tree, do a
1212  * proper unlink of that name (that is, remove its entry from the inode
1213  * reference item and both dir index keys).
1214  */
1215 static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
1216                                  struct btrfs_root *root,
1217                                  struct btrfs_path *path,
1218                                  struct btrfs_inode *inode,
1219                                  struct extent_buffer *log_eb,
1220                                  int log_slot,
1221                                  struct btrfs_key *key)
1222 {
1223         int ret;
1224         unsigned long ref_ptr;
1225         unsigned long ref_end;
1226         struct extent_buffer *eb;
1227
1228 again:
1229         btrfs_release_path(path);
1230         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
1231         if (ret > 0) {
1232                 ret = 0;
1233                 goto out;
1234         }
1235         if (ret < 0)
1236                 goto out;
1237
1238         eb = path->nodes[0];
1239         ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
1240         ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]);
1241         while (ref_ptr < ref_end) {
1242                 char *name = NULL;
1243                 int namelen;
1244                 u64 parent_id;
1245
1246                 if (key->type == BTRFS_INODE_EXTREF_KEY) {
1247                         ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1248                                                 NULL, &parent_id);
1249                 } else {
1250                         parent_id = key->offset;
1251                         ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1252                                              NULL);
1253                 }
1254                 if (ret)
1255                         goto out;
1256
1257                 if (key->type == BTRFS_INODE_EXTREF_KEY)
1258                         ret = btrfs_find_name_in_ext_backref(log_eb, log_slot,
1259                                                              parent_id, name,
1260                                                              namelen, NULL);
1261                 else
1262                         ret = btrfs_find_name_in_backref(log_eb, log_slot, name,
1263                                                          namelen, NULL);
1264
1265                 if (!ret) {
1266                         struct inode *dir;
1267
1268                         btrfs_release_path(path);
1269                         dir = read_one_inode(root, parent_id);
1270                         if (!dir) {
1271                                 ret = -ENOENT;
1272                                 kfree(name);
1273                                 goto out;
1274                         }
1275                         ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
1276                                                  inode, name, namelen);
1277                         kfree(name);
1278                         iput(dir);
1279                         if (ret)
1280                                 goto out;
1281                         goto again;
1282                 }
1283
1284                 kfree(name);
1285                 ref_ptr += namelen;
1286                 if (key->type == BTRFS_INODE_EXTREF_KEY)
1287                         ref_ptr += sizeof(struct btrfs_inode_extref);
1288                 else
1289                         ref_ptr += sizeof(struct btrfs_inode_ref);
1290         }
1291         ret = 0;
1292  out:
1293         btrfs_release_path(path);
1294         return ret;
1295 }
1296
1297 /*
1298  * replay one inode back reference item found in the log tree.
1299  * eb, slot and key refer to the buffer and key found in the log tree.
1300  * root is the destination we are replaying into, and path is for temp
1301  * use by this function.  (it should be released on return).
1302  */
1303 static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1304                                   struct btrfs_root *root,
1305                                   struct btrfs_root *log,
1306                                   struct btrfs_path *path,
1307                                   struct extent_buffer *eb, int slot,
1308                                   struct btrfs_key *key)
1309 {
1310         struct inode *dir = NULL;
1311         struct inode *inode = NULL;
1312         unsigned long ref_ptr;
1313         unsigned long ref_end;
1314         char *name = NULL;
1315         int namelen;
1316         int ret;
1317         int search_done = 0;
1318         int log_ref_ver = 0;
1319         u64 parent_objectid;
1320         u64 inode_objectid;
1321         u64 ref_index = 0;
1322         int ref_struct_size;
1323
1324         ref_ptr = btrfs_item_ptr_offset(eb, slot);
1325         ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
1326
1327         if (key->type == BTRFS_INODE_EXTREF_KEY) {
1328                 struct btrfs_inode_extref *r;
1329
1330                 ref_struct_size = sizeof(struct btrfs_inode_extref);
1331                 log_ref_ver = 1;
1332                 r = (struct btrfs_inode_extref *)ref_ptr;
1333                 parent_objectid = btrfs_inode_extref_parent(eb, r);
1334         } else {
1335                 ref_struct_size = sizeof(struct btrfs_inode_ref);
1336                 parent_objectid = key->offset;
1337         }
1338         inode_objectid = key->objectid;
1339
1340         /*
1341          * it is possible that we didn't log all the parent directories
1342          * for a given inode.  If we don't find the dir, just don't
1343          * copy the back ref in.  The link count fixup code will take
1344          * care of the rest
1345          */
1346         dir = read_one_inode(root, parent_objectid);
1347         if (!dir) {
1348                 ret = -ENOENT;
1349                 goto out;
1350         }
1351
1352         inode = read_one_inode(root, inode_objectid);
1353         if (!inode) {
1354                 ret = -EIO;
1355                 goto out;
1356         }
1357
1358         while (ref_ptr < ref_end) {
1359                 if (log_ref_ver) {
1360                         ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1361                                                 &ref_index, &parent_objectid);
1362                         /*
1363                          * parent object can change from one array
1364                          * item to another.
1365                          */
1366                         if (!dir)
1367                                 dir = read_one_inode(root, parent_objectid);
1368                         if (!dir) {
1369                                 ret = -ENOENT;
1370                                 goto out;
1371                         }
1372                 } else {
1373                         ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1374                                              &ref_index);
1375                 }
1376                 if (ret)
1377                         goto out;
1378
1379                 /* if we already have a perfect match, we're done */
1380                 if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
1381                                         btrfs_ino(BTRFS_I(inode)), ref_index,
1382                                         name, namelen)) {
1383                         /*
1384                          * look for a conflicting back reference in the
1385                          * metadata. if we find one we have to unlink that name
1386                          * of the file before we add our new link.  Later on, we
1387                          * overwrite any existing back reference, and we don't
1388                          * want to create dangling pointers in the directory.
1389                          */
1390
1391                         if (!search_done) {
1392                                 ret = __add_inode_ref(trans, root, path, log,
1393                                                       BTRFS_I(dir),
1394                                                       BTRFS_I(inode),
1395                                                       inode_objectid,
1396                                                       parent_objectid,
1397                                                       ref_index, name, namelen,
1398                                                       &search_done);
1399                                 if (ret) {
1400                                         if (ret == 1)
1401                                                 ret = 0;
1402                                         goto out;
1403                                 }
1404                         }
1405
1406                         /* insert our name */
1407                         ret = btrfs_add_link(trans, BTRFS_I(dir),
1408                                         BTRFS_I(inode),
1409                                         name, namelen, 0, ref_index);
1410                         if (ret)
1411                                 goto out;
1412
1413                         btrfs_update_inode(trans, root, inode);
1414                 }
1415
1416                 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
1417                 kfree(name);
1418                 name = NULL;
1419                 if (log_ref_ver) {
1420                         iput(dir);
1421                         dir = NULL;
1422                 }
1423         }
1424
1425         /*
1426          * Before we overwrite the inode reference item in the subvolume tree
1427          * with the item from the log tree, we must unlink all names from the
1428          * parent directory that are in the subvolume's tree inode reference
1429          * item, otherwise we end up with an inconsistent subvolume tree where
1430          * dir index entries exist for a name but there is no inode reference
1431          * item with the same name.
1432          */
1433         ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot,
1434                                     key);
1435         if (ret)
1436                 goto out;
1437
1438         /* finally write the back reference in the inode */
1439         ret = overwrite_item(trans, root, path, eb, slot, key);
1440 out:
1441         btrfs_release_path(path);
1442         kfree(name);
1443         iput(dir);
1444         iput(inode);
1445         return ret;
1446 }
1447
1448 static int insert_orphan_item(struct btrfs_trans_handle *trans,
1449                               struct btrfs_root *root, u64 ino)
1450 {
1451         int ret;
1452
1453         ret = btrfs_insert_orphan_item(trans, root, ino);
1454         if (ret == -EEXIST)
1455                 ret = 0;
1456
1457         return ret;
1458 }
1459
1460 static int count_inode_extrefs(struct btrfs_root *root,
1461                 struct btrfs_inode *inode, struct btrfs_path *path)
1462 {
1463         int ret = 0;
1464         int name_len;
1465         unsigned int nlink = 0;
1466         u32 item_size;
1467         u32 cur_offset = 0;
1468         u64 inode_objectid = btrfs_ino(inode);
1469         u64 offset = 0;
1470         unsigned long ptr;
1471         struct btrfs_inode_extref *extref;
1472         struct extent_buffer *leaf;
1473
1474         while (1) {
1475                 ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
1476                                             &extref, &offset);
1477                 if (ret)
1478                         break;
1479
1480                 leaf = path->nodes[0];
1481                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1482                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1483                 cur_offset = 0;
1484
1485                 while (cur_offset < item_size) {
1486                         extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1487                         name_len = btrfs_inode_extref_name_len(leaf, extref);
1488
1489                         nlink++;
1490
1491                         cur_offset += name_len + sizeof(*extref);
1492                 }
1493
1494                 offset++;
1495                 btrfs_release_path(path);
1496         }
1497         btrfs_release_path(path);
1498
1499         if (ret < 0 && ret != -ENOENT)
1500                 return ret;
1501         return nlink;
1502 }
1503
1504 static int count_inode_refs(struct btrfs_root *root,
1505                         struct btrfs_inode *inode, struct btrfs_path *path)
1506 {
1507         int ret;
1508         struct btrfs_key key;
1509         unsigned int nlink = 0;
1510         unsigned long ptr;
1511         unsigned long ptr_end;
1512         int name_len;
1513         u64 ino = btrfs_ino(inode);
1514
1515         key.objectid = ino;
1516         key.type = BTRFS_INODE_REF_KEY;
1517         key.offset = (u64)-1;
1518
1519         while (1) {
1520                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1521                 if (ret < 0)
1522                         break;
1523                 if (ret > 0) {
1524                         if (path->slots[0] == 0)
1525                                 break;
1526                         path->slots[0]--;
1527                 }
1528 process_slot:
1529                 btrfs_item_key_to_cpu(path->nodes[0], &key,
1530                                       path->slots[0]);
1531                 if (key.objectid != ino ||
1532                     key.type != BTRFS_INODE_REF_KEY)
1533                         break;
1534                 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1535                 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
1536                                                    path->slots[0]);
1537                 while (ptr < ptr_end) {
1538                         struct btrfs_inode_ref *ref;
1539
1540                         ref = (struct btrfs_inode_ref *)ptr;
1541                         name_len = btrfs_inode_ref_name_len(path->nodes[0],
1542                                                             ref);
1543                         ptr = (unsigned long)(ref + 1) + name_len;
1544                         nlink++;
1545                 }
1546
1547                 if (key.offset == 0)
1548                         break;
1549                 if (path->slots[0] > 0) {
1550                         path->slots[0]--;
1551                         goto process_slot;
1552                 }
1553                 key.offset--;
1554                 btrfs_release_path(path);
1555         }
1556         btrfs_release_path(path);
1557
1558         return nlink;
1559 }
1560
1561 /*
1562  * There are a few corners where the link count of the file can't
1563  * be properly maintained during replay.  So, instead of adding
1564  * lots of complexity to the log code, we just scan the backrefs
1565  * for any file that has been through replay.
1566  *
1567  * The scan will update the link count on the inode to reflect the
1568  * number of back refs found.  If it goes down to zero, the iput
1569  * will free the inode.
1570  */
1571 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1572                                            struct btrfs_root *root,
1573                                            struct inode *inode)
1574 {
1575         struct btrfs_path *path;
1576         int ret;
1577         u64 nlink = 0;
1578         u64 ino = btrfs_ino(BTRFS_I(inode));
1579
1580         path = btrfs_alloc_path();
1581         if (!path)
1582                 return -ENOMEM;
1583
1584         ret = count_inode_refs(root, BTRFS_I(inode), path);
1585         if (ret < 0)
1586                 goto out;
1587
1588         nlink = ret;
1589
1590         ret = count_inode_extrefs(root, BTRFS_I(inode), path);
1591         if (ret < 0)
1592                 goto out;
1593
1594         nlink += ret;
1595
1596         ret = 0;
1597
1598         if (nlink != inode->i_nlink) {
1599                 set_nlink(inode, nlink);
1600                 btrfs_update_inode(trans, root, inode);
1601         }
1602         BTRFS_I(inode)->index_cnt = (u64)-1;
1603
1604         if (inode->i_nlink == 0) {
1605                 if (S_ISDIR(inode->i_mode)) {
1606                         ret = replay_dir_deletes(trans, root, NULL, path,
1607                                                  ino, 1);
1608                         if (ret)
1609                                 goto out;
1610                 }
1611                 ret = insert_orphan_item(trans, root, ino);
1612         }
1613
1614 out:
1615         btrfs_free_path(path);
1616         return ret;
1617 }
1618
1619 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1620                                             struct btrfs_root *root,
1621                                             struct btrfs_path *path)
1622 {
1623         int ret;
1624         struct btrfs_key key;
1625         struct inode *inode;
1626
1627         key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1628         key.type = BTRFS_ORPHAN_ITEM_KEY;
1629         key.offset = (u64)-1;
1630         while (1) {
1631                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1632                 if (ret < 0)
1633                         break;
1634
1635                 if (ret == 1) {
1636                         if (path->slots[0] == 0)
1637                                 break;
1638                         path->slots[0]--;
1639                 }
1640
1641                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1642                 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1643                     key.type != BTRFS_ORPHAN_ITEM_KEY)
1644                         break;
1645
1646                 ret = btrfs_del_item(trans, root, path);
1647                 if (ret)
1648                         goto out;
1649
1650                 btrfs_release_path(path);
1651                 inode = read_one_inode(root, key.offset);
1652                 if (!inode)
1653                         return -EIO;
1654
1655                 ret = fixup_inode_link_count(trans, root, inode);
1656                 iput(inode);
1657                 if (ret)
1658                         goto out;
1659
1660                 /*
1661                  * fixup on a directory may create new entries,
1662                  * make sure we always look for the highset possible
1663                  * offset
1664                  */
1665                 key.offset = (u64)-1;
1666         }
1667         ret = 0;
1668 out:
1669         btrfs_release_path(path);
1670         return ret;
1671 }
1672
1673
1674 /*
1675  * record a given inode in the fixup dir so we can check its link
1676  * count when replay is done.  The link count is incremented here
1677  * so the inode won't go away until we check it
1678  */
1679 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1680                                       struct btrfs_root *root,
1681                                       struct btrfs_path *path,
1682                                       u64 objectid)
1683 {
1684         struct btrfs_key key;
1685         int ret = 0;
1686         struct inode *inode;
1687
1688         inode = read_one_inode(root, objectid);
1689         if (!inode)
1690                 return -EIO;
1691
1692         key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1693         key.type = BTRFS_ORPHAN_ITEM_KEY;
1694         key.offset = objectid;
1695
1696         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1697
1698         btrfs_release_path(path);
1699         if (ret == 0) {
1700                 if (!inode->i_nlink)
1701                         set_nlink(inode, 1);
1702                 else
1703                         inc_nlink(inode);
1704                 ret = btrfs_update_inode(trans, root, inode);
1705         } else if (ret == -EEXIST) {
1706                 ret = 0;
1707         } else {
1708                 BUG(); /* Logic Error */
1709         }
1710         iput(inode);
1711
1712         return ret;
1713 }
1714
1715 /*
1716  * when replaying the log for a directory, we only insert names
1717  * for inodes that actually exist.  This means an fsync on a directory
1718  * does not implicitly fsync all the new files in it
1719  */
1720 static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1721                                     struct btrfs_root *root,
1722                                     u64 dirid, u64 index,
1723                                     char *name, int name_len,
1724                                     struct btrfs_key *location)
1725 {
1726         struct inode *inode;
1727         struct inode *dir;
1728         int ret;
1729
1730         inode = read_one_inode(root, location->objectid);
1731         if (!inode)
1732                 return -ENOENT;
1733
1734         dir = read_one_inode(root, dirid);
1735         if (!dir) {
1736                 iput(inode);
1737                 return -EIO;
1738         }
1739
1740         ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
1741                         name_len, 1, index);
1742
1743         /* FIXME, put inode into FIXUP list */
1744
1745         iput(inode);
1746         iput(dir);
1747         return ret;
1748 }
1749
1750 /*
1751  * Return true if an inode reference exists in the log for the given name,
1752  * inode and parent inode.
1753  */
1754 static bool name_in_log_ref(struct btrfs_root *log_root,
1755                             const char *name, const int name_len,
1756                             const u64 dirid, const u64 ino)
1757 {
1758         struct btrfs_key search_key;
1759
1760         search_key.objectid = ino;
1761         search_key.type = BTRFS_INODE_REF_KEY;
1762         search_key.offset = dirid;
1763         if (backref_in_log(log_root, &search_key, dirid, name, name_len))
1764                 return true;
1765
1766         search_key.type = BTRFS_INODE_EXTREF_KEY;
1767         search_key.offset = btrfs_extref_hash(dirid, name, name_len);
1768         if (backref_in_log(log_root, &search_key, dirid, name, name_len))
1769                 return true;
1770
1771         return false;
1772 }
1773
1774 /*
1775  * take a single entry in a log directory item and replay it into
1776  * the subvolume.
1777  *
1778  * if a conflicting item exists in the subdirectory already,
1779  * the inode it points to is unlinked and put into the link count
1780  * fix up tree.
1781  *
1782  * If a name from the log points to a file or directory that does
1783  * not exist in the FS, it is skipped.  fsyncs on directories
1784  * do not force down inodes inside that directory, just changes to the
1785  * names or unlinks in a directory.
1786  *
1787  * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
1788  * non-existing inode) and 1 if the name was replayed.
1789  */
1790 static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1791                                     struct btrfs_root *root,
1792                                     struct btrfs_path *path,
1793                                     struct extent_buffer *eb,
1794                                     struct btrfs_dir_item *di,
1795                                     struct btrfs_key *key)
1796 {
1797         char *name;
1798         int name_len;
1799         struct btrfs_dir_item *dst_di;
1800         struct btrfs_key found_key;
1801         struct btrfs_key log_key;
1802         struct inode *dir;
1803         u8 log_type;
1804         int exists;
1805         int ret = 0;
1806         bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
1807         bool name_added = false;
1808
1809         dir = read_one_inode(root, key->objectid);
1810         if (!dir)
1811                 return -EIO;
1812
1813         name_len = btrfs_dir_name_len(eb, di);
1814         name = kmalloc(name_len, GFP_NOFS);
1815         if (!name) {
1816                 ret = -ENOMEM;
1817                 goto out;
1818         }
1819
1820         log_type = btrfs_dir_type(eb, di);
1821         read_extent_buffer(eb, name, (unsigned long)(di + 1),
1822                    name_len);
1823
1824         btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1825         exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1826         if (exists == 0)
1827                 exists = 1;
1828         else
1829                 exists = 0;
1830         btrfs_release_path(path);
1831
1832         if (key->type == BTRFS_DIR_ITEM_KEY) {
1833                 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1834                                        name, name_len, 1);
1835         } else if (key->type == BTRFS_DIR_INDEX_KEY) {
1836                 dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1837                                                      key->objectid,
1838                                                      key->offset, name,
1839                                                      name_len, 1);
1840         } else {
1841                 /* Corruption */
1842                 ret = -EINVAL;
1843                 goto out;
1844         }
1845         if (IS_ERR_OR_NULL(dst_di)) {
1846                 /* we need a sequence number to insert, so we only
1847                  * do inserts for the BTRFS_DIR_INDEX_KEY types
1848                  */
1849                 if (key->type != BTRFS_DIR_INDEX_KEY)
1850                         goto out;
1851                 goto insert;
1852         }
1853
1854         btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1855         /* the existing item matches the logged item */
1856         if (found_key.objectid == log_key.objectid &&
1857             found_key.type == log_key.type &&
1858             found_key.offset == log_key.offset &&
1859             btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1860                 update_size = false;
1861                 goto out;
1862         }
1863
1864         /*
1865          * don't drop the conflicting directory entry if the inode
1866          * for the new entry doesn't exist
1867          */
1868         if (!exists)
1869                 goto out;
1870
1871         ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di);
1872         if (ret)
1873                 goto out;
1874
1875         if (key->type == BTRFS_DIR_INDEX_KEY)
1876                 goto insert;
1877 out:
1878         btrfs_release_path(path);
1879         if (!ret && update_size) {
1880                 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
1881                 ret = btrfs_update_inode(trans, root, dir);
1882         }
1883         kfree(name);
1884         iput(dir);
1885         if (!ret && name_added)
1886                 ret = 1;
1887         return ret;
1888
1889 insert:
1890         if (name_in_log_ref(root->log_root, name, name_len,
1891                             key->objectid, log_key.objectid)) {
1892                 /* The dentry will be added later. */
1893                 ret = 0;
1894                 update_size = false;
1895                 goto out;
1896         }
1897         btrfs_release_path(path);
1898         ret = insert_one_name(trans, root, key->objectid, key->offset,
1899                               name, name_len, &log_key);
1900         if (ret && ret != -ENOENT && ret != -EEXIST)
1901                 goto out;
1902         if (!ret)
1903                 name_added = true;
1904         update_size = false;
1905         ret = 0;
1906         goto out;
1907 }
1908
1909 /*
1910  * find all the names in a directory item and reconcile them into
1911  * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
1912  * one name in a directory item, but the same code gets used for
1913  * both directory index types
1914  */
1915 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1916                                         struct btrfs_root *root,
1917                                         struct btrfs_path *path,
1918                                         struct extent_buffer *eb, int slot,
1919                                         struct btrfs_key *key)
1920 {
1921         int ret = 0;
1922         u32 item_size = btrfs_item_size_nr(eb, slot);
1923         struct btrfs_dir_item *di;
1924         int name_len;
1925         unsigned long ptr;
1926         unsigned long ptr_end;
1927         struct btrfs_path *fixup_path = NULL;
1928
1929         ptr = btrfs_item_ptr_offset(eb, slot);
1930         ptr_end = ptr + item_size;
1931         while (ptr < ptr_end) {
1932                 di = (struct btrfs_dir_item *)ptr;
1933                 name_len = btrfs_dir_name_len(eb, di);
1934                 ret = replay_one_name(trans, root, path, eb, di, key);
1935                 if (ret < 0)
1936                         break;
1937                 ptr = (unsigned long)(di + 1);
1938                 ptr += name_len;
1939
1940                 /*
1941                  * If this entry refers to a non-directory (directories can not
1942                  * have a link count > 1) and it was added in the transaction
1943                  * that was not committed, make sure we fixup the link count of
1944                  * the inode it the entry points to. Otherwise something like
1945                  * the following would result in a directory pointing to an
1946                  * inode with a wrong link that does not account for this dir
1947                  * entry:
1948                  *
1949                  * mkdir testdir
1950                  * touch testdir/foo
1951                  * touch testdir/bar
1952                  * sync
1953                  *
1954                  * ln testdir/bar testdir/bar_link
1955                  * ln testdir/foo testdir/foo_link
1956                  * xfs_io -c "fsync" testdir/bar
1957                  *
1958                  * <power failure>
1959                  *
1960                  * mount fs, log replay happens
1961                  *
1962                  * File foo would remain with a link count of 1 when it has two
1963                  * entries pointing to it in the directory testdir. This would
1964                  * make it impossible to ever delete the parent directory has
1965                  * it would result in stale dentries that can never be deleted.
1966                  */
1967                 if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
1968                         struct btrfs_key di_key;
1969
1970                         if (!fixup_path) {
1971                                 fixup_path = btrfs_alloc_path();
1972                                 if (!fixup_path) {
1973                                         ret = -ENOMEM;
1974                                         break;
1975                                 }
1976                         }
1977
1978                         btrfs_dir_item_key_to_cpu(eb, di, &di_key);
1979                         ret = link_to_fixup_dir(trans, root, fixup_path,
1980                                                 di_key.objectid);
1981                         if (ret)
1982                                 break;
1983                 }
1984                 ret = 0;
1985         }
1986         btrfs_free_path(fixup_path);
1987         return ret;
1988 }
1989
1990 /*
1991  * directory replay has two parts.  There are the standard directory
1992  * items in the log copied from the subvolume, and range items
1993  * created in the log while the subvolume was logged.
1994  *
1995  * The range items tell us which parts of the key space the log
1996  * is authoritative for.  During replay, if a key in the subvolume
1997  * directory is in a logged range item, but not actually in the log
1998  * that means it was deleted from the directory before the fsync
1999  * and should be removed.
2000  */
2001 static noinline int find_dir_range(struct btrfs_root *root,
2002                                    struct btrfs_path *path,
2003                                    u64 dirid, int key_type,
2004                                    u64 *start_ret, u64 *end_ret)
2005 {
2006         struct btrfs_key key;
2007         u64 found_end;
2008         struct btrfs_dir_log_item *item;
2009         int ret;
2010         int nritems;
2011
2012         if (*start_ret == (u64)-1)
2013                 return 1;
2014
2015         key.objectid = dirid;
2016         key.type = key_type;
2017         key.offset = *start_ret;
2018
2019         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2020         if (ret < 0)
2021                 goto out;
2022         if (ret > 0) {
2023                 if (path->slots[0] == 0)
2024                         goto out;
2025                 path->slots[0]--;
2026         }
2027         if (ret != 0)
2028                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2029
2030         if (key.type != key_type || key.objectid != dirid) {
2031                 ret = 1;
2032                 goto next;
2033         }
2034         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2035                               struct btrfs_dir_log_item);
2036         found_end = btrfs_dir_log_end(path->nodes[0], item);
2037
2038         if (*start_ret >= key.offset && *start_ret <= found_end) {
2039                 ret = 0;
2040                 *start_ret = key.offset;
2041                 *end_ret = found_end;
2042                 goto out;
2043         }
2044         ret = 1;
2045 next:
2046         /* check the next slot in the tree to see if it is a valid item */
2047         nritems = btrfs_header_nritems(path->nodes[0]);
2048         path->slots[0]++;
2049         if (path->slots[0] >= nritems) {
2050                 ret = btrfs_next_leaf(root, path);
2051                 if (ret)
2052                         goto out;
2053         }
2054
2055         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2056
2057         if (key.type != key_type || key.objectid != dirid) {
2058                 ret = 1;
2059                 goto out;
2060         }
2061         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2062                               struct btrfs_dir_log_item);
2063         found_end = btrfs_dir_log_end(path->nodes[0], item);
2064         *start_ret = key.offset;
2065         *end_ret = found_end;
2066         ret = 0;
2067 out:
2068         btrfs_release_path(path);
2069         return ret;
2070 }
2071
2072 /*
2073  * this looks for a given directory item in the log.  If the directory
2074  * item is not in the log, the item is removed and the inode it points
2075  * to is unlinked
2076  */
2077 static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
2078                                       struct btrfs_root *root,
2079                                       struct btrfs_root *log,
2080                                       struct btrfs_path *path,
2081                                       struct btrfs_path *log_path,
2082                                       struct inode *dir,
2083                                       struct btrfs_key *dir_key)
2084 {
2085         int ret;
2086         struct extent_buffer *eb;
2087         int slot;
2088         u32 item_size;
2089         struct btrfs_dir_item *di;
2090         struct btrfs_dir_item *log_di;
2091         int name_len;
2092         unsigned long ptr;
2093         unsigned long ptr_end;
2094         char *name;
2095         struct inode *inode;
2096         struct btrfs_key location;
2097
2098 again:
2099         eb = path->nodes[0];
2100         slot = path->slots[0];
2101         item_size = btrfs_item_size_nr(eb, slot);
2102         ptr = btrfs_item_ptr_offset(eb, slot);
2103         ptr_end = ptr + item_size;
2104         while (ptr < ptr_end) {
2105                 di = (struct btrfs_dir_item *)ptr;
2106                 name_len = btrfs_dir_name_len(eb, di);
2107                 name = kmalloc(name_len, GFP_NOFS);
2108                 if (!name) {
2109                         ret = -ENOMEM;
2110                         goto out;
2111                 }
2112                 read_extent_buffer(eb, name, (unsigned long)(di + 1),
2113                                   name_len);
2114                 log_di = NULL;
2115                 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
2116                         log_di = btrfs_lookup_dir_item(trans, log, log_path,
2117                                                        dir_key->objectid,
2118                                                        name, name_len, 0);
2119                 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
2120                         log_di = btrfs_lookup_dir_index_item(trans, log,
2121                                                      log_path,
2122                                                      dir_key->objectid,
2123                                                      dir_key->offset,
2124                                                      name, name_len, 0);
2125                 }
2126                 if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) {
2127                         btrfs_dir_item_key_to_cpu(eb, di, &location);
2128                         btrfs_release_path(path);
2129                         btrfs_release_path(log_path);
2130                         inode = read_one_inode(root, location.objectid);
2131                         if (!inode) {
2132                                 kfree(name);
2133                                 return -EIO;
2134                         }
2135
2136                         ret = link_to_fixup_dir(trans, root,
2137                                                 path, location.objectid);
2138                         if (ret) {
2139                                 kfree(name);
2140                                 iput(inode);
2141                                 goto out;
2142                         }
2143
2144                         inc_nlink(inode);
2145                         ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
2146                                         BTRFS_I(inode), name, name_len);
2147                         if (!ret)
2148                                 ret = btrfs_run_delayed_items(trans);
2149                         kfree(name);
2150                         iput(inode);
2151                         if (ret)
2152                                 goto out;
2153
2154                         /* there might still be more names under this key
2155                          * check and repeat if required
2156                          */
2157                         ret = btrfs_search_slot(NULL, root, dir_key, path,
2158                                                 0, 0);
2159                         if (ret == 0)
2160                                 goto again;
2161                         ret = 0;
2162                         goto out;
2163                 } else if (IS_ERR(log_di)) {
2164                         kfree(name);
2165                         return PTR_ERR(log_di);
2166                 }
2167                 btrfs_release_path(log_path);
2168                 kfree(name);
2169
2170                 ptr = (unsigned long)(di + 1);
2171                 ptr += name_len;
2172         }
2173         ret = 0;
2174 out:
2175         btrfs_release_path(path);
2176         btrfs_release_path(log_path);
2177         return ret;
2178 }
2179
2180 static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
2181                               struct btrfs_root *root,
2182                               struct btrfs_root *log,
2183                               struct btrfs_path *path,
2184                               const u64 ino)
2185 {
2186         struct btrfs_key search_key;
2187         struct btrfs_path *log_path;
2188         int i;
2189         int nritems;
2190         int ret;
2191
2192         log_path = btrfs_alloc_path();
2193         if (!log_path)
2194                 return -ENOMEM;
2195
2196         search_key.objectid = ino;
2197         search_key.type = BTRFS_XATTR_ITEM_KEY;
2198         search_key.offset = 0;
2199 again:
2200         ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
2201         if (ret < 0)
2202                 goto out;
2203 process_leaf:
2204         nritems = btrfs_header_nritems(path->nodes[0]);
2205         for (i = path->slots[0]; i < nritems; i++) {
2206                 struct btrfs_key key;
2207                 struct btrfs_dir_item *di;
2208                 struct btrfs_dir_item *log_di;
2209                 u32 total_size;
2210                 u32 cur;
2211
2212                 btrfs_item_key_to_cpu(path->nodes[0], &key, i);
2213                 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
2214                         ret = 0;
2215                         goto out;
2216                 }
2217
2218                 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
2219                 total_size = btrfs_item_size_nr(path->nodes[0], i);
2220                 cur = 0;
2221                 while (cur < total_size) {
2222                         u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
2223                         u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
2224                         u32 this_len = sizeof(*di) + name_len + data_len;
2225                         char *name;
2226
2227                         name = kmalloc(name_len, GFP_NOFS);
2228                         if (!name) {
2229                                 ret = -ENOMEM;
2230                                 goto out;
2231                         }
2232                         read_extent_buffer(path->nodes[0], name,
2233                                            (unsigned long)(di + 1), name_len);
2234
2235                         log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
2236                                                     name, name_len, 0);
2237                         btrfs_release_path(log_path);
2238                         if (!log_di) {
2239                                 /* Doesn't exist in log tree, so delete it. */
2240                                 btrfs_release_path(path);
2241                                 di = btrfs_lookup_xattr(trans, root, path, ino,
2242                                                         name, name_len, -1);
2243                                 kfree(name);
2244                                 if (IS_ERR(di)) {
2245                                         ret = PTR_ERR(di);
2246                                         goto out;
2247                                 }
2248                                 ASSERT(di);
2249                                 ret = btrfs_delete_one_dir_name(trans, root,
2250                                                                 path, di);
2251                                 if (ret)
2252                                         goto out;
2253                                 btrfs_release_path(path);
2254                                 search_key = key;
2255                                 goto again;
2256                         }
2257                         kfree(name);
2258                         if (IS_ERR(log_di)) {
2259                                 ret = PTR_ERR(log_di);
2260                                 goto out;
2261                         }
2262                         cur += this_len;
2263                         di = (struct btrfs_dir_item *)((char *)di + this_len);
2264                 }
2265         }
2266         ret = btrfs_next_leaf(root, path);
2267         if (ret > 0)
2268                 ret = 0;
2269         else if (ret == 0)
2270                 goto process_leaf;
2271 out:
2272         btrfs_free_path(log_path);
2273         btrfs_release_path(path);
2274         return ret;
2275 }
2276
2277
2278 /*
2279  * deletion replay happens before we copy any new directory items
2280  * out of the log or out of backreferences from inodes.  It
2281  * scans the log to find ranges of keys that log is authoritative for,
2282  * and then scans the directory to find items in those ranges that are
2283  * not present in the log.
2284  *
2285  * Anything we don't find in the log is unlinked and removed from the
2286  * directory.
2287  */
2288 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
2289                                        struct btrfs_root *root,
2290                                        struct btrfs_root *log,
2291                                        struct btrfs_path *path,
2292                                        u64 dirid, int del_all)
2293 {
2294         u64 range_start;
2295         u64 range_end;
2296         int key_type = BTRFS_DIR_LOG_ITEM_KEY;
2297         int ret = 0;
2298         struct btrfs_key dir_key;
2299         struct btrfs_key found_key;
2300         struct btrfs_path *log_path;
2301         struct inode *dir;
2302
2303         dir_key.objectid = dirid;
2304         dir_key.type = BTRFS_DIR_ITEM_KEY;
2305         log_path = btrfs_alloc_path();
2306         if (!log_path)
2307                 return -ENOMEM;
2308
2309         dir = read_one_inode(root, dirid);
2310         /* it isn't an error if the inode isn't there, that can happen
2311          * because we replay the deletes before we copy in the inode item
2312          * from the log
2313          */
2314         if (!dir) {
2315                 btrfs_free_path(log_path);
2316                 return 0;
2317         }
2318 again:
2319         range_start = 0;
2320         range_end = 0;
2321         while (1) {
2322                 if (del_all)
2323                         range_end = (u64)-1;
2324                 else {
2325                         ret = find_dir_range(log, path, dirid, key_type,
2326                                              &range_start, &range_end);
2327                         if (ret != 0)
2328                                 break;
2329                 }
2330
2331                 dir_key.offset = range_start;
2332                 while (1) {
2333                         int nritems;
2334                         ret = btrfs_search_slot(NULL, root, &dir_key, path,
2335                                                 0, 0);
2336                         if (ret < 0)
2337                                 goto out;
2338
2339                         nritems = btrfs_header_nritems(path->nodes[0]);
2340                         if (path->slots[0] >= nritems) {
2341                                 ret = btrfs_next_leaf(root, path);
2342                                 if (ret == 1)
2343                                         break;
2344                                 else if (ret < 0)
2345                                         goto out;
2346                         }
2347                         btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2348                                               path->slots[0]);
2349                         if (found_key.objectid != dirid ||
2350                             found_key.type != dir_key.type)
2351                                 goto next_type;
2352
2353                         if (found_key.offset > range_end)
2354                                 break;
2355
2356                         ret = check_item_in_log(trans, root, log, path,
2357                                                 log_path, dir,
2358                                                 &found_key);
2359                         if (ret)
2360                                 goto out;
2361                         if (found_key.offset == (u64)-1)
2362                                 break;
2363                         dir_key.offset = found_key.offset + 1;
2364                 }
2365                 btrfs_release_path(path);
2366                 if (range_end == (u64)-1)
2367                         break;
2368                 range_start = range_end + 1;
2369         }
2370
2371 next_type:
2372         ret = 0;
2373         if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
2374                 key_type = BTRFS_DIR_LOG_INDEX_KEY;
2375                 dir_key.type = BTRFS_DIR_INDEX_KEY;
2376                 btrfs_release_path(path);
2377                 goto again;
2378         }
2379 out:
2380         btrfs_release_path(path);
2381         btrfs_free_path(log_path);
2382         iput(dir);
2383         return ret;
2384 }
2385
2386 /*
2387  * the process_func used to replay items from the log tree.  This
2388  * gets called in two different stages.  The first stage just looks
2389  * for inodes and makes sure they are all copied into the subvolume.
2390  *
2391  * The second stage copies all the other item types from the log into
2392  * the subvolume.  The two stage approach is slower, but gets rid of
2393  * lots of complexity around inodes referencing other inodes that exist
2394  * only in the log (references come from either directory items or inode
2395  * back refs).
2396  */
2397 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2398                              struct walk_control *wc, u64 gen, int level)
2399 {
2400         int nritems;
2401         struct btrfs_path *path;
2402         struct btrfs_root *root = wc->replay_dest;
2403         struct btrfs_key key;
2404         int i;
2405         int ret;
2406
2407         ret = btrfs_read_buffer(eb, gen, level, NULL);
2408         if (ret)
2409                 return ret;
2410
2411         level = btrfs_header_level(eb);
2412
2413         if (level != 0)
2414                 return 0;
2415
2416         path = btrfs_alloc_path();
2417         if (!path)
2418                 return -ENOMEM;
2419
2420         nritems = btrfs_header_nritems(eb);
2421         for (i = 0; i < nritems; i++) {
2422                 btrfs_item_key_to_cpu(eb, &key, i);
2423
2424                 /* inode keys are done during the first stage */
2425                 if (key.type == BTRFS_INODE_ITEM_KEY &&
2426                     wc->stage == LOG_WALK_REPLAY_INODES) {
2427                         struct btrfs_inode_item *inode_item;
2428                         u32 mode;
2429
2430                         inode_item = btrfs_item_ptr(eb, i,
2431                                             struct btrfs_inode_item);
2432                         ret = replay_xattr_deletes(wc->trans, root, log,
2433                                                    path, key.objectid);
2434                         if (ret)
2435                                 break;
2436                         mode = btrfs_inode_mode(eb, inode_item);
2437                         if (S_ISDIR(mode)) {
2438                                 ret = replay_dir_deletes(wc->trans,
2439                                          root, log, path, key.objectid, 0);
2440                                 if (ret)
2441                                         break;
2442                         }
2443                         ret = overwrite_item(wc->trans, root, path,
2444                                              eb, i, &key);
2445                         if (ret)
2446                                 break;
2447
2448                         /*
2449                          * Before replaying extents, truncate the inode to its
2450                          * size. We need to do it now and not after log replay
2451                          * because before an fsync we can have prealloc extents
2452                          * added beyond the inode's i_size. If we did it after,
2453                          * through orphan cleanup for example, we would drop
2454                          * those prealloc extents just after replaying them.
2455                          */
2456                         if (S_ISREG(mode)) {
2457                                 struct inode *inode;
2458                                 u64 from;
2459
2460                                 inode = read_one_inode(root, key.objectid);
2461                                 if (!inode) {
2462                                         ret = -EIO;
2463                                         break;
2464                                 }
2465                                 from = ALIGN(i_size_read(inode),
2466                                              root->fs_info->sectorsize);
2467                                 ret = btrfs_drop_extents(wc->trans, root, inode,
2468                                                          from, (u64)-1, 1);
2469                                 /*
2470                                  * If the nlink count is zero here, the iput
2471                                  * will free the inode.  We bump it to make
2472                                  * sure it doesn't get freed until the link
2473                                  * count fixup is done.
2474                                  */
2475                                 if (!ret) {
2476                                         if (inode->i_nlink == 0)
2477                                                 inc_nlink(inode);
2478                                         /* Update link count and nbytes. */
2479                                         ret = btrfs_update_inode(wc->trans,
2480                                                                  root, inode);
2481                                 }
2482                                 iput(inode);
2483                                 if (ret)
2484                                         break;
2485                         }
2486
2487                         ret = link_to_fixup_dir(wc->trans, root,
2488                                                 path, key.objectid);
2489                         if (ret)
2490                                 break;
2491                 }
2492
2493                 if (key.type == BTRFS_DIR_INDEX_KEY &&
2494                     wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2495                         ret = replay_one_dir_item(wc->trans, root, path,
2496                                                   eb, i, &key);
2497                         if (ret)
2498                                 break;
2499                 }
2500
2501                 if (wc->stage < LOG_WALK_REPLAY_ALL)
2502                         continue;
2503
2504                 /* these keys are simply copied */
2505                 if (key.type == BTRFS_XATTR_ITEM_KEY) {
2506                         ret = overwrite_item(wc->trans, root, path,
2507                                              eb, i, &key);
2508                         if (ret)
2509                                 break;
2510                 } else if (key.type == BTRFS_INODE_REF_KEY ||
2511                            key.type == BTRFS_INODE_EXTREF_KEY) {
2512                         ret = add_inode_ref(wc->trans, root, log, path,
2513                                             eb, i, &key);
2514                         if (ret && ret != -ENOENT)
2515                                 break;
2516                         ret = 0;
2517                 } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
2518                         ret = replay_one_extent(wc->trans, root, path,
2519                                                 eb, i, &key);
2520                         if (ret)
2521                                 break;
2522                 } else if (key.type == BTRFS_DIR_ITEM_KEY) {
2523                         ret = replay_one_dir_item(wc->trans, root, path,
2524                                                   eb, i, &key);
2525                         if (ret)
2526                                 break;
2527                 }
2528         }
2529         btrfs_free_path(path);
2530         return ret;
2531 }
2532
2533 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2534                                    struct btrfs_root *root,
2535                                    struct btrfs_path *path, int *level,
2536                                    struct walk_control *wc)
2537 {
2538         struct btrfs_fs_info *fs_info = root->fs_info;
2539         u64 root_owner;
2540         u64 bytenr;
2541         u64 ptr_gen;
2542         struct extent_buffer *next;
2543         struct extent_buffer *cur;
2544         struct extent_buffer *parent;
2545         u32 blocksize;
2546         int ret = 0;
2547
2548         WARN_ON(*level < 0);
2549         WARN_ON(*level >= BTRFS_MAX_LEVEL);
2550
2551         while (*level > 0) {
2552                 struct btrfs_key first_key;
2553
2554                 WARN_ON(*level < 0);
2555                 WARN_ON(*level >= BTRFS_MAX_LEVEL);
2556                 cur = path->nodes[*level];
2557
2558                 WARN_ON(btrfs_header_level(cur) != *level);
2559
2560                 if (path->slots[*level] >=
2561                     btrfs_header_nritems(cur))
2562                         break;
2563
2564                 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2565                 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2566                 btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);
2567                 blocksize = fs_info->nodesize;
2568
2569                 parent = path->nodes[*level];
2570                 root_owner = btrfs_header_owner(parent);
2571
2572                 next = btrfs_find_create_tree_block(fs_info, bytenr);
2573                 if (IS_ERR(next))
2574                         return PTR_ERR(next);
2575
2576                 if (*level == 1) {
2577                         ret = wc->process_func(root, next, wc, ptr_gen,
2578                                                *level - 1);
2579                         if (ret) {
2580                                 free_extent_buffer(next);
2581                                 return ret;
2582                         }
2583
2584                         path->slots[*level]++;
2585                         if (wc->free) {
2586                                 ret = btrfs_read_buffer(next, ptr_gen,
2587                                                         *level - 1, &first_key);
2588                                 if (ret) {
2589                                         free_extent_buffer(next);
2590                                         return ret;
2591                                 }
2592
2593                                 if (trans) {
2594                                         btrfs_tree_lock(next);
2595                                         btrfs_set_lock_blocking(next);
2596                                         clean_tree_block(fs_info, next);
2597                                         btrfs_wait_tree_block_writeback(next);
2598                                         btrfs_tree_unlock(next);
2599                                 } else {
2600                                         if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2601                                                 clear_extent_buffer_dirty(next);
2602                                 }
2603
2604                                 WARN_ON(root_owner !=
2605                                         BTRFS_TREE_LOG_OBJECTID);
2606                                 ret = btrfs_free_and_pin_reserved_extent(
2607                                                         fs_info, bytenr,
2608                                                         blocksize);
2609                                 if (ret) {
2610                                         free_extent_buffer(next);
2611                                         return ret;
2612                                 }
2613                         }
2614                         free_extent_buffer(next);
2615                         continue;
2616                 }
2617                 ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key);
2618                 if (ret) {
2619                         free_extent_buffer(next);
2620                         return ret;
2621                 }
2622
2623                 WARN_ON(*level <= 0);
2624                 if (path->nodes[*level-1])
2625                         free_extent_buffer(path->nodes[*level-1]);
2626                 path->nodes[*level-1] = next;
2627                 *level = btrfs_header_level(next);
2628                 path->slots[*level] = 0;
2629                 cond_resched();
2630         }
2631         WARN_ON(*level < 0);
2632         WARN_ON(*level >= BTRFS_MAX_LEVEL);
2633
2634         path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
2635
2636         cond_resched();
2637         return 0;
2638 }
2639
2640 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2641                                  struct btrfs_root *root,
2642                                  struct btrfs_path *path, int *level,
2643                                  struct walk_control *wc)
2644 {
2645         struct btrfs_fs_info *fs_info = root->fs_info;
2646         u64 root_owner;
2647         int i;
2648         int slot;
2649         int ret;
2650
2651         for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2652                 slot = path->slots[i];
2653                 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
2654                         path->slots[i]++;
2655                         *level = i;
2656                         WARN_ON(*level == 0);
2657                         return 0;
2658                 } else {
2659                         struct extent_buffer *parent;
2660                         if (path->nodes[*level] == root->node)
2661                                 parent = path->nodes[*level];
2662                         else
2663                                 parent = path->nodes[*level + 1];
2664
2665                         root_owner = btrfs_header_owner(parent);
2666                         ret = wc->process_func(root, path->nodes[*level], wc,
2667                                  btrfs_header_generation(path->nodes[*level]),
2668                                  *level);
2669                         if (ret)
2670                                 return ret;
2671
2672                         if (wc->free) {
2673                                 struct extent_buffer *next;
2674
2675                                 next = path->nodes[*level];
2676
2677                                 if (trans) {
2678                                         btrfs_tree_lock(next);
2679                                         btrfs_set_lock_blocking(next);
2680                                         clean_tree_block(fs_info, next);
2681                                         btrfs_wait_tree_block_writeback(next);
2682                                         btrfs_tree_unlock(next);
2683                                 } else {
2684                                         if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2685                                                 clear_extent_buffer_dirty(next);
2686                                 }
2687
2688                                 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
2689                                 ret = btrfs_free_and_pin_reserved_extent(
2690                                                 fs_info,
2691                                                 path->nodes[*level]->start,
2692                                                 path->nodes[*level]->len);
2693                                 if (ret)
2694                                         return ret;
2695                         }
2696                         free_extent_buffer(path->nodes[*level]);
2697                         path->nodes[*level] = NULL;
2698                         *level = i + 1;
2699                 }
2700         }
2701         return 1;
2702 }
2703
2704 /*
2705  * drop the reference count on the tree rooted at 'snap'.  This traverses
2706  * the tree freeing any blocks that have a ref count of zero after being
2707  * decremented.
2708  */
2709 static int walk_log_tree(struct btrfs_trans_handle *trans,
2710                          struct btrfs_root *log, struct walk_control *wc)
2711 {
2712         struct btrfs_fs_info *fs_info = log->fs_info;
2713         int ret = 0;
2714         int wret;
2715         int level;
2716         struct btrfs_path *path;
2717         int orig_level;
2718
2719         path = btrfs_alloc_path();
2720         if (!path)
2721                 return -ENOMEM;
2722
2723         level = btrfs_header_level(log->node);
2724         orig_level = level;
2725         path->nodes[level] = log->node;
2726         extent_buffer_get(log->node);
2727         path->slots[level] = 0;
2728
2729         while (1) {
2730                 wret = walk_down_log_tree(trans, log, path, &level, wc);
2731                 if (wret > 0)
2732                         break;
2733                 if (wret < 0) {
2734                         ret = wret;
2735                         goto out;
2736                 }
2737
2738                 wret = walk_up_log_tree(trans, log, path, &level, wc);
2739                 if (wret > 0)
2740                         break;
2741                 if (wret < 0) {
2742                         ret = wret;
2743                         goto out;
2744                 }
2745         }
2746
2747         /* was the root node processed? if not, catch it here */
2748         if (path->nodes[orig_level]) {
2749                 ret = wc->process_func(log, path->nodes[orig_level], wc,
2750                          btrfs_header_generation(path->nodes[orig_level]),
2751                          orig_level);
2752                 if (ret)
2753                         goto out;
2754                 if (wc->free) {
2755                         struct extent_buffer *next;
2756
2757                         next = path->nodes[orig_level];
2758
2759                         if (trans) {
2760                                 btrfs_tree_lock(next);
2761                                 btrfs_set_lock_blocking(next);
2762                                 clean_tree_block(fs_info, next);
2763                                 btrfs_wait_tree_block_writeback(next);
2764                                 btrfs_tree_unlock(next);
2765                         } else {
2766                                 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2767                                         clear_extent_buffer_dirty(next);
2768                         }
2769
2770                         WARN_ON(log->root_key.objectid !=
2771                                 BTRFS_TREE_LOG_OBJECTID);
2772                         ret = btrfs_free_and_pin_reserved_extent(fs_info,
2773                                                         next->start, next->len);
2774                         if (ret)
2775                                 goto out;
2776                 }
2777         }
2778
2779 out:
2780         btrfs_free_path(path);
2781         return ret;
2782 }
2783
2784 /*
2785  * helper function to update the item for a given subvolumes log root
2786  * in the tree of log roots
2787  */
2788 static int update_log_root(struct btrfs_trans_handle *trans,
2789                            struct btrfs_root *log)
2790 {
2791         struct btrfs_fs_info *fs_info = log->fs_info;
2792         int ret;
2793
2794         if (log->log_transid == 1) {
2795                 /* insert root item on the first sync */
2796                 ret = btrfs_insert_root(trans, fs_info->log_root_tree,
2797                                 &log->root_key, &log->root_item);
2798         } else {
2799                 ret = btrfs_update_root(trans, fs_info->log_root_tree,
2800                                 &log->root_key, &log->root_item);
2801         }
2802         return ret;
2803 }
2804
2805 static void wait_log_commit(struct btrfs_root *root, int transid)
2806 {
2807         DEFINE_WAIT(wait);
2808         int index = transid % 2;
2809
2810         /*
2811          * we only allow two pending log transactions at a time,
2812          * so we know that if ours is more than 2 older than the
2813          * current transaction, we're done
2814          */
2815         for (;;) {
2816                 prepare_to_wait(&root->log_commit_wait[index],
2817                                 &wait, TASK_UNINTERRUPTIBLE);
2818
2819                 if (!(root->log_transid_committed < transid &&
2820                       atomic_read(&root->log_commit[index])))
2821                         break;
2822
2823                 mutex_unlock(&root->log_mutex);
2824                 schedule();
2825                 mutex_lock(&root->log_mutex);
2826         }
2827         finish_wait(&root->log_commit_wait[index], &wait);
2828 }
2829
2830 static void wait_for_writer(struct btrfs_root *root)
2831 {
2832         DEFINE_WAIT(wait);
2833
2834         for (;;) {
2835                 prepare_to_wait(&root->log_writer_wait, &wait,
2836                                 TASK_UNINTERRUPTIBLE);
2837                 if (!atomic_read(&root->log_writers))
2838                         break;
2839
2840                 mutex_unlock(&root->log_mutex);
2841                 schedule();
2842                 mutex_lock(&root->log_mutex);
2843         }
2844         finish_wait(&root->log_writer_wait, &wait);
2845 }
2846
2847 static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
2848                                         struct btrfs_log_ctx *ctx)
2849 {
2850         if (!ctx)
2851                 return;
2852
2853         mutex_lock(&root->log_mutex);
2854         list_del_init(&ctx->list);
2855         mutex_unlock(&root->log_mutex);
2856 }
2857
2858 /*
2859  * Invoked in log mutex context, or be sure there is no other task which
2860  * can access the list.
2861  */
2862 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
2863                                              int index, int error)
2864 {
2865         struct btrfs_log_ctx *ctx;
2866         struct btrfs_log_ctx *safe;
2867
2868         list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
2869                 list_del_init(&ctx->list);
2870                 ctx->log_ret = error;
2871         }
2872
2873         INIT_LIST_HEAD(&root->log_ctxs[index]);
2874 }
2875
2876 /*
2877  * btrfs_sync_log does sends a given tree log down to the disk and
2878  * updates the super blocks to record it.  When this call is done,
2879  * you know that any inodes previously logged are safely on disk only
2880  * if it returns 0.
2881  *
2882  * Any other return value means you need to call btrfs_commit_transaction.
2883  * Some of the edge cases for fsyncing directories that have had unlinks
2884  * or renames done in the past mean that sometimes the only safe
2885  * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
2886  * that has happened.
2887  */
2888 int btrfs_sync_log(struct btrfs_trans_handle *trans,
2889                    struct btrfs_root *root, struct btrfs_log_ctx *ctx)
2890 {
2891         int index1;
2892         int index2;
2893         int mark;
2894         int ret;
2895         struct btrfs_fs_info *fs_info = root->fs_info;
2896         struct btrfs_root *log = root->log_root;
2897         struct btrfs_root *log_root_tree = fs_info->log_root_tree;
2898         int log_transid = 0;
2899         struct btrfs_log_ctx root_log_ctx;
2900         struct blk_plug plug;
2901
2902         mutex_lock(&root->log_mutex);
2903         log_transid = ctx->log_transid;
2904         if (root->log_transid_committed >= log_transid) {
2905                 mutex_unlock(&root->log_mutex);
2906                 return ctx->log_ret;
2907         }
2908
2909         index1 = log_transid % 2;
2910         if (atomic_read(&root->log_commit[index1])) {
2911                 wait_log_commit(root, log_transid);
2912                 mutex_unlock(&root->log_mutex);
2913                 return ctx->log_ret;
2914         }
2915         ASSERT(log_transid == root->log_transid);
2916         atomic_set(&root->log_commit[index1], 1);
2917
2918         /* wait for previous tree log sync to complete */
2919         if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2920                 wait_log_commit(root, log_transid - 1);
2921
2922         while (1) {
2923                 int batch = atomic_read(&root->log_batch);
2924                 /* when we're on an ssd, just kick the log commit out */
2925                 if (!btrfs_test_opt(fs_info, SSD) &&
2926                     test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
2927                         mutex_unlock(&root->log_mutex);
2928                         schedule_timeout_uninterruptible(1);
2929                         mutex_lock(&root->log_mutex);
2930                 }
2931                 wait_for_writer(root);
2932                 if (batch == atomic_read(&root->log_batch))
2933                         break;
2934         }
2935
2936         /* bail out if we need to do a full commit */
2937         if (btrfs_need_log_full_commit(fs_info, trans)) {
2938                 ret = -EAGAIN;
2939                 btrfs_free_logged_extents(log, log_transid);
2940                 mutex_unlock(&root->log_mutex);
2941                 goto out;
2942         }
2943
2944         if (log_transid % 2 == 0)
2945                 mark = EXTENT_DIRTY;
2946         else
2947                 mark = EXTENT_NEW;
2948
2949         /* we start IO on  all the marked extents here, but we don't actually
2950          * wait for them until later.
2951          */
2952         blk_start_plug(&plug);
2953         ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
2954         if (ret) {
2955                 blk_finish_plug(&plug);
2956                 btrfs_abort_transaction(trans, ret);
2957                 btrfs_free_logged_extents(log, log_transid);
2958                 btrfs_set_log_full_commit(fs_info, trans);
2959                 mutex_unlock(&root->log_mutex);
2960                 goto out;
2961         }
2962
2963         btrfs_set_root_node(&log->root_item, log->node);
2964
2965         root->log_transid++;
2966         log->log_transid = root->log_transid;
2967         root->log_start_pid = 0;
2968         /*
2969          * IO has been started, blocks of the log tree have WRITTEN flag set
2970          * in their headers. new modifications of the log will be written to
2971          * new positions. so it's safe to allow log writers to go in.
2972          */
2973         mutex_unlock(&root->log_mutex);
2974
2975         btrfs_init_log_ctx(&root_log_ctx, NULL);
2976
2977         mutex_lock(&log_root_tree->log_mutex);
2978         atomic_inc(&log_root_tree->log_batch);
2979         atomic_inc(&log_root_tree->log_writers);
2980
2981         index2 = log_root_tree->log_transid % 2;
2982         list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
2983         root_log_ctx.log_transid = log_root_tree->log_transid;
2984
2985         mutex_unlock(&log_root_tree->log_mutex);
2986
2987         ret = update_log_root(trans, log);
2988
2989         mutex_lock(&log_root_tree->log_mutex);
2990         if (atomic_dec_and_test(&log_root_tree->log_writers)) {
2991                 /*
2992                  * Implicit memory barrier after atomic_dec_and_test
2993                  */
2994                 if (waitqueue_active(&log_root_tree->log_writer_wait))
2995                         wake_up(&log_root_tree->log_writer_wait);
2996         }
2997
2998         if (ret) {
2999                 if (!list_empty(&root_log_ctx.list))
3000                         list_del_init(&root_log_ctx.list);
3001
3002                 blk_finish_plug(&plug);
3003                 btrfs_set_log_full_commit(fs_info, trans);
3004
3005                 if (ret != -ENOSPC) {
3006                         btrfs_abort_transaction(trans, ret);
3007                         mutex_unlock(&log_root_tree->log_mutex);
3008                         goto out;
3009                 }
3010                 btrfs_wait_tree_log_extents(log, mark);
3011                 btrfs_free_logged_extents(log, log_transid);
3012                 mutex_unlock(&log_root_tree->log_mutex);
3013                 ret = -EAGAIN;
3014                 goto out;
3015         }
3016
3017         if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
3018                 blk_finish_plug(&plug);
3019                 list_del_init(&root_log_ctx.list);
3020                 mutex_unlock(&log_root_tree->log_mutex);
3021                 ret = root_log_ctx.log_ret;
3022                 goto out;
3023         }
3024
3025         index2 = root_log_ctx.log_transid % 2;
3026         if (atomic_read(&log_root_tree->log_commit[index2])) {
3027                 blk_finish_plug(&plug);
3028                 ret = btrfs_wait_tree_log_extents(log, mark);
3029                 btrfs_wait_logged_extents(trans, log, log_transid);
3030                 wait_log_commit(log_root_tree,
3031                                 root_log_ctx.log_transid);
3032                 mutex_unlock(&log_root_tree->log_mutex);
3033                 if (!ret)
3034                         ret = root_log_ctx.log_ret;
3035                 goto out;
3036         }
3037         ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
3038         atomic_set(&log_root_tree->log_commit[index2], 1);
3039
3040         if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
3041                 wait_log_commit(log_root_tree,
3042                                 root_log_ctx.log_transid - 1);
3043         }
3044
3045         wait_for_writer(log_root_tree);
3046
3047         /*
3048          * now that we've moved on to the tree of log tree roots,
3049          * check the full commit flag again
3050          */
3051         if (btrfs_need_log_full_commit(fs_info, trans)) {
3052                 blk_finish_plug(&plug);
3053                 btrfs_wait_tree_log_extents(log, mark);
3054                 btrfs_free_logged_extents(log, log_transid);
3055                 mutex_unlock(&log_root_tree->log_mutex);
3056                 ret = -EAGAIN;
3057                 goto out_wake_log_root;
3058         }
3059
3060         ret = btrfs_write_marked_extents(fs_info,
3061                                          &log_root_tree->dirty_log_pages,
3062                                          EXTENT_DIRTY | EXTENT_NEW);
3063         blk_finish_plug(&plug);
3064         if (ret) {
3065                 btrfs_set_log_full_commit(fs_info, trans);
3066                 btrfs_abort_transaction(trans, ret);
3067                 btrfs_free_logged_extents(log, log_transid);
3068                 mutex_unlock(&log_root_tree->log_mutex);
3069                 goto out_wake_log_root;
3070         }
3071         ret = btrfs_wait_tree_log_extents(log, mark);
3072         if (!ret)
3073                 ret = btrfs_wait_tree_log_extents(log_root_tree,
3074                                                   EXTENT_NEW | EXTENT_DIRTY);
3075         if (ret) {
3076                 btrfs_set_log_full_commit(fs_info, trans);
3077                 btrfs_free_logged_extents(log, log_transid);
3078                 mutex_unlock(&log_root_tree->log_mutex);
3079                 goto out_wake_log_root;
3080         }
3081         btrfs_wait_logged_extents(trans, log, log_transid);
3082
3083         btrfs_set_super_log_root(fs_info->super_for_commit,
3084                                  log_root_tree->node->start);
3085         btrfs_set_super_log_root_level(fs_info->super_for_commit,
3086                                        btrfs_header_level(log_root_tree->node));
3087
3088         log_root_tree->log_transid++;
3089         mutex_unlock(&log_root_tree->log_mutex);
3090
3091         /*
3092          * nobody else is going to jump in and write the the ctree
3093          * super here because the log_commit atomic below is protecting
3094          * us.  We must be called with a transaction handle pinning
3095          * the running transaction open, so a full commit can't hop
3096          * in and cause problems either.
3097          */
3098         ret = write_all_supers(fs_info, 1);
3099         if (ret) {
3100                 btrfs_set_log_full_commit(fs_info, trans);
3101                 btrfs_abort_transaction(trans, ret);
3102                 goto out_wake_log_root;
3103         }
3104
3105         mutex_lock(&root->log_mutex);
3106         if (root->last_log_commit < log_transid)
3107                 root->last_log_commit = log_transid;
3108         mutex_unlock(&root->log_mutex);
3109
3110 out_wake_log_root:
3111         mutex_lock(&log_root_tree->log_mutex);
3112         btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
3113
3114         log_root_tree->log_transid_committed++;
3115         atomic_set(&log_root_tree->log_commit[index2], 0);
3116         mutex_unlock(&log_root_tree->log_mutex);
3117
3118         /*
3119          * The barrier before waitqueue_active is implied by mutex_unlock
3120          */
3121         if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
3122                 wake_up(&log_root_tree->log_commit_wait[index2]);
3123 out:
3124         mutex_lock(&root->log_mutex);
3125         btrfs_remove_all_log_ctxs(root, index1, ret);
3126         root->log_transid_committed++;
3127         atomic_set(&root->log_commit[index1], 0);
3128         mutex_unlock(&root->log_mutex);
3129
3130         /*
3131          * The barrier before waitqueue_active is implied by mutex_unlock
3132          */
3133         if (waitqueue_active(&root->log_commit_wait[index1]))
3134                 wake_up(&root->log_commit_wait[index1]);
3135         return ret;
3136 }
3137
3138 static void free_log_tree(struct btrfs_trans_handle *trans,
3139                           struct btrfs_root *log)
3140 {
3141         int ret;
3142         u64 start;
3143         u64 end;
3144         struct walk_control wc = {
3145                 .free = 1,
3146                 .process_func = process_one_buffer
3147         };
3148
3149         ret = walk_log_tree(trans, log, &wc);
3150         /* I don't think this can happen but just in case */
3151         if (ret)
3152                 btrfs_abort_transaction(trans, ret);
3153
3154         while (1) {
3155                 ret = find_first_extent_bit(&log->dirty_log_pages,
3156                                 0, &start, &end,
3157                                 EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT,
3158                                 NULL);
3159                 if (ret)
3160                         break;
3161
3162                 clear_extent_bits(&log->dirty_log_pages, start, end,
3163                                   EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
3164         }
3165
3166         /*
3167          * We may have short-circuited the log tree with the full commit logic
3168          * and left ordered extents on our list, so clear these out to keep us
3169          * from leaking inodes and memory.
3170          */
3171         btrfs_free_logged_extents(log, 0);
3172         btrfs_free_logged_extents(log, 1);
3173
3174         free_extent_buffer(log->node);
3175         kfree(log);
3176 }
3177
3178 /*
3179  * free all the extents used by the tree log.  This should be called
3180  * at commit time of the full transaction
3181  */
3182 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
3183 {
3184         if (root->log_root) {
3185                 free_log_tree(trans, root->log_root);
3186                 root->log_root = NULL;
3187         }
3188         return 0;
3189 }
3190
3191 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
3192                              struct btrfs_fs_info *fs_info)
3193 {
3194         if (fs_info->log_root_tree) {
3195                 free_log_tree(trans, fs_info->log_root_tree);
3196                 fs_info->log_root_tree = NULL;
3197         }
3198         return 0;
3199 }
3200
3201 /*
3202  * If both a file and directory are logged, and unlinks or renames are
3203  * mixed in, we have a few interesting corners:
3204  *
3205  * create file X in dir Y
3206  * link file X to X.link in dir Y
3207  * fsync file X
3208  * unlink file X but leave X.link
3209  * fsync dir Y
3210  *
3211  * After a crash we would expect only X.link to exist.  But file X
3212  * didn't get fsync'd again so the log has back refs for X and X.link.
3213  *
3214  * We solve this by removing directory entries and inode backrefs from the
3215  * log when a file that was logged in the current transaction is
3216  * unlinked.  Any later fsync will include the updated log entries, and
3217  * we'll be able to reconstruct the proper directory items from backrefs.
3218  *
3219  * This optimizations allows us to avoid relogging the entire inode
3220  * or the entire directory.
3221  */
3222 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
3223                                  struct btrfs_root *root,
3224                                  const char *name, int name_len,
3225                                  struct btrfs_inode *dir, u64 index)
3226 {
3227         struct btrfs_root *log;
3228         struct btrfs_dir_item *di;
3229         struct btrfs_path *path;
3230         int ret;
3231         int err = 0;
3232         int bytes_del = 0;
3233         u64 dir_ino = btrfs_ino(dir);
3234
3235         if (dir->logged_trans < trans->transid)
3236                 return 0;
3237
3238         ret = join_running_log_trans(root);
3239         if (ret)
3240                 return 0;
3241
3242         mutex_lock(&dir->log_mutex);
3243
3244         log = root->log_root;
3245         path = btrfs_alloc_path();
3246         if (!path) {
3247                 err = -ENOMEM;
3248                 goto out_unlock;
3249         }
3250
3251         di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
3252                                    name, name_len, -1);
3253         if (IS_ERR(di)) {
3254                 err = PTR_ERR(di);
3255                 goto fail;
3256         }
3257         if (di) {
3258                 ret = btrfs_delete_one_dir_name(trans, log, path, di);
3259                 bytes_del += name_len;
3260                 if (ret) {
3261                         err = ret;
3262                         goto fail;
3263                 }
3264         }
3265         btrfs_release_path(path);
3266         di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
3267                                          index, name, name_len, -1);
3268         if (IS_ERR(di)) {
3269                 err = PTR_ERR(di);
3270                 goto fail;
3271         }
3272         if (di) {
3273                 ret = btrfs_delete_one_dir_name(trans, log, path, di);
3274                 bytes_del += name_len;
3275                 if (ret) {
3276                         err = ret;
3277                         goto fail;
3278                 }
3279         }
3280
3281         /* update the directory size in the log to reflect the names
3282          * we have removed
3283          */
3284         if (bytes_del) {
3285                 struct btrfs_key key;
3286
3287                 key.objectid = dir_ino;
3288                 key.offset = 0;
3289                 key.type = BTRFS_INODE_ITEM_KEY;
3290                 btrfs_release_path(path);
3291
3292                 ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
3293                 if (ret < 0) {
3294                         err = ret;
3295                         goto fail;
3296                 }
3297                 if (ret == 0) {
3298                         struct btrfs_inode_item *item;
3299                         u64 i_size;
3300
3301                         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3302                                               struct btrfs_inode_item);
3303                         i_size = btrfs_inode_size(path->nodes[0], item);
3304                         if (i_size > bytes_del)
3305                                 i_size -= bytes_del;
3306                         else
3307                                 i_size = 0;
3308                         btrfs_set_inode_size(path->nodes[0], item, i_size);
3309                         btrfs_mark_buffer_dirty(path->nodes[0]);
3310                 } else
3311                         ret = 0;
3312                 btrfs_release_path(path);
3313         }
3314 fail:
3315         btrfs_free_path(path);
3316 out_unlock:
3317         mutex_unlock(&dir->log_mutex);
3318         if (ret == -ENOSPC) {
3319                 btrfs_set_log_full_commit(root->fs_info, trans);
3320                 ret = 0;
3321         } else if (ret < 0)
3322                 btrfs_abort_transaction(trans, ret);
3323
3324         btrfs_end_log_trans(root);
3325
3326         return err;
3327 }
3328
3329 /* see comments for btrfs_del_dir_entries_in_log */
3330 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
3331                                struct btrfs_root *root,
3332                                const char *name, int name_len,
3333                                struct btrfs_inode *inode, u64 dirid)
3334 {
3335         struct btrfs_fs_info *fs_info = root->fs_info;
3336         struct btrfs_root *log;
3337         u64 index;
3338         int ret;
3339
3340         if (inode->logged_trans < trans->transid)
3341                 return 0;
3342
3343         ret = join_running_log_trans(root);
3344         if (ret)
3345                 return 0;
3346         log = root->log_root;
3347         mutex_lock(&inode->log_mutex);
3348
3349         ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
3350                                   dirid, &index);
3351         mutex_unlock(&inode->log_mutex);
3352         if (ret == -ENOSPC) {
3353                 btrfs_set_log_full_commit(fs_info, trans);
3354                 ret = 0;
3355         } else if (ret < 0 && ret != -ENOENT)
3356                 btrfs_abort_transaction(trans, ret);
3357         btrfs_end_log_trans(root);
3358
3359         return ret;
3360 }
3361
3362 /*
3363  * creates a range item in the log for 'dirid'.  first_offset and
3364  * last_offset tell us which parts of the key space the log should
3365  * be considered authoritative for.
3366  */
3367 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
3368                                        struct btrfs_root *log,
3369                                        struct btrfs_path *path,
3370                                        int key_type, u64 dirid,
3371                                        u64 first_offset, u64 last_offset)
3372 {
3373         int ret;
3374         struct btrfs_key key;
3375         struct btrfs_dir_log_item *item;
3376
3377         key.objectid = dirid;
3378         key.offset = first_offset;
3379         if (key_type == BTRFS_DIR_ITEM_KEY)
3380                 key.type = BTRFS_DIR_LOG_ITEM_KEY;
3381         else
3382                 key.type = BTRFS_DIR_LOG_INDEX_KEY;
3383         ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
3384         if (ret)
3385                 return ret;
3386
3387         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3388                               struct btrfs_dir_log_item);
3389         btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
3390         btrfs_mark_buffer_dirty(path->nodes[0]);
3391         btrfs_release_path(path);
3392         return 0;
3393 }
3394
3395 /*
3396  * log all the items included in the current transaction for a given
3397  * directory.  This also creates the range items in the log tree required
3398  * to replay anything deleted before the fsync
3399  */
3400 static noinline int log_dir_items(struct btrfs_trans_handle *trans,
3401                           struct btrfs_root *root, struct btrfs_inode *inode,
3402                           struct btrfs_path *path,
3403                           struct btrfs_path *dst_path, int key_type,
3404                           struct btrfs_log_ctx *ctx,
3405                           u64 min_offset, u64 *last_offset_ret)
3406 {
3407         struct btrfs_key min_key;
3408         struct btrfs_root *log = root->log_root;
3409         struct extent_buffer *src;
3410         int err = 0;
3411         int ret;
3412         int i;
3413         int nritems;
3414         u64 first_offset = min_offset;
3415         u64 last_offset = (u64)-1;
3416         u64 ino = btrfs_ino(inode);
3417
3418         log = root->log_root;
3419
3420         min_key.objectid = ino;
3421         min_key.type = key_type;
3422         min_key.offset = min_offset;
3423
3424         ret = btrfs_search_forward(root, &min_key, path, trans->transid);
3425
3426         /*
3427          * we didn't find anything from this transaction, see if there
3428          * is anything at all
3429          */
3430         if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
3431                 min_key.objectid = ino;
3432                 min_key.type = key_type;
3433                 min_key.offset = (u64)-1;
3434                 btrfs_release_path(path);
3435                 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3436                 if (ret < 0) {
3437                         btrfs_release_path(path);
3438                         return ret;
3439                 }
3440                 ret = btrfs_previous_item(root, path, ino, key_type);
3441
3442                 /* if ret == 0 there are items for this type,
3443                  * create a range to tell us the last key of this type.
3444                  * otherwise, there are no items in this directory after
3445                  * *min_offset, and we create a range to indicate that.
3446                  */
3447                 if (ret == 0) {
3448                         struct btrfs_key tmp;
3449                         btrfs_item_key_to_cpu(path->nodes[0], &tmp,
3450                                               path->slots[0]);
3451                         if (key_type == tmp.type)
3452                                 first_offset = max(min_offset, tmp.offset) + 1;
3453                 }
3454                 goto done;
3455         }
3456
3457         /* go backward to find any previous key */
3458         ret = btrfs_previous_item(root, path, ino, key_type);
3459         if (ret == 0) {
3460                 struct btrfs_key tmp;
3461                 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3462                 if (key_type == tmp.type) {
3463                         first_offset = tmp.offset;
3464                         ret = overwrite_item(trans, log, dst_path,
3465                                              path->nodes[0], path->slots[0],
3466                                              &tmp);
3467                         if (ret) {
3468                                 err = ret;
3469                                 goto done;
3470                         }
3471                 }
3472         }
3473         btrfs_release_path(path);
3474
3475         /* find the first key from this transaction again */
3476         ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3477         if (WARN_ON(ret != 0))
3478                 goto done;
3479
3480         /*
3481          * we have a block from this transaction, log every item in it
3482          * from our directory
3483          */
3484         while (1) {
3485                 struct btrfs_key tmp;
3486                 src = path->nodes[0];
3487                 nritems = btrfs_header_nritems(src);
3488                 for (i = path->slots[0]; i < nritems; i++) {
3489                         struct btrfs_dir_item *di;
3490
3491                         btrfs_item_key_to_cpu(src, &min_key, i);
3492
3493                         if (min_key.objectid != ino || min_key.type != key_type)
3494                                 goto done;
3495                         ret = overwrite_item(trans, log, dst_path, src, i,
3496                                              &min_key);
3497                         if (ret) {
3498                                 err = ret;
3499                                 goto done;
3500                         }
3501
3502                         /*
3503                          * We must make sure that when we log a directory entry,
3504                          * the corresponding inode, after log replay, has a
3505                          * matching link count. For example:
3506                          *
3507                          * touch foo
3508                          * mkdir mydir
3509                          * sync
3510                          * ln foo mydir/bar
3511                          * xfs_io -c "fsync" mydir
3512                          * <crash>
3513                          * <mount fs and log replay>
3514                          *
3515                          * Would result in a fsync log that when replayed, our
3516                          * file inode would have a link count of 1, but we get
3517                          * two directory entries pointing to the same inode.
3518                          * After removing one of the names, it would not be
3519                          * possible to remove the other name, which resulted
3520                          * always in stale file handle errors, and would not
3521                          * be possible to rmdir the parent directory, since
3522                          * its i_size could never decrement to the value
3523                          * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
3524                          */
3525                         di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
3526                         btrfs_dir_item_key_to_cpu(src, di, &tmp);
3527                         if (ctx &&
3528                             (btrfs_dir_transid(src, di) == trans->transid ||
3529                              btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
3530                             tmp.type != BTRFS_ROOT_ITEM_KEY)
3531                                 ctx->log_new_dentries = true;
3532                 }
3533                 path->slots[0] = nritems;
3534
3535                 /*
3536                  * look ahead to the next item and see if it is also
3537                  * from this directory and from this transaction
3538                  */
3539                 ret = btrfs_next_leaf(root, path);
3540                 if (ret) {
3541                         if (ret == 1)
3542                                 last_offset = (u64)-1;
3543                         else
3544                                 err = ret;
3545                         goto done;
3546                 }
3547                 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3548                 if (tmp.objectid != ino || tmp.type != key_type) {
3549                         last_offset = (u64)-1;
3550                         goto done;
3551                 }
3552                 if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
3553                         ret = overwrite_item(trans, log, dst_path,
3554                                              path->nodes[0], path->slots[0],
3555                                              &tmp);
3556                         if (ret)
3557                                 err = ret;
3558                         else
3559                                 last_offset = tmp.offset;
3560                         goto done;
3561                 }
3562         }
3563 done:
3564         btrfs_release_path(path);
3565         btrfs_release_path(dst_path);
3566
3567         if (err == 0) {
3568                 *last_offset_ret = last_offset;
3569                 /*
3570                  * insert the log range keys to indicate where the log
3571                  * is valid
3572                  */
3573                 ret = insert_dir_log_key(trans, log, path, key_type,
3574                                          ino, first_offset, last_offset);
3575                 if (ret)
3576                         err = ret;
3577         }
3578         return err;
3579 }
3580
3581 /*
3582  * logging directories is very similar to logging inodes, We find all the items
3583  * from the current transaction and write them to the log.
3584  *
3585  * The recovery code scans the directory in the subvolume, and if it finds a
3586  * key in the range logged that is not present in the log tree, then it means
3587  * that dir entry was unlinked during the transaction.
3588  *
3589  * In order for that scan to work, we must include one key smaller than
3590  * the smallest logged by this transaction and one key larger than the largest
3591  * key logged by this transaction.
3592  */
3593 static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
3594                           struct btrfs_root *root, struct btrfs_inode *inode,
3595                           struct btrfs_path *path,
3596                           struct btrfs_path *dst_path,
3597                           struct btrfs_log_ctx *ctx)
3598 {
3599         u64 min_key;
3600         u64 max_key;
3601         int ret;
3602         int key_type = BTRFS_DIR_ITEM_KEY;
3603
3604 again:
3605         min_key = 0;
3606         max_key = 0;
3607         while (1) {
3608                 ret = log_dir_items(trans, root, inode, path, dst_path, key_type,
3609                                 ctx, min_key, &max_key);
3610                 if (ret)
3611                         return ret;
3612                 if (max_key == (u64)-1)
3613                         break;
3614                 min_key = max_key + 1;
3615         }
3616
3617         if (key_type == BTRFS_DIR_ITEM_KEY) {
3618                 key_type = BTRFS_DIR_INDEX_KEY;
3619                 goto again;
3620         }
3621         return 0;
3622 }
3623
3624 /*
3625  * a helper function to drop items from the log before we relog an
3626  * inode.  max_key_type indicates the highest item type to remove.
3627  * This cannot be run for file data extents because it does not
3628  * free the extents they point to.
3629  */
3630 static int drop_objectid_items(struct btrfs_trans_handle *trans,
3631                                   struct btrfs_root *log,
3632                                   struct btrfs_path *path,
3633                                   u64 objectid, int max_key_type)
3634 {
3635         int ret;
3636         struct btrfs_key key;
3637         struct btrfs_key found_key;
3638         int start_slot;
3639
3640         key.objectid = objectid;
3641         key.type = max_key_type;
3642         key.offset = (u64)-1;
3643
3644         while (1) {
3645                 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
3646                 BUG_ON(ret == 0); /* Logic error */
3647                 if (ret < 0)
3648                         break;
3649
3650                 if (path->slots[0] == 0)
3651                         break;
3652
3653                 path->slots[0]--;
3654                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
3655                                       path->slots[0]);
3656
3657                 if (found_key.objectid != objectid)
3658                         break;
3659
3660                 found_key.offset = 0;
3661                 found_key.type = 0;
3662                 ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
3663                                        &start_slot);
3664
3665                 ret = btrfs_del_items(trans, log, path, start_slot,
3666                                       path->slots[0] - start_slot + 1);
3667                 /*
3668                  * If start slot isn't 0 then we don't need to re-search, we've
3669                  * found the last guy with the objectid in this tree.
3670                  */
3671                 if (ret || start_slot != 0)
3672                         break;
3673                 btrfs_release_path(path);
3674         }
3675         btrfs_release_path(path);
3676         if (ret > 0)
3677                 ret = 0;
3678         return ret;
3679 }
3680
3681 static void fill_inode_item(struct btrfs_trans_handle *trans,
3682                             struct extent_buffer *leaf,
3683                             struct btrfs_inode_item *item,
3684                             struct inode *inode, int log_inode_only,
3685                             u64 logged_isize)
3686 {
3687         struct btrfs_map_token token;
3688
3689         btrfs_init_map_token(&token);
3690
3691         if (log_inode_only) {
3692                 /* set the generation to zero so the recover code
3693                  * can tell the difference between an logging
3694                  * just to say 'this inode exists' and a logging
3695                  * to say 'update this inode with these values'
3696                  */
3697                 btrfs_set_token_inode_generation(leaf, item, 0, &token);
3698                 btrfs_set_token_inode_size(leaf, item, logged_isize, &token);
3699         } else {
3700                 btrfs_set_token_inode_generation(leaf, item,
3701                                                  BTRFS_I(inode)->generation,
3702                                                  &token);
3703                 btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
3704         }
3705
3706         btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
3707         btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
3708         btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3709         btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
3710
3711         btrfs_set_token_timespec_sec(leaf, &item->atime,
3712                                      inode->i_atime.tv_sec, &token);
3713         btrfs_set_token_timespec_nsec(leaf, &item->atime,
3714                                       inode->i_atime.tv_nsec, &token);
3715
3716         btrfs_set_token_timespec_sec(leaf, &item->mtime,
3717                                      inode->i_mtime.tv_sec, &token);
3718         btrfs_set_token_timespec_nsec(leaf, &item->mtime,
3719                                       inode->i_mtime.tv_nsec, &token);
3720
3721         btrfs_set_token_timespec_sec(leaf, &item->ctime,
3722                                      inode->i_ctime.tv_sec, &token);
3723         btrfs_set_token_timespec_nsec(leaf, &item->ctime,
3724                                       inode->i_ctime.tv_nsec, &token);
3725
3726         btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
3727                                      &token);
3728
3729         btrfs_set_token_inode_sequence(leaf, item,
3730                                        inode_peek_iversion(inode), &token);
3731         btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
3732         btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3733         btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3734         btrfs_set_token_inode_block_group(leaf, item, 0, &token);
3735 }
3736
3737 static int log_inode_item(struct btrfs_trans_handle *trans,
3738                           struct btrfs_root *log, struct btrfs_path *path,
3739                           struct btrfs_inode *inode)
3740 {
3741         struct btrfs_inode_item *inode_item;
3742         int ret;
3743
3744         ret = btrfs_insert_empty_item(trans, log, path,
3745                                       &inode->location, sizeof(*inode_item));
3746         if (ret && ret != -EEXIST)
3747                 return ret;
3748         inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3749                                     struct btrfs_inode_item);
3750         fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
3751                         0, 0);
3752         btrfs_release_path(path);
3753         return 0;
3754 }
3755
3756 static noinline int copy_items(struct btrfs_trans_handle *trans,
3757                                struct btrfs_inode *inode,
3758                                struct btrfs_path *dst_path,
3759                                struct btrfs_path *src_path, u64 *last_extent,
3760                                int start_slot, int nr, int inode_only,
3761                                u64 logged_isize)
3762 {
3763         struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
3764         unsigned long src_offset;
3765         unsigned long dst_offset;
3766         struct btrfs_root *log = inode->root->log_root;
3767         struct btrfs_file_extent_item *extent;
3768         struct btrfs_inode_item *inode_item;
3769         struct extent_buffer *src = src_path->nodes[0];
3770         struct btrfs_key first_key, last_key, key;
3771         int ret;
3772         struct btrfs_key *ins_keys;
3773         u32 *ins_sizes;
3774         char *ins_data;
3775         int i;
3776         struct list_head ordered_sums;
3777         int skip_csum = inode->flags & BTRFS_INODE_NODATASUM;
3778         bool has_extents = false;
3779         bool need_find_last_extent = true;
3780         bool done = false;
3781
3782         INIT_LIST_HEAD(&ordered_sums);
3783
3784         ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
3785                            nr * sizeof(u32), GFP_NOFS);
3786         if (!ins_data)
3787                 return -ENOMEM;
3788
3789         first_key.objectid = (u64)-1;
3790
3791         ins_sizes = (u32 *)ins_data;
3792         ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
3793
3794         for (i = 0; i < nr; i++) {
3795                 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
3796                 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
3797         }
3798         ret = btrfs_insert_empty_items(trans, log, dst_path,
3799                                        ins_keys, ins_sizes, nr);
3800         if (ret) {
3801                 kfree(ins_data);
3802                 return ret;
3803         }
3804
3805         for (i = 0; i < nr; i++, dst_path->slots[0]++) {
3806                 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
3807                                                    dst_path->slots[0]);
3808
3809                 src_offset = btrfs_item_ptr_offset(src, start_slot + i);
3810
3811                 if (i == nr - 1)
3812                         last_key = ins_keys[i];
3813
3814                 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
3815                         inode_item = btrfs_item_ptr(dst_path->nodes[0],
3816                                                     dst_path->slots[0],
3817                                                     struct btrfs_inode_item);
3818                         fill_inode_item(trans, dst_path->nodes[0], inode_item,
3819                                         &inode->vfs_inode,
3820                                         inode_only == LOG_INODE_EXISTS,
3821                                         logged_isize);
3822                 } else {
3823                         copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
3824                                            src_offset, ins_sizes[i]);
3825                 }
3826
3827                 /*
3828                  * We set need_find_last_extent here in case we know we were
3829                  * processing other items and then walk into the first extent in
3830                  * the inode.  If we don't hit an extent then nothing changes,
3831                  * we'll do the last search the next time around.
3832                  */
3833                 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) {
3834                         has_extents = true;
3835                         if (first_key.objectid == (u64)-1)
3836                                 first_key = ins_keys[i];
3837                 } else {
3838                         need_find_last_extent = false;
3839                 }
3840
3841                 /* take a reference on file data extents so that truncates
3842                  * or deletes of this inode don't have to relog the inode
3843                  * again
3844                  */
3845                 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
3846                     !skip_csum) {
3847                         int found_type;
3848                         extent = btrfs_item_ptr(src, start_slot + i,
3849                                                 struct btrfs_file_extent_item);
3850
3851                         if (btrfs_file_extent_generation(src, extent) < trans->transid)
3852                                 continue;
3853
3854                         found_type = btrfs_file_extent_type(src, extent);
3855                         if (found_type == BTRFS_FILE_EXTENT_REG) {
3856                                 u64 ds, dl, cs, cl;
3857                                 ds = btrfs_file_extent_disk_bytenr(src,
3858                                                                 extent);
3859                                 /* ds == 0 is a hole */
3860                                 if (ds == 0)
3861                                         continue;
3862
3863                                 dl = btrfs_file_extent_disk_num_bytes(src,
3864                                                                 extent);
3865                                 cs = btrfs_file_extent_offset(src, extent);
3866                                 cl = btrfs_file_extent_num_bytes(src,
3867                                                                 extent);
3868                                 if (btrfs_file_extent_compression(src,
3869                                                                   extent)) {
3870                                         cs = 0;
3871                                         cl = dl;
3872                                 }
3873
3874                                 ret = btrfs_lookup_csums_range(
3875                                                 fs_info->csum_root,
3876                                                 ds + cs, ds + cs + cl - 1,
3877                                                 &ordered_sums, 0);
3878                                 if (ret) {
3879                                         btrfs_release_path(dst_path);
3880                                         kfree(ins_data);
3881                                         return ret;
3882                                 }
3883                         }
3884                 }
3885         }
3886
3887         btrfs_mark_buffer_dirty(dst_path->nodes[0]);
3888         btrfs_release_path(dst_path);
3889         kfree(ins_data);
3890
3891         /*
3892          * we have to do this after the loop above to avoid changing the
3893          * log tree while trying to change the log tree.
3894          */
3895         ret = 0;
3896         while (!list_empty(&ordered_sums)) {
3897                 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
3898                                                    struct btrfs_ordered_sum,
3899                                                    list);
3900                 if (!ret)
3901                         ret = btrfs_csum_file_blocks(trans, log, sums);
3902                 list_del(&sums->list);
3903                 kfree(sums);
3904         }
3905
3906         if (!has_extents)
3907                 return ret;
3908
3909         if (need_find_last_extent && *last_extent == first_key.offset) {
3910                 /*
3911                  * We don't have any leafs between our current one and the one
3912                  * we processed before that can have file extent items for our
3913                  * inode (and have a generation number smaller than our current
3914                  * transaction id).
3915                  */
3916                 need_find_last_extent = false;
3917         }
3918
3919         /*
3920          * Because we use btrfs_search_forward we could skip leaves that were
3921          * not modified and then assume *last_extent is valid when it really
3922          * isn't.  So back up to the previous leaf and read the end of the last
3923          * extent before we go and fill in holes.
3924          */
3925         if (need_find_last_extent) {
3926                 u64 len;
3927
3928                 ret = btrfs_prev_leaf(inode->root, src_path);
3929                 if (ret < 0)
3930                         return ret;
3931                 if (ret)
3932                         goto fill_holes;
3933                 if (src_path->slots[0])
3934                         src_path->slots[0]--;
3935                 src = src_path->nodes[0];
3936                 btrfs_item_key_to_cpu(src, &key, src_path->slots[0]);
3937                 if (key.objectid != btrfs_ino(inode) ||
3938                     key.type != BTRFS_EXTENT_DATA_KEY)
3939                         goto fill_holes;
3940                 extent = btrfs_item_ptr(src, src_path->slots[0],
3941                                         struct btrfs_file_extent_item);
3942                 if (btrfs_file_extent_type(src, extent) ==
3943                     BTRFS_FILE_EXTENT_INLINE) {
3944                         len = btrfs_file_extent_inline_len(src,
3945                                                            src_path->slots[0],
3946                                                            extent);
3947                         *last_extent = ALIGN(key.offset + len,
3948                                              fs_info->sectorsize);
3949                 } else {
3950                         len = btrfs_file_extent_num_bytes(src, extent);
3951                         *last_extent = key.offset + len;
3952                 }
3953         }
3954 fill_holes:
3955         /* So we did prev_leaf, now we need to move to the next leaf, but a few
3956          * things could have happened
3957          *
3958          * 1) A merge could have happened, so we could currently be on a leaf
3959          * that holds what we were copying in the first place.
3960          * 2) A split could have happened, and now not all of the items we want
3961          * are on the same leaf.
3962          *
3963          * So we need to adjust how we search for holes, we need to drop the
3964          * path and re-search for the first extent key we found, and then walk
3965          * forward until we hit the last one we copied.
3966          */
3967         if (need_find_last_extent) {
3968                 /* btrfs_prev_leaf could return 1 without releasing the path */
3969                 btrfs_release_path(src_path);
3970                 ret = btrfs_search_slot(NULL, inode->root, &first_key,
3971                                 src_path, 0, 0);
3972                 if (ret < 0)
3973                         return ret;
3974                 ASSERT(ret == 0);
3975                 src = src_path->nodes[0];
3976                 i = src_path->slots[0];
3977         } else {
3978                 i = start_slot;
3979         }
3980
3981         /*
3982          * Ok so here we need to go through and fill in any holes we may have
3983          * to make sure that holes are punched for those areas in case they had
3984          * extents previously.
3985          */
3986         while (!done) {
3987                 u64 offset, len;
3988                 u64 extent_end;
3989
3990                 if (i >= btrfs_header_nritems(src_path->nodes[0])) {
3991                         ret = btrfs_next_leaf(inode->root, src_path);
3992                         if (ret < 0)
3993                                 return ret;
3994                         ASSERT(ret == 0);
3995                         src = src_path->nodes[0];
3996                         i = 0;
3997                         need_find_last_extent = true;
3998                 }
3999
4000                 btrfs_item_key_to_cpu(src, &key, i);
4001                 if (!btrfs_comp_cpu_keys(&key, &last_key))
4002                         done = true;
4003                 if (key.objectid != btrfs_ino(inode) ||
4004                     key.type != BTRFS_EXTENT_DATA_KEY) {
4005                         i++;
4006                         continue;
4007                 }
4008                 extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item);
4009                 if (btrfs_file_extent_type(src, extent) ==
4010                     BTRFS_FILE_EXTENT_INLINE) {
4011                         len = btrfs_file_extent_inline_len(src, i, extent);
4012                         extent_end = ALIGN(key.offset + len,
4013                                            fs_info->sectorsize);
4014                 } else {
4015                         len = btrfs_file_extent_num_bytes(src, extent);
4016                         extent_end = key.offset + len;
4017                 }
4018                 i++;
4019
4020                 if (*last_extent == key.offset) {
4021                         *last_extent = extent_end;
4022                         continue;
4023                 }
4024                 offset = *last_extent;
4025                 len = key.offset - *last_extent;
4026                 ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode),
4027                                 offset, 0, 0, len, 0, len, 0, 0, 0);
4028                 if (ret)
4029                         break;
4030                 *last_extent = extent_end;
4031         }
4032
4033         /*
4034          * Check if there is a hole between the last extent found in our leaf
4035          * and the first extent in the next leaf. If there is one, we need to
4036          * log an explicit hole so that at replay time we can punch the hole.
4037          */
4038         if (ret == 0 &&
4039             key.objectid == btrfs_ino(inode) &&
4040             key.type == BTRFS_EXTENT_DATA_KEY &&
4041             i == btrfs_header_nritems(src_path->nodes[0])) {
4042                 ret = btrfs_next_leaf(inode->root, src_path);
4043                 need_find_last_extent = true;
4044                 if (ret > 0) {
4045                         ret = 0;
4046                 } else if (ret == 0) {
4047                         btrfs_item_key_to_cpu(src_path->nodes[0], &key,
4048                                               src_path->slots[0]);
4049                         if (key.objectid == btrfs_ino(inode) &&
4050                             key.type == BTRFS_EXTENT_DATA_KEY &&
4051                             *last_extent < key.offset) {
4052                                 const u64 len = key.offset - *last_extent;
4053
4054                                 ret = btrfs_insert_file_extent(trans, log,
4055                                                                btrfs_ino(inode),
4056                                                                *last_extent, 0,
4057                                                                0, len, 0, len,
4058                                                                0, 0, 0);
4059                         }
4060                 }
4061         }
4062         /*
4063          * Need to let the callers know we dropped the path so they should
4064          * re-search.
4065          */
4066         if (!ret && need_find_last_extent)
4067                 ret = 1;
4068         return ret;
4069 }
4070
4071 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
4072 {
4073         struct extent_map *em1, *em2;
4074
4075         em1 = list_entry(a, struct extent_map, list);
4076         em2 = list_entry(b, struct extent_map, list);
4077
4078         if (em1->start < em2->start)
4079                 return -1;
4080         else if (em1->start > em2->start)
4081                 return 1;
4082         return 0;
4083 }
4084
4085 static int wait_ordered_extents(struct btrfs_trans_handle *trans,
4086                                 struct inode *inode,
4087                                 struct btrfs_root *root,
4088                                 const struct extent_map *em,
4089                                 const struct list_head *logged_list,
4090                                 bool *ordered_io_error)
4091 {
4092         struct btrfs_fs_info *fs_info = root->fs_info;
4093         struct btrfs_ordered_extent *ordered;
4094         struct btrfs_root *log = root->log_root;
4095         u64 mod_start = em->mod_start;
4096         u64 mod_len = em->mod_len;
4097         const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
4098         u64 csum_offset;
4099         u64 csum_len;
4100         LIST_HEAD(ordered_sums);
4101         int ret = 0;
4102
4103         *ordered_io_error = false;
4104
4105         if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
4106             em->block_start == EXTENT_MAP_HOLE)
4107                 return 0;
4108
4109         /*
4110          * Wait far any ordered extent that covers our extent map. If it
4111          * finishes without an error, first check and see if our csums are on
4112          * our outstanding ordered extents.
4113          */
4114         list_for_each_entry(ordered, logged_list, log_list) {
4115                 struct btrfs_ordered_sum *sum;
4116
4117                 if (!mod_len)
4118                         break;
4119
4120                 if (ordered->file_offset + ordered->len <= mod_start ||
4121                     mod_start + mod_len <= ordered->file_offset)
4122                         continue;
4123
4124                 if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
4125                     !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
4126                     !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
4127                         const u64 start = ordered->file_offset;
4128                         const u64 end = ordered->file_offset + ordered->len - 1;
4129
4130                         WARN_ON(ordered->inode != inode);
4131                         filemap_fdatawrite_range(inode->i_mapping, start, end);
4132                 }
4133
4134                 wait_event(ordered->wait,
4135                            (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) ||
4136                             test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)));
4137
4138                 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) {
4139                         /*
4140                          * Clear the AS_EIO/AS_ENOSPC flags from the inode's
4141                          * i_mapping flags, so that the next fsync won't get
4142                          * an outdated io error too.
4143                          */
4144                         filemap_check_errors(inode->i_mapping);
4145                         *ordered_io_error = true;
4146                         break;
4147                 }
4148                 /*
4149                  * We are going to copy all the csums on this ordered extent, so
4150                  * go ahead and adjust mod_start and mod_len in case this
4151                  * ordered extent has already been logged.
4152                  */
4153                 if (ordered->file_offset > mod_start) {
4154                         if (ordered->file_offset + ordered->len >=
4155                             mod_start + mod_len)
4156                                 mod_len = ordered->file_offset - mod_start;
4157                         /*
4158                          * If we have this case
4159                          *
4160                          * |--------- logged extent ---------|
4161                          *       |----- ordered extent ----|
4162                          *
4163                          * Just don't mess with mod_start and mod_len, we'll
4164                          * just end up logging more csums than we need and it
4165                          * will be ok.
4166                          */
4167                 } else {
4168                         if (ordered->file_offset + ordered->len <
4169                             mod_start + mod_len) {
4170                                 mod_len = (mod_start + mod_len) -
4171                                         (ordered->file_offset + ordered->len);
4172                                 mod_start = ordered->file_offset +
4173                                         ordered->len;
4174                         } else {
4175                                 mod_len = 0;
4176                         }
4177                 }
4178
4179                 if (skip_csum)
4180                         continue;
4181
4182                 /*
4183                  * To keep us from looping for the above case of an ordered
4184                  * extent that falls inside of the logged extent.
4185                  */
4186                 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
4187                                      &ordered->flags))
4188                         continue;
4189
4190                 list_for_each_entry(sum, &ordered->list, list) {
4191                         ret = btrfs_csum_file_blocks(trans, log, sum);
4192                         if (ret)
4193                                 break;
4194                 }
4195         }
4196
4197         if (*ordered_io_error || !mod_len || ret || skip_csum)
4198                 return ret;
4199
4200         if (em->compress_type) {
4201                 csum_offset = 0;
4202                 csum_len = max(em->block_len, em->orig_block_len);
4203         } else {
4204                 csum_offset = mod_start - em->start;
4205                 csum_len = mod_len;
4206         }
4207
4208         /* block start is already adjusted for the file extent offset. */
4209         ret = btrfs_lookup_csums_range(fs_info->csum_root,
4210                                        em->block_start + csum_offset,
4211                                        em->block_start + csum_offset +
4212                                        csum_len - 1, &ordered_sums, 0);
4213         if (ret)
4214                 return ret;
4215
4216         while (!list_empty(&ordered_sums)) {
4217                 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
4218                                                    struct btrfs_ordered_sum,
4219                                                    list);
4220                 if (!ret)
4221                         ret = btrfs_csum_file_blocks(trans, log, sums);
4222                 list_del(&sums->list);
4223                 kfree(sums);
4224         }
4225
4226         return ret;
4227 }
4228
4229 static int log_one_extent(struct btrfs_trans_handle *trans,
4230                           struct btrfs_inode *inode, struct btrfs_root *root,
4231                           const struct extent_map *em,
4232                           struct btrfs_path *path,
4233                           const struct list_head *logged_list,
4234                           struct btrfs_log_ctx *ctx)
4235 {
4236         struct btrfs_root *log = root->log_root;
4237         struct btrfs_file_extent_item *fi;
4238         struct extent_buffer *leaf;
4239         struct btrfs_map_token token;
4240         struct btrfs_key key;
4241         u64 extent_offset = em->start - em->orig_start;
4242         u64 block_len;
4243         int ret;
4244         int extent_inserted = 0;
4245         bool ordered_io_err = false;
4246
4247         ret = wait_ordered_extents(trans, &inode->vfs_inode, root, em,
4248                         logged_list, &ordered_io_err);
4249         if (ret)
4250                 return ret;
4251
4252         if (ordered_io_err) {
4253                 ctx->io_err = -EIO;
4254                 return ctx->io_err;
4255         }
4256
4257         btrfs_init_map_token(&token);
4258
4259         ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start,
4260                                    em->start + em->len, NULL, 0, 1,
4261                                    sizeof(*fi), &extent_inserted);
4262         if (ret)
4263                 return ret;
4264
4265         if (!extent_inserted) {
4266                 key.objectid = btrfs_ino(inode);
4267                 key.type = BTRFS_EXTENT_DATA_KEY;
4268                 key.offset = em->start;
4269
4270                 ret = btrfs_insert_empty_item(trans, log, path, &key,
4271                                               sizeof(*fi));
4272                 if (ret)
4273                         return ret;
4274         }
4275         leaf = path->nodes[0];
4276         fi = btrfs_item_ptr(leaf, path->slots[0],
4277                             struct btrfs_file_extent_item);
4278
4279         btrfs_set_token_file_extent_generation(leaf, fi, trans->transid,
4280                                                &token);
4281         if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
4282                 btrfs_set_token_file_extent_type(leaf, fi,
4283                                                  BTRFS_FILE_EXTENT_PREALLOC,
4284                                                  &token);
4285         else
4286                 btrfs_set_token_file_extent_type(leaf, fi,
4287                                                  BTRFS_FILE_EXTENT_REG,
4288                                                  &token);
4289
4290         block_len = max(em->block_len, em->orig_block_len);
4291         if (em->compress_type != BTRFS_COMPRESS_NONE) {
4292                 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
4293                                                         em->block_start,
4294                                                         &token);
4295                 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
4296                                                            &token);
4297         } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
4298                 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
4299                                                         em->block_start -
4300                                                         extent_offset, &token);
4301                 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
4302                                                            &token);
4303         } else {
4304                 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
4305                 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
4306                                                            &token);
4307         }
4308
4309         btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token);
4310         btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
4311         btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
4312         btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
4313                                                 &token);
4314         btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
4315         btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
4316         btrfs_mark_buffer_dirty(leaf);
4317
4318         btrfs_release_path(path);
4319
4320         return ret;
4321 }
4322
4323 /*
4324  * Log all prealloc extents beyond the inode's i_size to make sure we do not
4325  * lose them after doing a fast fsync and replaying the log. We scan the
4326  * subvolume's root instead of iterating the inode's extent map tree because
4327  * otherwise we can log incorrect extent items based on extent map conversion.
4328  * That can happen due to the fact that extent maps are merged when they
4329  * are not in the extent map tree's list of modified extents.
4330  */
4331 static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
4332                                       struct btrfs_inode *inode,
4333                                       struct btrfs_path *path)
4334 {
4335         struct btrfs_root *root = inode->root;
4336         struct btrfs_key key;
4337         const u64 i_size = i_size_read(&inode->vfs_inode);
4338         const u64 ino = btrfs_ino(inode);
4339         struct btrfs_path *dst_path = NULL;
4340         u64 last_extent = (u64)-1;
4341         int ins_nr = 0;
4342         int start_slot;
4343         int ret;
4344
4345         if (!(inode->flags & BTRFS_INODE_PREALLOC))
4346                 return 0;
4347
4348         key.objectid = ino;
4349         key.type = BTRFS_EXTENT_DATA_KEY;
4350         key.offset = i_size;
4351         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4352         if (ret < 0)
4353                 goto out;
4354
4355         while (true) {
4356                 struct extent_buffer *leaf = path->nodes[0];
4357                 int slot = path->slots[0];
4358
4359                 if (slot >= btrfs_header_nritems(leaf)) {
4360                         if (ins_nr > 0) {
4361                                 ret = copy_items(trans, inode, dst_path, path,
4362                                                  &last_extent, start_slot,
4363                                                  ins_nr, 1, 0);
4364                                 if (ret < 0)
4365                                         goto out;
4366                                 ins_nr = 0;
4367                         }
4368                         ret = btrfs_next_leaf(root, path);
4369                         if (ret < 0)
4370                                 goto out;
4371                         if (ret > 0) {
4372                                 ret = 0;
4373                                 break;
4374                         }
4375                         continue;
4376                 }
4377
4378                 btrfs_item_key_to_cpu(leaf, &key, slot);
4379                 if (key.objectid > ino)
4380                         break;
4381                 if (WARN_ON_ONCE(key.objectid < ino) ||
4382                     key.type < BTRFS_EXTENT_DATA_KEY ||
4383                     key.offset < i_size) {
4384                         path->slots[0]++;
4385                         continue;
4386                 }
4387                 if (last_extent == (u64)-1) {
4388                         last_extent = key.offset;
4389                         /*
4390                          * Avoid logging extent items logged in past fsync calls
4391                          * and leading to duplicate keys in the log tree.
4392                          */
4393                         do {
4394                                 ret = btrfs_truncate_inode_items(trans,
4395                                                          root->log_root,
4396                                                          &inode->vfs_inode,
4397                                                          i_size,
4398                                                          BTRFS_EXTENT_DATA_KEY);
4399                         } while (ret == -EAGAIN);
4400                         if (ret)
4401                                 goto out;
4402                 }
4403                 if (ins_nr == 0)
4404                         start_slot = slot;
4405                 ins_nr++;
4406                 path->slots[0]++;
4407                 if (!dst_path) {
4408                         dst_path = btrfs_alloc_path();
4409                         if (!dst_path) {
4410                                 ret = -ENOMEM;
4411                                 goto out;
4412                         }
4413                 }
4414         }
4415         if (ins_nr > 0) {
4416                 ret = copy_items(trans, inode, dst_path, path, &last_extent,
4417                                  start_slot, ins_nr, 1, 0);
4418                 if (ret > 0)
4419                         ret = 0;
4420         }
4421 out:
4422         btrfs_release_path(path);
4423         btrfs_free_path(dst_path);
4424         return ret;
4425 }
4426
4427 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
4428                                      struct btrfs_root *root,
4429                                      struct btrfs_inode *inode,
4430                                      struct btrfs_path *path,
4431                                      struct list_head *logged_list,
4432                                      struct btrfs_log_ctx *ctx,
4433                                      const u64 start,
4434                                      const u64 end)
4435 {
4436         struct extent_map *em, *n;
4437         struct list_head extents;
4438         struct extent_map_tree *tree = &inode->extent_tree;
4439         u64 logged_start, logged_end;
4440         u64 test_gen;
4441         int ret = 0;
4442         int num = 0;
4443
4444         INIT_LIST_HEAD(&extents);
4445
4446         down_write(&inode->dio_sem);
4447         write_lock(&tree->lock);
4448         test_gen = root->fs_info->last_trans_committed;
4449         logged_start = start;
4450         logged_end = end;
4451
4452         list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
4453                 list_del_init(&em->list);
4454                 /*
4455                  * Just an arbitrary number, this can be really CPU intensive
4456                  * once we start getting a lot of extents, and really once we
4457                  * have a bunch of extents we just want to commit since it will
4458                  * be faster.
4459                  */
4460                 if (++num > 32768) {
4461                         list_del_init(&tree->modified_extents);
4462                         ret = -EFBIG;
4463                         goto process;
4464                 }
4465
4466                 if (em->generation <= test_gen)
4467                         continue;
4468
4469                 /* We log prealloc extents beyond eof later. */
4470                 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) &&
4471                     em->start >= i_size_read(&inode->vfs_inode))
4472                         continue;
4473
4474                 if (em->start < logged_start)
4475                         logged_start = em->start;
4476                 if ((em->start + em->len - 1) > logged_end)
4477                         logged_end = em->start + em->len - 1;
4478
4479                 /* Need a ref to keep it from getting evicted from cache */
4480                 refcount_inc(&em->refs);
4481                 set_bit(EXTENT_FLAG_LOGGING, &em->flags);
4482                 list_add_tail(&em->list, &extents);
4483                 num++;
4484         }
4485
4486         list_sort(NULL, &extents, extent_cmp);
4487         btrfs_get_logged_extents(inode, logged_list, logged_start, logged_end);
4488         /*
4489          * Some ordered extents started by fsync might have completed
4490          * before we could collect them into the list logged_list, which
4491          * means they're gone, not in our logged_list nor in the inode's
4492          * ordered tree. We want the application/user space to know an
4493          * error happened while attempting to persist file data so that
4494          * it can take proper action. If such error happened, we leave
4495          * without writing to the log tree and the fsync must report the
4496          * file data write error and not commit the current transaction.
4497          */
4498         ret = filemap_check_errors(inode->vfs_inode.i_mapping);
4499         if (ret)
4500                 ctx->io_err = ret;
4501 process:
4502         while (!list_empty(&extents)) {
4503                 em = list_entry(extents.next, struct extent_map, list);
4504
4505                 list_del_init(&em->list);
4506
4507                 /*
4508                  * If we had an error we just need to delete everybody from our
4509                  * private list.
4510                  */
4511                 if (ret) {
4512                         clear_em_logging(tree, em);
4513                         free_extent_map(em);
4514                         continue;
4515                 }
4516
4517                 write_unlock(&tree->lock);
4518
4519                 ret = log_one_extent(trans, inode, root, em, path, logged_list,
4520                                      ctx);
4521                 write_lock(&tree->lock);
4522                 clear_em_logging(tree, em);
4523                 free_extent_map(em);
4524         }
4525         WARN_ON(!list_empty(&extents));
4526         write_unlock(&tree->lock);
4527         up_write(&inode->dio_sem);
4528
4529         btrfs_release_path(path);
4530         if (!ret)
4531                 ret = btrfs_log_prealloc_extents(trans, inode, path);
4532
4533         return ret;
4534 }
4535
4536 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
4537                              struct btrfs_path *path, u64 *size_ret)
4538 {
4539         struct btrfs_key key;
4540         int ret;
4541
4542         key.objectid = btrfs_ino(inode);
4543         key.type = BTRFS_INODE_ITEM_KEY;
4544         key.offset = 0;
4545
4546         ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
4547         if (ret < 0) {
4548                 return ret;
4549         } else if (ret > 0) {
4550                 *size_ret = 0;
4551         } else {
4552                 struct btrfs_inode_item *item;
4553
4554                 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4555                                       struct btrfs_inode_item);
4556                 *size_ret = btrfs_inode_size(path->nodes[0], item);
4557         }
4558
4559         btrfs_release_path(path);
4560         return 0;
4561 }
4562
4563 /*
4564  * At the moment we always log all xattrs. This is to figure out at log replay
4565  * time which xattrs must have their deletion replayed. If a xattr is missing
4566  * in the log tree and exists in the fs/subvol tree, we delete it. This is
4567  * because if a xattr is deleted, the inode is fsynced and a power failure
4568  * happens, causing the log to be replayed the next time the fs is mounted,
4569  * we want the xattr to not exist anymore (same behaviour as other filesystems
4570  * with a journal, ext3/4, xfs, f2fs, etc).
4571  */
4572 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
4573                                 struct btrfs_root *root,
4574                                 struct btrfs_inode *inode,
4575                                 struct btrfs_path *path,
4576                                 struct btrfs_path *dst_path)
4577 {
4578         int ret;
4579         struct btrfs_key key;
4580         const u64 ino = btrfs_ino(inode);
4581         int ins_nr = 0;
4582         int start_slot = 0;
4583
4584         key.objectid = ino;
4585         key.type = BTRFS_XATTR_ITEM_KEY;
4586         key.offset = 0;
4587
4588         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4589         if (ret < 0)
4590                 return ret;
4591
4592         while (true) {
4593                 int slot = path->slots[0];
4594                 struct extent_buffer *leaf = path->nodes[0];
4595                 int nritems = btrfs_header_nritems(leaf);
4596
4597                 if (slot >= nritems) {
4598                         if (ins_nr > 0) {
4599                                 u64 last_extent = 0;
4600
4601                                 ret = copy_items(trans, inode, dst_path, path,
4602                                                  &last_extent, start_slot,
4603                                                  ins_nr, 1, 0);
4604                                 /* can't be 1, extent items aren't processed */
4605                                 ASSERT(ret <= 0);
4606                                 if (ret < 0)
4607                                         return ret;
4608                                 ins_nr = 0;
4609                         }
4610                         ret = btrfs_next_leaf(root, path);
4611                         if (ret < 0)
4612                                 return ret;
4613                         else if (ret > 0)
4614                                 break;
4615                         continue;
4616                 }
4617
4618                 btrfs_item_key_to_cpu(leaf, &key, slot);
4619                 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
4620                         break;
4621
4622                 if (ins_nr == 0)
4623                         start_slot = slot;
4624                 ins_nr++;
4625                 path->slots[0]++;
4626                 cond_resched();
4627         }
4628         if (ins_nr > 0) {
4629                 u64 last_extent = 0;
4630
4631                 ret = copy_items(trans, inode, dst_path, path,
4632                                  &last_extent, start_slot,
4633                                  ins_nr, 1, 0);
4634                 /* can't be 1, extent items aren't processed */
4635                 ASSERT(ret <= 0);
4636                 if (ret < 0)
4637                         return ret;
4638         }
4639
4640         return 0;
4641 }
4642
4643 /*
4644  * If the no holes feature is enabled we need to make sure any hole between the
4645  * last extent and the i_size of our inode is explicitly marked in the log. This
4646  * is to make sure that doing something like:
4647  *
4648  *      1) create file with 128Kb of data
4649  *      2) truncate file to 64Kb
4650  *      3) truncate file to 256Kb
4651  *      4) fsync file
4652  *      5) <crash/power failure>
4653  *      6) mount fs and trigger log replay
4654  *
4655  * Will give us a file with a size of 256Kb, the first 64Kb of data match what
4656  * the file had in its first 64Kb of data at step 1 and the last 192Kb of the
4657  * file correspond to a hole. The presence of explicit holes in a log tree is
4658  * what guarantees that log replay will remove/adjust file extent items in the
4659  * fs/subvol tree.
4660  *
4661  * Here we do not need to care about holes between extents, that is already done
4662  * by copy_items(). We also only need to do this in the full sync path, where we
4663  * lookup for extents from the fs/subvol tree only. In the fast path case, we
4664  * lookup the list of modified extent maps and if any represents a hole, we
4665  * insert a corresponding extent representing a hole in the log tree.
4666  */
4667 static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
4668                                    struct btrfs_root *root,
4669                                    struct btrfs_inode *inode,
4670                                    struct btrfs_path *path)
4671 {
4672         struct btrfs_fs_info *fs_info = root->fs_info;
4673         int ret;
4674         struct btrfs_key key;
4675         u64 hole_start;
4676         u64 hole_size;
4677         struct extent_buffer *leaf;
4678         struct btrfs_root *log = root->log_root;
4679         const u64 ino = btrfs_ino(inode);
4680         const u64 i_size = i_size_read(&inode->vfs_inode);
4681
4682         if (!btrfs_fs_incompat(fs_info, NO_HOLES))
4683                 return 0;
4684
4685         key.objectid = ino;
4686         key.type = BTRFS_EXTENT_DATA_KEY;
4687         key.offset = (u64)-1;
4688
4689         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4690         ASSERT(ret != 0);
4691         if (ret < 0)
4692                 return ret;
4693
4694         ASSERT(path->slots[0] > 0);
4695         path->slots[0]--;
4696         leaf = path->nodes[0];
4697         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4698
4699         if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
4700                 /* inode does not have any extents */
4701                 hole_start = 0;
4702                 hole_size = i_size;
4703         } else {
4704                 struct btrfs_file_extent_item *extent;
4705                 u64 len;
4706
4707                 /*
4708                  * If there's an extent beyond i_size, an explicit hole was
4709                  * already inserted by copy_items().
4710                  */
4711                 if (key.offset >= i_size)
4712                         return 0;
4713
4714                 extent = btrfs_item_ptr(leaf, path->slots[0],
4715                                         struct btrfs_file_extent_item);
4716
4717                 if (btrfs_file_extent_type(leaf, extent) ==
4718                     BTRFS_FILE_EXTENT_INLINE) {
4719                         len = btrfs_file_extent_inline_len(leaf,
4720                                                            path->slots[0],
4721                                                            extent);
4722                         ASSERT(len == i_size ||
4723                                (len == fs_info->sectorsize &&
4724                                 btrfs_file_extent_compression(leaf, extent) !=
4725                                 BTRFS_COMPRESS_NONE));
4726                         return 0;
4727                 }
4728
4729                 len = btrfs_file_extent_num_bytes(leaf, extent);
4730                 /* Last extent goes beyond i_size, no need to log a hole. */
4731                 if (key.offset + len > i_size)
4732                         return 0;
4733                 hole_start = key.offset + len;
4734                 hole_size = i_size - hole_start;
4735         }
4736         btrfs_release_path(path);
4737
4738         /* Last extent ends at i_size. */
4739         if (hole_size == 0)
4740                 return 0;
4741
4742         hole_size = ALIGN(hole_size, fs_info->sectorsize);
4743         ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0,
4744                                        hole_size, 0, hole_size, 0, 0, 0);
4745         return ret;
4746 }
4747
4748 /*
4749  * When we are logging a new inode X, check if it doesn't have a reference that
4750  * matches the reference from some other inode Y created in a past transaction
4751  * and that was renamed in the current transaction. If we don't do this, then at
4752  * log replay time we can lose inode Y (and all its files if it's a directory):
4753  *
4754  * mkdir /mnt/x
4755  * echo "hello world" > /mnt/x/foobar
4756  * sync
4757  * mv /mnt/x /mnt/y
4758  * mkdir /mnt/x                 # or touch /mnt/x
4759  * xfs_io -c fsync /mnt/x
4760  * <power fail>
4761  * mount fs, trigger log replay
4762  *
4763  * After the log replay procedure, we would lose the first directory and all its
4764  * files (file foobar).
4765  * For the case where inode Y is not a directory we simply end up losing it:
4766  *
4767  * echo "123" > /mnt/foo
4768  * sync
4769  * mv /mnt/foo /mnt/bar
4770  * echo "abc" > /mnt/foo
4771  * xfs_io -c fsync /mnt/foo
4772  * <power fail>
4773  *
4774  * We also need this for cases where a snapshot entry is replaced by some other
4775  * entry (file or directory) otherwise we end up with an unreplayable log due to
4776  * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
4777  * if it were a regular entry:
4778  *
4779  * mkdir /mnt/x
4780  * btrfs subvolume snapshot /mnt /mnt/x/snap
4781  * btrfs subvolume delete /mnt/x/snap
4782  * rmdir /mnt/x
4783  * mkdir /mnt/x
4784  * fsync /mnt/x or fsync some new file inside it
4785  * <power fail>
4786  *
4787  * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
4788  * the same transaction.
4789  */
4790 static int btrfs_check_ref_name_override(struct extent_buffer *eb,
4791                                          const int slot,
4792                                          const struct btrfs_key *key,
4793                                          struct btrfs_inode *inode,
4794                                          u64 *other_ino)
4795 {
4796         int ret;
4797         struct btrfs_path *search_path;
4798         char *name = NULL;
4799         u32 name_len = 0;
4800         u32 item_size = btrfs_item_size_nr(eb, slot);
4801         u32 cur_offset = 0;
4802         unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
4803
4804         search_path = btrfs_alloc_path();
4805         if (!search_path)
4806                 return -ENOMEM;
4807         search_path->search_commit_root = 1;
4808         search_path->skip_locking = 1;
4809
4810         while (cur_offset < item_size) {
4811                 u64 parent;
4812                 u32 this_name_len;
4813                 u32 this_len;
4814                 unsigned long name_ptr;
4815                 struct btrfs_dir_item *di;
4816
4817                 if (key->type == BTRFS_INODE_REF_KEY) {
4818                         struct btrfs_inode_ref *iref;
4819
4820                         iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
4821                         parent = key->offset;
4822                         this_name_len = btrfs_inode_ref_name_len(eb, iref);
4823                         name_ptr = (unsigned long)(iref + 1);
4824                         this_len = sizeof(*iref) + this_name_len;
4825                 } else {
4826                         struct btrfs_inode_extref *extref;
4827
4828                         extref = (struct btrfs_inode_extref *)(ptr +
4829                                                                cur_offset);
4830                         parent = btrfs_inode_extref_parent(eb, extref);
4831                         this_name_len = btrfs_inode_extref_name_len(eb, extref);
4832                         name_ptr = (unsigned long)&extref->name;
4833                         this_len = sizeof(*extref) + this_name_len;
4834                 }
4835
4836                 if (this_name_len > name_len) {
4837                         char *new_name;
4838
4839                         new_name = krealloc(name, this_name_len, GFP_NOFS);
4840                         if (!new_name) {
4841                                 ret = -ENOMEM;
4842                                 goto out;
4843                         }
4844                         name_len = this_name_len;
4845                         name = new_name;
4846                 }
4847
4848                 read_extent_buffer(eb, name, name_ptr, this_name_len);
4849                 di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
4850                                 parent, name, this_name_len, 0);
4851                 if (di && !IS_ERR(di)) {
4852                         struct btrfs_key di_key;
4853
4854                         btrfs_dir_item_key_to_cpu(search_path->nodes[0],
4855                                                   di, &di_key);
4856                         if (di_key.type == BTRFS_INODE_ITEM_KEY) {
4857                                 ret = 1;
4858                                 *other_ino = di_key.objectid;
4859                         } else {
4860                                 ret = -EAGAIN;
4861                         }
4862                         goto out;
4863                 } else if (IS_ERR(di)) {
4864                         ret = PTR_ERR(di);
4865                         goto out;
4866                 }
4867                 btrfs_release_path(search_path);
4868
4869                 cur_offset += this_len;
4870         }
4871         ret = 0;
4872 out:
4873         btrfs_free_path(search_path);
4874         kfree(name);
4875         return ret;
4876 }
4877
4878 /* log a single inode in the tree log.
4879  * At least one parent directory for this inode must exist in the tree
4880  * or be logged already.
4881  *
4882  * Any items from this inode changed by the current transaction are copied
4883  * to the log tree.  An extra reference is taken on any extents in this
4884  * file, allowing us to avoid a whole pile of corner cases around logging
4885  * blocks that have been removed from the tree.
4886  *
4887  * See LOG_INODE_ALL and related defines for a description of what inode_only
4888  * does.
4889  *
4890  * This handles both files and directories.
4891  */
4892 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
4893                            struct btrfs_root *root, struct btrfs_inode *inode,
4894                            int inode_only,
4895                            const loff_t start,
4896                            const loff_t end,
4897                            struct btrfs_log_ctx *ctx)
4898 {
4899         struct btrfs_fs_info *fs_info = root->fs_info;
4900         struct btrfs_path *path;
4901         struct btrfs_path *dst_path;
4902         struct btrfs_key min_key;
4903         struct btrfs_key max_key;
4904         struct btrfs_root *log = root->log_root;
4905         LIST_HEAD(logged_list);
4906         u64 last_extent = 0;
4907         int err = 0;
4908         int ret;
4909         int nritems;
4910         int ins_start_slot = 0;
4911         int ins_nr;
4912         bool fast_search = false;
4913         u64 ino = btrfs_ino(inode);
4914         struct extent_map_tree *em_tree = &inode->extent_tree;
4915         u64 logged_isize = 0;
4916         bool need_log_inode_item = true;
4917         bool xattrs_logged = false;
4918
4919         path = btrfs_alloc_path();
4920         if (!path)
4921                 return -ENOMEM;
4922         dst_path = btrfs_alloc_path();
4923         if (!dst_path) {
4924                 btrfs_free_path(path);
4925                 return -ENOMEM;
4926         }
4927
4928         min_key.objectid = ino;
4929         min_key.type = BTRFS_INODE_ITEM_KEY;
4930         min_key.offset = 0;
4931
4932         max_key.objectid = ino;
4933
4934
4935         /* today the code can only do partial logging of directories */
4936         if (S_ISDIR(inode->vfs_inode.i_mode) ||
4937             (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4938                        &inode->runtime_flags) &&
4939              inode_only >= LOG_INODE_EXISTS))
4940                 max_key.type = BTRFS_XATTR_ITEM_KEY;
4941         else
4942                 max_key.type = (u8)-1;
4943         max_key.offset = (u64)-1;
4944
4945         /*
4946          * Only run delayed items if we are a dir or a new file.
4947          * Otherwise commit the delayed inode only, which is needed in
4948          * order for the log replay code to mark inodes for link count
4949          * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items).
4950          */
4951         if (S_ISDIR(inode->vfs_inode.i_mode) ||
4952             inode->generation > fs_info->last_trans_committed)
4953                 ret = btrfs_commit_inode_delayed_items(trans, inode);
4954         else
4955                 ret = btrfs_commit_inode_delayed_inode(inode);
4956
4957         if (ret) {
4958                 btrfs_free_path(path);
4959                 btrfs_free_path(dst_path);
4960                 return ret;
4961         }
4962
4963         if (inode_only == LOG_OTHER_INODE) {
4964                 inode_only = LOG_INODE_EXISTS;
4965                 mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
4966         } else {
4967                 mutex_lock(&inode->log_mutex);
4968         }
4969
4970         /*
4971          * a brute force approach to making sure we get the most uptodate
4972          * copies of everything.
4973          */
4974         if (S_ISDIR(inode->vfs_inode.i_mode)) {
4975                 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
4976
4977                 if (inode_only == LOG_INODE_EXISTS)
4978                         max_key_type = BTRFS_XATTR_ITEM_KEY;
4979                 ret = drop_objectid_items(trans, log, path, ino, max_key_type);
4980         } else {
4981                 if (inode_only == LOG_INODE_EXISTS) {
4982                         /*
4983                          * Make sure the new inode item we write to the log has
4984                          * the same isize as the current one (if it exists).
4985                          * This is necessary to prevent data loss after log
4986                          * replay, and also to prevent doing a wrong expanding
4987                          * truncate - for e.g. create file, write 4K into offset
4988                          * 0, fsync, write 4K into offset 4096, add hard link,
4989                          * fsync some other file (to sync log), power fail - if
4990                          * we use the inode's current i_size, after log replay
4991                          * we get a 8Kb file, with the last 4Kb extent as a hole
4992                          * (zeroes), as if an expanding truncate happened,
4993                          * instead of getting a file of 4Kb only.
4994                          */
4995                         err = logged_inode_size(log, inode, path, &logged_isize);
4996                         if (err)
4997                                 goto out_unlock;
4998                 }
4999                 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
5000                              &inode->runtime_flags)) {
5001                         if (inode_only == LOG_INODE_EXISTS) {
5002                                 max_key.type = BTRFS_XATTR_ITEM_KEY;
5003                                 ret = drop_objectid_items(trans, log, path, ino,
5004                                                           max_key.type);
5005                         } else {
5006                                 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
5007                                           &inode->runtime_flags);
5008                                 clear_bit(BTRFS_INODE_COPY_EVERYTHING,
5009                                           &inode->runtime_flags);
5010                                 while(1) {
5011                                         ret = btrfs_truncate_inode_items(trans,
5012                                                 log, &inode->vfs_inode, 0, 0);
5013                                         if (ret != -EAGAIN)
5014                                                 break;
5015                                 }
5016                         }
5017                 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
5018                                               &inode->runtime_flags) ||
5019                            inode_only == LOG_INODE_EXISTS) {
5020                         if (inode_only == LOG_INODE_ALL)
5021                                 fast_search = true;
5022                         max_key.type = BTRFS_XATTR_ITEM_KEY;
5023                         ret = drop_objectid_items(trans, log, path, ino,
5024                                                   max_key.type);
5025                 } else {
5026                         if (inode_only == LOG_INODE_ALL)
5027                                 fast_search = true;
5028                         goto log_extents;
5029                 }
5030
5031         }
5032         if (ret) {
5033                 err = ret;
5034                 goto out_unlock;
5035         }
5036
5037         while (1) {
5038                 ins_nr = 0;
5039                 ret = btrfs_search_forward(root, &min_key,
5040                                            path, trans->transid);
5041                 if (ret < 0) {
5042                         err = ret;
5043                         goto out_unlock;
5044                 }
5045                 if (ret != 0)
5046                         break;
5047 again:
5048                 /* note, ins_nr might be > 0 here, cleanup outside the loop */
5049                 if (min_key.objectid != ino)
5050                         break;
5051                 if (min_key.type > max_key.type)
5052                         break;
5053
5054                 if (min_key.type == BTRFS_INODE_ITEM_KEY)
5055                         need_log_inode_item = false;
5056
5057                 if ((min_key.type == BTRFS_INODE_REF_KEY ||
5058                      min_key.type == BTRFS_INODE_EXTREF_KEY) &&
5059                     inode->generation == trans->transid) {
5060                         u64 other_ino = 0;
5061
5062                         ret = btrfs_check_ref_name_override(path->nodes[0],
5063                                         path->slots[0], &min_key, inode,
5064                                         &other_ino);
5065                         if (ret < 0) {
5066                                 err = ret;
5067                                 goto out_unlock;
5068                         } else if (ret > 0 && ctx &&
5069                                    other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
5070                                 struct btrfs_key inode_key;
5071                                 struct inode *other_inode;
5072
5073                                 if (ins_nr > 0) {
5074                                         ins_nr++;
5075                                 } else {
5076                                         ins_nr = 1;
5077                                         ins_start_slot = path->slots[0];
5078                                 }
5079                                 ret = copy_items(trans, inode, dst_path, path,
5080                                                  &last_extent, ins_start_slot,
5081                                                  ins_nr, inode_only,
5082                                                  logged_isize);
5083                                 if (ret < 0) {
5084                                         err = ret;
5085                                         goto out_unlock;
5086                                 }
5087                                 ins_nr = 0;
5088                                 btrfs_release_path(path);
5089                                 inode_key.objectid = other_ino;
5090                                 inode_key.type = BTRFS_INODE_ITEM_KEY;
5091                                 inode_key.offset = 0;
5092                                 other_inode = btrfs_iget(fs_info->sb,
5093                                                          &inode_key, root,
5094                                                          NULL);
5095                                 /*
5096                                  * If the other inode that had a conflicting dir
5097                                  * entry was deleted in the current transaction,
5098                                  * we don't need to do more work nor fallback to
5099                                  * a transaction commit.
5100                                  */
5101                                 if (IS_ERR(other_inode) &&
5102                                     PTR_ERR(other_inode) == -ENOENT) {
5103                                         goto next_key;
5104                                 } else if (IS_ERR(other_inode)) {
5105                                         err = PTR_ERR(other_inode);
5106                                         goto out_unlock;
5107                                 }
5108                                 /*
5109                                  * We are safe logging the other inode without
5110                                  * acquiring its i_mutex as long as we log with
5111                                  * the LOG_INODE_EXISTS mode. We're safe against
5112                                  * concurrent renames of the other inode as well
5113                                  * because during a rename we pin the log and
5114                                  * update the log with the new name before we
5115                                  * unpin it.
5116                                  */
5117                                 err = btrfs_log_inode(trans, root,
5118                                                 BTRFS_I(other_inode),
5119                                                 LOG_OTHER_INODE, 0, LLONG_MAX,
5120                                                 ctx);
5121                                 iput(other_inode);
5122                                 if (err)
5123                                         goto out_unlock;
5124                                 else
5125                                         goto next_key;
5126                         }
5127                 }
5128
5129                 /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
5130                 if (min_key.type == BTRFS_XATTR_ITEM_KEY) {
5131                         if (ins_nr == 0)
5132                                 goto next_slot;
5133                         ret = copy_items(trans, inode, dst_path, path,
5134                                          &last_extent, ins_start_slot,
5135                                          ins_nr, inode_only, logged_isize);
5136                         if (ret < 0) {
5137                                 err = ret;
5138                                 goto out_unlock;
5139                         }
5140                         ins_nr = 0;
5141                         if (ret) {
5142                                 btrfs_release_path(path);
5143                                 continue;
5144                         }
5145                         goto next_slot;
5146                 }
5147
5148                 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
5149                         ins_nr++;
5150                         goto next_slot;
5151                 } else if (!ins_nr) {
5152                         ins_start_slot = path->slots[0];
5153                         ins_nr = 1;
5154                         goto next_slot;
5155                 }
5156
5157                 ret = copy_items(trans, inode, dst_path, path, &last_extent,
5158                                  ins_start_slot, ins_nr, inode_only,
5159                                  logged_isize);
5160                 if (ret < 0) {
5161                         err = ret;
5162                         goto out_unlock;
5163                 }
5164                 if (ret) {
5165                         ins_nr = 0;
5166                         btrfs_release_path(path);
5167                         continue;
5168                 }
5169                 ins_nr = 1;
5170                 ins_start_slot = path->slots[0];
5171 next_slot:
5172
5173                 nritems = btrfs_header_nritems(path->nodes[0]);
5174                 path->slots[0]++;
5175                 if (path->slots[0] < nritems) {
5176                         btrfs_item_key_to_cpu(path->nodes[0], &min_key,
5177                                               path->slots[0]);
5178                         goto again;
5179                 }
5180                 if (ins_nr) {
5181                         ret = copy_items(trans, inode, dst_path, path,
5182                                          &last_extent, ins_start_slot,
5183                                          ins_nr, inode_only, logged_isize);
5184                         if (ret < 0) {
5185                                 err = ret;
5186                                 goto out_unlock;
5187                         }
5188                         ret = 0;
5189                         ins_nr = 0;
5190                 }
5191                 btrfs_release_path(path);
5192 next_key:
5193                 if (min_key.offset < (u64)-1) {
5194                         min_key.offset++;
5195                 } else if (min_key.type < max_key.type) {
5196                         min_key.type++;
5197                         min_key.offset = 0;
5198                 } else {
5199                         break;
5200                 }
5201         }
5202         if (ins_nr) {
5203                 ret = copy_items(trans, inode, dst_path, path, &last_extent,
5204                                  ins_start_slot, ins_nr, inode_only,
5205                                  logged_isize);
5206                 if (ret < 0) {
5207                         err = ret;
5208                         goto out_unlock;
5209                 }
5210                 ret = 0;
5211                 ins_nr = 0;
5212         }
5213
5214         btrfs_release_path(path);
5215         btrfs_release_path(dst_path);
5216         err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
5217         if (err)
5218                 goto out_unlock;
5219         xattrs_logged = true;
5220         if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
5221                 btrfs_release_path(path);
5222                 btrfs_release_path(dst_path);
5223                 err = btrfs_log_trailing_hole(trans, root, inode, path);
5224                 if (err)
5225                         goto out_unlock;
5226         }
5227 log_extents:
5228         btrfs_release_path(path);
5229         btrfs_release_path(dst_path);
5230         if (need_log_inode_item) {
5231                 err = log_inode_item(trans, log, dst_path, inode);
5232                 if (!err && !xattrs_logged) {
5233                         err = btrfs_log_all_xattrs(trans, root, inode, path,
5234                                                    dst_path);
5235                         btrfs_release_path(path);
5236                 }
5237                 if (err)
5238                         goto out_unlock;
5239         }
5240         if (fast_search) {
5241                 ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
5242                                                 &logged_list, ctx, start, end);
5243                 if (ret) {
5244                         err = ret;
5245                         goto out_unlock;
5246                 }
5247         } else if (inode_only == LOG_INODE_ALL) {
5248                 struct extent_map *em, *n;
5249
5250                 write_lock(&em_tree->lock);
5251                 /*
5252                  * We can't just remove every em if we're called for a ranged
5253                  * fsync - that is, one that doesn't cover the whole possible
5254                  * file range (0 to LLONG_MAX). This is because we can have
5255                  * em's that fall outside the range we're logging and therefore
5256                  * their ordered operations haven't completed yet
5257                  * (btrfs_finish_ordered_io() not invoked yet). This means we
5258                  * didn't get their respective file extent item in the fs/subvol
5259                  * tree yet, and need to let the next fast fsync (one which
5260                  * consults the list of modified extent maps) find the em so
5261                  * that it logs a matching file extent item and waits for the
5262                  * respective ordered operation to complete (if it's still
5263                  * running).
5264                  *
5265                  * Removing every em outside the range we're logging would make
5266                  * the next fast fsync not log their matching file extent items,
5267                  * therefore making us lose data after a log replay.
5268                  */
5269                 list_for_each_entry_safe(em, n, &em_tree->modified_extents,
5270                                          list) {
5271                         const u64 mod_end = em->mod_start + em->mod_len - 1;
5272
5273                         if (em->mod_start >= start && mod_end <= end)
5274                                 list_del_init(&em->list);
5275                 }
5276                 write_unlock(&em_tree->lock);
5277         }
5278
5279         if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) {
5280                 ret = log_directory_changes(trans, root, inode, path, dst_path,
5281                                         ctx);
5282                 if (ret) {
5283                         err = ret;
5284                         goto out_unlock;
5285                 }
5286         }
5287
5288         spin_lock(&inode->lock);
5289         inode->logged_trans = trans->transid;
5290         inode->last_log_commit = inode->last_sub_trans;
5291         spin_unlock(&inode->lock);
5292 out_unlock:
5293         if (unlikely(err))
5294                 btrfs_put_logged_extents(&logged_list);
5295         else
5296                 btrfs_submit_logged_extents(&logged_list, log);
5297         mutex_unlock(&inode->log_mutex);
5298
5299         btrfs_free_path(path);
5300         btrfs_free_path(dst_path);
5301         return err;
5302 }
5303
5304 /*
5305  * Check if we must fallback to a transaction commit when logging an inode.
5306  * This must be called after logging the inode and is used only in the context
5307  * when fsyncing an inode requires the need to log some other inode - in which
5308  * case we can't lock the i_mutex of each other inode we need to log as that
5309  * can lead to deadlocks with concurrent fsync against other inodes (as we can
5310  * log inodes up or down in the hierarchy) or rename operations for example. So
5311  * we take the log_mutex of the inode after we have logged it and then check for
5312  * its last_unlink_trans value - this is safe because any task setting
5313  * last_unlink_trans must take the log_mutex and it must do this before it does
5314  * the actual unlink operation, so if we do this check before a concurrent task
5315  * sets last_unlink_trans it means we've logged a consistent version/state of
5316  * all the inode items, otherwise we are not sure and must do a transaction
5317  * commit (the concurrent task might have only updated last_unlink_trans before
5318  * we logged the inode or it might have also done the unlink).
5319  */
5320 static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
5321                                           struct btrfs_inode *inode)
5322 {
5323         struct btrfs_fs_info *fs_info = inode->root->fs_info;
5324         bool ret = false;
5325
5326         mutex_lock(&inode->log_mutex);
5327         if (inode->last_unlink_trans > fs_info->last_trans_committed) {
5328                 /*
5329                  * Make sure any commits to the log are forced to be full
5330                  * commits.
5331                  */
5332                 btrfs_set_log_full_commit(fs_info, trans);
5333                 ret = true;
5334         }
5335         mutex_unlock(&inode->log_mutex);
5336
5337         return ret;
5338 }
5339
5340 /*
5341  * follow the dentry parent pointers up the chain and see if any
5342  * of the directories in it require a full commit before they can
5343  * be logged.  Returns zero if nothing special needs to be done or 1 if
5344  * a full commit is required.
5345  */
5346 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
5347                                                struct btrfs_inode *inode,
5348                                                struct dentry *parent,
5349                                                struct super_block *sb,
5350                                                u64 last_committed)
5351 {
5352         int ret = 0;
5353         struct dentry *old_parent = NULL;
5354         struct btrfs_inode *orig_inode = inode;
5355
5356         /*
5357          * for regular files, if its inode is already on disk, we don't
5358          * have to worry about the parents at all.  This is because
5359          * we can use the last_unlink_trans field to record renames
5360          * and other fun in this file.
5361          */
5362         if (S_ISREG(inode->vfs_inode.i_mode) &&
5363             inode->generation <= last_committed &&
5364             inode->last_unlink_trans <= last_committed)
5365                 goto out;
5366
5367         if (!S_ISDIR(inode->vfs_inode.i_mode)) {
5368                 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
5369                         goto out;
5370                 inode = BTRFS_I(d_inode(parent));
5371         }
5372
5373         while (1) {
5374                 /*
5375                  * If we are logging a directory then we start with our inode,
5376                  * not our parent's inode, so we need to skip setting the
5377                  * logged_trans so that further down in the log code we don't
5378                  * think this inode has already been logged.
5379                  */
5380                 if (inode != orig_inode)
5381                         inode->logged_trans = trans->transid;
5382                 smp_mb();
5383
5384                 if (btrfs_must_commit_transaction(trans, inode)) {
5385                         ret = 1;
5386                         break;
5387                 }
5388
5389                 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
5390                         break;
5391
5392                 if (IS_ROOT(parent)) {
5393                         inode = BTRFS_I(d_inode(parent));
5394                         if (btrfs_must_commit_transaction(trans, inode))
5395                                 ret = 1;
5396                         break;
5397                 }
5398
5399                 parent = dget_parent(parent);
5400                 dput(old_parent);
5401                 old_parent = parent;
5402                 inode = BTRFS_I(d_inode(parent));
5403
5404         }
5405         dput(old_parent);
5406 out:
5407         return ret;
5408 }
5409
5410 struct btrfs_dir_list {
5411         u64 ino;
5412         struct list_head list;
5413 };
5414
5415 /*
5416  * Log the inodes of the new dentries of a directory. See log_dir_items() for
5417  * details about the why it is needed.
5418  * This is a recursive operation - if an existing dentry corresponds to a
5419  * directory, that directory's new entries are logged too (same behaviour as
5420  * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
5421  * the dentries point to we do not lock their i_mutex, otherwise lockdep
5422  * complains about the following circular lock dependency / possible deadlock:
5423  *
5424  *        CPU0                                        CPU1
5425  *        ----                                        ----
5426  * lock(&type->i_mutex_dir_key#3/2);
5427  *                                            lock(sb_internal#2);
5428  *                                            lock(&type->i_mutex_dir_key#3/2);
5429  * lock(&sb->s_type->i_mutex_key#14);
5430  *
5431  * Where sb_internal is the lock (a counter that works as a lock) acquired by
5432  * sb_start_intwrite() in btrfs_start_transaction().
5433  * Not locking i_mutex of the inodes is still safe because:
5434  *
5435  * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
5436  *    that while logging the inode new references (names) are added or removed
5437  *    from the inode, leaving the logged inode item with a link count that does
5438  *    not match the number of logged inode reference items. This is fine because
5439  *    at log replay time we compute the real number of links and correct the
5440  *    link count in the inode item (see replay_one_buffer() and
5441  *    link_to_fixup_dir());
5442  *
5443  * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
5444  *    while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
5445  *    BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
5446  *    has a size that doesn't match the sum of the lengths of all the logged
5447  *    names. This does not result in a problem because if a dir_item key is
5448  *    logged but its matching dir_index key is not logged, at log replay time we
5449  *    don't use it to replay the respective name (see replay_one_name()). On the
5450  *    other hand if only the dir_index key ends up being logged, the respective
5451  *    name is added to the fs/subvol tree with both the dir_item and dir_index
5452  *    keys created (see replay_one_name()).
5453  *    The directory's inode item with a wrong i_size is not a problem as well,
5454  *    since we don't use it at log replay time to set the i_size in the inode
5455  *    item of the fs/subvol tree (see overwrite_item()).
5456  */
5457 static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
5458                                 struct btrfs_root *root,
5459                                 struct btrfs_inode *start_inode,
5460                                 struct btrfs_log_ctx *ctx)
5461 {
5462         struct btrfs_fs_info *fs_info = root->fs_info;
5463         struct btrfs_root *log = root->log_root;
5464         struct btrfs_path *path;
5465         LIST_HEAD(dir_list);
5466         struct btrfs_dir_list *dir_elem;
5467         int ret = 0;
5468
5469         path = btrfs_alloc_path();
5470         if (!path)
5471                 return -ENOMEM;
5472
5473         dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
5474         if (!dir_elem) {
5475                 btrfs_free_path(path);
5476                 return -ENOMEM;
5477         }
5478         dir_elem->ino = btrfs_ino(start_inode);
5479         list_add_tail(&dir_elem->list, &dir_list);
5480
5481         while (!list_empty(&dir_list)) {
5482                 struct extent_buffer *leaf;
5483                 struct btrfs_key min_key;
5484                 int nritems;
5485                 int i;
5486
5487                 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
5488                                             list);
5489                 if (ret)
5490                         goto next_dir_inode;
5491
5492                 min_key.objectid = dir_elem->ino;
5493                 min_key.type = BTRFS_DIR_ITEM_KEY;
5494                 min_key.offset = 0;
5495 again:
5496                 btrfs_release_path(path);
5497                 ret = btrfs_search_forward(log, &min_key, path, trans->transid);
5498                 if (ret < 0) {
5499                         goto next_dir_inode;
5500                 } else if (ret > 0) {
5501                         ret = 0;
5502                         goto next_dir_inode;
5503                 }
5504
5505 process_leaf:
5506                 leaf = path->nodes[0];
5507                 nritems = btrfs_header_nritems(leaf);
5508                 for (i = path->slots[0]; i < nritems; i++) {
5509                         struct btrfs_dir_item *di;
5510                         struct btrfs_key di_key;
5511                         struct inode *di_inode;
5512                         struct btrfs_dir_list *new_dir_elem;
5513                         int log_mode = LOG_INODE_EXISTS;
5514                         int type;
5515
5516                         btrfs_item_key_to_cpu(leaf, &min_key, i);
5517                         if (min_key.objectid != dir_elem->ino ||
5518                             min_key.type != BTRFS_DIR_ITEM_KEY)
5519                                 goto next_dir_inode;
5520
5521                         di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
5522                         type = btrfs_dir_type(leaf, di);
5523                         if (btrfs_dir_transid(leaf, di) < trans->transid &&
5524                             type != BTRFS_FT_DIR)
5525                                 continue;
5526                         btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
5527                         if (di_key.type == BTRFS_ROOT_ITEM_KEY)
5528                                 continue;
5529
5530                         btrfs_release_path(path);
5531                         di_inode = btrfs_iget(fs_info->sb, &di_key, root, NULL);
5532                         if (IS_ERR(di_inode)) {
5533                                 ret = PTR_ERR(di_inode);
5534                                 goto next_dir_inode;
5535                         }
5536
5537                         if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) {
5538                                 iput(di_inode);
5539                                 break;
5540                         }
5541
5542                         ctx->log_new_dentries = false;
5543                         if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
5544                                 log_mode = LOG_INODE_ALL;
5545                         ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
5546                                               log_mode, 0, LLONG_MAX, ctx);
5547                         if (!ret &&
5548                             btrfs_must_commit_transaction(trans, BTRFS_I(di_inode)))
5549                                 ret = 1;
5550                         iput(di_inode);
5551                         if (ret)
5552                                 goto next_dir_inode;
5553                         if (ctx->log_new_dentries) {
5554                                 new_dir_elem = kmalloc(sizeof(*new_dir_elem),
5555                                                        GFP_NOFS);
5556                                 if (!new_dir_elem) {
5557                                         ret = -ENOMEM;
5558                                         goto next_dir_inode;
5559                                 }
5560                                 new_dir_elem->ino = di_key.objectid;
5561                                 list_add_tail(&new_dir_elem->list, &dir_list);
5562                         }
5563                         break;
5564                 }
5565                 if (i == nritems) {
5566                         ret = btrfs_next_leaf(log, path);
5567                         if (ret < 0) {
5568                                 goto next_dir_inode;
5569                         } else if (ret > 0) {
5570                                 ret = 0;
5571                                 goto next_dir_inode;
5572                         }
5573                         goto process_leaf;
5574                 }
5575                 if (min_key.offset < (u64)-1) {
5576                         min_key.offset++;
5577                         goto again;
5578                 }
5579 next_dir_inode:
5580                 list_del(&dir_elem->list);
5581                 kfree(dir_elem);
5582         }
5583
5584         btrfs_free_path(path);
5585         return ret;
5586 }
5587
5588 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
5589                                  struct btrfs_inode *inode,
5590                                  struct btrfs_log_ctx *ctx)
5591 {
5592         struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
5593         int ret;
5594         struct btrfs_path *path;
5595         struct btrfs_key key;
5596         struct btrfs_root *root = inode->root;
5597         const u64 ino = btrfs_ino(inode);
5598
5599         path = btrfs_alloc_path();
5600         if (!path)
5601                 return -ENOMEM;
5602         path->skip_locking = 1;
5603         path->search_commit_root = 1;
5604
5605         key.objectid = ino;
5606         key.type = BTRFS_INODE_REF_KEY;
5607         key.offset = 0;
5608         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5609         if (ret < 0)
5610                 goto out;
5611
5612         while (true) {
5613                 struct extent_buffer *leaf = path->nodes[0];
5614                 int slot = path->slots[0];
5615                 u32 cur_offset = 0;
5616                 u32 item_size;
5617                 unsigned long ptr;
5618
5619                 if (slot >= btrfs_header_nritems(leaf)) {
5620                         ret = btrfs_next_leaf(root, path);
5621                         if (ret < 0)
5622                                 goto out;
5623                         else if (ret > 0)
5624                                 break;
5625                         continue;
5626                 }
5627
5628                 btrfs_item_key_to_cpu(leaf, &key, slot);
5629                 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
5630                 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
5631                         break;
5632
5633                 item_size = btrfs_item_size_nr(leaf, slot);
5634                 ptr = btrfs_item_ptr_offset(leaf, slot);
5635                 while (cur_offset < item_size) {
5636                         struct btrfs_key inode_key;
5637                         struct inode *dir_inode;
5638
5639                         inode_key.type = BTRFS_INODE_ITEM_KEY;
5640                         inode_key.offset = 0;
5641
5642                         if (key.type == BTRFS_INODE_EXTREF_KEY) {
5643                                 struct btrfs_inode_extref *extref;
5644
5645                                 extref = (struct btrfs_inode_extref *)
5646                                         (ptr + cur_offset);
5647                                 inode_key.objectid = btrfs_inode_extref_parent(
5648                                         leaf, extref);
5649                                 cur_offset += sizeof(*extref);
5650                                 cur_offset += btrfs_inode_extref_name_len(leaf,
5651                                         extref);
5652                         } else {
5653                                 inode_key.objectid = key.offset;
5654                                 cur_offset = item_size;
5655                         }
5656
5657                         dir_inode = btrfs_iget(fs_info->sb, &inode_key,
5658                                                root, NULL);
5659                         /* If parent inode was deleted, skip it. */
5660                         if (IS_ERR(dir_inode))
5661                                 continue;
5662
5663                         if (ctx)
5664                                 ctx->log_new_dentries = false;
5665                         ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode),
5666                                               LOG_INODE_ALL, 0, LLONG_MAX, ctx);
5667                         if (!ret &&
5668                             btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode)))
5669                                 ret = 1;
5670                         if (!ret && ctx && ctx->log_new_dentries)
5671                                 ret = log_new_dir_dentries(trans, root,
5672                                                    BTRFS_I(dir_inode), ctx);
5673                         iput(dir_inode);
5674                         if (ret)
5675                                 goto out;
5676                 }
5677                 path->slots[0]++;
5678         }
5679         ret = 0;
5680 out:
5681         btrfs_free_path(path);
5682         return ret;
5683 }
5684
5685 /*
5686  * helper function around btrfs_log_inode to make sure newly created
5687  * parent directories also end up in the log.  A minimal inode and backref
5688  * only logging is done of any parent directories that are older than
5689  * the last committed transaction
5690  */
5691 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
5692                                   struct btrfs_inode *inode,
5693                                   struct dentry *parent,
5694                                   const loff_t start,
5695                                   const loff_t end,
5696                                   int inode_only,
5697                                   struct btrfs_log_ctx *ctx)
5698 {
5699         struct btrfs_root *root = inode->root;
5700         struct btrfs_fs_info *fs_info = root->fs_info;
5701         struct super_block *sb;
5702         struct dentry *old_parent = NULL;
5703         int ret = 0;
5704         u64 last_committed = fs_info->last_trans_committed;
5705         bool log_dentries = false;
5706         struct btrfs_inode *orig_inode = inode;
5707
5708         sb = inode->vfs_inode.i_sb;
5709
5710         if (btrfs_test_opt(fs_info, NOTREELOG)) {
5711                 ret = 1;
5712                 goto end_no_trans;
5713         }
5714
5715         /*
5716          * The prev transaction commit doesn't complete, we need do
5717          * full commit by ourselves.
5718          */
5719         if (fs_info->last_trans_log_full_commit >
5720             fs_info->last_trans_committed) {
5721                 ret = 1;
5722                 goto end_no_trans;
5723         }
5724
5725         if (btrfs_root_refs(&root->root_item) == 0) {
5726                 ret = 1;
5727                 goto end_no_trans;
5728         }
5729
5730         ret = check_parent_dirs_for_sync(trans, inode, parent, sb,
5731                         last_committed);
5732         if (ret)
5733                 goto end_no_trans;
5734
5735         if (btrfs_inode_in_log(inode, trans->transid)) {
5736                 ret = BTRFS_NO_LOG_SYNC;
5737                 goto end_no_trans;
5738         }
5739
5740         ret = start_log_trans(trans, root, ctx);
5741         if (ret)
5742                 goto end_no_trans;
5743
5744         ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx);
5745         if (ret)
5746                 goto end_trans;
5747
5748         /*
5749          * for regular files, if its inode is already on disk, we don't
5750          * have to worry about the parents at all.  This is because
5751          * we can use the last_unlink_trans field to record renames
5752          * and other fun in this file.
5753          */
5754         if (S_ISREG(inode->vfs_inode.i_mode) &&
5755             inode->generation <= last_committed &&
5756             inode->last_unlink_trans <= last_committed) {
5757                 ret = 0;
5758                 goto end_trans;
5759         }
5760
5761         if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries)
5762                 log_dentries = true;
5763
5764         /*
5765          * On unlink we must make sure all our current and old parent directory
5766          * inodes are fully logged. This is to prevent leaving dangling
5767          * directory index entries in directories that were our parents but are
5768          * not anymore. Not doing this results in old parent directory being
5769          * impossible to delete after log replay (rmdir will always fail with
5770          * error -ENOTEMPTY).
5771          *
5772          * Example 1:
5773          *
5774          * mkdir testdir
5775          * touch testdir/foo
5776          * ln testdir/foo testdir/bar
5777          * sync
5778          * unlink testdir/bar
5779          * xfs_io -c fsync testdir/foo
5780          * <power failure>
5781          * mount fs, triggers log replay
5782          *
5783          * If we don't log the parent directory (testdir), after log replay the
5784          * directory still has an entry pointing to the file inode using the bar
5785          * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
5786          * the file inode has a link count of 1.
5787          *
5788          * Example 2:
5789          *
5790          * mkdir testdir
5791          * touch foo
5792          * ln foo testdir/foo2
5793          * ln foo testdir/foo3
5794          * sync
5795          * unlink testdir/foo3
5796          * xfs_io -c fsync foo
5797          * <power failure>
5798          * mount fs, triggers log replay
5799          *
5800          * Similar as the first example, after log replay the parent directory
5801          * testdir still has an entry pointing to the inode file with name foo3
5802          * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
5803          * and has a link count of 2.
5804          */
5805         if (inode->last_unlink_trans > last_committed) {
5806                 ret = btrfs_log_all_parents(trans, orig_inode, ctx);
5807                 if (ret)
5808                         goto end_trans;
5809         }
5810
5811         while (1) {
5812                 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
5813                         break;
5814
5815                 inode = BTRFS_I(d_inode(parent));
5816                 if (root != inode->root)
5817                         break;
5818
5819                 if (inode->generation > last_committed) {
5820                         ret = btrfs_log_inode(trans, root, inode,
5821                                         LOG_INODE_EXISTS, 0, LLONG_MAX, ctx);
5822                         if (ret)
5823                                 goto end_trans;
5824                 }
5825                 if (IS_ROOT(parent))
5826                         break;
5827
5828                 parent = dget_parent(parent);
5829                 dput(old_parent);
5830                 old_parent = parent;
5831         }
5832         if (log_dentries)
5833                 ret = log_new_dir_dentries(trans, root, orig_inode, ctx);
5834         else
5835                 ret = 0;
5836 end_trans:
5837         dput(old_parent);
5838         if (ret < 0) {
5839                 btrfs_set_log_full_commit(fs_info, trans);
5840                 ret = 1;
5841         }
5842
5843         if (ret)
5844                 btrfs_remove_log_ctx(root, ctx);
5845         btrfs_end_log_trans(root);
5846 end_no_trans:
5847         return ret;
5848 }
5849
5850 /*
5851  * it is not safe to log dentry if the chunk root has added new
5852  * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
5853  * If this returns 1, you must commit the transaction to safely get your
5854  * data on disk.
5855  */
5856 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
5857                           struct dentry *dentry,
5858                           const loff_t start,
5859                           const loff_t end,
5860                           struct btrfs_log_ctx *ctx)
5861 {
5862         struct dentry *parent = dget_parent(dentry);
5863         int ret;
5864
5865         ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
5866                                      start, end, LOG_INODE_ALL, ctx);
5867         dput(parent);
5868
5869         return ret;
5870 }
5871
5872 /*
5873  * should be called during mount to recover any replay any log trees
5874  * from the FS
5875  */
5876 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
5877 {
5878         int ret;
5879         struct btrfs_path *path;
5880         struct btrfs_trans_handle *trans;
5881         struct btrfs_key key;
5882         struct btrfs_key found_key;
5883         struct btrfs_key tmp_key;
5884         struct btrfs_root *log;
5885         struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
5886         struct walk_control wc = {
5887                 .process_func = process_one_buffer,
5888                 .stage = 0,
5889         };
5890
5891         path = btrfs_alloc_path();
5892         if (!path)
5893                 return -ENOMEM;
5894
5895         set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
5896
5897         trans = btrfs_start_transaction(fs_info->tree_root, 0);
5898         if (IS_ERR(trans)) {
5899                 ret = PTR_ERR(trans);
5900                 goto error;
5901         }
5902
5903         wc.trans = trans;
5904         wc.pin = 1;
5905
5906         ret = walk_log_tree(trans, log_root_tree, &wc);
5907         if (ret) {
5908                 btrfs_handle_fs_error(fs_info, ret,
5909                         "Failed to pin buffers while recovering log root tree.");
5910                 goto error;
5911         }
5912
5913 again:
5914         key.objectid = BTRFS_TREE_LOG_OBJECTID;
5915         key.offset = (u64)-1;
5916         key.type = BTRFS_ROOT_ITEM_KEY;
5917
5918         while (1) {
5919                 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
5920
5921                 if (ret < 0) {
5922                         btrfs_handle_fs_error(fs_info, ret,
5923                                     "Couldn't find tree log root.");
5924                         goto error;
5925                 }
5926                 if (ret > 0) {
5927                         if (path->slots[0] == 0)
5928                                 break;
5929                         path->slots[0]--;
5930                 }
5931                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
5932                                       path->slots[0]);
5933                 btrfs_release_path(path);
5934                 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
5935                         break;
5936
5937                 log = btrfs_read_fs_root(log_root_tree, &found_key);
5938                 if (IS_ERR(log)) {
5939                         ret = PTR_ERR(log);
5940                         btrfs_handle_fs_error(fs_info, ret,
5941                                     "Couldn't read tree log root.");
5942                         goto error;
5943                 }
5944
5945                 tmp_key.objectid = found_key.offset;
5946                 tmp_key.type = BTRFS_ROOT_ITEM_KEY;
5947                 tmp_key.offset = (u64)-1;
5948
5949                 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
5950                 if (IS_ERR(wc.replay_dest)) {
5951                         ret = PTR_ERR(wc.replay_dest);
5952                         free_extent_buffer(log->node);
5953                         free_extent_buffer(log->commit_root);
5954                         kfree(log);
5955                         btrfs_handle_fs_error(fs_info, ret,
5956                                 "Couldn't read target root for tree log recovery.");
5957                         goto error;
5958                 }
5959
5960                 wc.replay_dest->log_root = log;
5961                 btrfs_record_root_in_trans(trans, wc.replay_dest);
5962                 ret = walk_log_tree(trans, log, &wc);
5963
5964                 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
5965                         ret = fixup_inode_link_counts(trans, wc.replay_dest,
5966                                                       path);
5967                 }
5968
5969                 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
5970                         struct btrfs_root *root = wc.replay_dest;
5971
5972                         btrfs_release_path(path);
5973
5974                         /*
5975                          * We have just replayed everything, and the highest
5976                          * objectid of fs roots probably has changed in case
5977                          * some inode_item's got replayed.
5978                          *
5979                          * root->objectid_mutex is not acquired as log replay
5980                          * could only happen during mount.
5981                          */
5982                         ret = btrfs_find_highest_objectid(root,
5983                                                   &root->highest_objectid);
5984                 }
5985
5986                 key.offset = found_key.offset - 1;
5987                 wc.replay_dest->log_root = NULL;
5988                 free_extent_buffer(log->node);
5989                 free_extent_buffer(log->commit_root);
5990                 kfree(log);
5991
5992                 if (ret)
5993                         goto error;
5994
5995                 if (found_key.offset == 0)
5996                         break;
5997         }
5998         btrfs_release_path(path);
5999
6000         /* step one is to pin it all, step two is to replay just inodes */
6001         if (wc.pin) {
6002                 wc.pin = 0;
6003                 wc.process_func = replay_one_buffer;
6004                 wc.stage = LOG_WALK_REPLAY_INODES;
6005                 goto again;
6006         }
6007         /* step three is to replay everything */
6008         if (wc.stage < LOG_WALK_REPLAY_ALL) {
6009                 wc.stage++;
6010                 goto again;
6011         }
6012
6013         btrfs_free_path(path);
6014
6015         /* step 4: commit the transaction, which also unpins the blocks */
6016         ret = btrfs_commit_transaction(trans);
6017         if (ret)
6018                 return ret;
6019
6020         free_extent_buffer(log_root_tree->node);
6021         log_root_tree->log_root = NULL;
6022         clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
6023         kfree(log_root_tree);
6024
6025         return 0;
6026 error:
6027         if (wc.trans)
6028                 btrfs_end_transaction(wc.trans);
6029         btrfs_free_path(path);
6030         return ret;
6031 }
6032
6033 /*
6034  * there are some corner cases where we want to force a full
6035  * commit instead of allowing a directory to be logged.
6036  *
6037  * They revolve around files there were unlinked from the directory, and
6038  * this function updates the parent directory so that a full commit is
6039  * properly done if it is fsync'd later after the unlinks are done.
6040  *
6041  * Must be called before the unlink operations (updates to the subvolume tree,
6042  * inodes, etc) are done.
6043  */
6044 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
6045                              struct btrfs_inode *dir, struct btrfs_inode *inode,
6046                              int for_rename)
6047 {
6048         /*
6049          * when we're logging a file, if it hasn't been renamed
6050          * or unlinked, and its inode is fully committed on disk,
6051          * we don't have to worry about walking up the directory chain
6052          * to log its parents.
6053          *
6054          * So, we use the last_unlink_trans field to put this transid
6055          * into the file.  When the file is logged we check it and
6056          * don't log the parents if the file is fully on disk.
6057          */
6058         mutex_lock(&inode->log_mutex);
6059         inode->last_unlink_trans = trans->transid;
6060         mutex_unlock(&inode->log_mutex);
6061
6062         /*
6063          * if this directory was already logged any new
6064          * names for this file/dir will get recorded
6065          */
6066         smp_mb();
6067         if (dir->logged_trans == trans->transid)
6068                 return;
6069
6070         /*
6071          * if the inode we're about to unlink was logged,
6072          * the log will be properly updated for any new names
6073          */
6074         if (inode->logged_trans == trans->transid)
6075                 return;
6076
6077         /*
6078          * when renaming files across directories, if the directory
6079          * there we're unlinking from gets fsync'd later on, there's
6080          * no way to find the destination directory later and fsync it
6081          * properly.  So, we have to be conservative and force commits
6082          * so the new name gets discovered.
6083          */
6084         if (for_rename)
6085                 goto record;
6086
6087         /* we can safely do the unlink without any special recording */
6088         return;
6089
6090 record:
6091         mutex_lock(&dir->log_mutex);
6092         dir->last_unlink_trans = trans->transid;
6093         mutex_unlock(&dir->log_mutex);
6094 }
6095
6096 /*
6097  * Make sure that if someone attempts to fsync the parent directory of a deleted
6098  * snapshot, it ends up triggering a transaction commit. This is to guarantee
6099  * that after replaying the log tree of the parent directory's root we will not
6100  * see the snapshot anymore and at log replay time we will not see any log tree
6101  * corresponding to the deleted snapshot's root, which could lead to replaying
6102  * it after replaying the log tree of the parent directory (which would replay
6103  * the snapshot delete operation).
6104  *
6105  * Must be called before the actual snapshot destroy operation (updates to the
6106  * parent root and tree of tree roots trees, etc) are done.
6107  */
6108 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
6109                                    struct btrfs_inode *dir)
6110 {
6111         mutex_lock(&dir->log_mutex);
6112         dir->last_unlink_trans = trans->transid;
6113         mutex_unlock(&dir->log_mutex);
6114 }
6115
6116 /*
6117  * Call this after adding a new name for a file and it will properly
6118  * update the log to reflect the new name.
6119  *
6120  * It will return zero if all goes well, and it will return 1 if a
6121  * full transaction commit is required.
6122  */
6123 int btrfs_log_new_name(struct btrfs_trans_handle *trans,
6124                         struct btrfs_inode *inode, struct btrfs_inode *old_dir,
6125                         struct dentry *parent)
6126 {
6127         struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
6128
6129         /*
6130          * this will force the logging code to walk the dentry chain
6131          * up for the file
6132          */
6133         if (!S_ISDIR(inode->vfs_inode.i_mode))
6134                 inode->last_unlink_trans = trans->transid;
6135
6136         /*
6137          * if this inode hasn't been logged and directory we're renaming it
6138          * from hasn't been logged, we don't need to log it
6139          */
6140         if (inode->logged_trans <= fs_info->last_trans_committed &&
6141             (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed))
6142                 return 0;
6143
6144         return btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
6145                                       LOG_INODE_EXISTS, NULL);
6146 }
6147