fs/btrfs/tree-log.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2008 Oracle.  All rights reserved.
   4  */
   5
   6 #include <linux/sched.h>
   7 #include <linux/slab.h>
   8 #include <linux/blkdev.h>
   9 #include <linux/list_sort.h>
  10 #include <linux/iversion.h>
  11 #include "misc.h"
  12 #include "ctree.h"
  13 #include "tree-log.h"
  14 #include "disk-io.h"
  15 #include "locking.h"
  16 #include "print-tree.h"
  17 #include "backref.h"
  18 #include "compression.h"
  19 #include "qgroup.h"
  20 #include "block-group.h"
  21 #include "space-info.h"
  22 #include "zoned.h"
  23 #include "inode-item.h"
  24 #include "fs.h"
  25 #include "accessors.h"
  26
  27 #define MAX_CONFLICT_INODES 10
  28
  29 /* magic values for the inode_only field in btrfs_log_inode:
  30  *
  31  * LOG_INODE_ALL means to log everything
  32  * LOG_INODE_EXISTS means to log just enough to recreate the inode
  33  * during log replay
  34  */
  35 enum {
  36         LOG_INODE_ALL,
  37         LOG_INODE_EXISTS,
  38 };
  39
  40 /*
  41  * directory trouble cases
  42  *
  43  * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
  44  * log, we must force a full commit before doing an fsync of the directory
  45  * where the unlink was done.
  46  * ---> record transid of last unlink/rename per directory
  47  *
  48  * mkdir foo/some_dir
  49  * normal commit
  50  * rename foo/some_dir foo2/some_dir
  51  * mkdir foo/some_dir
  52  * fsync foo/some_dir/some_file
  53  *
  54  * The fsync above will unlink the original some_dir without recording
  55  * it in its new location (foo2).  After a crash, some_dir will be gone
  56  * unless the fsync of some_file forces a full commit
  57  *
  58  * 2) we must log any new names for any file or dir that is in the fsync
  59  * log. ---> check inode while renaming/linking.
  60  *
  61  * 2a) we must log any new names for any file or dir during rename
  62  * when the directory they are being removed from was logged.
  63  * ---> check inode and old parent dir during rename
  64  *
  65  *  2a is actually the more important variant.  With the extra logging
  66  *  a crash might unlink the old name without recreating the new one
  67  *
  68  * 3) after a crash, we must go through any directories with a link count
  69  * of zero and redo the rm -rf
  70  *
  71  * mkdir f1/foo
  72  * normal commit
  73  * rm -rf f1/foo
  74  * fsync(f1)
  75  *
  76  * The directory f1 was fully removed from the FS, but fsync was never
  77  * called on f1, only its parent dir.  After a crash the rm -rf must
  78  * be replayed.  This must be able to recurse down the entire
  79  * directory tree.  The inode link count fixup code takes care of the
  80  * ugly details.
  81  */
  82
  83 /*
  84  * stages for the tree walking.  The first
  85  * stage (0) is to only pin down the blocks we find
  86  * the second stage (1) is to make sure that all the inodes
  87  * we find in the log are created in the subvolume.
  88  *
  89  * The last stage is to deal with directories and links and extents
  90  * and all the other fun semantics
  91  */
  92 enum {
  93         LOG_WALK_PIN_ONLY,
  94         LOG_WALK_REPLAY_INODES,
  95         LOG_WALK_REPLAY_DIR_INDEX,
  96         LOG_WALK_REPLAY_ALL,
  97 };
  98
  99 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 100                            struct btrfs_inode *inode,
 101                            int inode_only,
 102                            struct btrfs_log_ctx *ctx);
 103 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
 104                              struct btrfs_root *root,
 105                              struct btrfs_path *path, u64 objectid);
 106 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
 107                                        struct btrfs_root *root,
 108                                        struct btrfs_root *log,
 109                                        struct btrfs_path *path,
 110                                        u64 dirid, int del_all);
 111 static void wait_log_commit(struct btrfs_root *root, int transid);
 112
 113 /*
 114  * tree logging is a special write ahead log used to make sure that
 115  * fsyncs and O_SYNCs can happen without doing full tree commits.
 116  *
 117  * Full tree commits are expensive because they require commonly
 118  * modified blocks to be recowed, creating many dirty pages in the
 119  * extent tree an 4x-6x higher write load than ext3.
 120  *
 121  * Instead of doing a tree commit on every fsync, we use the
 122  * key ranges and transaction ids to find items for a given file or directory
 123  * that have changed in this transaction.  Those items are copied into
 124  * a special tree (one per subvolume root), that tree is written to disk
 125  * and then the fsync is considered complete.
 126  *
 127  * After a crash, items are copied out of the log-tree back into the
 128  * subvolume tree.  Any file data extents found are recorded in the extent
 129  * allocation tree, and the log-tree freed.
 130  *
 131  * The log tree is read three times, once to pin down all the extents it is
 132  * using in ram and once, once to create all the inodes logged in the tree
 133  * and once to do all the other items.
 134  */
 135
 136 /*
 137  * start a sub transaction and setup the log tree
 138  * this increments the log tree writer count to make the people
 139  * syncing the tree wait for us to finish
 140  */
 141 static int start_log_trans(struct btrfs_trans_handle *trans,
 142                            struct btrfs_root *root,
 143                            struct btrfs_log_ctx *ctx)
 144 {
 145         struct btrfs_fs_info *fs_info = root->fs_info;
 146         struct btrfs_root *tree_root = fs_info->tree_root;
 147         const bool zoned = btrfs_is_zoned(fs_info);
 148         int ret = 0;
 149         bool created = false;
 150
 151         /*
 152          * First check if the log root tree was already created. If not, create
 153          * it before locking the root's log_mutex, just to keep lockdep happy.
 154          */
 155         if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) {
 156                 mutex_lock(&tree_root->log_mutex);
 157                 if (!fs_info->log_root_tree) {
 158                         ret = btrfs_init_log_root_tree(trans, fs_info);
 159                         if (!ret) {
 160                                 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state);
 161                                 created = true;
 162                         }
 163                 }
 164                 mutex_unlock(&tree_root->log_mutex);
 165                 if (ret)
 166                         return ret;
 167         }
 168
 169         mutex_lock(&root->log_mutex);
 170
 171 again:
 172         if (root->log_root) {
 173                 int index = (root->log_transid + 1) % 2;
 174
 175                 if (btrfs_need_log_full_commit(trans)) {
 176                         ret = BTRFS_LOG_FORCE_COMMIT;
 177                         goto out;
 178                 }
 179
 180                 if (zoned && atomic_read(&root->log_commit[index])) {
 181                         wait_log_commit(root, root->log_transid - 1);
 182                         goto again;
 183                 }
 184
 185                 if (!root->log_start_pid) {
 186                         clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 187                         root->log_start_pid = current->pid;
 188                 } else if (root->log_start_pid != current->pid) {
 189                         set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 190                 }
 191         } else {
 192                 /*
 193                  * This means fs_info->log_root_tree was already created
 194                  * for some other FS trees. Do the full commit not to mix
 195                  * nodes from multiple log transactions to do sequential
 196                  * writing.
 197                  */
 198                 if (zoned && !created) {
 199                         ret = BTRFS_LOG_FORCE_COMMIT;
 200                         goto out;
 201                 }
 202
 203                 ret = btrfs_add_log_tree(trans, root);
 204                 if (ret)
 205                         goto out;
 206
 207                 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
 208                 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 209                 root->log_start_pid = current->pid;
 210         }
 211
 212         atomic_inc(&root->log_writers);
 213         if (!ctx->logging_new_name) {
 214                 int index = root->log_transid % 2;
 215                 list_add_tail(&ctx->list, &root->log_ctxs[index]);
 216                 ctx->log_transid = root->log_transid;
 217         }
 218
 219 out:
 220         mutex_unlock(&root->log_mutex);
 221         return ret;
 222 }
 223
 224 /*
 225  * returns 0 if there was a log transaction running and we were able
 226  * to join, or returns -ENOENT if there were not transactions
 227  * in progress
 228  */
 229 static int join_running_log_trans(struct btrfs_root *root)
 230 {
 231         const bool zoned = btrfs_is_zoned(root->fs_info);
 232         int ret = -ENOENT;
 233
 234         if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
 235                 return ret;
 236
 237         mutex_lock(&root->log_mutex);
 238 again:
 239         if (root->log_root) {
 240                 int index = (root->log_transid + 1) % 2;
 241
 242                 ret = 0;
 243                 if (zoned && atomic_read(&root->log_commit[index])) {
 244                         wait_log_commit(root, root->log_transid - 1);
 245                         goto again;
 246                 }
 247                 atomic_inc(&root->log_writers);
 248         }
 249         mutex_unlock(&root->log_mutex);
 250         return ret;
 251 }
 252
 253 /*
 254  * This either makes the current running log transaction wait
 255  * until you call btrfs_end_log_trans() or it makes any future
 256  * log transactions wait until you call btrfs_end_log_trans()
 257  */
 258 void btrfs_pin_log_trans(struct btrfs_root *root)
 259 {
 260         atomic_inc(&root->log_writers);
 261 }
 262
 263 /*
 264  * indicate we're done making changes to the log tree
 265  * and wake up anyone waiting to do a sync
 266  */
 267 void btrfs_end_log_trans(struct btrfs_root *root)
 268 {
 269         if (atomic_dec_and_test(&root->log_writers)) {
 270                 /* atomic_dec_and_test implies a barrier */
 271                 cond_wake_up_nomb(&root->log_writer_wait);
 272         }
 273 }
 274
 275 static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
 276 {
 277         filemap_fdatawait_range(buf->pages[0]->mapping,
 278                                 buf->start, buf->start + buf->len - 1);
 279 }
 280
 281 /*
 282  * the walk control struct is used to pass state down the chain when
 283  * processing the log tree.  The stage field tells us which part
 284  * of the log tree processing we are currently doing.  The others
 285  * are state fields used for that specific part
 286  */
 287 struct walk_control {
 288         /* should we free the extent on disk when done?  This is used
 289          * at transaction commit time while freeing a log tree
 290          */
 291         int free;
 292
 293         /* pin only walk, we record which extents on disk belong to the
 294          * log trees
 295          */
 296         int pin;
 297
 298         /* what stage of the replay code we're currently in */
 299         int stage;
 300
 301         /*
 302          * Ignore any items from the inode currently being processed. Needs
 303          * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
 304          * the LOG_WALK_REPLAY_INODES stage.
 305          */
 306         bool ignore_cur_inode;
 307
 308         /* the root we are currently replaying */
 309         struct btrfs_root *replay_dest;
 310
 311         /* the trans handle for the current replay */
 312         struct btrfs_trans_handle *trans;
 313
 314         /* the function that gets used to process blocks we find in the
 315          * tree.  Note the extent_buffer might not be up to date when it is
 316          * passed in, and it must be checked or read if you need the data
 317          * inside it
 318          */
 319         int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
 320                             struct walk_control *wc, u64 gen, int level);
 321 };
 322
 323 /*
 324  * process_func used to pin down extents, write them or wait on them
 325  */
 326 static int process_one_buffer(struct btrfs_root *log,
 327                               struct extent_buffer *eb,
 328                               struct walk_control *wc, u64 gen, int level)
 329 {
 330         struct btrfs_fs_info *fs_info = log->fs_info;
 331         int ret = 0;
 332
 333         /*
 334          * If this fs is mixed then we need to be able to process the leaves to
 335          * pin down any logged extents, so we have to read the block.
 336          */
 337         if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
 338                 ret = btrfs_read_extent_buffer(eb, gen, level, NULL);
 339                 if (ret)
 340                         return ret;
 341         }
 342
 343         if (wc->pin) {
 344                 ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start,
 345                                                       eb->len);
 346                 if (ret)
 347                         return ret;
 348
 349                 if (btrfs_buffer_uptodate(eb, gen, 0) &&
 350                     btrfs_header_level(eb) == 0)
 351                         ret = btrfs_exclude_logged_extents(eb);
 352         }
 353         return ret;
 354 }
 355
 356 static int do_overwrite_item(struct btrfs_trans_handle *trans,
 357                              struct btrfs_root *root,
 358                              struct btrfs_path *path,
 359                              struct extent_buffer *eb, int slot,
 360                              struct btrfs_key *key)
 361 {
 362         int ret;
 363         u32 item_size;
 364         u64 saved_i_size = 0;
 365         int save_old_i_size = 0;
 366         unsigned long src_ptr;
 367         unsigned long dst_ptr;
 368         int overwrite_root = 0;
 369         bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
 370
 371         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
 372                 overwrite_root = 1;
 373
 374         item_size = btrfs_item_size(eb, slot);
 375         src_ptr = btrfs_item_ptr_offset(eb, slot);
 376
 377         /* Our caller must have done a search for the key for us. */
 378         ASSERT(path->nodes[0] != NULL);
 379
 380         /*
 381          * And the slot must point to the exact key or the slot where the key
 382          * should be at (the first item with a key greater than 'key')
 383          */
 384         if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
 385                 struct btrfs_key found_key;
 386
 387                 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
 388                 ret = btrfs_comp_cpu_keys(&found_key, key);
 389                 ASSERT(ret >= 0);
 390         } else {
 391                 ret = 1;
 392         }
 393
 394         if (ret == 0) {
 395                 char *src_copy;
 396                 char *dst_copy;
 397                 u32 dst_size = btrfs_item_size(path->nodes[0],
 398                                                   path->slots[0]);
 399                 if (dst_size != item_size)
 400                         goto insert;
 401
 402                 if (item_size == 0) {
 403                         btrfs_release_path(path);
 404                         return 0;
 405                 }
 406                 dst_copy = kmalloc(item_size, GFP_NOFS);
 407                 src_copy = kmalloc(item_size, GFP_NOFS);
 408                 if (!dst_copy || !src_copy) {
 409                         btrfs_release_path(path);
 410                         kfree(dst_copy);
 411                         kfree(src_copy);
 412                         return -ENOMEM;
 413                 }
 414
 415                 read_extent_buffer(eb, src_copy, src_ptr, item_size);
 416
 417                 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
 418                 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
 419                                    item_size);
 420                 ret = memcmp(dst_copy, src_copy, item_size);
 421
 422                 kfree(dst_copy);
 423                 kfree(src_copy);
 424                 /*
 425                  * they have the same contents, just return, this saves
 426                  * us from cowing blocks in the destination tree and doing
 427                  * extra writes that may not have been done by a previous
 428                  * sync
 429                  */
 430                 if (ret == 0) {
 431                         btrfs_release_path(path);
 432                         return 0;
 433                 }
 434
 435                 /*
 436                  * We need to load the old nbytes into the inode so when we
 437                  * replay the extents we've logged we get the right nbytes.
 438                  */
 439                 if (inode_item) {
 440                         struct btrfs_inode_item *item;
 441                         u64 nbytes;
 442                         u32 mode;
 443
 444                         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 445                                               struct btrfs_inode_item);
 446                         nbytes = btrfs_inode_nbytes(path->nodes[0], item);
 447                         item = btrfs_item_ptr(eb, slot,
 448                                               struct btrfs_inode_item);
 449                         btrfs_set_inode_nbytes(eb, item, nbytes);
 450
 451                         /*
 452                          * If this is a directory we need to reset the i_size to
 453                          * 0 so that we can set it up properly when replaying
 454                          * the rest of the items in this log.
 455                          */
 456                         mode = btrfs_inode_mode(eb, item);
 457                         if (S_ISDIR(mode))
 458                                 btrfs_set_inode_size(eb, item, 0);
 459                 }
 460         } else if (inode_item) {
 461                 struct btrfs_inode_item *item;
 462                 u32 mode;
 463
 464                 /*
 465                  * New inode, set nbytes to 0 so that the nbytes comes out
 466                  * properly when we replay the extents.
 467                  */
 468                 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
 469                 btrfs_set_inode_nbytes(eb, item, 0);
 470
 471                 /*
 472                  * If this is a directory we need to reset the i_size to 0 so
 473                  * that we can set it up properly when replaying the rest of
 474                  * the items in this log.
 475                  */
 476                 mode = btrfs_inode_mode(eb, item);
 477                 if (S_ISDIR(mode))
 478                         btrfs_set_inode_size(eb, item, 0);
 479         }
 480 insert:
 481         btrfs_release_path(path);
 482         /* try to insert the key into the destination tree */
 483         path->skip_release_on_error = 1;
 484         ret = btrfs_insert_empty_item(trans, root, path,
 485                                       key, item_size);
 486         path->skip_release_on_error = 0;
 487
 488         /* make sure any existing item is the correct size */
 489         if (ret == -EEXIST || ret == -EOVERFLOW) {
 490                 u32 found_size;
 491                 found_size = btrfs_item_size(path->nodes[0],
 492                                                 path->slots[0]);
 493                 if (found_size > item_size)
 494                         btrfs_truncate_item(path, item_size, 1);
 495                 else if (found_size < item_size)
 496                         btrfs_extend_item(path, item_size - found_size);
 497         } else if (ret) {
 498                 return ret;
 499         }
 500         dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
 501                                         path->slots[0]);
 502
 503         /* don't overwrite an existing inode if the generation number
 504          * was logged as zero.  This is done when the tree logging code
 505          * is just logging an inode to make sure it exists after recovery.
 506          *
 507          * Also, don't overwrite i_size on directories during replay.
 508          * log replay inserts and removes directory items based on the
 509          * state of the tree found in the subvolume, and i_size is modified
 510          * as it goes
 511          */
 512         if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
 513                 struct btrfs_inode_item *src_item;
 514                 struct btrfs_inode_item *dst_item;
 515
 516                 src_item = (struct btrfs_inode_item *)src_ptr;
 517                 dst_item = (struct btrfs_inode_item *)dst_ptr;
 518
 519                 if (btrfs_inode_generation(eb, src_item) == 0) {
 520                         struct extent_buffer *dst_eb = path->nodes[0];
 521                         const u64 ino_size = btrfs_inode_size(eb, src_item);
 522
 523                         /*
 524                          * For regular files an ino_size == 0 is used only when
 525                          * logging that an inode exists, as part of a directory
 526                          * fsync, and the inode wasn't fsynced before. In this
 527                          * case don't set the size of the inode in the fs/subvol
 528                          * tree, otherwise we would be throwing valid data away.
 529                          */
 530                         if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
 531                             S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
 532                             ino_size != 0)
 533                                 btrfs_set_inode_size(dst_eb, dst_item, ino_size);
 534                         goto no_copy;
 535                 }
 536
 537                 if (overwrite_root &&
 538                     S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
 539                     S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
 540                         save_old_i_size = 1;
 541                         saved_i_size = btrfs_inode_size(path->nodes[0],
 542                                                         dst_item);
 543                 }
 544         }
 545
 546         copy_extent_buffer(path->nodes[0], eb, dst_ptr,
 547                            src_ptr, item_size);
 548
 549         if (save_old_i_size) {
 550                 struct btrfs_inode_item *dst_item;
 551                 dst_item = (struct btrfs_inode_item *)dst_ptr;
 552                 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
 553         }
 554
 555         /* make sure the generation is filled in */
 556         if (key->type == BTRFS_INODE_ITEM_KEY) {
 557                 struct btrfs_inode_item *dst_item;
 558                 dst_item = (struct btrfs_inode_item *)dst_ptr;
 559                 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
 560                         btrfs_set_inode_generation(path->nodes[0], dst_item,
 561                                                    trans->transid);
 562                 }
 563         }
 564 no_copy:
 565         btrfs_mark_buffer_dirty(path->nodes[0]);
 566         btrfs_release_path(path);
 567         return 0;
 568 }
 569
 570 /*
 571  * Item overwrite used by replay and tree logging.  eb, slot and key all refer
 572  * to the src data we are copying out.
 573  *
 574  * root is the tree we are copying into, and path is a scratch
 575  * path for use in this function (it should be released on entry and
 576  * will be released on exit).
 577  *
 578  * If the key is already in the destination tree the existing item is
 579  * overwritten.  If the existing item isn't big enough, it is extended.
 580  * If it is too large, it is truncated.
 581  *
 582  * If the key isn't in the destination yet, a new item is inserted.
 583  */
 584 static int overwrite_item(struct btrfs_trans_handle *trans,
 585                           struct btrfs_root *root,
 586                           struct btrfs_path *path,
 587                           struct extent_buffer *eb, int slot,
 588                           struct btrfs_key *key)
 589 {
 590         int ret;
 591
 592         /* Look for the key in the destination tree. */
 593         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
 594         if (ret < 0)
 595                 return ret;
 596
 597         return do_overwrite_item(trans, root, path, eb, slot, key);
 598 }
 599
 600 /*
 601  * simple helper to read an inode off the disk from a given root
 602  * This can only be called for subvolume roots and not for the log
 603  */
 604 static noinline struct inode *read_one_inode(struct btrfs_root *root,
 605                                              u64 objectid)
 606 {
 607         struct inode *inode;
 608
 609         inode = btrfs_iget(root->fs_info->sb, objectid, root);
 610         if (IS_ERR(inode))
 611                 inode = NULL;
 612         return inode;
 613 }
 614
 615 /* replays a single extent in 'eb' at 'slot' with 'key' into the
 616  * subvolume 'root'.  path is released on entry and should be released
 617  * on exit.
 618  *
 619  * extents in the log tree have not been allocated out of the extent
 620  * tree yet.  So, this completes the allocation, taking a reference
 621  * as required if the extent already exists or creating a new extent
 622  * if it isn't in the extent allocation tree yet.
 623  *
 624  * The extent is inserted into the file, dropping any existing extents
 625  * from the file that overlap the new one.
 626  */
 627 static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 628                                       struct btrfs_root *root,
 629                                       struct btrfs_path *path,
 630                                       struct extent_buffer *eb, int slot,
 631                                       struct btrfs_key *key)
 632 {
 633         struct btrfs_drop_extents_args drop_args = { 0 };
 634         struct btrfs_fs_info *fs_info = root->fs_info;
 635         int found_type;
 636         u64 extent_end;
 637         u64 start = key->offset;
 638         u64 nbytes = 0;
 639         struct btrfs_file_extent_item *item;
 640         struct inode *inode = NULL;
 641         unsigned long size;
 642         int ret = 0;
 643
 644         item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
 645         found_type = btrfs_file_extent_type(eb, item);
 646
 647         if (found_type == BTRFS_FILE_EXTENT_REG ||
 648             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 649                 nbytes = btrfs_file_extent_num_bytes(eb, item);
 650                 extent_end = start + nbytes;
 651
 652                 /*
 653                  * We don't add to the inodes nbytes if we are prealloc or a
 654                  * hole.
 655                  */
 656                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
 657                         nbytes = 0;
 658         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 659                 size = btrfs_file_extent_ram_bytes(eb, item);
 660                 nbytes = btrfs_file_extent_ram_bytes(eb, item);
 661                 extent_end = ALIGN(start + size,
 662                                    fs_info->sectorsize);
 663         } else {
 664                 ret = 0;
 665                 goto out;
 666         }
 667
 668         inode = read_one_inode(root, key->objectid);
 669         if (!inode) {
 670                 ret = -EIO;
 671                 goto out;
 672         }
 673
 674         /*
 675          * first check to see if we already have this extent in the
 676          * file.  This must be done before the btrfs_drop_extents run
 677          * so we don't try to drop this extent.
 678          */
 679         ret = btrfs_lookup_file_extent(trans, root, path,
 680                         btrfs_ino(BTRFS_I(inode)), start, 0);
 681
 682         if (ret == 0 &&
 683             (found_type == BTRFS_FILE_EXTENT_REG ||
 684              found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
 685                 struct btrfs_file_extent_item cmp1;
 686                 struct btrfs_file_extent_item cmp2;
 687                 struct btrfs_file_extent_item *existing;
 688                 struct extent_buffer *leaf;
 689
 690                 leaf = path->nodes[0];
 691                 existing = btrfs_item_ptr(leaf, path->slots[0],
 692                                           struct btrfs_file_extent_item);
 693
 694                 read_extent_buffer(eb, &cmp1, (unsigned long)item,
 695                                    sizeof(cmp1));
 696                 read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
 697                                    sizeof(cmp2));
 698
 699                 /*
 700                  * we already have a pointer to this exact extent,
 701                  * we don't have to do anything
 702                  */
 703                 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
 704                         btrfs_release_path(path);
 705                         goto out;
 706                 }
 707         }
 708         btrfs_release_path(path);
 709
 710         /* drop any overlapping extents */
 711         drop_args.start = start;
 712         drop_args.end = extent_end;
 713         drop_args.drop_cache = true;
 714         ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args);
 715         if (ret)
 716                 goto out;
 717
 718         if (found_type == BTRFS_FILE_EXTENT_REG ||
 719             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 720                 u64 offset;
 721                 unsigned long dest_offset;
 722                 struct btrfs_key ins;
 723
 724                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
 725                     btrfs_fs_incompat(fs_info, NO_HOLES))
 726                         goto update_inode;
 727
 728                 ret = btrfs_insert_empty_item(trans, root, path, key,
 729                                               sizeof(*item));
 730                 if (ret)
 731                         goto out;
 732                 dest_offset = btrfs_item_ptr_offset(path->nodes[0],
 733                                                     path->slots[0]);
 734                 copy_extent_buffer(path->nodes[0], eb, dest_offset,
 735                                 (unsigned long)item,  sizeof(*item));
 736
 737                 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
 738                 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
 739                 ins.type = BTRFS_EXTENT_ITEM_KEY;
 740                 offset = key->offset - btrfs_file_extent_offset(eb, item);
 741
 742                 /*
 743                  * Manually record dirty extent, as here we did a shallow
 744                  * file extent item copy and skip normal backref update,
 745                  * but modifying extent tree all by ourselves.
 746                  * So need to manually record dirty extent for qgroup,
 747                  * as the owner of the file extent changed from log tree
 748                  * (doesn't affect qgroup) to fs/file tree(affects qgroup)
 749                  */
 750                 ret = btrfs_qgroup_trace_extent(trans,
 751                                 btrfs_file_extent_disk_bytenr(eb, item),
 752                                 btrfs_file_extent_disk_num_bytes(eb, item));
 753                 if (ret < 0)
 754                         goto out;
 755
 756                 if (ins.objectid > 0) {
 757                         struct btrfs_ref ref = { 0 };
 758                         u64 csum_start;
 759                         u64 csum_end;
 760                         LIST_HEAD(ordered_sums);
 761
 762                         /*
 763                          * is this extent already allocated in the extent
 764                          * allocation tree?  If so, just add a reference
 765                          */
 766                         ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
 767                                                 ins.offset);
 768                         if (ret < 0) {
 769                                 goto out;
 770                         } else if (ret == 0) {
 771                                 btrfs_init_generic_ref(&ref,
 772                                                 BTRFS_ADD_DELAYED_REF,
 773                                                 ins.objectid, ins.offset, 0);
 774                                 btrfs_init_data_ref(&ref,
 775                                                 root->root_key.objectid,
 776                                                 key->objectid, offset, 0, false);
 777                                 ret = btrfs_inc_extent_ref(trans, &ref);
 778                                 if (ret)
 779                                         goto out;
 780                         } else {
 781                                 /*
 782                                  * insert the extent pointer in the extent
 783                                  * allocation tree
 784                                  */
 785                                 ret = btrfs_alloc_logged_file_extent(trans,
 786                                                 root->root_key.objectid,
 787                                                 key->objectid, offset, &ins);
 788                                 if (ret)
 789                                         goto out;
 790                         }
 791                         btrfs_release_path(path);
 792
 793                         if (btrfs_file_extent_compression(eb, item)) {
 794                                 csum_start = ins.objectid;
 795                                 csum_end = csum_start + ins.offset;
 796                         } else {
 797                                 csum_start = ins.objectid +
 798                                         btrfs_file_extent_offset(eb, item);
 799                                 csum_end = csum_start +
 800                                         btrfs_file_extent_num_bytes(eb, item);
 801                         }
 802
 803                         ret = btrfs_lookup_csums_range(root->log_root,
 804                                                 csum_start, csum_end - 1,
 805                                                 &ordered_sums, 0, false);
 806                         if (ret)
 807                                 goto out;
 808                         /*
 809                          * Now delete all existing cums in the csum root that
 810                          * cover our range. We do this because we can have an
 811                          * extent that is completely referenced by one file
 812                          * extent item and partially referenced by another
 813                          * file extent item (like after using the clone or
 814                          * extent_same ioctls). In this case if we end up doing
 815                          * the replay of the one that partially references the
 816                          * extent first, and we do not do the csum deletion
 817                          * below, we can get 2 csum items in the csum tree that
 818                          * overlap each other. For example, imagine our log has
 819                          * the two following file extent items:
 820                          *
 821                          * key (257 EXTENT_DATA 409600)
 822                          *     extent data disk byte 12845056 nr 102400
 823                          *     extent data offset 20480 nr 20480 ram 102400
 824                          *
 825                          * key (257 EXTENT_DATA 819200)
 826                          *     extent data disk byte 12845056 nr 102400
 827                          *     extent data offset 0 nr 102400 ram 102400
 828                          *
 829                          * Where the second one fully references the 100K extent
 830                          * that starts at disk byte 12845056, and the log tree
 831                          * has a single csum item that covers the entire range
 832                          * of the extent:
 833                          *
 834                          * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
 835                          *
 836                          * After the first file extent item is replayed, the
 837                          * csum tree gets the following csum item:
 838                          *
 839                          * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
 840                          *
 841                          * Which covers the 20K sub-range starting at offset 20K
 842                          * of our extent. Now when we replay the second file
 843                          * extent item, if we do not delete existing csum items
 844                          * that cover any of its blocks, we end up getting two
 845                          * csum items in our csum tree that overlap each other:
 846                          *
 847                          * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
 848                          * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
 849                          *
 850                          * Which is a problem, because after this anyone trying
 851                          * to lookup up for the checksum of any block of our
 852                          * extent starting at an offset of 40K or higher, will
 853                          * end up looking at the second csum item only, which
 854                          * does not contain the checksum for any block starting
 855                          * at offset 40K or higher of our extent.
 856                          */
 857                         while (!list_empty(&ordered_sums)) {
 858                                 struct btrfs_ordered_sum *sums;
 859                                 struct btrfs_root *csum_root;
 860
 861                                 sums = list_entry(ordered_sums.next,
 862                                                 struct btrfs_ordered_sum,
 863                                                 list);
 864                                 csum_root = btrfs_csum_root(fs_info,
 865                                                             sums->bytenr);
 866                                 if (!ret)
 867                                         ret = btrfs_del_csums(trans, csum_root,
 868                                                               sums->bytenr,
 869                                                               sums->len);
 870                                 if (!ret)
 871                                         ret = btrfs_csum_file_blocks(trans,
 872                                                                      csum_root,
 873                                                                      sums);
 874                                 list_del(&sums->list);
 875                                 kfree(sums);
 876                         }
 877                         if (ret)
 878                                 goto out;
 879                 } else {
 880                         btrfs_release_path(path);
 881                 }
 882         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 883                 /* inline extents are easy, we just overwrite them */
 884                 ret = overwrite_item(trans, root, path, eb, slot, key);
 885                 if (ret)
 886                         goto out;
 887         }
 888
 889         ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
 890                                                 extent_end - start);
 891         if (ret)
 892                 goto out;
 893
 894 update_inode:
 895         btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found);
 896         ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
 897 out:
 898         iput(inode);
 899         return ret;
 900 }
 901
 902 static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans,
 903                                        struct btrfs_inode *dir,
 904                                        struct btrfs_inode *inode,
 905                                        const char *name,
 906                                        int name_len)
 907 {
 908         int ret;
 909
 910         ret = btrfs_unlink_inode(trans, dir, inode, name, name_len);
 911         if (ret)
 912                 return ret;
 913         /*
 914          * Whenever we need to check if a name exists or not, we check the
 915          * fs/subvolume tree. So after an unlink we must run delayed items, so
 916          * that future checks for a name during log replay see that the name
 917          * does not exists anymore.
 918          */
 919         return btrfs_run_delayed_items(trans);
 920 }
 921
 922 /*
 923  * when cleaning up conflicts between the directory names in the
 924  * subvolume, directory names in the log and directory names in the
 925  * inode back references, we may have to unlink inodes from directories.
 926  *
 927  * This is a helper function to do the unlink of a specific directory
 928  * item
 929  */
 930 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
 931                                       struct btrfs_path *path,
 932                                       struct btrfs_inode *dir,
 933                                       struct btrfs_dir_item *di)
 934 {
 935         struct btrfs_root *root = dir->root;
 936         struct inode *inode;
 937         char *name;
 938         int name_len;
 939         struct extent_buffer *leaf;
 940         struct btrfs_key location;
 941         int ret;
 942
 943         leaf = path->nodes[0];
 944
 945         btrfs_dir_item_key_to_cpu(leaf, di, &location);
 946         name_len = btrfs_dir_name_len(leaf, di);
 947         name = kmalloc(name_len, GFP_NOFS);
 948         if (!name)
 949                 return -ENOMEM;
 950
 951         read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
 952         btrfs_release_path(path);
 953
 954         inode = read_one_inode(root, location.objectid);
 955         if (!inode) {
 956                 ret = -EIO;
 957                 goto out;
 958         }
 959
 960         ret = link_to_fixup_dir(trans, root, path, location.objectid);
 961         if (ret)
 962                 goto out;
 963
 964         ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), name,
 965                         name_len);
 966 out:
 967         kfree(name);
 968         iput(inode);
 969         return ret;
 970 }
 971
 972 /*
 973  * See if a given name and sequence number found in an inode back reference are
 974  * already in a directory and correctly point to this inode.
 975  *
 976  * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it
 977  * exists.
 978  */
 979 static noinline int inode_in_dir(struct btrfs_root *root,
 980                                  struct btrfs_path *path,
 981                                  u64 dirid, u64 objectid, u64 index,
 982                                  const char *name, int name_len)
 983 {
 984         struct btrfs_dir_item *di;
 985         struct btrfs_key location;
 986         int ret = 0;
 987
 988         di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
 989                                          index, name, name_len, 0);
 990         if (IS_ERR(di)) {
 991                 ret = PTR_ERR(di);
 992                 goto out;
 993         } else if (di) {
 994                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
 995                 if (location.objectid != objectid)
 996                         goto out;
 997         } else {
 998                 goto out;
 999         }
1000
1001         btrfs_release_path(path);
1002         di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
1003         if (IS_ERR(di)) {
1004                 ret = PTR_ERR(di);
1005                 goto out;
1006         } else if (di) {
1007                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
1008                 if (location.objectid == objectid)
1009                         ret = 1;
1010         }
1011 out:
1012         btrfs_release_path(path);
1013         return ret;
1014 }
1015
1016 /*
1017  * helper function to check a log tree for a named back reference in
1018  * an inode.  This is used to decide if a back reference that is
1019  * found in the subvolume conflicts with what we find in the log.
1020  *
1021  * inode backreferences may have multiple refs in a single item,
1022  * during replay we process one reference at a time, and we don't
1023  * want to delete valid links to a file from the subvolume if that
1024  * link is also in the log.
1025  */
1026 static noinline int backref_in_log(struct btrfs_root *log,
1027                                    struct btrfs_key *key,
1028                                    u64 ref_objectid,
1029                                    const char *name, int namelen)
1030 {
1031         struct btrfs_path *path;
1032         int ret;
1033
1034         path = btrfs_alloc_path();
1035         if (!path)
1036                 return -ENOMEM;
1037
1038         ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
1039         if (ret < 0) {
1040                 goto out;
1041         } else if (ret == 1) {
1042                 ret = 0;
1043                 goto out;
1044         }
1045
1046         if (key->type == BTRFS_INODE_EXTREF_KEY)
1047                 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
1048                                                        path->slots[0],
1049                                                        ref_objectid,
1050                                                        name, namelen);
1051         else
1052                 ret = !!btrfs_find_name_in_backref(path->nodes[0],
1053                                                    path->slots[0],
1054                                                    name, namelen);
1055 out:
1056         btrfs_free_path(path);
1057         return ret;
1058 }
1059
1060 static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
1061                                   struct btrfs_root *root,
1062                                   struct btrfs_path *path,
1063                                   struct btrfs_root *log_root,
1064                                   struct btrfs_inode *dir,
1065                                   struct btrfs_inode *inode,
1066                                   u64 inode_objectid, u64 parent_objectid,
1067                                   u64 ref_index, char *name, int namelen)
1068 {
1069         int ret;
1070         char *victim_name;
1071         int victim_name_len;
1072         struct extent_buffer *leaf;
1073         struct btrfs_dir_item *di;
1074         struct btrfs_key search_key;
1075         struct btrfs_inode_extref *extref;
1076
1077 again:
1078         /* Search old style refs */
1079         search_key.objectid = inode_objectid;
1080         search_key.type = BTRFS_INODE_REF_KEY;
1081         search_key.offset = parent_objectid;
1082         ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
1083         if (ret == 0) {
1084                 struct btrfs_inode_ref *victim_ref;
1085                 unsigned long ptr;
1086                 unsigned long ptr_end;
1087
1088                 leaf = path->nodes[0];
1089
1090                 /* are we trying to overwrite a back ref for the root directory
1091                  * if so, just jump out, we're done
1092                  */
1093                 if (search_key.objectid == search_key.offset)
1094                         return 1;
1095
1096                 /* check all the names in this back reference to see
1097                  * if they are in the log.  if so, we allow them to stay
1098                  * otherwise they must be unlinked as a conflict
1099                  */
1100                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1101                 ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]);
1102                 while (ptr < ptr_end) {
1103                         victim_ref = (struct btrfs_inode_ref *)ptr;
1104                         victim_name_len = btrfs_inode_ref_name_len(leaf,
1105                                                                    victim_ref);
1106                         victim_name = kmalloc(victim_name_len, GFP_NOFS);
1107                         if (!victim_name)
1108                                 return -ENOMEM;
1109
1110                         read_extent_buffer(leaf, victim_name,
1111                                            (unsigned long)(victim_ref + 1),
1112                                            victim_name_len);
1113
1114                         ret = backref_in_log(log_root, &search_key,
1115                                              parent_objectid, victim_name,
1116                                              victim_name_len);
1117                         if (ret < 0) {
1118                                 kfree(victim_name);
1119                                 return ret;
1120                         } else if (!ret) {
1121                                 inc_nlink(&inode->vfs_inode);
1122                                 btrfs_release_path(path);
1123
1124                                 ret = unlink_inode_for_log_replay(trans, dir, inode,
1125                                                 victim_name, victim_name_len);
1126                                 kfree(victim_name);
1127                                 if (ret)
1128                                         return ret;
1129                                 goto again;
1130                         }
1131                         kfree(victim_name);
1132
1133                         ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
1134                 }
1135         }
1136         btrfs_release_path(path);
1137
1138         /* Same search but for extended refs */
1139         extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
1140                                            inode_objectid, parent_objectid, 0,
1141                                            0);
1142         if (IS_ERR(extref)) {
1143                 return PTR_ERR(extref);
1144         } else if (extref) {
1145                 u32 item_size;
1146                 u32 cur_offset = 0;
1147                 unsigned long base;
1148                 struct inode *victim_parent;
1149
1150                 leaf = path->nodes[0];
1151
1152                 item_size = btrfs_item_size(leaf, path->slots[0]);
1153                 base = btrfs_item_ptr_offset(leaf, path->slots[0]);
1154
1155                 while (cur_offset < item_size) {
1156                         extref = (struct btrfs_inode_extref *)(base + cur_offset);
1157
1158                         victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
1159
1160                         if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
1161                                 goto next;
1162
1163                         victim_name = kmalloc(victim_name_len, GFP_NOFS);
1164                         if (!victim_name)
1165                                 return -ENOMEM;
1166                         read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
1167                                            victim_name_len);
1168
1169                         search_key.objectid = inode_objectid;
1170                         search_key.type = BTRFS_INODE_EXTREF_KEY;
1171                         search_key.offset = btrfs_extref_hash(parent_objectid,
1172                                                               victim_name,
1173                                                               victim_name_len);
1174                         ret = backref_in_log(log_root, &search_key,
1175                                              parent_objectid, victim_name,
1176                                              victim_name_len);
1177                         if (ret < 0) {
1178                                 kfree(victim_name);
1179                                 return ret;
1180                         } else if (!ret) {
1181                                 ret = -ENOENT;
1182                                 victim_parent = read_one_inode(root,
1183                                                 parent_objectid);
1184                                 if (victim_parent) {
1185                                         inc_nlink(&inode->vfs_inode);
1186                                         btrfs_release_path(path);
1187
1188                                         ret = unlink_inode_for_log_replay(trans,
1189                                                         BTRFS_I(victim_parent),
1190                                                         inode,
1191                                                         victim_name,
1192                                                         victim_name_len);
1193                                 }
1194                                 iput(victim_parent);
1195                                 kfree(victim_name);
1196                                 if (ret)
1197                                         return ret;
1198                                 goto again;
1199                         }
1200                         kfree(victim_name);
1201 next:
1202                         cur_offset += victim_name_len + sizeof(*extref);
1203                 }
1204         }
1205         btrfs_release_path(path);
1206
1207         /* look for a conflicting sequence number */
1208         di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
1209                                          ref_index, name, namelen, 0);
1210         if (IS_ERR(di)) {
1211                 return PTR_ERR(di);
1212         } else if (di) {
1213                 ret = drop_one_dir_item(trans, path, dir, di);
1214                 if (ret)
1215                         return ret;
1216         }
1217         btrfs_release_path(path);
1218
1219         /* look for a conflicting name */
1220         di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
1221                                    name, namelen, 0);
1222         if (IS_ERR(di)) {
1223                 return PTR_ERR(di);
1224         } else if (di) {
1225                 ret = drop_one_dir_item(trans, path, dir, di);
1226                 if (ret)
1227                         return ret;
1228         }
1229         btrfs_release_path(path);
1230
1231         return 0;
1232 }
1233
1234 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1235                              u32 *namelen, char **name, u64 *index,
1236                              u64 *parent_objectid)
1237 {
1238         struct btrfs_inode_extref *extref;
1239
1240         extref = (struct btrfs_inode_extref *)ref_ptr;
1241
1242         *namelen = btrfs_inode_extref_name_len(eb, extref);
1243         *name = kmalloc(*namelen, GFP_NOFS);
1244         if (*name == NULL)
1245                 return -ENOMEM;
1246
1247         read_extent_buffer(eb, *name, (unsigned long)&extref->name,
1248                            *namelen);
1249
1250         if (index)
1251                 *index = btrfs_inode_extref_index(eb, extref);
1252         if (parent_objectid)
1253                 *parent_objectid = btrfs_inode_extref_parent(eb, extref);
1254
1255         return 0;
1256 }
1257
1258 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1259                           u32 *namelen, char **name, u64 *index)
1260 {
1261         struct btrfs_inode_ref *ref;
1262
1263         ref = (struct btrfs_inode_ref *)ref_ptr;
1264
1265         *namelen = btrfs_inode_ref_name_len(eb, ref);
1266         *name = kmalloc(*namelen, GFP_NOFS);
1267         if (*name == NULL)
1268                 return -ENOMEM;
1269
1270         read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
1271
1272         if (index)
1273                 *index = btrfs_inode_ref_index(eb, ref);
1274
1275         return 0;
1276 }
1277
1278 /*
1279  * Take an inode reference item from the log tree and iterate all names from the
1280  * inode reference item in the subvolume tree with the same key (if it exists).
1281  * For any name that is not in the inode reference item from the log tree, do a
1282  * proper unlink of that name (that is, remove its entry from the inode
1283  * reference item and both dir index keys).
1284  */
1285 static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
1286                                  struct btrfs_root *root,
1287                                  struct btrfs_path *path,
1288                                  struct btrfs_inode *inode,
1289                                  struct extent_buffer *log_eb,
1290                                  int log_slot,
1291                                  struct btrfs_key *key)
1292 {
1293         int ret;
1294         unsigned long ref_ptr;
1295         unsigned long ref_end;
1296         struct extent_buffer *eb;
1297
1298 again:
1299         btrfs_release_path(path);
1300         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
1301         if (ret > 0) {
1302                 ret = 0;
1303                 goto out;
1304         }
1305         if (ret < 0)
1306                 goto out;
1307
1308         eb = path->nodes[0];
1309         ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
1310         ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]);
1311         while (ref_ptr < ref_end) {
1312                 char *name = NULL;
1313                 int namelen;
1314                 u64 parent_id;
1315
1316                 if (key->type == BTRFS_INODE_EXTREF_KEY) {
1317                         ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1318                                                 NULL, &parent_id);
1319                 } else {
1320                         parent_id = key->offset;
1321                         ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1322                                              NULL);
1323                 }
1324                 if (ret)
1325                         goto out;
1326
1327                 if (key->type == BTRFS_INODE_EXTREF_KEY)
1328                         ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
1329                                                                parent_id, name,
1330                                                                namelen);
1331                 else
1332                         ret = !!btrfs_find_name_in_backref(log_eb, log_slot,
1333                                                            name, namelen);
1334
1335                 if (!ret) {
1336                         struct inode *dir;
1337
1338                         btrfs_release_path(path);
1339                         dir = read_one_inode(root, parent_id);
1340                         if (!dir) {
1341                                 ret = -ENOENT;
1342                                 kfree(name);
1343                                 goto out;
1344                         }
1345                         ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir),
1346                                                  inode, name, namelen);
1347                         kfree(name);
1348                         iput(dir);
1349                         if (ret)
1350                                 goto out;
1351                         goto again;
1352                 }
1353
1354                 kfree(name);
1355                 ref_ptr += namelen;
1356                 if (key->type == BTRFS_INODE_EXTREF_KEY)
1357                         ref_ptr += sizeof(struct btrfs_inode_extref);
1358                 else
1359                         ref_ptr += sizeof(struct btrfs_inode_ref);
1360         }
1361         ret = 0;
1362  out:
1363         btrfs_release_path(path);
1364         return ret;
1365 }
1366
1367 /*
1368  * replay one inode back reference item found in the log tree.
1369  * eb, slot and key refer to the buffer and key found in the log tree.
1370  * root is the destination we are replaying into, and path is for temp
1371  * use by this function.  (it should be released on return).
1372  */
1373 static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1374                                   struct btrfs_root *root,
1375                                   struct btrfs_root *log,
1376                                   struct btrfs_path *path,
1377                                   struct extent_buffer *eb, int slot,
1378                                   struct btrfs_key *key)
1379 {
1380         struct inode *dir = NULL;
1381         struct inode *inode = NULL;
1382         unsigned long ref_ptr;
1383         unsigned long ref_end;
1384         char *name = NULL;
1385         int namelen;
1386         int ret;
1387         int log_ref_ver = 0;
1388         u64 parent_objectid;
1389         u64 inode_objectid;
1390         u64 ref_index = 0;
1391         int ref_struct_size;
1392
1393         ref_ptr = btrfs_item_ptr_offset(eb, slot);
1394         ref_end = ref_ptr + btrfs_item_size(eb, slot);
1395
1396         if (key->type == BTRFS_INODE_EXTREF_KEY) {
1397                 struct btrfs_inode_extref *r;
1398
1399                 ref_struct_size = sizeof(struct btrfs_inode_extref);
1400                 log_ref_ver = 1;
1401                 r = (struct btrfs_inode_extref *)ref_ptr;
1402                 parent_objectid = btrfs_inode_extref_parent(eb, r);
1403         } else {
1404                 ref_struct_size = sizeof(struct btrfs_inode_ref);
1405                 parent_objectid = key->offset;
1406         }
1407         inode_objectid = key->objectid;
1408
1409         /*
1410          * it is possible that we didn't log all the parent directories
1411          * for a given inode.  If we don't find the dir, just don't
1412          * copy the back ref in.  The link count fixup code will take
1413          * care of the rest
1414          */
1415         dir = read_one_inode(root, parent_objectid);
1416         if (!dir) {
1417                 ret = -ENOENT;
1418                 goto out;
1419         }
1420
1421         inode = read_one_inode(root, inode_objectid);
1422         if (!inode) {
1423                 ret = -EIO;
1424                 goto out;
1425         }
1426
1427         while (ref_ptr < ref_end) {
1428                 if (log_ref_ver) {
1429                         ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1430                                                 &ref_index, &parent_objectid);
1431                         /*
1432                          * parent object can change from one array
1433                          * item to another.
1434                          */
1435                         if (!dir)
1436                                 dir = read_one_inode(root, parent_objectid);
1437                         if (!dir) {
1438                                 ret = -ENOENT;
1439                                 goto out;
1440                         }
1441                 } else {
1442                         ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1443                                              &ref_index);
1444                 }
1445                 if (ret)
1446                         goto out;
1447
1448                 ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
1449                                    btrfs_ino(BTRFS_I(inode)), ref_index,
1450                                    name, namelen);
1451                 if (ret < 0) {
1452                         goto out;
1453                 } else if (ret == 0) {
1454                         /*
1455                          * look for a conflicting back reference in the
1456                          * metadata. if we find one we have to unlink that name
1457                          * of the file before we add our new link.  Later on, we
1458                          * overwrite any existing back reference, and we don't
1459                          * want to create dangling pointers in the directory.
1460                          */
1461                         ret = __add_inode_ref(trans, root, path, log,
1462                                               BTRFS_I(dir), BTRFS_I(inode),
1463                                               inode_objectid, parent_objectid,
1464                                               ref_index, name, namelen);
1465                         if (ret) {
1466                                 if (ret == 1)
1467                                         ret = 0;
1468                                 goto out;
1469                         }
1470
1471                         /* insert our name */
1472                         ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
1473                                              name, namelen, 0, ref_index);
1474                         if (ret)
1475                                 goto out;
1476
1477                         ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
1478                         if (ret)
1479                                 goto out;
1480                 }
1481                 /* Else, ret == 1, we already have a perfect match, we're done. */
1482
1483                 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
1484                 kfree(name);
1485                 name = NULL;
1486                 if (log_ref_ver) {
1487                         iput(dir);
1488                         dir = NULL;
1489                 }
1490         }
1491
1492         /*
1493          * Before we overwrite the inode reference item in the subvolume tree
1494          * with the item from the log tree, we must unlink all names from the
1495          * parent directory that are in the subvolume's tree inode reference
1496          * item, otherwise we end up with an inconsistent subvolume tree where
1497          * dir index entries exist for a name but there is no inode reference
1498          * item with the same name.
1499          */
1500         ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot,
1501                                     key);
1502         if (ret)
1503                 goto out;
1504
1505         /* finally write the back reference in the inode */
1506         ret = overwrite_item(trans, root, path, eb, slot, key);
1507 out:
1508         btrfs_release_path(path);
1509         kfree(name);
1510         iput(dir);
1511         iput(inode);
1512         return ret;
1513 }
1514
1515 static int count_inode_extrefs(struct btrfs_root *root,
1516                 struct btrfs_inode *inode, struct btrfs_path *path)
1517 {
1518         int ret = 0;
1519         int name_len;
1520         unsigned int nlink = 0;
1521         u32 item_size;
1522         u32 cur_offset = 0;
1523         u64 inode_objectid = btrfs_ino(inode);
1524         u64 offset = 0;
1525         unsigned long ptr;
1526         struct btrfs_inode_extref *extref;
1527         struct extent_buffer *leaf;
1528
1529         while (1) {
1530                 ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
1531                                             &extref, &offset);
1532                 if (ret)
1533                         break;
1534
1535                 leaf = path->nodes[0];
1536                 item_size = btrfs_item_size(leaf, path->slots[0]);
1537                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1538                 cur_offset = 0;
1539
1540                 while (cur_offset < item_size) {
1541                         extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1542                         name_len = btrfs_inode_extref_name_len(leaf, extref);
1543
1544                         nlink++;
1545
1546                         cur_offset += name_len + sizeof(*extref);
1547                 }
1548
1549                 offset++;
1550                 btrfs_release_path(path);
1551         }
1552         btrfs_release_path(path);
1553
1554         if (ret < 0 && ret != -ENOENT)
1555                 return ret;
1556         return nlink;
1557 }
1558
1559 static int count_inode_refs(struct btrfs_root *root,
1560                         struct btrfs_inode *inode, struct btrfs_path *path)
1561 {
1562         int ret;
1563         struct btrfs_key key;
1564         unsigned int nlink = 0;
1565         unsigned long ptr;
1566         unsigned long ptr_end;
1567         int name_len;
1568         u64 ino = btrfs_ino(inode);
1569
1570         key.objectid = ino;
1571         key.type = BTRFS_INODE_REF_KEY;
1572         key.offset = (u64)-1;
1573
1574         while (1) {
1575                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1576                 if (ret < 0)
1577                         break;
1578                 if (ret > 0) {
1579                         if (path->slots[0] == 0)
1580                                 break;
1581                         path->slots[0]--;
1582                 }
1583 process_slot:
1584                 btrfs_item_key_to_cpu(path->nodes[0], &key,
1585                                       path->slots[0]);
1586                 if (key.objectid != ino ||
1587                     key.type != BTRFS_INODE_REF_KEY)
1588                         break;
1589                 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1590                 ptr_end = ptr + btrfs_item_size(path->nodes[0],
1591                                                    path->slots[0]);
1592                 while (ptr < ptr_end) {
1593                         struct btrfs_inode_ref *ref;
1594
1595                         ref = (struct btrfs_inode_ref *)ptr;
1596                         name_len = btrfs_inode_ref_name_len(path->nodes[0],
1597                                                             ref);
1598                         ptr = (unsigned long)(ref + 1) + name_len;
1599                         nlink++;
1600                 }
1601
1602                 if (key.offset == 0)
1603                         break;
1604                 if (path->slots[0] > 0) {
1605                         path->slots[0]--;
1606                         goto process_slot;
1607                 }
1608                 key.offset--;
1609                 btrfs_release_path(path);
1610         }
1611         btrfs_release_path(path);
1612
1613         return nlink;
1614 }
1615
1616 /*
1617  * There are a few corners where the link count of the file can't
1618  * be properly maintained during replay.  So, instead of adding
1619  * lots of complexity to the log code, we just scan the backrefs
1620  * for any file that has been through replay.
1621  *
1622  * The scan will update the link count on the inode to reflect the
1623  * number of back refs found.  If it goes down to zero, the iput
1624  * will free the inode.
1625  */
1626 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1627                                            struct btrfs_root *root,
1628                                            struct inode *inode)
1629 {
1630         struct btrfs_path *path;
1631         int ret;
1632         u64 nlink = 0;
1633         u64 ino = btrfs_ino(BTRFS_I(inode));
1634
1635         path = btrfs_alloc_path();
1636         if (!path)
1637                 return -ENOMEM;
1638
1639         ret = count_inode_refs(root, BTRFS_I(inode), path);
1640         if (ret < 0)
1641                 goto out;
1642
1643         nlink = ret;
1644
1645         ret = count_inode_extrefs(root, BTRFS_I(inode), path);
1646         if (ret < 0)
1647                 goto out;
1648
1649         nlink += ret;
1650
1651         ret = 0;
1652
1653         if (nlink != inode->i_nlink) {
1654                 set_nlink(inode, nlink);
1655                 ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
1656                 if (ret)
1657                         goto out;
1658         }
1659         BTRFS_I(inode)->index_cnt = (u64)-1;
1660
1661         if (inode->i_nlink == 0) {
1662                 if (S_ISDIR(inode->i_mode)) {
1663                         ret = replay_dir_deletes(trans, root, NULL, path,
1664                                                  ino, 1);
1665                         if (ret)
1666                                 goto out;
1667                 }
1668                 ret = btrfs_insert_orphan_item(trans, root, ino);
1669                 if (ret == -EEXIST)
1670                         ret = 0;
1671         }
1672
1673 out:
1674         btrfs_free_path(path);
1675         return ret;
1676 }
1677
1678 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1679                                             struct btrfs_root *root,
1680                                             struct btrfs_path *path)
1681 {
1682         int ret;
1683         struct btrfs_key key;
1684         struct inode *inode;
1685
1686         key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1687         key.type = BTRFS_ORPHAN_ITEM_KEY;
1688         key.offset = (u64)-1;
1689         while (1) {
1690                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1691                 if (ret < 0)
1692                         break;
1693
1694                 if (ret == 1) {
1695                         ret = 0;
1696                         if (path->slots[0] == 0)
1697                                 break;
1698                         path->slots[0]--;
1699                 }
1700
1701                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1702                 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1703                     key.type != BTRFS_ORPHAN_ITEM_KEY)
1704                         break;
1705
1706                 ret = btrfs_del_item(trans, root, path);
1707                 if (ret)
1708                         break;
1709
1710                 btrfs_release_path(path);
1711                 inode = read_one_inode(root, key.offset);
1712                 if (!inode) {
1713                         ret = -EIO;
1714                         break;
1715                 }
1716
1717                 ret = fixup_inode_link_count(trans, root, inode);
1718                 iput(inode);
1719                 if (ret)
1720                         break;
1721
1722                 /*
1723                  * fixup on a directory may create new entries,
1724                  * make sure we always look for the highset possible
1725                  * offset
1726                  */
1727                 key.offset = (u64)-1;
1728         }
1729         btrfs_release_path(path);
1730         return ret;
1731 }
1732
1733
1734 /*
1735  * record a given inode in the fixup dir so we can check its link
1736  * count when replay is done.  The link count is incremented here
1737  * so the inode won't go away until we check it
1738  */
1739 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1740                                       struct btrfs_root *root,
1741                                       struct btrfs_path *path,
1742                                       u64 objectid)
1743 {
1744         struct btrfs_key key;
1745         int ret = 0;
1746         struct inode *inode;
1747
1748         inode = read_one_inode(root, objectid);
1749         if (!inode)
1750                 return -EIO;
1751
1752         key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1753         key.type = BTRFS_ORPHAN_ITEM_KEY;
1754         key.offset = objectid;
1755
1756         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1757
1758         btrfs_release_path(path);
1759         if (ret == 0) {
1760                 if (!inode->i_nlink)
1761                         set_nlink(inode, 1);
1762                 else
1763                         inc_nlink(inode);
1764                 ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
1765         } else if (ret == -EEXIST) {
1766                 ret = 0;
1767         }
1768         iput(inode);
1769
1770         return ret;
1771 }
1772
1773 /*
1774  * when replaying the log for a directory, we only insert names
1775  * for inodes that actually exist.  This means an fsync on a directory
1776  * does not implicitly fsync all the new files in it
1777  */
1778 static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1779                                     struct btrfs_root *root,
1780                                     u64 dirid, u64 index,
1781                                     char *name, int name_len,
1782                                     struct btrfs_key *location)
1783 {
1784         struct inode *inode;
1785         struct inode *dir;
1786         int ret;
1787
1788         inode = read_one_inode(root, location->objectid);
1789         if (!inode)
1790                 return -ENOENT;
1791
1792         dir = read_one_inode(root, dirid);
1793         if (!dir) {
1794                 iput(inode);
1795                 return -EIO;
1796         }
1797
1798         ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
1799                         name_len, 1, index);
1800
1801         /* FIXME, put inode into FIXUP list */
1802
1803         iput(inode);
1804         iput(dir);
1805         return ret;
1806 }
1807
1808 static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
1809                                         struct btrfs_inode *dir,
1810                                         struct btrfs_path *path,
1811                                         struct btrfs_dir_item *dst_di,
1812                                         const struct btrfs_key *log_key,
1813                                         u8 log_type,
1814                                         bool exists)
1815 {
1816         struct btrfs_key found_key;
1817
1818         btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1819         /* The existing dentry points to the same inode, don't delete it. */
1820         if (found_key.objectid == log_key->objectid &&
1821             found_key.type == log_key->type &&
1822             found_key.offset == log_key->offset &&
1823             btrfs_dir_type(path->nodes[0], dst_di) == log_type)
1824                 return 1;
1825
1826         /*
1827          * Don't drop the conflicting directory entry if the inode for the new
1828          * entry doesn't exist.
1829          */
1830         if (!exists)
1831                 return 0;
1832
1833         return drop_one_dir_item(trans, path, dir, dst_di);
1834 }
1835
1836 /*
1837  * take a single entry in a log directory item and replay it into
1838  * the subvolume.
1839  *
1840  * if a conflicting item exists in the subdirectory already,
1841  * the inode it points to is unlinked and put into the link count
1842  * fix up tree.
1843  *
1844  * If a name from the log points to a file or directory that does
1845  * not exist in the FS, it is skipped.  fsyncs on directories
1846  * do not force down inodes inside that directory, just changes to the
1847  * names or unlinks in a directory.
1848  *
1849  * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
1850  * non-existing inode) and 1 if the name was replayed.
1851  */
1852 static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1853                                     struct btrfs_root *root,
1854                                     struct btrfs_path *path,
1855                                     struct extent_buffer *eb,
1856                                     struct btrfs_dir_item *di,
1857                                     struct btrfs_key *key)
1858 {
1859         char *name;
1860         int name_len;
1861         struct btrfs_dir_item *dir_dst_di;
1862         struct btrfs_dir_item *index_dst_di;
1863         bool dir_dst_matches = false;
1864         bool index_dst_matches = false;
1865         struct btrfs_key log_key;
1866         struct btrfs_key search_key;
1867         struct inode *dir;
1868         u8 log_type;
1869         bool exists;
1870         int ret;
1871         bool update_size = true;
1872         bool name_added = false;
1873
1874         dir = read_one_inode(root, key->objectid);
1875         if (!dir)
1876                 return -EIO;
1877
1878         name_len = btrfs_dir_name_len(eb, di);
1879         name = kmalloc(name_len, GFP_NOFS);
1880         if (!name) {
1881                 ret = -ENOMEM;
1882                 goto out;
1883         }
1884
1885         log_type = btrfs_dir_type(eb, di);
1886         read_extent_buffer(eb, name, (unsigned long)(di + 1),
1887                    name_len);
1888
1889         btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1890         ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1891         btrfs_release_path(path);
1892         if (ret < 0)
1893                 goto out;
1894         exists = (ret == 0);
1895         ret = 0;
1896
1897         dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1898                                            name, name_len, 1);
1899         if (IS_ERR(dir_dst_di)) {
1900                 ret = PTR_ERR(dir_dst_di);
1901                 goto out;
1902         } else if (dir_dst_di) {
1903                 ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
1904                                                    dir_dst_di, &log_key, log_type,
1905                                                    exists);
1906                 if (ret < 0)
1907                         goto out;
1908                 dir_dst_matches = (ret == 1);
1909         }
1910
1911         btrfs_release_path(path);
1912
1913         index_dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1914                                                    key->objectid, key->offset,
1915                                                    name, name_len, 1);
1916         if (IS_ERR(index_dst_di)) {
1917                 ret = PTR_ERR(index_dst_di);
1918                 goto out;
1919         } else if (index_dst_di) {
1920                 ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
1921                                                    index_dst_di, &log_key,
1922                                                    log_type, exists);
1923                 if (ret < 0)
1924                         goto out;
1925                 index_dst_matches = (ret == 1);
1926         }
1927
1928         btrfs_release_path(path);
1929
1930         if (dir_dst_matches && index_dst_matches) {
1931                 ret = 0;
1932                 update_size = false;
1933                 goto out;
1934         }
1935
1936         /*
1937          * Check if the inode reference exists in the log for the given name,
1938          * inode and parent inode
1939          */
1940         search_key.objectid = log_key.objectid;
1941         search_key.type = BTRFS_INODE_REF_KEY;
1942         search_key.offset = key->objectid;
1943         ret = backref_in_log(root->log_root, &search_key, 0, name, name_len);
1944         if (ret < 0) {
1945                 goto out;
1946         } else if (ret) {
1947                 /* The dentry will be added later. */
1948                 ret = 0;
1949                 update_size = false;
1950                 goto out;
1951         }
1952
1953         search_key.objectid = log_key.objectid;
1954         search_key.type = BTRFS_INODE_EXTREF_KEY;
1955         search_key.offset = key->objectid;
1956         ret = backref_in_log(root->log_root, &search_key, key->objectid, name,
1957                              name_len);
1958         if (ret < 0) {
1959                 goto out;
1960         } else if (ret) {
1961                 /* The dentry will be added later. */
1962                 ret = 0;
1963                 update_size = false;
1964                 goto out;
1965         }
1966         btrfs_release_path(path);
1967         ret = insert_one_name(trans, root, key->objectid, key->offset,
1968                               name, name_len, &log_key);
1969         if (ret && ret != -ENOENT && ret != -EEXIST)
1970                 goto out;
1971         if (!ret)
1972                 name_added = true;
1973         update_size = false;
1974         ret = 0;
1975
1976 out:
1977         if (!ret && update_size) {
1978                 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
1979                 ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
1980         }
1981         kfree(name);
1982         iput(dir);
1983         if (!ret && name_added)
1984                 ret = 1;
1985         return ret;
1986 }
1987
1988 /* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */
1989 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1990                                         struct btrfs_root *root,
1991                                         struct btrfs_path *path,
1992                                         struct extent_buffer *eb, int slot,
1993                                         struct btrfs_key *key)
1994 {
1995         int ret;
1996         struct btrfs_dir_item *di;
1997
1998         /* We only log dir index keys, which only contain a single dir item. */
1999         ASSERT(key->type == BTRFS_DIR_INDEX_KEY);
2000
2001         di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
2002         ret = replay_one_name(trans, root, path, eb, di, key);
2003         if (ret < 0)
2004                 return ret;
2005
2006         /*
2007          * If this entry refers to a non-directory (directories can not have a
2008          * link count > 1) and it was added in the transaction that was not
2009          * committed, make sure we fixup the link count of the inode the entry
2010          * points to. Otherwise something like the following would result in a
2011          * directory pointing to an inode with a wrong link that does not account
2012          * for this dir entry:
2013          *
2014          * mkdir testdir
2015          * touch testdir/foo
2016          * touch testdir/bar
2017          * sync
2018          *
2019          * ln testdir/bar testdir/bar_link
2020          * ln testdir/foo testdir/foo_link
2021          * xfs_io -c "fsync" testdir/bar
2022          *
2023          * <power failure>
2024          *
2025          * mount fs, log replay happens
2026          *
2027          * File foo would remain with a link count of 1 when it has two entries
2028          * pointing to it in the directory testdir. This would make it impossible
2029          * to ever delete the parent directory has it would result in stale
2030          * dentries that can never be deleted.
2031          */
2032         if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
2033                 struct btrfs_path *fixup_path;
2034                 struct btrfs_key di_key;
2035
2036                 fixup_path = btrfs_alloc_path();
2037                 if (!fixup_path)
2038                         return -ENOMEM;
2039
2040                 btrfs_dir_item_key_to_cpu(eb, di, &di_key);
2041                 ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid);
2042                 btrfs_free_path(fixup_path);
2043         }
2044
2045         return ret;
2046 }
2047
2048 /*
2049  * directory replay has two parts.  There are the standard directory
2050  * items in the log copied from the subvolume, and range items
2051  * created in the log while the subvolume was logged.
2052  *
2053  * The range items tell us which parts of the key space the log
2054  * is authoritative for.  During replay, if a key in the subvolume
2055  * directory is in a logged range item, but not actually in the log
2056  * that means it was deleted from the directory before the fsync
2057  * and should be removed.
2058  */
2059 static noinline int find_dir_range(struct btrfs_root *root,
2060                                    struct btrfs_path *path,
2061                                    u64 dirid,
2062                                    u64 *start_ret, u64 *end_ret)
2063 {
2064         struct btrfs_key key;
2065         u64 found_end;
2066         struct btrfs_dir_log_item *item;
2067         int ret;
2068         int nritems;
2069
2070         if (*start_ret == (u64)-1)
2071                 return 1;
2072
2073         key.objectid = dirid;
2074         key.type = BTRFS_DIR_LOG_INDEX_KEY;
2075         key.offset = *start_ret;
2076
2077         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2078         if (ret < 0)
2079                 goto out;
2080         if (ret > 0) {
2081                 if (path->slots[0] == 0)
2082                         goto out;
2083                 path->slots[0]--;
2084         }
2085         if (ret != 0)
2086                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2087
2088         if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
2089                 ret = 1;
2090                 goto next;
2091         }
2092         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2093                               struct btrfs_dir_log_item);
2094         found_end = btrfs_dir_log_end(path->nodes[0], item);
2095
2096         if (*start_ret >= key.offset && *start_ret <= found_end) {
2097                 ret = 0;
2098                 *start_ret = key.offset;
2099                 *end_ret = found_end;
2100                 goto out;
2101         }
2102         ret = 1;
2103 next:
2104         /* check the next slot in the tree to see if it is a valid item */
2105         nritems = btrfs_header_nritems(path->nodes[0]);
2106         path->slots[0]++;
2107         if (path->slots[0] >= nritems) {
2108                 ret = btrfs_next_leaf(root, path);
2109                 if (ret)
2110                         goto out;
2111         }
2112
2113         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2114
2115         if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
2116                 ret = 1;
2117                 goto out;
2118         }
2119         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2120                               struct btrfs_dir_log_item);
2121         found_end = btrfs_dir_log_end(path->nodes[0], item);
2122         *start_ret = key.offset;
2123         *end_ret = found_end;
2124         ret = 0;
2125 out:
2126         btrfs_release_path(path);
2127         return ret;
2128 }
2129
2130 /*
2131  * this looks for a given directory item in the log.  If the directory
2132  * item is not in the log, the item is removed and the inode it points
2133  * to is unlinked
2134  */
2135 static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
2136                                       struct btrfs_root *log,
2137                                       struct btrfs_path *path,
2138                                       struct btrfs_path *log_path,
2139                                       struct inode *dir,
2140                                       struct btrfs_key *dir_key)
2141 {
2142         struct btrfs_root *root = BTRFS_I(dir)->root;
2143         int ret;
2144         struct extent_buffer *eb;
2145         int slot;
2146         struct btrfs_dir_item *di;
2147         int name_len;
2148         char *name;
2149         struct inode *inode = NULL;
2150         struct btrfs_key location;
2151
2152         /*
2153          * Currently we only log dir index keys. Even if we replay a log created
2154          * by an older kernel that logged both dir index and dir item keys, all
2155          * we need to do is process the dir index keys, we (and our caller) can
2156          * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY).
2157          */
2158         ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY);
2159
2160         eb = path->nodes[0];
2161         slot = path->slots[0];
2162         di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
2163         name_len = btrfs_dir_name_len(eb, di);
2164         name = kmalloc(name_len, GFP_NOFS);
2165         if (!name) {
2166                 ret = -ENOMEM;
2167                 goto out;
2168         }
2169
2170         read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len);
2171
2172         if (log) {
2173                 struct btrfs_dir_item *log_di;
2174
2175                 log_di = btrfs_lookup_dir_index_item(trans, log, log_path,
2176                                                      dir_key->objectid,
2177                                                      dir_key->offset,
2178                                                      name, name_len, 0);
2179                 if (IS_ERR(log_di)) {
2180                         ret = PTR_ERR(log_di);
2181                         goto out;
2182                 } else if (log_di) {
2183                         /* The dentry exists in the log, we have nothing to do. */
2184                         ret = 0;
2185                         goto out;
2186                 }
2187         }
2188
2189         btrfs_dir_item_key_to_cpu(eb, di, &location);
2190         btrfs_release_path(path);
2191         btrfs_release_path(log_path);
2192         inode = read_one_inode(root, location.objectid);
2193         if (!inode) {
2194                 ret = -EIO;
2195                 goto out;
2196         }
2197
2198         ret = link_to_fixup_dir(trans, root, path, location.objectid);
2199         if (ret)
2200                 goto out;
2201
2202         inc_nlink(inode);
2203         ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode),
2204                                           name, name_len);
2205         /*
2206          * Unlike dir item keys, dir index keys can only have one name (entry) in
2207          * them, as there are no key collisions since each key has a unique offset
2208          * (an index number), so we're done.
2209          */
2210 out:
2211         btrfs_release_path(path);
2212         btrfs_release_path(log_path);
2213         kfree(name);
2214         iput(inode);
2215         return ret;
2216 }
2217
2218 static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
2219                               struct btrfs_root *root,
2220                               struct btrfs_root *log,
2221                               struct btrfs_path *path,
2222                               const u64 ino)
2223 {
2224         struct btrfs_key search_key;
2225         struct btrfs_path *log_path;
2226         int i;
2227         int nritems;
2228         int ret;
2229
2230         log_path = btrfs_alloc_path();
2231         if (!log_path)
2232                 return -ENOMEM;
2233
2234         search_key.objectid = ino;
2235         search_key.type = BTRFS_XATTR_ITEM_KEY;
2236         search_key.offset = 0;
2237 again:
2238         ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
2239         if (ret < 0)
2240                 goto out;
2241 process_leaf:
2242         nritems = btrfs_header_nritems(path->nodes[0]);
2243         for (i = path->slots[0]; i < nritems; i++) {
2244                 struct btrfs_key key;
2245                 struct btrfs_dir_item *di;
2246                 struct btrfs_dir_item *log_di;
2247                 u32 total_size;
2248                 u32 cur;
2249
2250                 btrfs_item_key_to_cpu(path->nodes[0], &key, i);
2251                 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
2252                         ret = 0;
2253                         goto out;
2254                 }
2255
2256                 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
2257                 total_size = btrfs_item_size(path->nodes[0], i);
2258                 cur = 0;
2259                 while (cur < total_size) {
2260                         u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
2261                         u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
2262                         u32 this_len = sizeof(*di) + name_len + data_len;
2263                         char *name;
2264
2265                         name = kmalloc(name_len, GFP_NOFS);
2266                         if (!name) {
2267                                 ret = -ENOMEM;
2268                                 goto out;
2269                         }
2270                         read_extent_buffer(path->nodes[0], name,
2271                                            (unsigned long)(di + 1), name_len);
2272
2273                         log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
2274                                                     name, name_len, 0);
2275                         btrfs_release_path(log_path);
2276                         if (!log_di) {
2277                                 /* Doesn't exist in log tree, so delete it. */
2278                                 btrfs_release_path(path);
2279                                 di = btrfs_lookup_xattr(trans, root, path, ino,
2280                                                         name, name_len, -1);
2281                                 kfree(name);
2282                                 if (IS_ERR(di)) {
2283                                         ret = PTR_ERR(di);
2284                                         goto out;
2285                                 }
2286                                 ASSERT(di);
2287                                 ret = btrfs_delete_one_dir_name(trans, root,
2288                                                                 path, di);
2289                                 if (ret)
2290                                         goto out;
2291                                 btrfs_release_path(path);
2292                                 search_key = key;
2293                                 goto again;
2294                         }
2295                         kfree(name);
2296                         if (IS_ERR(log_di)) {
2297                                 ret = PTR_ERR(log_di);
2298                                 goto out;
2299                         }
2300                         cur += this_len;
2301                         di = (struct btrfs_dir_item *)((char *)di + this_len);
2302                 }
2303         }
2304         ret = btrfs_next_leaf(root, path);
2305         if (ret > 0)
2306                 ret = 0;
2307         else if (ret == 0)
2308                 goto process_leaf;
2309 out:
2310         btrfs_free_path(log_path);
2311         btrfs_release_path(path);
2312         return ret;
2313 }
2314
2315
2316 /*
2317  * deletion replay happens before we copy any new directory items
2318  * out of the log or out of backreferences from inodes.  It
2319  * scans the log to find ranges of keys that log is authoritative for,
2320  * and then scans the directory to find items in those ranges that are
2321  * not present in the log.
2322  *
2323  * Anything we don't find in the log is unlinked and removed from the
2324  * directory.
2325  */
2326 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
2327                                        struct btrfs_root *root,
2328                                        struct btrfs_root *log,
2329                                        struct btrfs_path *path,
2330                                        u64 dirid, int del_all)
2331 {
2332         u64 range_start;
2333         u64 range_end;
2334         int ret = 0;
2335         struct btrfs_key dir_key;
2336         struct btrfs_key found_key;
2337         struct btrfs_path *log_path;
2338         struct inode *dir;
2339
2340         dir_key.objectid = dirid;
2341         dir_key.type = BTRFS_DIR_INDEX_KEY;
2342         log_path = btrfs_alloc_path();
2343         if (!log_path)
2344                 return -ENOMEM;
2345
2346         dir = read_one_inode(root, dirid);
2347         /* it isn't an error if the inode isn't there, that can happen
2348          * because we replay the deletes before we copy in the inode item
2349          * from the log
2350          */
2351         if (!dir) {
2352                 btrfs_free_path(log_path);
2353                 return 0;
2354         }
2355
2356         range_start = 0;
2357         range_end = 0;
2358         while (1) {
2359                 if (del_all)
2360                         range_end = (u64)-1;
2361                 else {
2362                         ret = find_dir_range(log, path, dirid,
2363                                              &range_start, &range_end);
2364                         if (ret < 0)
2365                                 goto out;
2366                         else if (ret > 0)
2367                                 break;
2368                 }
2369
2370                 dir_key.offset = range_start;
2371                 while (1) {
2372                         int nritems;
2373                         ret = btrfs_search_slot(NULL, root, &dir_key, path,
2374                                                 0, 0);
2375                         if (ret < 0)
2376                                 goto out;
2377
2378                         nritems = btrfs_header_nritems(path->nodes[0]);
2379                         if (path->slots[0] >= nritems) {
2380                                 ret = btrfs_next_leaf(root, path);
2381                                 if (ret == 1)
2382                                         break;
2383                                 else if (ret < 0)
2384                                         goto out;
2385                         }
2386                         btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2387                                               path->slots[0]);
2388                         if (found_key.objectid != dirid ||
2389                             found_key.type != dir_key.type) {
2390                                 ret = 0;
2391                                 goto out;
2392                         }
2393
2394                         if (found_key.offset > range_end)
2395                                 break;
2396
2397                         ret = check_item_in_log(trans, log, path,
2398                                                 log_path, dir,
2399                                                 &found_key);
2400                         if (ret)
2401                                 goto out;
2402                         if (found_key.offset == (u64)-1)
2403                                 break;
2404                         dir_key.offset = found_key.offset + 1;
2405                 }
2406                 btrfs_release_path(path);
2407                 if (range_end == (u64)-1)
2408                         break;
2409                 range_start = range_end + 1;
2410         }
2411         ret = 0;
2412 out:
2413         btrfs_release_path(path);
2414         btrfs_free_path(log_path);
2415         iput(dir);
2416         return ret;
2417 }
2418
2419 /*
2420  * the process_func used to replay items from the log tree.  This
2421  * gets called in two different stages.  The first stage just looks
2422  * for inodes and makes sure they are all copied into the subvolume.
2423  *
2424  * The second stage copies all the other item types from the log into
2425  * the subvolume.  The two stage approach is slower, but gets rid of
2426  * lots of complexity around inodes referencing other inodes that exist
2427  * only in the log (references come from either directory items or inode
2428  * back refs).
2429  */
2430 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2431                              struct walk_control *wc, u64 gen, int level)
2432 {
2433         int nritems;
2434         struct btrfs_path *path;
2435         struct btrfs_root *root = wc->replay_dest;
2436         struct btrfs_key key;
2437         int i;
2438         int ret;
2439
2440         ret = btrfs_read_extent_buffer(eb, gen, level, NULL);
2441         if (ret)
2442                 return ret;
2443
2444         level = btrfs_header_level(eb);
2445
2446         if (level != 0)
2447                 return 0;
2448
2449         path = btrfs_alloc_path();
2450         if (!path)
2451                 return -ENOMEM;
2452
2453         nritems = btrfs_header_nritems(eb);
2454         for (i = 0; i < nritems; i++) {
2455                 btrfs_item_key_to_cpu(eb, &key, i);
2456
2457                 /* inode keys are done during the first stage */
2458                 if (key.type == BTRFS_INODE_ITEM_KEY &&
2459                     wc->stage == LOG_WALK_REPLAY_INODES) {
2460                         struct btrfs_inode_item *inode_item;
2461                         u32 mode;
2462
2463                         inode_item = btrfs_item_ptr(eb, i,
2464                                             struct btrfs_inode_item);
2465                         /*
2466                          * If we have a tmpfile (O_TMPFILE) that got fsync'ed
2467                          * and never got linked before the fsync, skip it, as
2468                          * replaying it is pointless since it would be deleted
2469                          * later. We skip logging tmpfiles, but it's always
2470                          * possible we are replaying a log created with a kernel
2471                          * that used to log tmpfiles.
2472                          */
2473                         if (btrfs_inode_nlink(eb, inode_item) == 0) {
2474                                 wc->ignore_cur_inode = true;
2475                                 continue;
2476                         } else {
2477                                 wc->ignore_cur_inode = false;
2478                         }
2479                         ret = replay_xattr_deletes(wc->trans, root, log,
2480                                                    path, key.objectid);
2481                         if (ret)
2482                                 break;
2483                         mode = btrfs_inode_mode(eb, inode_item);
2484                         if (S_ISDIR(mode)) {
2485                                 ret = replay_dir_deletes(wc->trans,
2486                                          root, log, path, key.objectid, 0);
2487                                 if (ret)
2488                                         break;
2489                         }
2490                         ret = overwrite_item(wc->trans, root, path,
2491                                              eb, i, &key);
2492                         if (ret)
2493                                 break;
2494
2495                         /*
2496                          * Before replaying extents, truncate the inode to its
2497                          * size. We need to do it now and not after log replay
2498                          * because before an fsync we can have prealloc extents
2499                          * added beyond the inode's i_size. If we did it after,
2500                          * through orphan cleanup for example, we would drop
2501                          * those prealloc extents just after replaying them.
2502                          */
2503                         if (S_ISREG(mode)) {
2504                                 struct btrfs_drop_extents_args drop_args = { 0 };
2505                                 struct inode *inode;
2506                                 u64 from;
2507
2508                                 inode = read_one_inode(root, key.objectid);
2509                                 if (!inode) {
2510                                         ret = -EIO;
2511                                         break;
2512                                 }
2513                                 from = ALIGN(i_size_read(inode),
2514                                              root->fs_info->sectorsize);
2515                                 drop_args.start = from;
2516                                 drop_args.end = (u64)-1;
2517                                 drop_args.drop_cache = true;
2518                                 ret = btrfs_drop_extents(wc->trans, root,
2519                                                          BTRFS_I(inode),
2520                                                          &drop_args);
2521                                 if (!ret) {
2522                                         inode_sub_bytes(inode,
2523                                                         drop_args.bytes_found);
2524                                         /* Update the inode's nbytes. */
2525                                         ret = btrfs_update_inode(wc->trans,
2526                                                         root, BTRFS_I(inode));
2527                                 }
2528                                 iput(inode);
2529                                 if (ret)
2530                                         break;
2531                         }
2532
2533                         ret = link_to_fixup_dir(wc->trans, root,
2534                                                 path, key.objectid);
2535                         if (ret)
2536                                 break;
2537                 }
2538
2539                 if (wc->ignore_cur_inode)
2540                         continue;
2541
2542                 if (key.type == BTRFS_DIR_INDEX_KEY &&
2543                     wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2544                         ret = replay_one_dir_item(wc->trans, root, path,
2545                                                   eb, i, &key);
2546                         if (ret)
2547                                 break;
2548                 }
2549
2550                 if (wc->stage < LOG_WALK_REPLAY_ALL)
2551                         continue;
2552
2553                 /* these keys are simply copied */
2554                 if (key.type == BTRFS_XATTR_ITEM_KEY) {
2555                         ret = overwrite_item(wc->trans, root, path,
2556                                              eb, i, &key);
2557                         if (ret)
2558                                 break;
2559                 } else if (key.type == BTRFS_INODE_REF_KEY ||
2560                            key.type == BTRFS_INODE_EXTREF_KEY) {
2561                         ret = add_inode_ref(wc->trans, root, log, path,
2562                                             eb, i, &key);
2563                         if (ret && ret != -ENOENT)
2564                                 break;
2565                         ret = 0;
2566                 } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
2567                         ret = replay_one_extent(wc->trans, root, path,
2568                                                 eb, i, &key);
2569                         if (ret)
2570                                 break;
2571                 }
2572                 /*
2573                  * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the
2574                  * BTRFS_DIR_INDEX_KEY items which we use to derive the
2575                  * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an
2576                  * older kernel with such keys, ignore them.
2577                  */
2578         }
2579         btrfs_free_path(path);
2580         return ret;
2581 }
2582
2583 /*
2584  * Correctly adjust the reserved bytes occupied by a log tree extent buffer
2585  */
2586 static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
2587 {
2588         struct btrfs_block_group *cache;
2589
2590         cache = btrfs_lookup_block_group(fs_info, start);
2591         if (!cache) {
2592                 btrfs_err(fs_info, "unable to find block group for %llu", start);
2593                 return;
2594         }
2595
2596         spin_lock(&cache->space_info->lock);
2597         spin_lock(&cache->lock);
2598         cache->reserved -= fs_info->nodesize;
2599         cache->space_info->bytes_reserved -= fs_info->nodesize;
2600         spin_unlock(&cache->lock);
2601         spin_unlock(&cache->space_info->lock);
2602
2603         btrfs_put_block_group(cache);
2604 }
2605
2606 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2607                                    struct btrfs_root *root,
2608                                    struct btrfs_path *path, int *level,
2609                                    struct walk_control *wc)
2610 {
2611         struct btrfs_fs_info *fs_info = root->fs_info;
2612         u64 bytenr;
2613         u64 ptr_gen;
2614         struct extent_buffer *next;
2615         struct extent_buffer *cur;
2616         u32 blocksize;
2617         int ret = 0;
2618
2619         while (*level > 0) {
2620                 struct btrfs_key first_key;
2621
2622                 cur = path->nodes[*level];
2623
2624                 WARN_ON(btrfs_header_level(cur) != *level);
2625
2626                 if (path->slots[*level] >=
2627                     btrfs_header_nritems(cur))
2628                         break;
2629
2630                 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2631                 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2632                 btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);
2633                 blocksize = fs_info->nodesize;
2634
2635                 next = btrfs_find_create_tree_block(fs_info, bytenr,
2636                                                     btrfs_header_owner(cur),
2637                                                     *level - 1);
2638                 if (IS_ERR(next))
2639                         return PTR_ERR(next);
2640
2641                 if (*level == 1) {
2642                         ret = wc->process_func(root, next, wc, ptr_gen,
2643                                                *level - 1);
2644                         if (ret) {
2645                                 free_extent_buffer(next);
2646                                 return ret;
2647                         }
2648
2649                         path->slots[*level]++;
2650                         if (wc->free) {
2651                                 ret = btrfs_read_extent_buffer(next, ptr_gen,
2652                                                         *level - 1, &first_key);
2653                                 if (ret) {
2654                                         free_extent_buffer(next);
2655                                         return ret;
2656                                 }
2657
2658                                 if (trans) {
2659                                         btrfs_tree_lock(next);
2660                                         btrfs_clean_tree_block(next);
2661                                         btrfs_wait_tree_block_writeback(next);
2662                                         btrfs_tree_unlock(next);
2663                                         ret = btrfs_pin_reserved_extent(trans,
2664                                                         bytenr, blocksize);
2665                                         if (ret) {
2666                                                 free_extent_buffer(next);
2667                                                 return ret;
2668                                         }
2669                                         btrfs_redirty_list_add(
2670                                                 trans->transaction, next);
2671                                 } else {
2672                                         if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2673                                                 clear_extent_buffer_dirty(next);
2674                                         unaccount_log_buffer(fs_info, bytenr);
2675                                 }
2676                         }
2677                         free_extent_buffer(next);
2678                         continue;
2679                 }
2680                 ret = btrfs_read_extent_buffer(next, ptr_gen, *level - 1, &first_key);
2681                 if (ret) {
2682                         free_extent_buffer(next);
2683                         return ret;
2684                 }
2685
2686                 if (path->nodes[*level-1])
2687                         free_extent_buffer(path->nodes[*level-1]);
2688                 path->nodes[*level-1] = next;
2689                 *level = btrfs_header_level(next);
2690                 path->slots[*level] = 0;
2691                 cond_resched();
2692         }
2693         path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
2694
2695         cond_resched();
2696         return 0;
2697 }
2698
2699 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2700                                  struct btrfs_root *root,
2701                                  struct btrfs_path *path, int *level,
2702                                  struct walk_control *wc)
2703 {
2704         struct btrfs_fs_info *fs_info = root->fs_info;
2705         int i;
2706         int slot;
2707         int ret;
2708
2709         for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2710                 slot = path->slots[i];
2711                 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
2712                         path->slots[i]++;
2713                         *level = i;
2714                         WARN_ON(*level == 0);
2715                         return 0;
2716                 } else {
2717                         ret = wc->process_func(root, path->nodes[*level], wc,
2718                                  btrfs_header_generation(path->nodes[*level]),
2719                                  *level);
2720                         if (ret)
2721                                 return ret;
2722
2723                         if (wc->free) {
2724                                 struct extent_buffer *next;
2725
2726                                 next = path->nodes[*level];
2727
2728                                 if (trans) {
2729                                         btrfs_tree_lock(next);
2730                                         btrfs_clean_tree_block(next);
2731                                         btrfs_wait_tree_block_writeback(next);
2732                                         btrfs_tree_unlock(next);
2733                                         ret = btrfs_pin_reserved_extent(trans,
2734                                                      path->nodes[*level]->start,
2735                                                      path->nodes[*level]->len);
2736                                         if (ret)
2737                                                 return ret;
2738                                         btrfs_redirty_list_add(trans->transaction,
2739                                                                next);
2740                                 } else {
2741                                         if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2742                                                 clear_extent_buffer_dirty(next);
2743
2744                                         unaccount_log_buffer(fs_info,
2745                                                 path->nodes[*level]->start);
2746                                 }
2747                         }
2748                         free_extent_buffer(path->nodes[*level]);
2749                         path->nodes[*level] = NULL;
2750                         *level = i + 1;
2751                 }
2752         }
2753         return 1;
2754 }
2755
2756 /*
2757  * drop the reference count on the tree rooted at 'snap'.  This traverses
2758  * the tree freeing any blocks that have a ref count of zero after being
2759  * decremented.
2760  */
2761 static int walk_log_tree(struct btrfs_trans_handle *trans,
2762                          struct btrfs_root *log, struct walk_control *wc)
2763 {
2764         struct btrfs_fs_info *fs_info = log->fs_info;
2765         int ret = 0;
2766         int wret;
2767         int level;
2768         struct btrfs_path *path;
2769         int orig_level;
2770
2771         path = btrfs_alloc_path();
2772         if (!path)
2773                 return -ENOMEM;
2774
2775         level = btrfs_header_level(log->node);
2776         orig_level = level;
2777         path->nodes[level] = log->node;
2778         atomic_inc(&log->node->refs);
2779         path->slots[level] = 0;
2780
2781         while (1) {
2782                 wret = walk_down_log_tree(trans, log, path, &level, wc);
2783                 if (wret > 0)
2784                         break;
2785                 if (wret < 0) {
2786                         ret = wret;
2787                         goto out;
2788                 }
2789
2790                 wret = walk_up_log_tree(trans, log, path, &level, wc);
2791                 if (wret > 0)
2792                         break;
2793                 if (wret < 0) {
2794                         ret = wret;
2795                         goto out;
2796                 }
2797         }
2798
2799         /* was the root node processed? if not, catch it here */
2800         if (path->nodes[orig_level]) {
2801                 ret = wc->process_func(log, path->nodes[orig_level], wc,
2802                          btrfs_header_generation(path->nodes[orig_level]),
2803                          orig_level);
2804                 if (ret)
2805                         goto out;
2806                 if (wc->free) {
2807                         struct extent_buffer *next;
2808
2809                         next = path->nodes[orig_level];
2810
2811                         if (trans) {
2812                                 btrfs_tree_lock(next);
2813                                 btrfs_clean_tree_block(next);
2814                                 btrfs_wait_tree_block_writeback(next);
2815                                 btrfs_tree_unlock(next);
2816                                 ret = btrfs_pin_reserved_extent(trans,
2817                                                 next->start, next->len);
2818                                 if (ret)
2819                                         goto out;
2820                                 btrfs_redirty_list_add(trans->transaction, next);
2821                         } else {
2822                                 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2823                                         clear_extent_buffer_dirty(next);
2824                                 unaccount_log_buffer(fs_info, next->start);
2825                         }
2826                 }
2827         }
2828
2829 out:
2830         btrfs_free_path(path);
2831         return ret;
2832 }
2833
2834 /*
2835  * helper function to update the item for a given subvolumes log root
2836  * in the tree of log roots
2837  */
2838 static int update_log_root(struct btrfs_trans_handle *trans,
2839                            struct btrfs_root *log,
2840                            struct btrfs_root_item *root_item)
2841 {
2842         struct btrfs_fs_info *fs_info = log->fs_info;
2843         int ret;
2844
2845         if (log->log_transid == 1) {
2846                 /* insert root item on the first sync */
2847                 ret = btrfs_insert_root(trans, fs_info->log_root_tree,
2848                                 &log->root_key, root_item);
2849         } else {
2850                 ret = btrfs_update_root(trans, fs_info->log_root_tree,
2851                                 &log->root_key, root_item);
2852         }
2853         return ret;
2854 }
2855
2856 static void wait_log_commit(struct btrfs_root *root, int transid)
2857 {
2858         DEFINE_WAIT(wait);
2859         int index = transid % 2;
2860
2861         /*
2862          * we only allow two pending log transactions at a time,
2863          * so we know that if ours is more than 2 older than the
2864          * current transaction, we're done
2865          */
2866         for (;;) {
2867                 prepare_to_wait(&root->log_commit_wait[index],
2868                                 &wait, TASK_UNINTERRUPTIBLE);
2869
2870                 if (!(root->log_transid_committed < transid &&
2871                       atomic_read(&root->log_commit[index])))
2872                         break;
2873
2874                 mutex_unlock(&root->log_mutex);
2875                 schedule();
2876                 mutex_lock(&root->log_mutex);
2877         }
2878         finish_wait(&root->log_commit_wait[index], &wait);
2879 }
2880
2881 static void wait_for_writer(struct btrfs_root *root)
2882 {
2883         DEFINE_WAIT(wait);
2884
2885         for (;;) {
2886                 prepare_to_wait(&root->log_writer_wait, &wait,
2887                                 TASK_UNINTERRUPTIBLE);
2888                 if (!atomic_read(&root->log_writers))
2889                         break;
2890
2891                 mutex_unlock(&root->log_mutex);
2892                 schedule();
2893                 mutex_lock(&root->log_mutex);
2894         }
2895         finish_wait(&root->log_writer_wait, &wait);
2896 }
2897
2898 static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
2899                                         struct btrfs_log_ctx *ctx)
2900 {
2901         mutex_lock(&root->log_mutex);
2902         list_del_init(&ctx->list);
2903         mutex_unlock(&root->log_mutex);
2904 }
2905
2906 /*
2907  * Invoked in log mutex context, or be sure there is no other task which
2908  * can access the list.
2909  */
2910 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
2911                                              int index, int error)
2912 {
2913         struct btrfs_log_ctx *ctx;
2914         struct btrfs_log_ctx *safe;
2915
2916         list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
2917                 list_del_init(&ctx->list);
2918                 ctx->log_ret = error;
2919         }
2920 }
2921
2922 /*
2923  * btrfs_sync_log does sends a given tree log down to the disk and
2924  * updates the super blocks to record it.  When this call is done,
2925  * you know that any inodes previously logged are safely on disk only
2926  * if it returns 0.
2927  *
2928  * Any other return value means you need to call btrfs_commit_transaction.
2929  * Some of the edge cases for fsyncing directories that have had unlinks
2930  * or renames done in the past mean that sometimes the only safe
2931  * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
2932  * that has happened.
2933  */
2934 int btrfs_sync_log(struct btrfs_trans_handle *trans,
2935                    struct btrfs_root *root, struct btrfs_log_ctx *ctx)
2936 {
2937         int index1;
2938         int index2;
2939         int mark;
2940         int ret;
2941         struct btrfs_fs_info *fs_info = root->fs_info;
2942         struct btrfs_root *log = root->log_root;
2943         struct btrfs_root *log_root_tree = fs_info->log_root_tree;
2944         struct btrfs_root_item new_root_item;
2945         int log_transid = 0;
2946         struct btrfs_log_ctx root_log_ctx;
2947         struct blk_plug plug;
2948         u64 log_root_start;
2949         u64 log_root_level;
2950
2951         mutex_lock(&root->log_mutex);
2952         log_transid = ctx->log_transid;
2953         if (root->log_transid_committed >= log_transid) {
2954                 mutex_unlock(&root->log_mutex);
2955                 return ctx->log_ret;
2956         }
2957
2958         index1 = log_transid % 2;
2959         if (atomic_read(&root->log_commit[index1])) {
2960                 wait_log_commit(root, log_transid);
2961                 mutex_unlock(&root->log_mutex);
2962                 return ctx->log_ret;
2963         }
2964         ASSERT(log_transid == root->log_transid);
2965         atomic_set(&root->log_commit[index1], 1);
2966
2967         /* wait for previous tree log sync to complete */
2968         if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2969                 wait_log_commit(root, log_transid - 1);
2970
2971         while (1) {
2972                 int batch = atomic_read(&root->log_batch);
2973                 /* when we're on an ssd, just kick the log commit out */
2974                 if (!btrfs_test_opt(fs_info, SSD) &&
2975                     test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
2976                         mutex_unlock(&root->log_mutex);
2977                         schedule_timeout_uninterruptible(1);
2978                         mutex_lock(&root->log_mutex);
2979                 }
2980                 wait_for_writer(root);
2981                 if (batch == atomic_read(&root->log_batch))
2982                         break;
2983         }
2984
2985         /* bail out if we need to do a full commit */
2986         if (btrfs_need_log_full_commit(trans)) {
2987                 ret = BTRFS_LOG_FORCE_COMMIT;
2988                 mutex_unlock(&root->log_mutex);
2989                 goto out;
2990         }
2991
2992         if (log_transid % 2 == 0)
2993                 mark = EXTENT_DIRTY;
2994         else
2995                 mark = EXTENT_NEW;
2996
2997         /* we start IO on  all the marked extents here, but we don't actually
2998          * wait for them until later.
2999          */
3000         blk_start_plug(&plug);
3001         ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
3002         /*
3003          * -EAGAIN happens when someone, e.g., a concurrent transaction
3004          *  commit, writes a dirty extent in this tree-log commit. This
3005          *  concurrent write will create a hole writing out the extents,
3006          *  and we cannot proceed on a zoned filesystem, requiring
3007          *  sequential writing. While we can bail out to a full commit
3008          *  here, but we can continue hoping the concurrent writing fills
3009          *  the hole.
3010          */
3011         if (ret == -EAGAIN && btrfs_is_zoned(fs_info))
3012                 ret = 0;
3013         if (ret) {
3014                 blk_finish_plug(&plug);
3015                 btrfs_abort_transaction(trans, ret);
3016                 btrfs_set_log_full_commit(trans);
3017                 mutex_unlock(&root->log_mutex);
3018                 goto out;
3019         }
3020
3021         /*
3022          * We _must_ update under the root->log_mutex in order to make sure we
3023          * have a consistent view of the log root we are trying to commit at
3024          * this moment.
3025          *
3026          * We _must_ copy this into a local copy, because we are not holding the
3027          * log_root_tree->log_mutex yet.  This is important because when we
3028          * commit the log_root_tree we must have a consistent view of the
3029          * log_root_tree when we update the super block to point at the
3030          * log_root_tree bytenr.  If we update the log_root_tree here we'll race
3031          * with the commit and possibly point at the new block which we may not
3032          * have written out.
3033          */
3034         btrfs_set_root_node(&log->root_item, log->node);
3035         memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
3036
3037         root->log_transid++;
3038         log->log_transid = root->log_transid;
3039         root->log_start_pid = 0;
3040         /*
3041          * IO has been started, blocks of the log tree have WRITTEN flag set
3042          * in their headers. new modifications of the log will be written to
3043          * new positions. so it's safe to allow log writers to go in.
3044          */
3045         mutex_unlock(&root->log_mutex);
3046
3047         if (btrfs_is_zoned(fs_info)) {
3048                 mutex_lock(&fs_info->tree_root->log_mutex);
3049                 if (!log_root_tree->node) {
3050                         ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
3051                         if (ret) {
3052                                 mutex_unlock(&fs_info->tree_root->log_mutex);
3053                                 blk_finish_plug(&plug);
3054                                 goto out;
3055                         }
3056                 }
3057                 mutex_unlock(&fs_info->tree_root->log_mutex);
3058         }
3059
3060         btrfs_init_log_ctx(&root_log_ctx, NULL);
3061
3062         mutex_lock(&log_root_tree->log_mutex);
3063
3064         index2 = log_root_tree->log_transid % 2;
3065         list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
3066         root_log_ctx.log_transid = log_root_tree->log_transid;
3067
3068         /*
3069          * Now we are safe to update the log_root_tree because we're under the
3070          * log_mutex, and we're a current writer so we're holding the commit
3071          * open until we drop the log_mutex.
3072          */
3073         ret = update_log_root(trans, log, &new_root_item);
3074         if (ret) {
3075                 if (!list_empty(&root_log_ctx.list))
3076                         list_del_init(&root_log_ctx.list);
3077
3078                 blk_finish_plug(&plug);
3079                 btrfs_set_log_full_commit(trans);
3080
3081                 if (ret != -ENOSPC) {
3082                         btrfs_abort_transaction(trans, ret);
3083                         mutex_unlock(&log_root_tree->log_mutex);
3084                         goto out;
3085                 }
3086                 btrfs_wait_tree_log_extents(log, mark);
3087                 mutex_unlock(&log_root_tree->log_mutex);
3088                 ret = BTRFS_LOG_FORCE_COMMIT;
3089                 goto out;
3090         }
3091
3092         if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
3093                 blk_finish_plug(&plug);
3094                 list_del_init(&root_log_ctx.list);
3095                 mutex_unlock(&log_root_tree->log_mutex);
3096                 ret = root_log_ctx.log_ret;
3097                 goto out;
3098         }
3099
3100         index2 = root_log_ctx.log_transid % 2;
3101         if (atomic_read(&log_root_tree->log_commit[index2])) {
3102                 blk_finish_plug(&plug);
3103                 ret = btrfs_wait_tree_log_extents(log, mark);
3104                 wait_log_commit(log_root_tree,
3105                                 root_log_ctx.log_transid);
3106                 mutex_unlock(&log_root_tree->log_mutex);
3107                 if (!ret)
3108                         ret = root_log_ctx.log_ret;
3109                 goto out;
3110         }
3111         ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
3112         atomic_set(&log_root_tree->log_commit[index2], 1);
3113
3114         if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
3115                 wait_log_commit(log_root_tree,
3116                                 root_log_ctx.log_transid - 1);
3117         }
3118
3119         /*
3120          * now that we've moved on to the tree of log tree roots,
3121          * check the full commit flag again
3122          */
3123         if (btrfs_need_log_full_commit(trans)) {
3124                 blk_finish_plug(&plug);
3125                 btrfs_wait_tree_log_extents(log, mark);
3126                 mutex_unlock(&log_root_tree->log_mutex);
3127                 ret = BTRFS_LOG_FORCE_COMMIT;
3128                 goto out_wake_log_root;
3129         }
3130
3131         ret = btrfs_write_marked_extents(fs_info,
3132                                          &log_root_tree->dirty_log_pages,
3133                                          EXTENT_DIRTY | EXTENT_NEW);
3134         blk_finish_plug(&plug);
3135         /*
3136          * As described above, -EAGAIN indicates a hole in the extents. We
3137          * cannot wait for these write outs since the waiting cause a
3138          * deadlock. Bail out to the full commit instead.
3139          */
3140         if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) {
3141                 btrfs_set_log_full_commit(trans);
3142                 btrfs_wait_tree_log_extents(log, mark);
3143                 mutex_unlock(&log_root_tree->log_mutex);
3144                 goto out_wake_log_root;
3145         } else if (ret) {
3146                 btrfs_set_log_full_commit(trans);
3147                 btrfs_abort_transaction(trans, ret);
3148                 mutex_unlock(&log_root_tree->log_mutex);
3149                 goto out_wake_log_root;
3150         }
3151         ret = btrfs_wait_tree_log_extents(log, mark);
3152         if (!ret)
3153                 ret = btrfs_wait_tree_log_extents(log_root_tree,
3154                                                   EXTENT_NEW | EXTENT_DIRTY);
3155         if (ret) {
3156                 btrfs_set_log_full_commit(trans);
3157                 mutex_unlock(&log_root_tree->log_mutex);
3158                 goto out_wake_log_root;
3159         }
3160
3161         log_root_start = log_root_tree->node->start;
3162         log_root_level = btrfs_header_level(log_root_tree->node);
3163         log_root_tree->log_transid++;
3164         mutex_unlock(&log_root_tree->log_mutex);
3165
3166         /*
3167          * Here we are guaranteed that nobody is going to write the superblock
3168          * for the current transaction before us and that neither we do write
3169          * our superblock before the previous transaction finishes its commit
3170          * and writes its superblock, because:
3171          *
3172          * 1) We are holding a handle on the current transaction, so no body
3173          *    can commit it until we release the handle;
3174          *
3175          * 2) Before writing our superblock we acquire the tree_log_mutex, so
3176          *    if the previous transaction is still committing, and hasn't yet
3177          *    written its superblock, we wait for it to do it, because a
3178          *    transaction commit acquires the tree_log_mutex when the commit
3179          *    begins and releases it only after writing its superblock.
3180          */
3181         mutex_lock(&fs_info->tree_log_mutex);
3182
3183         /*
3184          * The previous transaction writeout phase could have failed, and thus
3185          * marked the fs in an error state.  We must not commit here, as we
3186          * could have updated our generation in the super_for_commit and
3187          * writing the super here would result in transid mismatches.  If there
3188          * is an error here just bail.
3189          */
3190         if (BTRFS_FS_ERROR(fs_info)) {
3191                 ret = -EIO;
3192                 btrfs_set_log_full_commit(trans);
3193                 btrfs_abort_transaction(trans, ret);
3194                 mutex_unlock(&fs_info->tree_log_mutex);
3195                 goto out_wake_log_root;
3196         }
3197
3198         btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start);
3199         btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level);
3200         ret = write_all_supers(fs_info, 1);
3201         mutex_unlock(&fs_info->tree_log_mutex);
3202         if (ret) {
3203                 btrfs_set_log_full_commit(trans);
3204                 btrfs_abort_transaction(trans, ret);
3205                 goto out_wake_log_root;
3206         }
3207
3208         /*
3209          * We know there can only be one task here, since we have not yet set
3210          * root->log_commit[index1] to 0 and any task attempting to sync the
3211          * log must wait for the previous log transaction to commit if it's
3212          * still in progress or wait for the current log transaction commit if
3213          * someone else already started it. We use <= and not < because the
3214          * first log transaction has an ID of 0.
3215          */
3216         ASSERT(root->last_log_commit <= log_transid);
3217         root->last_log_commit = log_transid;
3218
3219 out_wake_log_root:
3220         mutex_lock(&log_root_tree->log_mutex);
3221         btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
3222
3223         log_root_tree->log_transid_committed++;
3224         atomic_set(&log_root_tree->log_commit[index2], 0);
3225         mutex_unlock(&log_root_tree->log_mutex);
3226
3227         /*
3228          * The barrier before waitqueue_active (in cond_wake_up) is needed so
3229          * all the updates above are seen by the woken threads. It might not be
3230          * necessary, but proving that seems to be hard.
3231          */
3232         cond_wake_up(&log_root_tree->log_commit_wait[index2]);
3233 out:
3234         mutex_lock(&root->log_mutex);
3235         btrfs_remove_all_log_ctxs(root, index1, ret);
3236         root->log_transid_committed++;
3237         atomic_set(&root->log_commit[index1], 0);
3238         mutex_unlock(&root->log_mutex);
3239
3240         /*
3241          * The barrier before waitqueue_active (in cond_wake_up) is needed so
3242          * all the updates above are seen by the woken threads. It might not be
3243          * necessary, but proving that seems to be hard.
3244          */
3245         cond_wake_up(&root->log_commit_wait[index1]);
3246         return ret;
3247 }
3248
3249 static void free_log_tree(struct btrfs_trans_handle *trans,
3250                           struct btrfs_root *log)
3251 {
3252         int ret;
3253         struct walk_control wc = {
3254                 .free = 1,
3255                 .process_func = process_one_buffer
3256         };
3257
3258         if (log->node) {
3259                 ret = walk_log_tree(trans, log, &wc);
3260                 if (ret) {
3261                         /*
3262                          * We weren't able to traverse the entire log tree, the
3263                          * typical scenario is getting an -EIO when reading an
3264                          * extent buffer of the tree, due to a previous writeback
3265                          * failure of it.
3266                          */
3267                         set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
3268                                 &log->fs_info->fs_state);
3269
3270                         /*
3271                          * Some extent buffers of the log tree may still be dirty
3272                          * and not yet written back to storage, because we may
3273                          * have updates to a log tree without syncing a log tree,
3274                          * such as during rename and link operations. So flush
3275                          * them out and wait for their writeback to complete, so
3276                          * that we properly cleanup their state and pages.
3277                          */
3278                         btrfs_write_marked_extents(log->fs_info,
3279                                                    &log->dirty_log_pages,
3280                                                    EXTENT_DIRTY | EXTENT_NEW);
3281                         btrfs_wait_tree_log_extents(log,
3282                                                     EXTENT_DIRTY | EXTENT_NEW);
3283
3284                         if (trans)
3285                                 btrfs_abort_transaction(trans, ret);
3286                         else
3287                                 btrfs_handle_fs_error(log->fs_info, ret, NULL);
3288                 }
3289         }
3290
3291         clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
3292                           EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
3293         extent_io_tree_release(&log->log_csum_range);
3294
3295         btrfs_put_root(log);
3296 }
3297
3298 /*
3299  * free all the extents used by the tree log.  This should be called
3300  * at commit time of the full transaction
3301  */
3302 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
3303 {
3304         if (root->log_root) {
3305                 free_log_tree(trans, root->log_root);
3306                 root->log_root = NULL;
3307                 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
3308         }
3309         return 0;
3310 }
3311
3312 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
3313                              struct btrfs_fs_info *fs_info)
3314 {
3315         if (fs_info->log_root_tree) {
3316                 free_log_tree(trans, fs_info->log_root_tree);
3317                 fs_info->log_root_tree = NULL;
3318                 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state);
3319         }
3320         return 0;
3321 }
3322
3323 /*
3324  * Check if an inode was logged in the current transaction. This correctly deals
3325  * with the case where the inode was logged but has a logged_trans of 0, which
3326  * happens if the inode is evicted and loaded again, as logged_trans is an in
3327  * memory only field (not persisted).
3328  *
3329  * Returns 1 if the inode was logged before in the transaction, 0 if it was not,
3330  * and < 0 on error.
3331  */
3332 static int inode_logged(struct btrfs_trans_handle *trans,
3333                         struct btrfs_inode *inode,
3334                         struct btrfs_path *path_in)
3335 {
3336         struct btrfs_path *path = path_in;
3337         struct btrfs_key key;
3338         int ret;
3339
3340         if (inode->logged_trans == trans->transid)
3341                 return 1;
3342
3343         /*
3344          * If logged_trans is not 0, then we know the inode logged was not logged
3345          * in this transaction, so we can return false right away.
3346          */
3347         if (inode->logged_trans > 0)
3348                 return 0;
3349
3350         /*
3351          * If no log tree was created for this root in this transaction, then
3352          * the inode can not have been logged in this transaction. In that case
3353          * set logged_trans to anything greater than 0 and less than the current
3354          * transaction's ID, to avoid the search below in a future call in case
3355          * a log tree gets created after this.
3356          */
3357         if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) {
3358                 inode->logged_trans = trans->transid - 1;
3359                 return 0;
3360         }
3361
3362         /*
3363          * We have a log tree and the inode's logged_trans is 0. We can't tell
3364          * for sure if the inode was logged before in this transaction by looking
3365          * only at logged_trans. We could be pessimistic and assume it was, but
3366          * that can lead to unnecessarily logging an inode during rename and link
3367          * operations, and then further updating the log in followup rename and
3368          * link operations, specially if it's a directory, which adds latency
3369          * visible to applications doing a series of rename or link operations.
3370          *
3371          * A logged_trans of 0 here can mean several things:
3372          *
3373          * 1) The inode was never logged since the filesystem was mounted, and may
3374          *    or may have not been evicted and loaded again;
3375          *
3376          * 2) The inode was logged in a previous transaction, then evicted and
3377          *    then loaded again;
3378          *
3379          * 3) The inode was logged in the current transaction, then evicted and
3380          *    then loaded again.
3381          *
3382          * For cases 1) and 2) we don't want to return true, but we need to detect
3383          * case 3) and return true. So we do a search in the log root for the inode
3384          * item.
3385          */
3386         key.objectid = btrfs_ino(inode);
3387         key.type = BTRFS_INODE_ITEM_KEY;
3388         key.offset = 0;
3389
3390         if (!path) {
3391                 path = btrfs_alloc_path();
3392                 if (!path)
3393                         return -ENOMEM;
3394         }
3395
3396         ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
3397
3398         if (path_in)
3399                 btrfs_release_path(path);
3400         else
3401                 btrfs_free_path(path);
3402
3403         /*
3404          * Logging an inode always results in logging its inode item. So if we
3405          * did not find the item we know the inode was not logged for sure.
3406          */
3407         if (ret < 0) {
3408                 return ret;
3409         } else if (ret > 0) {
3410                 /*
3411                  * Set logged_trans to a value greater than 0 and less then the
3412                  * current transaction to avoid doing the search in future calls.
3413                  */
3414                 inode->logged_trans = trans->transid - 1;
3415                 return 0;
3416         }
3417
3418         /*
3419          * The inode was previously logged and then evicted, set logged_trans to
3420          * the current transacion's ID, to avoid future tree searches as long as
3421          * the inode is not evicted again.
3422          */
3423         inode->logged_trans = trans->transid;
3424
3425         /*
3426          * If it's a directory, then we must set last_dir_index_offset to the
3427          * maximum possible value, so that the next attempt to log the inode does
3428          * not skip checking if dir index keys found in modified subvolume tree
3429          * leaves have been logged before, otherwise it would result in attempts
3430          * to insert duplicate dir index keys in the log tree. This must be done
3431          * because last_dir_index_offset is an in-memory only field, not persisted
3432          * in the inode item or any other on-disk structure, so its value is lost
3433          * once the inode is evicted.
3434          */
3435         if (S_ISDIR(inode->vfs_inode.i_mode))
3436                 inode->last_dir_index_offset = (u64)-1;
3437
3438         return 1;
3439 }
3440
3441 /*
3442  * Delete a directory entry from the log if it exists.
3443  *
3444  * Returns < 0 on error
3445  *           1 if the entry does not exists
3446  *           0 if the entry existed and was successfully deleted
3447  */
3448 static int del_logged_dentry(struct btrfs_trans_handle *trans,
3449                              struct btrfs_root *log,
3450                              struct btrfs_path *path,
3451                              u64 dir_ino,
3452                              const char *name, int name_len,
3453                              u64 index)
3454 {
3455         struct btrfs_dir_item *di;
3456
3457         /*
3458          * We only log dir index items of a directory, so we don't need to look
3459          * for dir item keys.
3460          */
3461         di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
3462                                          index, name, name_len, -1);
3463         if (IS_ERR(di))
3464                 return PTR_ERR(di);
3465         else if (!di)
3466                 return 1;
3467
3468         /*
3469          * We do not need to update the size field of the directory's
3470          * inode item because on log replay we update the field to reflect
3471          * all existing entries in the directory (see overwrite_item()).
3472          */
3473         return btrfs_delete_one_dir_name(trans, log, path, di);
3474 }
3475
3476 /*
3477  * If both a file and directory are logged, and unlinks or renames are
3478  * mixed in, we have a few interesting corners:
3479  *
3480  * create file X in dir Y
3481  * link file X to X.link in dir Y
3482  * fsync file X
3483  * unlink file X but leave X.link
3484  * fsync dir Y
3485  *
3486  * After a crash we would expect only X.link to exist.  But file X
3487  * didn't get fsync'd again so the log has back refs for X and X.link.
3488  *
3489  * We solve this by removing directory entries and inode backrefs from the
3490  * log when a file that was logged in the current transaction is
3491  * unlinked.  Any later fsync will include the updated log entries, and
3492  * we'll be able to reconstruct the proper directory items from backrefs.
3493  *
3494  * This optimizations allows us to avoid relogging the entire inode
3495  * or the entire directory.
3496  */
3497 void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
3498                                   struct btrfs_root *root,
3499                                   const char *name, int name_len,
3500                                   struct btrfs_inode *dir, u64 index)
3501 {
3502         struct btrfs_path *path;
3503         int ret;
3504
3505         ret = inode_logged(trans, dir, NULL);
3506         if (ret == 0)
3507                 return;
3508         else if (ret < 0) {
3509                 btrfs_set_log_full_commit(trans);
3510                 return;
3511         }
3512
3513         ret = join_running_log_trans(root);
3514         if (ret)
3515                 return;
3516
3517         mutex_lock(&dir->log_mutex);
3518
3519         path = btrfs_alloc_path();
3520         if (!path) {
3521                 ret = -ENOMEM;
3522                 goto out_unlock;
3523         }
3524
3525         ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir),
3526                                 name, name_len, index);
3527         btrfs_free_path(path);
3528 out_unlock:
3529         mutex_unlock(&dir->log_mutex);
3530         if (ret < 0)
3531                 btrfs_set_log_full_commit(trans);
3532         btrfs_end_log_trans(root);
3533 }
3534
3535 /* see comments for btrfs_del_dir_entries_in_log */
3536 void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
3537                                 struct btrfs_root *root,
3538                                 const char *name, int name_len,
3539                                 struct btrfs_inode *inode, u64 dirid)
3540 {
3541         struct btrfs_root *log;
3542         u64 index;
3543         int ret;
3544
3545         ret = inode_logged(trans, inode, NULL);
3546         if (ret == 0)
3547                 return;
3548         else if (ret < 0) {
3549                 btrfs_set_log_full_commit(trans);
3550                 return;
3551         }
3552
3553         ret = join_running_log_trans(root);
3554         if (ret)
3555                 return;
3556         log = root->log_root;
3557         mutex_lock(&inode->log_mutex);
3558
3559         ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
3560                                   dirid, &index);
3561         mutex_unlock(&inode->log_mutex);
3562         if (ret < 0 && ret != -ENOENT)
3563                 btrfs_set_log_full_commit(trans);
3564         btrfs_end_log_trans(root);
3565 }
3566
3567 /*
3568  * creates a range item in the log for 'dirid'.  first_offset and
3569  * last_offset tell us which parts of the key space the log should
3570  * be considered authoritative for.
3571  */
3572 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
3573                                        struct btrfs_root *log,
3574                                        struct btrfs_path *path,
3575                                        u64 dirid,
3576                                        u64 first_offset, u64 last_offset)
3577 {
3578         int ret;
3579         struct btrfs_key key;
3580         struct btrfs_dir_log_item *item;
3581
3582         key.objectid = dirid;
3583         key.offset = first_offset;
3584         key.type = BTRFS_DIR_LOG_INDEX_KEY;
3585         ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
3586         /*
3587          * -EEXIST is fine and can happen sporadically when we are logging a
3588          * directory and have concurrent insertions in the subvolume's tree for
3589          * items from other inodes and that result in pushing off some dir items
3590          * from one leaf to another in order to accommodate for the new items.
3591          * This results in logging the same dir index range key.
3592          */
3593         if (ret && ret != -EEXIST)
3594                 return ret;
3595
3596         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3597                               struct btrfs_dir_log_item);
3598         if (ret == -EEXIST) {
3599                 const u64 curr_end = btrfs_dir_log_end(path->nodes[0], item);
3600
3601                 /*
3602                  * btrfs_del_dir_entries_in_log() might have been called during
3603                  * an unlink between the initial insertion of this key and the
3604                  * current update, or we might be logging a single entry deletion
3605                  * during a rename, so set the new last_offset to the max value.
3606                  */
3607                 last_offset = max(last_offset, curr_end);
3608         }
3609         btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
3610         btrfs_mark_buffer_dirty(path->nodes[0]);
3611         btrfs_release_path(path);
3612         return 0;
3613 }
3614
3615 static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
3616                                  struct btrfs_root *log,
3617                                  struct extent_buffer *src,
3618                                  struct btrfs_path *dst_path,
3619                                  int start_slot,
3620                                  int count)
3621 {
3622         char *ins_data = NULL;
3623         struct btrfs_item_batch batch;
3624         struct extent_buffer *dst;
3625         unsigned long src_offset;
3626         unsigned long dst_offset;
3627         struct btrfs_key key;
3628         u32 item_size;
3629         int ret;
3630         int i;
3631
3632         ASSERT(count > 0);
3633         batch.nr = count;
3634
3635         if (count == 1) {
3636                 btrfs_item_key_to_cpu(src, &key, start_slot);
3637                 item_size = btrfs_item_size(src, start_slot);
3638                 batch.keys = &key;
3639                 batch.data_sizes = &item_size;
3640                 batch.total_data_size = item_size;
3641         } else {
3642                 struct btrfs_key *ins_keys;
3643                 u32 *ins_sizes;
3644
3645                 ins_data = kmalloc(count * sizeof(u32) +
3646                                    count * sizeof(struct btrfs_key), GFP_NOFS);
3647                 if (!ins_data)
3648                         return -ENOMEM;
3649
3650                 ins_sizes = (u32 *)ins_data;
3651                 ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32));
3652                 batch.keys = ins_keys;
3653                 batch.data_sizes = ins_sizes;
3654                 batch.total_data_size = 0;
3655
3656                 for (i = 0; i < count; i++) {
3657                         const int slot = start_slot + i;
3658
3659                         btrfs_item_key_to_cpu(src, &ins_keys[i], slot);
3660                         ins_sizes[i] = btrfs_item_size(src, slot);
3661                         batch.total_data_size += ins_sizes[i];
3662                 }
3663         }
3664
3665         ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
3666         if (ret)
3667                 goto out;
3668
3669         dst = dst_path->nodes[0];
3670         /*
3671          * Copy all the items in bulk, in a single copy operation. Item data is
3672          * organized such that it's placed at the end of a leaf and from right
3673          * to left. For example, the data for the second item ends at an offset
3674          * that matches the offset where the data for the first item starts, the
3675          * data for the third item ends at an offset that matches the offset
3676          * where the data of the second items starts, and so on.
3677          * Therefore our source and destination start offsets for copy match the
3678          * offsets of the last items (highest slots).
3679          */
3680         dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1);
3681         src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1);
3682         copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size);
3683         btrfs_release_path(dst_path);
3684 out:
3685         kfree(ins_data);
3686
3687         return ret;
3688 }
3689
3690 static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
3691                                   struct btrfs_inode *inode,
3692                                   struct btrfs_path *path,
3693                                   struct btrfs_path *dst_path,
3694                                   struct btrfs_log_ctx *ctx,
3695                                   u64 *last_old_dentry_offset)
3696 {
3697         struct btrfs_root *log = inode->root->log_root;
3698         struct extent_buffer *src;
3699         const int nritems = btrfs_header_nritems(path->nodes[0]);
3700         const u64 ino = btrfs_ino(inode);
3701         bool last_found = false;
3702         int batch_start = 0;
3703         int batch_size = 0;
3704         int i;
3705
3706         /*
3707          * We need to clone the leaf, release the read lock on it, and use the
3708          * clone before modifying the log tree. See the comment at copy_items()
3709          * about why we need to do this.
3710          */
3711         src = btrfs_clone_extent_buffer(path->nodes[0]);
3712         if (!src)
3713                 return -ENOMEM;
3714
3715         i = path->slots[0];
3716         btrfs_release_path(path);
3717         path->nodes[0] = src;
3718         path->slots[0] = i;
3719
3720         for (; i < nritems; i++) {
3721                 struct btrfs_dir_item *di;
3722                 struct btrfs_key key;
3723                 int ret;
3724
3725                 btrfs_item_key_to_cpu(src, &key, i);
3726
3727                 if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) {
3728                         last_found = true;
3729                         break;
3730                 }
3731
3732                 di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
3733                 ctx->last_dir_item_offset = key.offset;
3734
3735                 /*
3736                  * Skip ranges of items that consist only of dir item keys created
3737                  * in past transactions. However if we find a gap, we must log a
3738                  * dir index range item for that gap, so that index keys in that
3739                  * gap are deleted during log replay.
3740                  */
3741                 if (btrfs_dir_transid(src, di) < trans->transid) {
3742                         if (key.offset > *last_old_dentry_offset + 1) {
3743                                 ret = insert_dir_log_key(trans, log, dst_path,
3744                                                  ino, *last_old_dentry_offset + 1,
3745                                                  key.offset - 1);
3746                                 if (ret < 0)
3747                                         return ret;
3748                         }
3749
3750                         *last_old_dentry_offset = key.offset;
3751                         continue;
3752                 }
3753
3754                 /* If we logged this dir index item before, we can skip it. */
3755                 if (key.offset <= inode->last_dir_index_offset)
3756                         continue;
3757
3758                 /*
3759                  * We must make sure that when we log a directory entry, the
3760                  * corresponding inode, after log replay, has a matching link
3761                  * count. For example:
3762                  *
3763                  * touch foo
3764                  * mkdir mydir
3765                  * sync
3766                  * ln foo mydir/bar
3767                  * xfs_io -c "fsync" mydir
3768                  * <crash>
3769                  * <mount fs and log replay>
3770                  *
3771                  * Would result in a fsync log that when replayed, our file inode
3772                  * would have a link count of 1, but we get two directory entries
3773                  * pointing to the same inode. After removing one of the names,
3774                  * it would not be possible to remove the other name, which
3775                  * resulted always in stale file handle errors, and would not be
3776                  * possible to rmdir the parent directory, since its i_size could
3777                  * never be decremented to the value BTRFS_EMPTY_DIR_SIZE,
3778                  * resulting in -ENOTEMPTY errors.
3779                  */
3780                 if (!ctx->log_new_dentries) {
3781                         struct btrfs_key di_key;
3782
3783                         btrfs_dir_item_key_to_cpu(src, di, &di_key);
3784                         if (di_key.type != BTRFS_ROOT_ITEM_KEY)
3785                                 ctx->log_new_dentries = true;
3786                 }
3787
3788                 if (batch_size == 0)
3789                         batch_start = i;
3790                 batch_size++;
3791         }
3792
3793         if (batch_size > 0) {
3794                 int ret;
3795
3796                 ret = flush_dir_items_batch(trans, log, src, dst_path,
3797                                             batch_start, batch_size);
3798                 if (ret < 0)
3799                         return ret;
3800         }
3801
3802         return last_found ? 1 : 0;
3803 }
3804
3805 /*
3806  * log all the items included in the current transaction for a given
3807  * directory.  This also creates the range items in the log tree required
3808  * to replay anything deleted before the fsync
3809  */
3810 static noinline int log_dir_items(struct btrfs_trans_handle *trans,
3811                           struct btrfs_inode *inode,
3812                           struct btrfs_path *path,
3813                           struct btrfs_path *dst_path,
3814                           struct btrfs_log_ctx *ctx,
3815                           u64 min_offset, u64 *last_offset_ret)
3816 {
3817         struct btrfs_key min_key;
3818         struct btrfs_root *root = inode->root;
3819         struct btrfs_root *log = root->log_root;
3820         int err = 0;
3821         int ret;
3822         u64 last_old_dentry_offset = min_offset - 1;
3823         u64 last_offset = (u64)-1;
3824         u64 ino = btrfs_ino(inode);
3825
3826         min_key.objectid = ino;
3827         min_key.type = BTRFS_DIR_INDEX_KEY;
3828         min_key.offset = min_offset;
3829
3830         ret = btrfs_search_forward(root, &min_key, path, trans->transid);
3831
3832         /*
3833          * we didn't find anything from this transaction, see if there
3834          * is anything at all
3835          */
3836         if (ret != 0 || min_key.objectid != ino ||
3837             min_key.type != BTRFS_DIR_INDEX_KEY) {
3838                 min_key.objectid = ino;
3839                 min_key.type = BTRFS_DIR_INDEX_KEY;
3840                 min_key.offset = (u64)-1;
3841                 btrfs_release_path(path);
3842                 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3843                 if (ret < 0) {
3844                         btrfs_release_path(path);
3845                         return ret;
3846                 }
3847                 ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
3848
3849                 /* if ret == 0 there are items for this type,
3850                  * create a range to tell us the last key of this type.
3851                  * otherwise, there are no items in this directory after
3852                  * *min_offset, and we create a range to indicate that.
3853                  */
3854                 if (ret == 0) {
3855                         struct btrfs_key tmp;
3856
3857                         btrfs_item_key_to_cpu(path->nodes[0], &tmp,
3858                                               path->slots[0]);
3859                         if (tmp.type == BTRFS_DIR_INDEX_KEY)
3860                                 last_old_dentry_offset = tmp.offset;
3861                 }
3862                 goto done;
3863         }
3864
3865         /* go backward to find any previous key */
3866         ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
3867         if (ret == 0) {
3868                 struct btrfs_key tmp;
3869
3870                 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3871                 /*
3872                  * The dir index key before the first one we found that needs to
3873                  * be logged might be in a previous leaf, and there might be a
3874                  * gap between these keys, meaning that we had deletions that
3875                  * happened. So the key range item we log (key type
3876                  * BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the
3877                  * previous key's offset plus 1, so that those deletes are replayed.
3878                  */
3879                 if (tmp.type == BTRFS_DIR_INDEX_KEY)
3880                         last_old_dentry_offset = tmp.offset;
3881         }
3882         btrfs_release_path(path);
3883
3884         /*
3885          * Find the first key from this transaction again.  See the note for
3886          * log_new_dir_dentries, if we're logging a directory recursively we
3887          * won't be holding its i_mutex, which means we can modify the directory
3888          * while we're logging it.  If we remove an entry between our first
3889          * search and this search we'll not find the key again and can just
3890          * bail.
3891          */
3892 search:
3893         ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3894         if (ret != 0)
3895                 goto done;
3896
3897         /*
3898          * we have a block from this transaction, log every item in it
3899          * from our directory
3900          */
3901         while (1) {
3902                 ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx,
3903                                              &last_old_dentry_offset);
3904                 if (ret != 0) {
3905                         if (ret < 0)
3906                                 err = ret;
3907                         goto done;
3908                 }
3909                 path->slots[0] = btrfs_header_nritems(path->nodes[0]);
3910
3911                 /*
3912                  * look ahead to the next item and see if it is also
3913                  * from this directory and from this transaction
3914                  */
3915                 ret = btrfs_next_leaf(root, path);
3916                 if (ret) {
3917                         if (ret == 1)
3918                                 last_offset = (u64)-1;
3919                         else
3920                                 err = ret;
3921                         goto done;
3922                 }
3923                 btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]);
3924                 if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) {
3925                         last_offset = (u64)-1;
3926                         goto done;
3927                 }
3928                 if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
3929                         /*
3930                          * The next leaf was not changed in the current transaction
3931                          * and has at least one dir index key.
3932                          * We check for the next key because there might have been
3933                          * one or more deletions between the last key we logged and
3934                          * that next key. So the key range item we log (key type
3935                          * BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's
3936                          * offset minus 1, so that those deletes are replayed.
3937                          */
3938                         last_offset = min_key.offset - 1;
3939                         goto done;
3940                 }
3941                 if (need_resched()) {
3942                         btrfs_release_path(path);
3943                         cond_resched();
3944                         goto search;
3945                 }
3946         }
3947 done:
3948         btrfs_release_path(path);
3949         btrfs_release_path(dst_path);
3950
3951         if (err == 0) {
3952                 *last_offset_ret = last_offset;
3953                 /*
3954                  * In case the leaf was changed in the current transaction but
3955                  * all its dir items are from a past transaction, the last item
3956                  * in the leaf is a dir item and there's no gap between that last
3957                  * dir item and the first one on the next leaf (which did not
3958                  * change in the current transaction), then we don't need to log
3959                  * a range, last_old_dentry_offset is == to last_offset.
3960                  */
3961                 ASSERT(last_old_dentry_offset <= last_offset);
3962                 if (last_old_dentry_offset < last_offset) {
3963                         ret = insert_dir_log_key(trans, log, path, ino,
3964                                                  last_old_dentry_offset + 1,
3965                                                  last_offset);
3966                         if (ret)
3967                                 err = ret;
3968                 }
3969         }
3970         return err;
3971 }
3972
3973 /*
3974  * If the inode was logged before and it was evicted, then its
3975  * last_dir_index_offset is (u64)-1, so we don't the value of the last index
3976  * key offset. If that's the case, search for it and update the inode. This
3977  * is to avoid lookups in the log tree every time we try to insert a dir index
3978  * key from a leaf changed in the current transaction, and to allow us to always
3979  * do batch insertions of dir index keys.
3980  */
3981 static int update_last_dir_index_offset(struct btrfs_inode *inode,
3982                                         struct btrfs_path *path,
3983                                         const struct btrfs_log_ctx *ctx)
3984 {
3985         const u64 ino = btrfs_ino(inode);
3986         struct btrfs_key key;
3987         int ret;
3988
3989         lockdep_assert_held(&inode->log_mutex);
3990
3991         if (inode->last_dir_index_offset != (u64)-1)
3992                 return 0;
3993
3994         if (!ctx->logged_before) {
3995                 inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
3996                 return 0;
3997         }
3998
3999         key.objectid = ino;
4000         key.type = BTRFS_DIR_INDEX_KEY;
4001         key.offset = (u64)-1;
4002
4003         ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
4004         /*
4005          * An error happened or we actually have an index key with an offset
4006          * value of (u64)-1. Bail out, we're done.
4007          */
4008         if (ret <= 0)
4009                 goto out;
4010
4011         ret = 0;
4012         inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
4013
4014         /*
4015          * No dir index items, bail out and leave last_dir_index_offset with
4016          * the value right before the first valid index value.
4017          */
4018         if (path->slots[0] == 0)
4019                 goto out;
4020
4021         /*
4022          * btrfs_search_slot() left us at one slot beyond the slot with the last
4023          * index key, or beyond the last key of the directory that is not an
4024          * index key. If we have an index key before, set last_dir_index_offset
4025          * to its offset value, otherwise leave it with a value right before the
4026          * first valid index value, as it means we have an empty directory.
4027          */
4028         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
4029         if (key.objectid == ino && key.type == BTRFS_DIR_INDEX_KEY)
4030                 inode->last_dir_index_offset = key.offset;
4031
4032 out:
4033         btrfs_release_path(path);
4034
4035         return ret;
4036 }
4037
4038 /*
4039  * logging directories is very similar to logging inodes, We find all the items
4040  * from the current transaction and write them to the log.
4041  *
4042  * The recovery code scans the directory in the subvolume, and if it finds a
4043  * key in the range logged that is not present in the log tree, then it means
4044  * that dir entry was unlinked during the transaction.
4045  *
4046  * In order for that scan to work, we must include one key smaller than
4047  * the smallest logged by this transaction and one key larger than the largest
4048  * key logged by this transaction.
4049  */
4050 static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
4051                           struct btrfs_inode *inode,
4052                           struct btrfs_path *path,
4053                           struct btrfs_path *dst_path,
4054                           struct btrfs_log_ctx *ctx)
4055 {
4056         u64 min_key;
4057         u64 max_key;
4058         int ret;
4059
4060         ret = update_last_dir_index_offset(inode, path, ctx);
4061         if (ret)
4062                 return ret;
4063
4064         min_key = BTRFS_DIR_START_INDEX;
4065         max_key = 0;
4066         ctx->last_dir_item_offset = inode->last_dir_index_offset;
4067
4068         while (1) {
4069                 ret = log_dir_items(trans, inode, path, dst_path,
4070                                 ctx, min_key, &max_key);
4071                 if (ret)
4072                         return ret;
4073                 if (max_key == (u64)-1)
4074                         break;
4075                 min_key = max_key + 1;
4076         }
4077
4078         inode->last_dir_index_offset = ctx->last_dir_item_offset;
4079
4080         return 0;
4081 }
4082
4083 /*
4084  * a helper function to drop items from the log before we relog an
4085  * inode.  max_key_type indicates the highest item type to remove.
4086  * This cannot be run for file data extents because it does not
4087  * free the extents they point to.
4088  */
4089 static int drop_inode_items(struct btrfs_trans_handle *trans,
4090                                   struct btrfs_root *log,
4091                                   struct btrfs_path *path,
4092                                   struct btrfs_inode *inode,
4093                                   int max_key_type)
4094 {
4095         int ret;
4096         struct btrfs_key key;
4097         struct btrfs_key found_key;
4098         int start_slot;
4099
4100         key.objectid = btrfs_ino(inode);
4101         key.type = max_key_type;
4102         key.offset = (u64)-1;
4103
4104         while (1) {
4105                 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
4106                 BUG_ON(ret == 0); /* Logic error */
4107                 if (ret < 0)
4108                         break;
4109
4110                 if (path->slots[0] == 0)
4111                         break;
4112
4113                 path->slots[0]--;
4114                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
4115                                       path->slots[0]);
4116
4117                 if (found_key.objectid != key.objectid)
4118                         break;
4119
4120                 found_key.offset = 0;
4121                 found_key.type = 0;
4122                 ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot);
4123                 if (ret < 0)
4124                         break;
4125
4126                 ret = btrfs_del_items(trans, log, path, start_slot,
4127                                       path->slots[0] - start_slot + 1);
4128                 /*
4129                  * If start slot isn't 0 then we don't need to re-search, we've
4130                  * found the last guy with the objectid in this tree.
4131                  */
4132                 if (ret || start_slot != 0)
4133                         break;
4134                 btrfs_release_path(path);
4135         }
4136         btrfs_release_path(path);
4137         if (ret > 0)
4138                 ret = 0;
4139         return ret;
4140 }
4141
4142 static int truncate_inode_items(struct btrfs_trans_handle *trans,
4143                                 struct btrfs_root *log_root,
4144                                 struct btrfs_inode *inode,
4145                                 u64 new_size, u32 min_type)
4146 {
4147         struct btrfs_truncate_control control = {
4148                 .new_size = new_size,
4149                 .ino = btrfs_ino(inode),
4150                 .min_type = min_type,
4151                 .skip_ref_updates = true,
4152         };
4153
4154         return btrfs_truncate_inode_items(trans, log_root, &control);
4155 }
4156
4157 static void fill_inode_item(struct btrfs_trans_handle *trans,
4158                             struct extent_buffer *leaf,
4159                             struct btrfs_inode_item *item,
4160                             struct inode *inode, int log_inode_only,
4161                             u64 logged_isize)
4162 {
4163         struct btrfs_map_token token;
4164         u64 flags;
4165
4166         btrfs_init_map_token(&token, leaf);
4167
4168         if (log_inode_only) {
4169                 /* set the generation to zero so the recover code
4170                  * can tell the difference between an logging
4171                  * just to say 'this inode exists' and a logging
4172                  * to say 'update this inode with these values'
4173                  */
4174                 btrfs_set_token_inode_generation(&token, item, 0);
4175                 btrfs_set_token_inode_size(&token, item, logged_isize);
4176         } else {
4177                 btrfs_set_token_inode_generation(&token, item,
4178                                                  BTRFS_I(inode)->generation);
4179                 btrfs_set_token_inode_size(&token, item, inode->i_size);
4180         }
4181
4182         btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
4183         btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
4184         btrfs_set_token_inode_mode(&token, item, inode->i_mode);
4185         btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
4186
4187         btrfs_set_token_timespec_sec(&token, &item->atime,
4188                                      inode->i_atime.tv_sec);
4189         btrfs_set_token_timespec_nsec(&token, &item->atime,
4190                                       inode->i_atime.tv_nsec);
4191
4192         btrfs_set_token_timespec_sec(&token, &item->mtime,
4193                                      inode->i_mtime.tv_sec);
4194         btrfs_set_token_timespec_nsec(&token, &item->mtime,
4195                                       inode->i_mtime.tv_nsec);
4196
4197         btrfs_set_token_timespec_sec(&token, &item->ctime,
4198                                      inode->i_ctime.tv_sec);
4199         btrfs_set_token_timespec_nsec(&token, &item->ctime,
4200                                       inode->i_ctime.tv_nsec);
4201
4202         /*
4203          * We do not need to set the nbytes field, in fact during a fast fsync
4204          * its value may not even be correct, since a fast fsync does not wait
4205          * for ordered extent completion, which is where we update nbytes, it
4206          * only waits for writeback to complete. During log replay as we find
4207          * file extent items and replay them, we adjust the nbytes field of the
4208          * inode item in subvolume tree as needed (see overwrite_item()).
4209          */
4210
4211         btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
4212         btrfs_set_token_inode_transid(&token, item, trans->transid);
4213         btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
4214         flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
4215                                           BTRFS_I(inode)->ro_flags);
4216         btrfs_set_token_inode_flags(&token, item, flags);
4217         btrfs_set_token_inode_block_group(&token, item, 0);
4218 }
4219
4220 static int log_inode_item(struct btrfs_trans_handle *trans,
4221                           struct btrfs_root *log, struct btrfs_path *path,
4222                           struct btrfs_inode *inode, bool inode_item_dropped)
4223 {
4224         struct btrfs_inode_item *inode_item;
4225         int ret;
4226
4227         /*
4228          * If we are doing a fast fsync and the inode was logged before in the
4229          * current transaction, then we know the inode was previously logged and
4230          * it exists in the log tree. For performance reasons, in this case use
4231          * btrfs_search_slot() directly with ins_len set to 0 so that we never
4232          * attempt a write lock on the leaf's parent, which adds unnecessary lock
4233          * contention in case there are concurrent fsyncs for other inodes of the
4234          * same subvolume. Using btrfs_insert_empty_item() when the inode item
4235          * already exists can also result in unnecessarily splitting a leaf.
4236          */
4237         if (!inode_item_dropped && inode->logged_trans == trans->transid) {
4238                 ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1);
4239                 ASSERT(ret <= 0);
4240                 if (ret > 0)
4241                         ret = -ENOENT;
4242         } else {
4243                 /*
4244                  * This means it is the first fsync in the current transaction,
4245                  * so the inode item is not in the log and we need to insert it.
4246                  * We can never get -EEXIST because we are only called for a fast
4247                  * fsync and in case an inode eviction happens after the inode was
4248                  * logged before in the current transaction, when we load again
4249                  * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime
4250                  * flags and set ->logged_trans to 0.
4251                  */
4252                 ret = btrfs_insert_empty_item(trans, log, path, &inode->location,
4253                                               sizeof(*inode_item));
4254                 ASSERT(ret != -EEXIST);
4255         }
4256         if (ret)
4257                 return ret;
4258         inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4259                                     struct btrfs_inode_item);
4260         fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
4261                         0, 0);
4262         btrfs_release_path(path);
4263         return 0;
4264 }
4265
4266 static int log_csums(struct btrfs_trans_handle *trans,
4267                      struct btrfs_inode *inode,
4268                      struct btrfs_root *log_root,
4269                      struct btrfs_ordered_sum *sums)
4270 {
4271         const u64 lock_end = sums->bytenr + sums->len - 1;
4272         struct extent_state *cached_state = NULL;
4273         int ret;
4274
4275         /*
4276          * If this inode was not used for reflink operations in the current
4277          * transaction with new extents, then do the fast path, no need to
4278          * worry about logging checksum items with overlapping ranges.
4279          */
4280         if (inode->last_reflink_trans < trans->transid)
4281                 return btrfs_csum_file_blocks(trans, log_root, sums);
4282
4283         /*
4284          * Serialize logging for checksums. This is to avoid racing with the
4285          * same checksum being logged by another task that is logging another
4286          * file which happens to refer to the same extent as well. Such races
4287          * can leave checksum items in the log with overlapping ranges.
4288          */
4289         ret = lock_extent(&log_root->log_csum_range, sums->bytenr, lock_end,
4290                           &cached_state);
4291         if (ret)
4292                 return ret;
4293         /*
4294          * Due to extent cloning, we might have logged a csum item that covers a
4295          * subrange of a cloned extent, and later we can end up logging a csum
4296          * item for a larger subrange of the same extent or the entire range.
4297          * This would leave csum items in the log tree that cover the same range
4298          * and break the searches for checksums in the log tree, resulting in
4299          * some checksums missing in the fs/subvolume tree. So just delete (or
4300          * trim and adjust) any existing csum items in the log for this range.
4301          */
4302         ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len);
4303         if (!ret)
4304                 ret = btrfs_csum_file_blocks(trans, log_root, sums);
4305
4306         unlock_extent(&log_root->log_csum_range, sums->bytenr, lock_end,
4307                       &cached_state);
4308
4309         return ret;
4310 }
4311
4312 static noinline int copy_items(struct btrfs_trans_handle *trans,
4313                                struct btrfs_inode *inode,
4314                                struct btrfs_path *dst_path,
4315                                struct btrfs_path *src_path,
4316                                int start_slot, int nr, int inode_only,
4317                                u64 logged_isize)
4318 {
4319         struct btrfs_root *log = inode->root->log_root;
4320         struct btrfs_file_extent_item *extent;
4321         struct extent_buffer *src;
4322         int ret = 0;
4323         struct btrfs_key *ins_keys;
4324         u32 *ins_sizes;
4325         struct btrfs_item_batch batch;
4326         char *ins_data;
4327         int i;
4328         int dst_index;
4329         const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM);
4330         const u64 i_size = i_size_read(&inode->vfs_inode);
4331
4332         /*
4333          * To keep lockdep happy and avoid deadlocks, clone the source leaf and
4334          * use the clone. This is because otherwise we would be changing the log
4335          * tree, to insert items from the subvolume tree or insert csum items,
4336          * while holding a read lock on a leaf from the subvolume tree, which
4337          * creates a nasty lock dependency when COWing log tree nodes/leaves:
4338          *
4339          * 1) Modifying the log tree triggers an extent buffer allocation while
4340          *    holding a write lock on a parent extent buffer from the log tree.
4341          *    Allocating the pages for an extent buffer, or the extent buffer
4342          *    struct, can trigger inode eviction and finally the inode eviction
4343          *    will trigger a release/remove of a delayed node, which requires
4344          *    taking the delayed node's mutex;
4345          *
4346          * 2) Allocating a metadata extent for a log tree can trigger the async
4347          *    reclaim thread and make us wait for it to release enough space and
4348          *    unblock our reservation ticket. The reclaim thread can start
4349          *    flushing delayed items, and that in turn results in the need to
4350          *    lock delayed node mutexes and in the need to write lock extent
4351          *    buffers of a subvolume tree - all this while holding a write lock
4352          *    on the parent extent buffer in the log tree.
4353          *
4354          * So one task in scenario 1) running in parallel with another task in
4355          * scenario 2) could lead to a deadlock, one wanting to lock a delayed
4356          * node mutex while having a read lock on a leaf from the subvolume,
4357          * while the other is holding the delayed node's mutex and wants to
4358          * write lock the same subvolume leaf for flushing delayed items.
4359          */
4360         src = btrfs_clone_extent_buffer(src_path->nodes[0]);
4361         if (!src)
4362                 return -ENOMEM;
4363
4364         i = src_path->slots[0];
4365         btrfs_release_path(src_path);
4366         src_path->nodes[0] = src;
4367         src_path->slots[0] = i;
4368
4369         ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
4370                            nr * sizeof(u32), GFP_NOFS);
4371         if (!ins_data)
4372                 return -ENOMEM;
4373
4374         ins_sizes = (u32 *)ins_data;
4375         ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
4376         batch.keys = ins_keys;
4377         batch.data_sizes = ins_sizes;
4378         batch.total_data_size = 0;
4379         batch.nr = 0;
4380
4381         dst_index = 0;
4382         for (i = 0; i < nr; i++) {
4383                 const int src_slot = start_slot + i;
4384                 struct btrfs_root *csum_root;
4385                 struct btrfs_ordered_sum *sums;
4386                 struct btrfs_ordered_sum *sums_next;
4387                 LIST_HEAD(ordered_sums);
4388                 u64 disk_bytenr;
4389                 u64 disk_num_bytes;
4390                 u64 extent_offset;
4391                 u64 extent_num_bytes;
4392                 bool is_old_extent;
4393
4394                 btrfs_item_key_to_cpu(src, &ins_keys[dst_index], src_slot);
4395
4396                 if (ins_keys[dst_index].type != BTRFS_EXTENT_DATA_KEY)
4397                         goto add_to_batch;
4398
4399                 extent = btrfs_item_ptr(src, src_slot,
4400                                         struct btrfs_file_extent_item);
4401
4402                 is_old_extent = (btrfs_file_extent_generation(src, extent) <
4403                                  trans->transid);
4404
4405                 /*
4406                  * Don't copy extents from past generations. That would make us
4407                  * log a lot more metadata for common cases like doing only a
4408                  * few random writes into a file and then fsync it for the first
4409                  * time or after the full sync flag is set on the inode. We can
4410                  * get leaves full of extent items, most of which are from past
4411                  * generations, so we can skip them - as long as the inode has
4412                  * not been the target of a reflink operation in this transaction,
4413                  * as in that case it might have had file extent items with old
4414                  * generations copied into it. We also must always log prealloc
4415                  * extents that start at or beyond eof, otherwise we would lose
4416                  * them on log replay.
4417                  */
4418                 if (is_old_extent &&
4419                     ins_keys[dst_index].offset < i_size &&
4420                     inode->last_reflink_trans < trans->transid)
4421                         continue;
4422
4423                 if (skip_csum)
4424                         goto add_to_batch;
4425
4426                 /* Only regular extents have checksums. */
4427                 if (btrfs_file_extent_type(src, extent) != BTRFS_FILE_EXTENT_REG)
4428                         goto add_to_batch;
4429
4430                 /*
4431                  * If it's an extent created in a past transaction, then its
4432                  * checksums are already accessible from the committed csum tree,
4433                  * no need to log them.
4434                  */
4435                 if (is_old_extent)
4436                         goto add_to_batch;
4437
4438                 disk_bytenr = btrfs_file_extent_disk_bytenr(src, extent);
4439                 /* If it's an explicit hole, there are no checksums. */
4440                 if (disk_bytenr == 0)
4441                         goto add_to_batch;
4442
4443                 disk_num_bytes = btrfs_file_extent_disk_num_bytes(src, extent);
4444
4445                 if (btrfs_file_extent_compression(src, extent)) {
4446                         extent_offset = 0;
4447                         extent_num_bytes = disk_num_bytes;
4448                 } else {
4449                         extent_offset = btrfs_file_extent_offset(src, extent);
4450                         extent_num_bytes = btrfs_file_extent_num_bytes(src, extent);
4451                 }
4452
4453                 csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr);
4454                 disk_bytenr += extent_offset;
4455                 ret = btrfs_lookup_csums_range(csum_root, disk_bytenr,
4456                                                disk_bytenr + extent_num_bytes - 1,
4457                                                &ordered_sums, 0, false);
4458                 if (ret)
4459                         goto out;
4460
4461                 list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) {
4462                         if (!ret)
4463                                 ret = log_csums(trans, inode, log, sums);
4464                         list_del(&sums->list);
4465                         kfree(sums);
4466                 }
4467                 if (ret)
4468                         goto out;
4469
4470 add_to_batch:
4471                 ins_sizes[dst_index] = btrfs_item_size(src, src_slot);
4472                 batch.total_data_size += ins_sizes[dst_index];
4473                 batch.nr++;
4474                 dst_index++;
4475         }
4476
4477         /*
4478          * We have a leaf full of old extent items that don't need to be logged,
4479          * so we don't need to do anything.
4480          */
4481         if (batch.nr == 0)
4482                 goto out;
4483
4484         ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
4485         if (ret)
4486                 goto out;
4487
4488         dst_index = 0;
4489         for (i = 0; i < nr; i++) {
4490                 const int src_slot = start_slot + i;
4491                 const int dst_slot = dst_path->slots[0] + dst_index;
4492                 struct btrfs_key key;
4493                 unsigned long src_offset;
4494                 unsigned long dst_offset;
4495
4496                 /*
4497                  * We're done, all the remaining items in the source leaf
4498                  * correspond to old file extent items.
4499                  */
4500                 if (dst_index >= batch.nr)
4501                         break;
4502
4503                 btrfs_item_key_to_cpu(src, &key, src_slot);
4504
4505                 if (key.type != BTRFS_EXTENT_DATA_KEY)
4506                         goto copy_item;
4507
4508                 extent = btrfs_item_ptr(src, src_slot,
4509                                         struct btrfs_file_extent_item);
4510
4511                 /* See the comment in the previous loop, same logic. */
4512                 if (btrfs_file_extent_generation(src, extent) < trans->transid &&
4513                     key.offset < i_size &&
4514                     inode->last_reflink_trans < trans->transid)
4515                         continue;
4516
4517 copy_item:
4518                 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_slot);
4519                 src_offset = btrfs_item_ptr_offset(src, src_slot);
4520
4521                 if (key.type == BTRFS_INODE_ITEM_KEY) {
4522                         struct btrfs_inode_item *inode_item;
4523
4524                         inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot,
4525                                                     struct btrfs_inode_item);
4526                         fill_inode_item(trans, dst_path->nodes[0], inode_item,
4527                                         &inode->vfs_inode,
4528                                         inode_only == LOG_INODE_EXISTS,
4529                                         logged_isize);
4530                 } else {
4531                         copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
4532                                            src_offset, ins_sizes[dst_index]);
4533                 }
4534
4535                 dst_index++;
4536         }
4537
4538         btrfs_mark_buffer_dirty(dst_path->nodes[0]);
4539         btrfs_release_path(dst_path);
4540 out:
4541         kfree(ins_data);
4542
4543         return ret;
4544 }
4545
4546 static int extent_cmp(void *priv, const struct list_head *a,
4547                       const struct list_head *b)
4548 {
4549         const struct extent_map *em1, *em2;
4550
4551         em1 = list_entry(a, struct extent_map, list);
4552         em2 = list_entry(b, struct extent_map, list);
4553
4554         if (em1->start < em2->start)
4555                 return -1;
4556         else if (em1->start > em2->start)
4557                 return 1;
4558         return 0;
4559 }
4560
4561 static int log_extent_csums(struct btrfs_trans_handle *trans,
4562                             struct btrfs_inode *inode,
4563                             struct btrfs_root *log_root,
4564                             const struct extent_map *em,
4565                             struct btrfs_log_ctx *ctx)
4566 {
4567         struct btrfs_ordered_extent *ordered;
4568         struct btrfs_root *csum_root;
4569         u64 csum_offset;
4570         u64 csum_len;
4571         u64 mod_start = em->mod_start;
4572         u64 mod_len = em->mod_len;
4573         LIST_HEAD(ordered_sums);
4574         int ret = 0;
4575
4576         if (inode->flags & BTRFS_INODE_NODATASUM ||
4577             test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
4578             em->block_start == EXTENT_MAP_HOLE)
4579                 return 0;
4580
4581         list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
4582                 const u64 ordered_end = ordered->file_offset + ordered->num_bytes;
4583                 const u64 mod_end = mod_start + mod_len;
4584                 struct btrfs_ordered_sum *sums;
4585
4586                 if (mod_len == 0)
4587                         break;
4588
4589                 if (ordered_end <= mod_start)
4590                         continue;
4591                 if (mod_end <= ordered->file_offset)
4592                         break;
4593
4594                 /*
4595                  * We are going to copy all the csums on this ordered extent, so
4596                  * go ahead and adjust mod_start and mod_len in case this ordered
4597                  * extent has already been logged.
4598                  */
4599                 if (ordered->file_offset > mod_start) {
4600                         if (ordered_end >= mod_end)
4601                                 mod_len = ordered->file_offset - mod_start;
4602                         /*
4603                          * If we have this case
4604                          *
4605                          * |--------- logged extent ---------|
4606                          *       |----- ordered extent ----|
4607                          *
4608                          * Just don't mess with mod_start and mod_len, we'll
4609                          * just end up logging more csums than we need and it
4610                          * will be ok.
4611                          */
4612                 } else {
4613                         if (ordered_end < mod_end) {
4614                                 mod_len = mod_end - ordered_end;
4615                                 mod_start = ordered_end;
4616                         } else {
4617                                 mod_len = 0;
4618                         }
4619                 }
4620
4621                 /*
4622                  * To keep us from looping for the above case of an ordered
4623                  * extent that falls inside of the logged extent.
4624                  */
4625                 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags))
4626                         continue;
4627
4628                 list_for_each_entry(sums, &ordered->list, list) {
4629                         ret = log_csums(trans, inode, log_root, sums);
4630                         if (ret)
4631                                 return ret;
4632                 }
4633         }
4634
4635         /* We're done, found all csums in the ordered extents. */
4636         if (mod_len == 0)
4637                 return 0;
4638
4639         /* If we're compressed we have to save the entire range of csums. */
4640         if (em->compress_type) {
4641                 csum_offset = 0;
4642                 csum_len = max(em->block_len, em->orig_block_len);
4643         } else {
4644                 csum_offset = mod_start - em->start;
4645                 csum_len = mod_len;
4646         }
4647
4648         /* block start is already adjusted for the file extent offset. */
4649         csum_root = btrfs_csum_root(trans->fs_info, em->block_start);
4650         ret = btrfs_lookup_csums_range(csum_root,
4651                                        em->block_start + csum_offset,
4652                                        em->block_start + csum_offset +
4653                                        csum_len - 1, &ordered_sums, 0, false);
4654         if (ret)
4655                 return ret;
4656
4657         while (!list_empty(&ordered_sums)) {
4658                 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
4659                                                    struct btrfs_ordered_sum,
4660                                                    list);
4661                 if (!ret)
4662                         ret = log_csums(trans, inode, log_root, sums);
4663                 list_del(&sums->list);
4664                 kfree(sums);
4665         }
4666
4667         return ret;
4668 }
4669
4670 static int log_one_extent(struct btrfs_trans_handle *trans,
4671                           struct btrfs_inode *inode,
4672                           const struct extent_map *em,
4673                           struct btrfs_path *path,
4674                           struct btrfs_log_ctx *ctx)
4675 {
4676         struct btrfs_drop_extents_args drop_args = { 0 };
4677         struct btrfs_root *log = inode->root->log_root;
4678         struct btrfs_file_extent_item fi = { 0 };
4679         struct extent_buffer *leaf;
4680         struct btrfs_key key;
4681         u64 extent_offset = em->start - em->orig_start;
4682         u64 block_len;
4683         int ret;
4684
4685         btrfs_set_stack_file_extent_generation(&fi, trans->transid);
4686         if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
4687                 btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC);
4688         else
4689                 btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG);
4690
4691         block_len = max(em->block_len, em->orig_block_len);
4692         if (em->compress_type != BTRFS_COMPRESS_NONE) {
4693                 btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start);
4694                 btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
4695         } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
4696                 btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start -
4697                                                         extent_offset);
4698                 btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
4699         }
4700
4701         btrfs_set_stack_file_extent_offset(&fi, extent_offset);
4702         btrfs_set_stack_file_extent_num_bytes(&fi, em->len);
4703         btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes);
4704         btrfs_set_stack_file_extent_compression(&fi, em->compress_type);
4705
4706         ret = log_extent_csums(trans, inode, log, em, ctx);
4707         if (ret)
4708                 return ret;
4709
4710         /*
4711          * If this is the first time we are logging the inode in the current
4712          * transaction, we can avoid btrfs_drop_extents(), which is expensive
4713          * because it does a deletion search, which always acquires write locks
4714          * for extent buffers at levels 2, 1 and 0. This not only wastes time
4715          * but also adds significant contention in a log tree, since log trees
4716          * are small, with a root at level 2 or 3 at most, due to their short
4717          * life span.
4718          */
4719         if (ctx->logged_before) {
4720                 drop_args.path = path;
4721                 drop_args.start = em->start;
4722                 drop_args.end = em->start + em->len;
4723                 drop_args.replace_extent = true;
4724                 drop_args.extent_item_size = sizeof(fi);
4725                 ret = btrfs_drop_extents(trans, log, inode, &drop_args);
4726                 if (ret)
4727                         return ret;
4728         }
4729
4730         if (!drop_args.extent_inserted) {
4731                 key.objectid = btrfs_ino(inode);
4732                 key.type = BTRFS_EXTENT_DATA_KEY;
4733                 key.offset = em->start;
4734
4735                 ret = btrfs_insert_empty_item(trans, log, path, &key,
4736                                               sizeof(fi));
4737                 if (ret)
4738                         return ret;
4739         }
4740         leaf = path->nodes[0];
4741         write_extent_buffer(leaf, &fi,
4742                             btrfs_item_ptr_offset(leaf, path->slots[0]),
4743                             sizeof(fi));
4744         btrfs_mark_buffer_dirty(leaf);
4745
4746         btrfs_release_path(path);
4747
4748         return ret;
4749 }
4750
4751 /*
4752  * Log all prealloc extents beyond the inode's i_size to make sure we do not
4753  * lose them after doing a full/fast fsync and replaying the log. We scan the
4754  * subvolume's root instead of iterating the inode's extent map tree because
4755  * otherwise we can log incorrect extent items based on extent map conversion.
4756  * That can happen due to the fact that extent maps are merged when they
4757  * are not in the extent map tree's list of modified extents.
4758  */
4759 static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
4760                                       struct btrfs_inode *inode,
4761                                       struct btrfs_path *path)
4762 {
4763         struct btrfs_root *root = inode->root;
4764         struct btrfs_key key;
4765         const u64 i_size = i_size_read(&inode->vfs_inode);
4766         const u64 ino = btrfs_ino(inode);
4767         struct btrfs_path *dst_path = NULL;
4768         bool dropped_extents = false;
4769         u64 truncate_offset = i_size;
4770         struct extent_buffer *leaf;
4771         int slot;
4772         int ins_nr = 0;
4773         int start_slot;
4774         int ret;
4775
4776         if (!(inode->flags & BTRFS_INODE_PREALLOC))
4777                 return 0;
4778
4779         key.objectid = ino;
4780         key.type = BTRFS_EXTENT_DATA_KEY;
4781         key.offset = i_size;
4782         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4783         if (ret < 0)
4784                 goto out;
4785
4786         /*
4787          * We must check if there is a prealloc extent that starts before the
4788          * i_size and crosses the i_size boundary. This is to ensure later we
4789          * truncate down to the end of that extent and not to the i_size, as
4790          * otherwise we end up losing part of the prealloc extent after a log
4791          * replay and with an implicit hole if there is another prealloc extent
4792          * that starts at an offset beyond i_size.
4793          */
4794         ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
4795         if (ret < 0)
4796                 goto out;
4797
4798         if (ret == 0) {
4799                 struct btrfs_file_extent_item *ei;
4800
4801                 leaf = path->nodes[0];
4802                 slot = path->slots[0];
4803                 ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
4804
4805                 if (btrfs_file_extent_type(leaf, ei) ==
4806                     BTRFS_FILE_EXTENT_PREALLOC) {
4807                         u64 extent_end;
4808
4809                         btrfs_item_key_to_cpu(leaf, &key, slot);
4810                         extent_end = key.offset +
4811                                 btrfs_file_extent_num_bytes(leaf, ei);
4812
4813                         if (extent_end > i_size)
4814                                 truncate_offset = extent_end;
4815                 }
4816         } else {
4817                 ret = 0;
4818         }
4819
4820         while (true) {
4821                 leaf = path->nodes[0];
4822                 slot = path->slots[0];
4823
4824                 if (slot >= btrfs_header_nritems(leaf)) {
4825                         if (ins_nr > 0) {
4826                                 ret = copy_items(trans, inode, dst_path, path,
4827                                                  start_slot, ins_nr, 1, 0);
4828                                 if (ret < 0)
4829                                         goto out;
4830                                 ins_nr = 0;
4831                         }
4832                         ret = btrfs_next_leaf(root, path);
4833                         if (ret < 0)
4834                                 goto out;
4835                         if (ret > 0) {
4836                                 ret = 0;
4837                                 break;
4838                         }
4839                         continue;
4840                 }
4841
4842                 btrfs_item_key_to_cpu(leaf, &key, slot);
4843                 if (key.objectid > ino)
4844                         break;
4845                 if (WARN_ON_ONCE(key.objectid < ino) ||
4846                     key.type < BTRFS_EXTENT_DATA_KEY ||
4847                     key.offset < i_size) {
4848                         path->slots[0]++;
4849                         continue;
4850                 }
4851                 if (!dropped_extents) {
4852                         /*
4853                          * Avoid logging extent items logged in past fsync calls
4854                          * and leading to duplicate keys in the log tree.
4855                          */
4856                         ret = truncate_inode_items(trans, root->log_root, inode,
4857                                                    truncate_offset,
4858                                                    BTRFS_EXTENT_DATA_KEY);
4859                         if (ret)
4860                                 goto out;
4861                         dropped_extents = true;
4862                 }
4863                 if (ins_nr == 0)
4864                         start_slot = slot;
4865                 ins_nr++;
4866                 path->slots[0]++;
4867                 if (!dst_path) {
4868                         dst_path = btrfs_alloc_path();
4869                         if (!dst_path) {
4870                                 ret = -ENOMEM;
4871                                 goto out;
4872                         }
4873                 }
4874         }
4875         if (ins_nr > 0)
4876                 ret = copy_items(trans, inode, dst_path, path,
4877                                  start_slot, ins_nr, 1, 0);
4878 out:
4879         btrfs_release_path(path);
4880         btrfs_free_path(dst_path);
4881         return ret;
4882 }
4883
4884 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
4885                                      struct btrfs_inode *inode,
4886                                      struct btrfs_path *path,
4887                                      struct btrfs_log_ctx *ctx)
4888 {
4889         struct btrfs_ordered_extent *ordered;
4890         struct btrfs_ordered_extent *tmp;
4891         struct extent_map *em, *n;
4892         struct list_head extents;
4893         struct extent_map_tree *tree = &inode->extent_tree;
4894         int ret = 0;
4895         int num = 0;
4896
4897         INIT_LIST_HEAD(&extents);
4898
4899         write_lock(&tree->lock);
4900
4901         list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
4902                 list_del_init(&em->list);
4903                 /*
4904                  * Just an arbitrary number, this can be really CPU intensive
4905                  * once we start getting a lot of extents, and really once we
4906                  * have a bunch of extents we just want to commit since it will
4907                  * be faster.
4908                  */
4909                 if (++num > 32768) {
4910                         list_del_init(&tree->modified_extents);
4911                         ret = -EFBIG;
4912                         goto process;
4913                 }
4914
4915                 if (em->generation < trans->transid)
4916                         continue;
4917
4918                 /* We log prealloc extents beyond eof later. */
4919                 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) &&
4920                     em->start >= i_size_read(&inode->vfs_inode))
4921                         continue;
4922
4923                 /* Need a ref to keep it from getting evicted from cache */
4924                 refcount_inc(&em->refs);
4925                 set_bit(EXTENT_FLAG_LOGGING, &em->flags);
4926                 list_add_tail(&em->list, &extents);
4927                 num++;
4928         }
4929
4930         list_sort(NULL, &extents, extent_cmp);
4931 process:
4932         while (!list_empty(&extents)) {
4933                 em = list_entry(extents.next, struct extent_map, list);
4934
4935                 list_del_init(&em->list);
4936
4937                 /*
4938                  * If we had an error we just need to delete everybody from our
4939                  * private list.
4940                  */
4941                 if (ret) {
4942                         clear_em_logging(tree, em);
4943                         free_extent_map(em);
4944                         continue;
4945                 }
4946
4947                 write_unlock(&tree->lock);
4948
4949                 ret = log_one_extent(trans, inode, em, path, ctx);
4950                 write_lock(&tree->lock);
4951                 clear_em_logging(tree, em);
4952                 free_extent_map(em);
4953         }
4954         WARN_ON(!list_empty(&extents));
4955         write_unlock(&tree->lock);
4956
4957         if (!ret)
4958                 ret = btrfs_log_prealloc_extents(trans, inode, path);
4959         if (ret)
4960                 return ret;
4961
4962         /*
4963          * We have logged all extents successfully, now make sure the commit of
4964          * the current transaction waits for the ordered extents to complete
4965          * before it commits and wipes out the log trees, otherwise we would
4966          * lose data if an ordered extents completes after the transaction
4967          * commits and a power failure happens after the transaction commit.
4968          */
4969         list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
4970                 list_del_init(&ordered->log_list);
4971                 set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
4972
4973                 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
4974                         spin_lock_irq(&inode->ordered_tree.lock);
4975                         if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
4976                                 set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
4977                                 atomic_inc(&trans->transaction->pending_ordered);
4978                         }
4979                         spin_unlock_irq(&inode->ordered_tree.lock);
4980                 }
4981                 btrfs_put_ordered_extent(ordered);
4982         }
4983
4984         return 0;
4985 }
4986
4987 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
4988                              struct btrfs_path *path, u64 *size_ret)
4989 {
4990         struct btrfs_key key;
4991         int ret;
4992
4993         key.objectid = btrfs_ino(inode);
4994         key.type = BTRFS_INODE_ITEM_KEY;
4995         key.offset = 0;
4996
4997         ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
4998         if (ret < 0) {
4999                 return ret;
5000         } else if (ret > 0) {
5001                 *size_ret = 0;
5002         } else {
5003                 struct btrfs_inode_item *item;
5004
5005                 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
5006                                       struct btrfs_inode_item);
5007                 *size_ret = btrfs_inode_size(path->nodes[0], item);
5008                 /*
5009                  * If the in-memory inode's i_size is smaller then the inode
5010                  * size stored in the btree, return the inode's i_size, so
5011                  * that we get a correct inode size after replaying the log
5012                  * when before a power failure we had a shrinking truncate
5013                  * followed by addition of a new name (rename / new hard link).
5014                  * Otherwise return the inode size from the btree, to avoid
5015                  * data loss when replaying a log due to previously doing a
5016                  * write that expands the inode's size and logging a new name
5017                  * immediately after.
5018                  */
5019                 if (*size_ret > inode->vfs_inode.i_size)
5020                         *size_ret = inode->vfs_inode.i_size;
5021         }
5022
5023         btrfs_release_path(path);
5024         return 0;
5025 }
5026
5027 /*
5028  * At the moment we always log all xattrs. This is to figure out at log replay
5029  * time which xattrs must have their deletion replayed. If a xattr is missing
5030  * in the log tree and exists in the fs/subvol tree, we delete it. This is
5031  * because if a xattr is deleted, the inode is fsynced and a power failure
5032  * happens, causing the log to be replayed the next time the fs is mounted,
5033  * we want the xattr to not exist anymore (same behaviour as other filesystems
5034  * with a journal, ext3/4, xfs, f2fs, etc).
5035  */
5036 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
5037                                 struct btrfs_inode *inode,
5038                                 struct btrfs_path *path,
5039                                 struct btrfs_path *dst_path)
5040 {
5041         struct btrfs_root *root = inode->root;
5042         int ret;
5043         struct btrfs_key key;
5044         const u64 ino = btrfs_ino(inode);
5045         int ins_nr = 0;
5046         int start_slot = 0;
5047         bool found_xattrs = false;
5048
5049         if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags))
5050                 return 0;
5051
5052         key.objectid = ino;
5053         key.type = BTRFS_XATTR_ITEM_KEY;
5054         key.offset = 0;
5055
5056         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5057         if (ret < 0)
5058                 return ret;
5059
5060         while (true) {
5061                 int slot = path->slots[0];
5062                 struct extent_buffer *leaf = path->nodes[0];
5063                 int nritems = btrfs_header_nritems(leaf);
5064
5065                 if (slot >= nritems) {
5066                         if (ins_nr > 0) {
5067                                 ret = copy_items(trans, inode, dst_path, path,
5068                                                  start_slot, ins_nr, 1, 0);
5069                                 if (ret < 0)
5070                                         return ret;
5071                                 ins_nr = 0;
5072                         }
5073                         ret = btrfs_next_leaf(root, path);
5074                         if (ret < 0)
5075                                 return ret;
5076                         else if (ret > 0)
5077                                 break;
5078                         continue;
5079                 }
5080
5081                 btrfs_item_key_to_cpu(leaf, &key, slot);
5082                 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
5083                         break;
5084
5085                 if (ins_nr == 0)
5086                         start_slot = slot;
5087                 ins_nr++;
5088                 path->slots[0]++;
5089                 found_xattrs = true;
5090                 cond_resched();
5091         }
5092         if (ins_nr > 0) {
5093                 ret = copy_items(trans, inode, dst_path, path,
5094                                  start_slot, ins_nr, 1, 0);
5095                 if (ret < 0)
5096                         return ret;
5097         }
5098
5099         if (!found_xattrs)
5100                 set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags);
5101
5102         return 0;
5103 }
5104
5105 /*
5106  * When using the NO_HOLES feature if we punched a hole that causes the
5107  * deletion of entire leafs or all the extent items of the first leaf (the one
5108  * that contains the inode item and references) we may end up not processing
5109  * any extents, because there are no leafs with a generation matching the
5110  * current transaction that have extent items for our inode. So we need to find
5111  * if any holes exist and then log them. We also need to log holes after any
5112  * truncate operation that changes the inode's size.
5113  */
5114 static int btrfs_log_holes(struct btrfs_trans_handle *trans,
5115                            struct btrfs_inode *inode,
5116                            struct btrfs_path *path)
5117 {
5118         struct btrfs_root *root = inode->root;
5119         struct btrfs_fs_info *fs_info = root->fs_info;
5120         struct btrfs_key key;
5121         const u64 ino = btrfs_ino(inode);
5122         const u64 i_size = i_size_read(&inode->vfs_inode);
5123         u64 prev_extent_end = 0;
5124         int ret;
5125
5126         if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
5127                 return 0;
5128
5129         key.objectid = ino;
5130         key.type = BTRFS_EXTENT_DATA_KEY;
5131         key.offset = 0;
5132
5133         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5134         if (ret < 0)
5135                 return ret;
5136
5137         while (true) {
5138                 struct extent_buffer *leaf = path->nodes[0];
5139
5140                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
5141                         ret = btrfs_next_leaf(root, path);
5142                         if (ret < 0)
5143                                 return ret;
5144                         if (ret > 0) {
5145                                 ret = 0;
5146                                 break;
5147                         }
5148                         leaf = path->nodes[0];
5149                 }
5150
5151                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5152                 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
5153                         break;
5154
5155                 /* We have a hole, log it. */
5156                 if (prev_extent_end < key.offset) {
5157                         const u64 hole_len = key.offset - prev_extent_end;
5158
5159                         /*
5160                          * Release the path to avoid deadlocks with other code
5161                          * paths that search the root while holding locks on
5162                          * leafs from the log root.
5163                          */
5164                         btrfs_release_path(path);
5165                         ret = btrfs_insert_hole_extent(trans, root->log_root,
5166                                                        ino, prev_extent_end,
5167                                                        hole_len);
5168                         if (ret < 0)
5169                                 return ret;
5170
5171                         /*
5172                          * Search for the same key again in the root. Since it's
5173                          * an extent item and we are holding the inode lock, the
5174                          * key must still exist. If it doesn't just emit warning
5175                          * and return an error to fall back to a transaction
5176                          * commit.
5177                          */
5178                         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5179                         if (ret < 0)
5180                                 return ret;
5181                         if (WARN_ON(ret > 0))
5182                                 return -ENOENT;
5183                         leaf = path->nodes[0];
5184                 }
5185
5186                 prev_extent_end = btrfs_file_extent_end(path);
5187                 path->slots[0]++;
5188                 cond_resched();
5189         }
5190
5191         if (prev_extent_end < i_size) {
5192                 u64 hole_len;
5193
5194                 btrfs_release_path(path);
5195                 hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
5196                 ret = btrfs_insert_hole_extent(trans, root->log_root, ino,
5197                                                prev_extent_end, hole_len);
5198                 if (ret < 0)
5199                         return ret;
5200         }
5201
5202         return 0;
5203 }
5204
5205 /*
5206  * When we are logging a new inode X, check if it doesn't have a reference that
5207  * matches the reference from some other inode Y created in a past transaction
5208  * and that was renamed in the current transaction. If we don't do this, then at
5209  * log replay time we can lose inode Y (and all its files if it's a directory):
5210  *
5211  * mkdir /mnt/x
5212  * echo "hello world" > /mnt/x/foobar
5213  * sync
5214  * mv /mnt/x /mnt/y
5215  * mkdir /mnt/x                 # or touch /mnt/x
5216  * xfs_io -c fsync /mnt/x
5217  * <power fail>
5218  * mount fs, trigger log replay
5219  *
5220  * After the log replay procedure, we would lose the first directory and all its
5221  * files (file foobar).
5222  * For the case where inode Y is not a directory we simply end up losing it:
5223  *
5224  * echo "123" > /mnt/foo
5225  * sync
5226  * mv /mnt/foo /mnt/bar
5227  * echo "abc" > /mnt/foo
5228  * xfs_io -c fsync /mnt/foo
5229  * <power fail>
5230  *
5231  * We also need this for cases where a snapshot entry is replaced by some other
5232  * entry (file or directory) otherwise we end up with an unreplayable log due to
5233  * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
5234  * if it were a regular entry:
5235  *
5236  * mkdir /mnt/x
5237  * btrfs subvolume snapshot /mnt /mnt/x/snap
5238  * btrfs subvolume delete /mnt/x/snap
5239  * rmdir /mnt/x
5240  * mkdir /mnt/x
5241  * fsync /mnt/x or fsync some new file inside it
5242  * <power fail>
5243  *
5244  * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
5245  * the same transaction.
5246  */
5247 static int btrfs_check_ref_name_override(struct extent_buffer *eb,
5248                                          const int slot,
5249                                          const struct btrfs_key *key,
5250                                          struct btrfs_inode *inode,
5251                                          u64 *other_ino, u64 *other_parent)
5252 {
5253         int ret;
5254         struct btrfs_path *search_path;
5255         char *name = NULL;
5256         u32 name_len = 0;
5257         u32 item_size = btrfs_item_size(eb, slot);
5258         u32 cur_offset = 0;
5259         unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
5260
5261         search_path = btrfs_alloc_path();
5262         if (!search_path)
5263                 return -ENOMEM;
5264         search_path->search_commit_root = 1;
5265         search_path->skip_locking = 1;
5266
5267         while (cur_offset < item_size) {
5268                 u64 parent;
5269                 u32 this_name_len;
5270                 u32 this_len;
5271                 unsigned long name_ptr;
5272                 struct btrfs_dir_item *di;
5273
5274                 if (key->type == BTRFS_INODE_REF_KEY) {
5275                         struct btrfs_inode_ref *iref;
5276
5277                         iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
5278                         parent = key->offset;
5279                         this_name_len = btrfs_inode_ref_name_len(eb, iref);
5280                         name_ptr = (unsigned long)(iref + 1);
5281                         this_len = sizeof(*iref) + this_name_len;
5282                 } else {
5283                         struct btrfs_inode_extref *extref;
5284
5285                         extref = (struct btrfs_inode_extref *)(ptr +
5286                                                                cur_offset);
5287                         parent = btrfs_inode_extref_parent(eb, extref);
5288                         this_name_len = btrfs_inode_extref_name_len(eb, extref);
5289                         name_ptr = (unsigned long)&extref->name;
5290                         this_len = sizeof(*extref) + this_name_len;
5291                 }
5292
5293                 if (this_name_len > name_len) {
5294                         char *new_name;
5295
5296                         new_name = krealloc(name, this_name_len, GFP_NOFS);
5297                         if (!new_name) {
5298                                 ret = -ENOMEM;
5299                                 goto out;
5300                         }
5301                         name_len = this_name_len;
5302                         name = new_name;
5303                 }
5304
5305                 read_extent_buffer(eb, name, name_ptr, this_name_len);
5306                 di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
5307                                 parent, name, this_name_len, 0);
5308                 if (di && !IS_ERR(di)) {
5309                         struct btrfs_key di_key;
5310
5311                         btrfs_dir_item_key_to_cpu(search_path->nodes[0],
5312                                                   di, &di_key);
5313                         if (di_key.type == BTRFS_INODE_ITEM_KEY) {
5314                                 if (di_key.objectid != key->objectid) {
5315                                         ret = 1;
5316                                         *other_ino = di_key.objectid;
5317                                         *other_parent = parent;
5318                                 } else {
5319                                         ret = 0;
5320                                 }
5321                         } else {
5322                                 ret = -EAGAIN;
5323                         }
5324                         goto out;
5325                 } else if (IS_ERR(di)) {
5326                         ret = PTR_ERR(di);
5327                         goto out;
5328                 }
5329                 btrfs_release_path(search_path);
5330
5331                 cur_offset += this_len;
5332         }
5333         ret = 0;
5334 out:
5335         btrfs_free_path(search_path);
5336         kfree(name);
5337         return ret;
5338 }
5339
5340 /*
5341  * Check if we need to log an inode. This is used in contexts where while
5342  * logging an inode we need to log another inode (either that it exists or in
5343  * full mode). This is used instead of btrfs_inode_in_log() because the later
5344  * requires the inode to be in the log and have the log transaction committed,
5345  * while here we do not care if the log transaction was already committed - our
5346  * caller will commit the log later - and we want to avoid logging an inode
5347  * multiple times when multiple tasks have joined the same log transaction.
5348  */
5349 static bool need_log_inode(const struct btrfs_trans_handle *trans,
5350                            const struct btrfs_inode *inode)
5351 {
5352         /*
5353          * If a directory was not modified, no dentries added or removed, we can
5354          * and should avoid logging it.
5355          */
5356         if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid)
5357                 return false;
5358
5359         /*
5360          * If this inode does not have new/updated/deleted xattrs since the last
5361          * time it was logged and is flagged as logged in the current transaction,
5362          * we can skip logging it. As for new/deleted names, those are updated in
5363          * the log by link/unlink/rename operations.
5364          * In case the inode was logged and then evicted and reloaded, its
5365          * logged_trans will be 0, in which case we have to fully log it since
5366          * logged_trans is a transient field, not persisted.
5367          */
5368         if (inode->logged_trans == trans->transid &&
5369             !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
5370                 return false;
5371
5372         return true;
5373 }
5374
5375 struct btrfs_dir_list {
5376         u64 ino;
5377         struct list_head list;
5378 };
5379
5380 /*
5381  * Log the inodes of the new dentries of a directory.
5382  * See process_dir_items_leaf() for details about why it is needed.
5383  * This is a recursive operation - if an existing dentry corresponds to a
5384  * directory, that directory's new entries are logged too (same behaviour as
5385  * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
5386  * the dentries point to we do not acquire their VFS lock, otherwise lockdep
5387  * complains about the following circular lock dependency / possible deadlock:
5388  *
5389  *        CPU0                                        CPU1
5390  *        ----                                        ----
5391  * lock(&type->i_mutex_dir_key#3/2);
5392  *                                            lock(sb_internal#2);
5393  *                                            lock(&type->i_mutex_dir_key#3/2);
5394  * lock(&sb->s_type->i_mutex_key#14);
5395  *
5396  * Where sb_internal is the lock (a counter that works as a lock) acquired by
5397  * sb_start_intwrite() in btrfs_start_transaction().
5398  * Not acquiring the VFS lock of the inodes is still safe because:
5399  *
5400  * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
5401  *    that while logging the inode new references (names) are added or removed
5402  *    from the inode, leaving the logged inode item with a link count that does
5403  *    not match the number of logged inode reference items. This is fine because
5404  *    at log replay time we compute the real number of links and correct the
5405  *    link count in the inode item (see replay_one_buffer() and
5406  *    link_to_fixup_dir());
5407  *
5408  * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
5409  *    while logging the inode's items new index items (key type
5410  *    BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item
5411  *    has a size that doesn't match the sum of the lengths of all the logged
5412  *    names - this is ok, not a problem, because at log replay time we set the
5413  *    directory's i_size to the correct value (see replay_one_name() and
5414  *    do_overwrite_item()).
5415  */
5416 static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
5417                                 struct btrfs_inode *start_inode,
5418                                 struct btrfs_log_ctx *ctx)
5419 {
5420         struct btrfs_root *root = start_inode->root;
5421         struct btrfs_fs_info *fs_info = root->fs_info;
5422         struct btrfs_path *path;
5423         LIST_HEAD(dir_list);
5424         struct btrfs_dir_list *dir_elem;
5425         u64 ino = btrfs_ino(start_inode);
5426         int ret = 0;
5427
5428         /*
5429          * If we are logging a new name, as part of a link or rename operation,
5430          * don't bother logging new dentries, as we just want to log the names
5431          * of an inode and that any new parents exist.
5432          */
5433         if (ctx->logging_new_name)
5434                 return 0;
5435
5436         path = btrfs_alloc_path();
5437         if (!path)
5438                 return -ENOMEM;
5439
5440         while (true) {
5441                 struct extent_buffer *leaf;
5442                 struct btrfs_key min_key;
5443                 bool continue_curr_inode = true;
5444                 int nritems;
5445                 int i;
5446
5447                 min_key.objectid = ino;
5448                 min_key.type = BTRFS_DIR_INDEX_KEY;
5449                 min_key.offset = 0;
5450 again:
5451                 btrfs_release_path(path);
5452                 ret = btrfs_search_forward(root, &min_key, path, trans->transid);
5453                 if (ret < 0) {
5454                         break;
5455                 } else if (ret > 0) {
5456                         ret = 0;
5457                         goto next;
5458                 }
5459
5460                 leaf = path->nodes[0];
5461                 nritems = btrfs_header_nritems(leaf);
5462                 for (i = path->slots[0]; i < nritems; i++) {
5463                         struct btrfs_dir_item *di;
5464                         struct btrfs_key di_key;
5465                         struct inode *di_inode;
5466                         int log_mode = LOG_INODE_EXISTS;
5467                         int type;
5468
5469                         btrfs_item_key_to_cpu(leaf, &min_key, i);
5470                         if (min_key.objectid != ino ||
5471                             min_key.type != BTRFS_DIR_INDEX_KEY) {
5472                                 continue_curr_inode = false;
5473                                 break;
5474                         }
5475
5476                         di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
5477                         type = btrfs_dir_type(leaf, di);
5478                         if (btrfs_dir_transid(leaf, di) < trans->transid)
5479                                 continue;
5480                         btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
5481                         if (di_key.type == BTRFS_ROOT_ITEM_KEY)
5482                                 continue;
5483
5484                         btrfs_release_path(path);
5485                         di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root);
5486                         if (IS_ERR(di_inode)) {
5487                                 ret = PTR_ERR(di_inode);
5488                                 goto out;
5489                         }
5490
5491                         if (!need_log_inode(trans, BTRFS_I(di_inode))) {
5492                                 btrfs_add_delayed_iput(di_inode);
5493                                 break;
5494                         }
5495
5496                         ctx->log_new_dentries = false;
5497                         if (type == BTRFS_FT_DIR)
5498                                 log_mode = LOG_INODE_ALL;
5499                         ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
5500                                               log_mode, ctx);
5501                         btrfs_add_delayed_iput(di_inode);
5502                         if (ret)
5503                                 goto out;
5504                         if (ctx->log_new_dentries) {
5505                                 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
5506                                 if (!dir_elem) {
5507                                         ret = -ENOMEM;
5508                                         goto out;
5509                                 }
5510                                 dir_elem->ino = di_key.objectid;
5511                                 list_add_tail(&dir_elem->list, &dir_list);
5512                         }
5513                         break;
5514                 }
5515
5516                 if (continue_curr_inode && min_key.offset < (u64)-1) {
5517                         min_key.offset++;
5518                         goto again;
5519                 }
5520
5521 next:
5522                 if (list_empty(&dir_list))
5523                         break;
5524
5525                 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, list);
5526                 ino = dir_elem->ino;
5527                 list_del(&dir_elem->list);
5528                 kfree(dir_elem);
5529         }
5530 out:
5531         btrfs_free_path(path);
5532         if (ret) {
5533                 struct btrfs_dir_list *next;
5534
5535                 list_for_each_entry_safe(dir_elem, next, &dir_list, list)
5536                         kfree(dir_elem);
5537         }
5538
5539         return ret;
5540 }
5541
5542 struct btrfs_ino_list {
5543         u64 ino;
5544         u64 parent;
5545         struct list_head list;
5546 };
5547
5548 static void free_conflicting_inodes(struct btrfs_log_ctx *ctx)
5549 {
5550         struct btrfs_ino_list *curr;
5551         struct btrfs_ino_list *next;
5552
5553         list_for_each_entry_safe(curr, next, &ctx->conflict_inodes, list) {
5554                 list_del(&curr->list);
5555                 kfree(curr);
5556         }
5557 }
5558
5559 static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino,
5560                                     struct btrfs_path *path)
5561 {
5562         struct btrfs_key key;
5563         int ret;
5564
5565         key.objectid = ino;
5566         key.type = BTRFS_INODE_ITEM_KEY;
5567         key.offset = 0;
5568
5569         path->search_commit_root = 1;
5570         path->skip_locking = 1;
5571
5572         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5573         if (WARN_ON_ONCE(ret > 0)) {
5574                 /*
5575                  * We have previously found the inode through the commit root
5576                  * so this should not happen. If it does, just error out and
5577                  * fallback to a transaction commit.
5578                  */
5579                 ret = -ENOENT;
5580         } else if (ret == 0) {
5581                 struct btrfs_inode_item *item;
5582
5583                 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
5584                                       struct btrfs_inode_item);
5585                 if (S_ISDIR(btrfs_inode_mode(path->nodes[0], item)))
5586                         ret = 1;
5587         }
5588
5589         btrfs_release_path(path);
5590         path->search_commit_root = 0;
5591         path->skip_locking = 0;
5592
5593         return ret;
5594 }
5595
5596 static int add_conflicting_inode(struct btrfs_trans_handle *trans,
5597                                  struct btrfs_root *root,
5598                                  struct btrfs_path *path,
5599                                  u64 ino, u64 parent,
5600                                  struct btrfs_log_ctx *ctx)
5601 {
5602         struct btrfs_ino_list *ino_elem;
5603         struct inode *inode;
5604
5605         /*
5606          * It's rare to have a lot of conflicting inodes, in practice it is not
5607          * common to have more than 1 or 2. We don't want to collect too many,
5608          * as we could end up logging too many inodes (even if only in
5609          * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction
5610          * commits.
5611          */
5612         if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES)
5613                 return BTRFS_LOG_FORCE_COMMIT;
5614
5615         inode = btrfs_iget(root->fs_info->sb, ino, root);
5616         /*
5617          * If the other inode that had a conflicting dir entry was deleted in
5618          * the current transaction then we either:
5619          *
5620          * 1) Log the parent directory (later after adding it to the list) if
5621          *    the inode is a directory. This is because it may be a deleted
5622          *    subvolume/snapshot or it may be a regular directory that had
5623          *    deleted subvolumes/snapshots (or subdirectories that had them),
5624          *    and at the moment we can't deal with dropping subvolumes/snapshots
5625          *    during log replay. So we just log the parent, which will result in
5626          *    a fallback to a transaction commit if we are dealing with those
5627          *    cases (last_unlink_trans will match the current transaction);
5628          *
5629          * 2) Do nothing if it's not a directory. During log replay we simply
5630          *    unlink the conflicting dentry from the parent directory and then
5631          *    add the dentry for our inode. Like this we can avoid logging the
5632          *    parent directory (and maybe fallback to a transaction commit in
5633          *    case it has a last_unlink_trans == trans->transid, due to moving
5634          *    some inode from it to some other directory).
5635          */
5636         if (IS_ERR(inode)) {
5637                 int ret = PTR_ERR(inode);
5638
5639                 if (ret != -ENOENT)
5640                         return ret;
5641
5642                 ret = conflicting_inode_is_dir(root, ino, path);
5643                 /* Not a directory or we got an error. */
5644                 if (ret <= 0)
5645                         return ret;
5646
5647                 /* Conflicting inode is a directory, so we'll log its parent. */
5648                 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
5649                 if (!ino_elem)
5650                         return -ENOMEM;
5651                 ino_elem->ino = ino;
5652                 ino_elem->parent = parent;
5653                 list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
5654                 ctx->num_conflict_inodes++;
5655
5656                 return 0;
5657         }
5658
5659         /*
5660          * If the inode was already logged skip it - otherwise we can hit an
5661          * infinite loop. Example:
5662          *
5663          * From the commit root (previous transaction) we have the following
5664          * inodes:
5665          *
5666          * inode 257 a directory
5667          * inode 258 with references "zz" and "zz_link" on inode 257
5668          * inode 259 with reference "a" on inode 257
5669          *
5670          * And in the current (uncommitted) transaction we have:
5671          *
5672          * inode 257 a directory, unchanged
5673          * inode 258 with references "a" and "a2" on inode 257
5674          * inode 259 with reference "zz_link" on inode 257
5675          * inode 261 with reference "zz" on inode 257
5676          *
5677          * When logging inode 261 the following infinite loop could
5678          * happen if we don't skip already logged inodes:
5679          *
5680          * - we detect inode 258 as a conflicting inode, with inode 261
5681          *   on reference "zz", and log it;
5682          *
5683          * - we detect inode 259 as a conflicting inode, with inode 258
5684          *   on reference "a", and log it;
5685          *
5686          * - we detect inode 258 as a conflicting inode, with inode 259
5687          *   on reference "zz_link", and log it - again! After this we
5688          *   repeat the above steps forever.
5689          *
5690          * Here we can use need_log_inode() because we only need to log the
5691          * inode in LOG_INODE_EXISTS mode and rename operations update the log,
5692          * so that the log ends up with the new name and without the old name.
5693          */
5694         if (!need_log_inode(trans, BTRFS_I(inode))) {
5695                 btrfs_add_delayed_iput(inode);
5696                 return 0;
5697         }
5698
5699         btrfs_add_delayed_iput(inode);
5700
5701         ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
5702         if (!ino_elem)
5703                 return -ENOMEM;
5704         ino_elem->ino = ino;
5705         ino_elem->parent = parent;
5706         list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
5707         ctx->num_conflict_inodes++;
5708
5709         return 0;
5710 }
5711
5712 static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
5713                                   struct btrfs_root *root,
5714                                   struct btrfs_log_ctx *ctx)
5715 {
5716         struct btrfs_fs_info *fs_info = root->fs_info;
5717         int ret = 0;
5718
5719         /*
5720          * Conflicting inodes are logged by the first call to btrfs_log_inode(),
5721          * otherwise we could have unbounded recursion of btrfs_log_inode()
5722          * calls. This check guarantees we can have only 1 level of recursion.
5723          */
5724         if (ctx->logging_conflict_inodes)
5725                 return 0;
5726
5727         ctx->logging_conflict_inodes = true;
5728
5729         /*
5730          * New conflicting inodes may be found and added to the list while we
5731          * are logging a conflicting inode, so keep iterating while the list is
5732          * not empty.
5733          */
5734         while (!list_empty(&ctx->conflict_inodes)) {
5735                 struct btrfs_ino_list *curr;
5736                 struct inode *inode;
5737                 u64 ino;
5738                 u64 parent;
5739
5740                 curr = list_first_entry(&ctx->conflict_inodes,
5741                                         struct btrfs_ino_list, list);
5742                 ino = curr->ino;
5743                 parent = curr->parent;
5744                 list_del(&curr->list);
5745                 kfree(curr);
5746
5747                 inode = btrfs_iget(fs_info->sb, ino, root);
5748                 /*
5749                  * If the other inode that had a conflicting dir entry was
5750                  * deleted in the current transaction, we need to log its parent
5751                  * directory. See the comment at add_conflicting_inode().
5752                  */
5753                 if (IS_ERR(inode)) {
5754                         ret = PTR_ERR(inode);
5755                         if (ret != -ENOENT)
5756                                 break;
5757
5758                         inode = btrfs_iget(fs_info->sb, parent, root);
5759                         if (IS_ERR(inode)) {
5760                                 ret = PTR_ERR(inode);
5761                                 break;
5762                         }
5763
5764                         /*
5765                          * Always log the directory, we cannot make this
5766                          * conditional on need_log_inode() because the directory
5767                          * might have been logged in LOG_INODE_EXISTS mode or
5768                          * the dir index of the conflicting inode is not in a
5769                          * dir index key range logged for the directory. So we
5770                          * must make sure the deletion is recorded.
5771                          */
5772                         ret = btrfs_log_inode(trans, BTRFS_I(inode),
5773                                               LOG_INODE_ALL, ctx);
5774                         btrfs_add_delayed_iput(inode);
5775                         if (ret)
5776                                 break;
5777                         continue;
5778                 }
5779
5780                 /*
5781                  * Here we can use need_log_inode() because we only need to log
5782                  * the inode in LOG_INODE_EXISTS mode and rename operations
5783                  * update the log, so that the log ends up with the new name and
5784                  * without the old name.
5785                  *
5786                  * We did this check at add_conflicting_inode(), but here we do
5787                  * it again because if some other task logged the inode after
5788                  * that, we can avoid doing it again.
5789                  */
5790                 if (!need_log_inode(trans, BTRFS_I(inode))) {
5791                         btrfs_add_delayed_iput(inode);
5792                         continue;
5793                 }
5794
5795                 /*
5796                  * We are safe logging the other inode without acquiring its
5797                  * lock as long as we log with the LOG_INODE_EXISTS mode. We
5798                  * are safe against concurrent renames of the other inode as
5799                  * well because during a rename we pin the log and update the
5800                  * log with the new name before we unpin it.
5801                  */
5802                 ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx);
5803                 btrfs_add_delayed_iput(inode);
5804                 if (ret)
5805                         break;
5806         }
5807
5808         ctx->logging_conflict_inodes = false;
5809         if (ret)
5810                 free_conflicting_inodes(ctx);
5811
5812         return ret;
5813 }
5814
5815 static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
5816                                    struct btrfs_inode *inode,
5817                                    struct btrfs_key *min_key,
5818                                    const struct btrfs_key *max_key,
5819                                    struct btrfs_path *path,
5820                                    struct btrfs_path *dst_path,
5821                                    const u64 logged_isize,
5822                                    const int inode_only,
5823                                    struct btrfs_log_ctx *ctx,
5824                                    bool *need_log_inode_item)
5825 {
5826         const u64 i_size = i_size_read(&inode->vfs_inode);
5827         struct btrfs_root *root = inode->root;
5828         int ins_start_slot = 0;
5829         int ins_nr = 0;
5830         int ret;
5831
5832         while (1) {
5833                 ret = btrfs_search_forward(root, min_key, path, trans->transid);
5834                 if (ret < 0)
5835                         return ret;
5836                 if (ret > 0) {
5837                         ret = 0;
5838                         break;
5839                 }
5840 again:
5841                 /* Note, ins_nr might be > 0 here, cleanup outside the loop */
5842                 if (min_key->objectid != max_key->objectid)
5843                         break;
5844                 if (min_key->type > max_key->type)
5845                         break;
5846
5847                 if (min_key->type == BTRFS_INODE_ITEM_KEY) {
5848                         *need_log_inode_item = false;
5849                 } else if (min_key->type == BTRFS_EXTENT_DATA_KEY &&
5850                            min_key->offset >= i_size) {
5851                         /*
5852                          * Extents at and beyond eof are logged with
5853                          * btrfs_log_prealloc_extents().
5854                          * Only regular files have BTRFS_EXTENT_DATA_KEY keys,
5855                          * and no keys greater than that, so bail out.
5856                          */
5857                         break;
5858                 } else if ((min_key->type == BTRFS_INODE_REF_KEY ||
5859                             min_key->type == BTRFS_INODE_EXTREF_KEY) &&
5860                            (inode->generation == trans->transid ||
5861                             ctx->logging_conflict_inodes)) {
5862                         u64 other_ino = 0;
5863                         u64 other_parent = 0;
5864
5865                         ret = btrfs_check_ref_name_override(path->nodes[0],
5866                                         path->slots[0], min_key, inode,
5867                                         &other_ino, &other_parent);
5868                         if (ret < 0) {
5869                                 return ret;
5870                         } else if (ret > 0 &&
5871                                    other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
5872                                 if (ins_nr > 0) {
5873                                         ins_nr++;
5874                                 } else {
5875                                         ins_nr = 1;
5876                                         ins_start_slot = path->slots[0];
5877                                 }
5878                                 ret = copy_items(trans, inode, dst_path, path,
5879                                                  ins_start_slot, ins_nr,
5880                                                  inode_only, logged_isize);
5881                                 if (ret < 0)
5882                                         return ret;
5883                                 ins_nr = 0;
5884
5885                                 btrfs_release_path(path);
5886                                 ret = add_conflicting_inode(trans, root, path,
5887                                                             other_ino,
5888                                                             other_parent, ctx);
5889                                 if (ret)
5890                                         return ret;
5891                                 goto next_key;
5892                         }
5893                 } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
5894                         /* Skip xattrs, logged later with btrfs_log_all_xattrs() */
5895                         if (ins_nr == 0)
5896                                 goto next_slot;
5897                         ret = copy_items(trans, inode, dst_path, path,
5898                                          ins_start_slot,
5899                                          ins_nr, inode_only, logged_isize);
5900                         if (ret < 0)
5901                                 return ret;
5902                         ins_nr = 0;
5903                         goto next_slot;
5904                 }
5905
5906                 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
5907                         ins_nr++;
5908                         goto next_slot;
5909                 } else if (!ins_nr) {
5910                         ins_start_slot = path->slots[0];
5911                         ins_nr = 1;
5912                         goto next_slot;
5913                 }
5914
5915                 ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
5916                                  ins_nr, inode_only, logged_isize);
5917                 if (ret < 0)
5918                         return ret;
5919                 ins_nr = 1;
5920                 ins_start_slot = path->slots[0];
5921 next_slot:
5922                 path->slots[0]++;
5923                 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
5924                         btrfs_item_key_to_cpu(path->nodes[0], min_key,
5925                                               path->slots[0]);
5926                         goto again;
5927                 }
5928                 if (ins_nr) {
5929                         ret = copy_items(trans, inode, dst_path, path,
5930                                          ins_start_slot, ins_nr, inode_only,
5931                                          logged_isize);
5932                         if (ret < 0)
5933                                 return ret;
5934                         ins_nr = 0;
5935                 }
5936                 btrfs_release_path(path);
5937 next_key:
5938                 if (min_key->offset < (u64)-1) {
5939                         min_key->offset++;
5940                 } else if (min_key->type < max_key->type) {
5941                         min_key->type++;
5942                         min_key->offset = 0;
5943                 } else {
5944                         break;
5945                 }
5946
5947                 /*
5948                  * We may process many leaves full of items for our inode, so
5949                  * avoid monopolizing a cpu for too long by rescheduling while
5950                  * not holding locks on any tree.
5951                  */
5952                 cond_resched();
5953         }
5954         if (ins_nr) {
5955                 ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
5956                                  ins_nr, inode_only, logged_isize);
5957                 if (ret)
5958                         return ret;
5959         }
5960
5961         if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
5962                 /*
5963                  * Release the path because otherwise we might attempt to double
5964                  * lock the same leaf with btrfs_log_prealloc_extents() below.
5965                  */
5966                 btrfs_release_path(path);
5967                 ret = btrfs_log_prealloc_extents(trans, inode, dst_path);
5968         }
5969
5970         return ret;
5971 }
5972
5973 static int insert_delayed_items_batch(struct btrfs_trans_handle *trans,
5974                                       struct btrfs_root *log,
5975                                       struct btrfs_path *path,
5976                                       const struct btrfs_item_batch *batch,
5977                                       const struct btrfs_delayed_item *first_item)
5978 {
5979         const struct btrfs_delayed_item *curr = first_item;
5980         int ret;
5981
5982         ret = btrfs_insert_empty_items(trans, log, path, batch);
5983         if (ret)
5984                 return ret;
5985
5986         for (int i = 0; i < batch->nr; i++) {
5987                 char *data_ptr;
5988
5989                 data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
5990                 write_extent_buffer(path->nodes[0], &curr->data,
5991                                     (unsigned long)data_ptr, curr->data_len);
5992                 curr = list_next_entry(curr, log_list);
5993                 path->slots[0]++;
5994         }
5995
5996         btrfs_release_path(path);
5997
5998         return 0;
5999 }
6000
6001 static int log_delayed_insertion_items(struct btrfs_trans_handle *trans,
6002                                        struct btrfs_inode *inode,
6003                                        struct btrfs_path *path,
6004                                        const struct list_head *delayed_ins_list,
6005                                        struct btrfs_log_ctx *ctx)
6006 {
6007         /* 195 (4095 bytes of keys and sizes) fits in a single 4K page. */
6008         const int max_batch_size = 195;
6009         const int leaf_data_size = BTRFS_LEAF_DATA_SIZE(trans->fs_info);
6010         const u64 ino = btrfs_ino(inode);
6011         struct btrfs_root *log = inode->root->log_root;
6012         struct btrfs_item_batch batch = {
6013                 .nr = 0,
6014                 .total_data_size = 0,
6015         };
6016         const struct btrfs_delayed_item *first = NULL;
6017         const struct btrfs_delayed_item *curr;
6018         char *ins_data;
6019         struct btrfs_key *ins_keys;
6020         u32 *ins_sizes;
6021         u64 curr_batch_size = 0;
6022         int batch_idx = 0;
6023         int ret;
6024
6025         /* We are adding dir index items to the log tree. */
6026         lockdep_assert_held(&inode->log_mutex);
6027
6028         /*
6029          * We collect delayed items before copying index keys from the subvolume
6030          * to the log tree. However just after we collected them, they may have
6031          * been flushed (all of them or just some of them), and therefore we
6032          * could have copied them from the subvolume tree to the log tree.
6033          * So find the first delayed item that was not yet logged (they are
6034          * sorted by index number).
6035          */
6036         list_for_each_entry(curr, delayed_ins_list, log_list) {
6037                 if (curr->index > inode->last_dir_index_offset) {
6038                         first = curr;
6039                         break;
6040                 }
6041         }
6042
6043         /* Empty list or all delayed items were already logged. */
6044         if (!first)
6045                 return 0;
6046
6047         ins_data = kmalloc(max_batch_size * sizeof(u32) +
6048                            max_batch_size * sizeof(struct btrfs_key), GFP_NOFS);
6049         if (!ins_data)
6050                 return -ENOMEM;
6051         ins_sizes = (u32 *)ins_data;
6052         batch.data_sizes = ins_sizes;
6053         ins_keys = (struct btrfs_key *)(ins_data + max_batch_size * sizeof(u32));
6054         batch.keys = ins_keys;
6055
6056         curr = first;
6057         while (!list_entry_is_head(curr, delayed_ins_list, log_list)) {
6058                 const u32 curr_size = curr->data_len + sizeof(struct btrfs_item);
6059
6060                 if (curr_batch_size + curr_size > leaf_data_size ||
6061                     batch.nr == max_batch_size) {
6062                         ret = insert_delayed_items_batch(trans, log, path,
6063                                                          &batch, first);
6064                         if (ret)
6065                                 goto out;
6066                         batch_idx = 0;
6067                         batch.nr = 0;
6068                         batch.total_data_size = 0;
6069                         curr_batch_size = 0;
6070                         first = curr;
6071                 }
6072
6073                 ins_sizes[batch_idx] = curr->data_len;
6074                 ins_keys[batch_idx].objectid = ino;
6075                 ins_keys[batch_idx].type = BTRFS_DIR_INDEX_KEY;
6076                 ins_keys[batch_idx].offset = curr->index;
6077                 curr_batch_size += curr_size;
6078                 batch.total_data_size += curr->data_len;
6079                 batch.nr++;
6080                 batch_idx++;
6081                 curr = list_next_entry(curr, log_list);
6082         }
6083
6084         ASSERT(batch.nr >= 1);
6085         ret = insert_delayed_items_batch(trans, log, path, &batch, first);
6086
6087         curr = list_last_entry(delayed_ins_list, struct btrfs_delayed_item,
6088                                log_list);
6089         inode->last_dir_index_offset = curr->index;
6090 out:
6091         kfree(ins_data);
6092
6093         return ret;
6094 }
6095
6096 static int log_delayed_deletions_full(struct btrfs_trans_handle *trans,
6097                                       struct btrfs_inode *inode,
6098                                       struct btrfs_path *path,
6099                                       const struct list_head *delayed_del_list,
6100                                       struct btrfs_log_ctx *ctx)
6101 {
6102         const u64 ino = btrfs_ino(inode);
6103         const struct btrfs_delayed_item *curr;
6104
6105         curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
6106                                 log_list);
6107
6108         while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
6109                 u64 first_dir_index = curr->index;
6110                 u64 last_dir_index;
6111                 const struct btrfs_delayed_item *next;
6112                 int ret;
6113
6114                 /*
6115                  * Find a range of consecutive dir index items to delete. Like
6116                  * this we log a single dir range item spanning several contiguous
6117                  * dir items instead of logging one range item per dir index item.
6118                  */
6119                 next = list_next_entry(curr, log_list);
6120                 while (!list_entry_is_head(next, delayed_del_list, log_list)) {
6121                         if (next->index != curr->index + 1)
6122                                 break;
6123                         curr = next;
6124                         next = list_next_entry(next, log_list);
6125                 }
6126
6127                 last_dir_index = curr->index;
6128                 ASSERT(last_dir_index >= first_dir_index);
6129
6130                 ret = insert_dir_log_key(trans, inode->root->log_root, path,
6131                                          ino, first_dir_index, last_dir_index);
6132                 if (ret)
6133                         return ret;
6134                 curr = list_next_entry(curr, log_list);
6135         }
6136
6137         return 0;
6138 }
6139
6140 static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans,
6141                                         struct btrfs_inode *inode,
6142                                         struct btrfs_path *path,
6143                                         struct btrfs_log_ctx *ctx,
6144                                         const struct list_head *delayed_del_list,
6145                                         const struct btrfs_delayed_item *first,
6146                                         const struct btrfs_delayed_item **last_ret)
6147 {
6148         const struct btrfs_delayed_item *next;
6149         struct extent_buffer *leaf = path->nodes[0];
6150         const int last_slot = btrfs_header_nritems(leaf) - 1;
6151         int slot = path->slots[0] + 1;
6152         const u64 ino = btrfs_ino(inode);
6153
6154         next = list_next_entry(first, log_list);
6155
6156         while (slot < last_slot &&
6157                !list_entry_is_head(next, delayed_del_list, log_list)) {
6158                 struct btrfs_key key;
6159
6160                 btrfs_item_key_to_cpu(leaf, &key, slot);
6161                 if (key.objectid != ino ||
6162                     key.type != BTRFS_DIR_INDEX_KEY ||
6163                     key.offset != next->index)
6164                         break;
6165
6166                 slot++;
6167                 *last_ret = next;
6168                 next = list_next_entry(next, log_list);
6169         }
6170
6171         return btrfs_del_items(trans, inode->root->log_root, path,
6172                                path->slots[0], slot - path->slots[0]);
6173 }
6174
6175 static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans,
6176                                              struct btrfs_inode *inode,
6177                                              struct btrfs_path *path,
6178                                              const struct list_head *delayed_del_list,
6179                                              struct btrfs_log_ctx *ctx)
6180 {
6181         struct btrfs_root *log = inode->root->log_root;
6182         const struct btrfs_delayed_item *curr;
6183         u64 last_range_start;
6184         u64 last_range_end = 0;
6185         struct btrfs_key key;
6186
6187         key.objectid = btrfs_ino(inode);
6188         key.type = BTRFS_DIR_INDEX_KEY;
6189         curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
6190                                 log_list);
6191
6192         while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
6193                 const struct btrfs_delayed_item *last = curr;
6194                 u64 first_dir_index = curr->index;
6195                 u64 last_dir_index;
6196                 bool deleted_items = false;
6197                 int ret;
6198
6199                 key.offset = curr->index;
6200                 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
6201                 if (ret < 0) {
6202                         return ret;
6203                 } else if (ret == 0) {
6204                         ret = batch_delete_dir_index_items(trans, inode, path, ctx,
6205                                                            delayed_del_list, curr,
6206                                                            &last);
6207                         if (ret)
6208                                 return ret;
6209                         deleted_items = true;
6210                 }
6211
6212                 btrfs_release_path(path);
6213
6214                 /*
6215                  * If we deleted items from the leaf, it means we have a range
6216                  * item logging their range, so no need to add one or update an
6217                  * existing one. Otherwise we have to log a dir range item.
6218                  */
6219                 if (deleted_items)
6220                         goto next_batch;
6221
6222                 last_dir_index = last->index;
6223                 ASSERT(last_dir_index >= first_dir_index);
6224                 /*
6225                  * If this range starts right after where the previous one ends,
6226                  * then we want to reuse the previous range item and change its
6227                  * end offset to the end of this range. This is just to minimize
6228                  * leaf space usage, by avoiding adding a new range item.
6229                  */
6230                 if (last_range_end != 0 && first_dir_index == last_range_end + 1)
6231                         first_dir_index = last_range_start;
6232
6233                 ret = insert_dir_log_key(trans, log, path, key.objectid,
6234                                          first_dir_index, last_dir_index);
6235                 if (ret)
6236                         return ret;
6237
6238                 last_range_start = first_dir_index;
6239                 last_range_end = last_dir_index;
6240 next_batch:
6241                 curr = list_next_entry(last, log_list);
6242         }
6243
6244         return 0;
6245 }
6246
6247 static int log_delayed_deletion_items(struct btrfs_trans_handle *trans,
6248                                       struct btrfs_inode *inode,
6249                                       struct btrfs_path *path,
6250                                       const struct list_head *delayed_del_list,
6251                                       struct btrfs_log_ctx *ctx)
6252 {
6253         /*
6254          * We are deleting dir index items from the log tree or adding range
6255          * items to it.
6256          */
6257         lockdep_assert_held(&inode->log_mutex);
6258
6259         if (list_empty(delayed_del_list))
6260                 return 0;
6261
6262         if (ctx->logged_before)
6263                 return log_delayed_deletions_incremental(trans, inode, path,
6264                                                          delayed_del_list, ctx);
6265
6266         return log_delayed_deletions_full(trans, inode, path, delayed_del_list,
6267                                           ctx);
6268 }
6269
6270 /*
6271  * Similar logic as for log_new_dir_dentries(), but it iterates over the delayed
6272  * items instead of the subvolume tree.
6273  */
6274 static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
6275                                     struct btrfs_inode *inode,
6276                                     const struct list_head *delayed_ins_list,
6277                                     struct btrfs_log_ctx *ctx)
6278 {
6279         const bool orig_log_new_dentries = ctx->log_new_dentries;
6280         struct btrfs_fs_info *fs_info = trans->fs_info;
6281         struct btrfs_delayed_item *item;
6282         int ret = 0;
6283
6284         /*
6285          * No need for the log mutex, plus to avoid potential deadlocks or
6286          * lockdep annotations due to nesting of delayed inode mutexes and log
6287          * mutexes.
6288          */
6289         lockdep_assert_not_held(&inode->log_mutex);
6290
6291         ASSERT(!ctx->logging_new_delayed_dentries);
6292         ctx->logging_new_delayed_dentries = true;
6293
6294         list_for_each_entry(item, delayed_ins_list, log_list) {
6295                 struct btrfs_dir_item *dir_item;
6296                 struct inode *di_inode;
6297                 struct btrfs_key key;
6298                 int log_mode = LOG_INODE_EXISTS;
6299
6300                 dir_item = (struct btrfs_dir_item *)item->data;
6301                 btrfs_disk_key_to_cpu(&key, &dir_item->location);
6302
6303                 if (key.type == BTRFS_ROOT_ITEM_KEY)
6304                         continue;
6305
6306                 di_inode = btrfs_iget(fs_info->sb, key.objectid, inode->root);
6307                 if (IS_ERR(di_inode)) {
6308                         ret = PTR_ERR(di_inode);
6309                         break;
6310                 }
6311
6312                 if (!need_log_inode(trans, BTRFS_I(di_inode))) {
6313                         btrfs_add_delayed_iput(di_inode);
6314                         continue;
6315                 }
6316
6317                 if (btrfs_stack_dir_type(dir_item) == BTRFS_FT_DIR)
6318                         log_mode = LOG_INODE_ALL;
6319
6320                 ctx->log_new_dentries = false;
6321                 ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx);
6322
6323                 if (!ret && ctx->log_new_dentries)
6324                         ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx);
6325
6326                 btrfs_add_delayed_iput(di_inode);
6327
6328                 if (ret)
6329                         break;
6330         }
6331
6332         ctx->log_new_dentries = orig_log_new_dentries;
6333         ctx->logging_new_delayed_dentries = false;
6334
6335         return ret;
6336 }
6337
6338 /* log a single inode in the tree log.
6339  * At least one parent directory for this inode must exist in the tree
6340  * or be logged already.
6341  *
6342  * Any items from this inode changed by the current transaction are copied
6343  * to the log tree.  An extra reference is taken on any extents in this
6344  * file, allowing us to avoid a whole pile of corner cases around logging
6345  * blocks that have been removed from the tree.
6346  *
6347  * See LOG_INODE_ALL and related defines for a description of what inode_only
6348  * does.
6349  *
6350  * This handles both files and directories.
6351  */
6352 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
6353                            struct btrfs_inode *inode,
6354                            int inode_only,
6355                            struct btrfs_log_ctx *ctx)
6356 {
6357         struct btrfs_path *path;
6358         struct btrfs_path *dst_path;
6359         struct btrfs_key min_key;
6360         struct btrfs_key max_key;
6361         struct btrfs_root *log = inode->root->log_root;
6362         int ret;
6363         bool fast_search = false;
6364         u64 ino = btrfs_ino(inode);
6365         struct extent_map_tree *em_tree = &inode->extent_tree;
6366         u64 logged_isize = 0;
6367         bool need_log_inode_item = true;
6368         bool xattrs_logged = false;
6369         bool inode_item_dropped = true;
6370         bool full_dir_logging = false;
6371         LIST_HEAD(delayed_ins_list);
6372         LIST_HEAD(delayed_del_list);
6373
6374         path = btrfs_alloc_path();
6375         if (!path)
6376                 return -ENOMEM;
6377         dst_path = btrfs_alloc_path();
6378         if (!dst_path) {
6379                 btrfs_free_path(path);
6380                 return -ENOMEM;
6381         }
6382
6383         min_key.objectid = ino;
6384         min_key.type = BTRFS_INODE_ITEM_KEY;
6385         min_key.offset = 0;
6386
6387         max_key.objectid = ino;
6388
6389
6390         /* today the code can only do partial logging of directories */
6391         if (S_ISDIR(inode->vfs_inode.i_mode) ||
6392             (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
6393                        &inode->runtime_flags) &&
6394              inode_only >= LOG_INODE_EXISTS))
6395                 max_key.type = BTRFS_XATTR_ITEM_KEY;
6396         else
6397                 max_key.type = (u8)-1;
6398         max_key.offset = (u64)-1;
6399
6400         if (S_ISDIR(inode->vfs_inode.i_mode) && inode_only == LOG_INODE_ALL)
6401                 full_dir_logging = true;
6402
6403         /*
6404          * If we are logging a directory while we are logging dentries of the
6405          * delayed items of some other inode, then we need to flush the delayed
6406          * items of this directory and not log the delayed items directly. This
6407          * is to prevent more than one level of recursion into btrfs_log_inode()
6408          * by having something like this:
6409          *
6410          *     $ mkdir -p a/b/c/d/e/f/g/h/...
6411          *     $ xfs_io -c "fsync" a
6412          *
6413          * Where all directories in the path did not exist before and are
6414          * created in the current transaction.
6415          * So in such a case we directly log the delayed items of the main
6416          * directory ("a") without flushing them first, while for each of its
6417          * subdirectories we flush their delayed items before logging them.
6418          * This prevents a potential unbounded recursion like this:
6419          *
6420          * btrfs_log_inode()
6421          *   log_new_delayed_dentries()
6422          *      btrfs_log_inode()
6423          *        log_new_delayed_dentries()
6424          *          btrfs_log_inode()
6425          *            log_new_delayed_dentries()
6426          *              (...)
6427          *
6428          * We have thresholds for the maximum number of delayed items to have in
6429          * memory, and once they are hit, the items are flushed asynchronously.
6430          * However the limit is quite high, so lets prevent deep levels of
6431          * recursion to happen by limiting the maximum depth to be 1.
6432          */
6433         if (full_dir_logging && ctx->logging_new_delayed_dentries) {
6434                 ret = btrfs_commit_inode_delayed_items(trans, inode);
6435                 if (ret)
6436                         goto out;
6437         }
6438
6439         mutex_lock(&inode->log_mutex);
6440
6441         /*
6442          * For symlinks, we must always log their content, which is stored in an
6443          * inline extent, otherwise we could end up with an empty symlink after
6444          * log replay, which is invalid on linux (symlink(2) returns -ENOENT if
6445          * one attempts to create an empty symlink).
6446          * We don't need to worry about flushing delalloc, because when we create
6447          * the inline extent when the symlink is created (we never have delalloc
6448          * for symlinks).
6449          */
6450         if (S_ISLNK(inode->vfs_inode.i_mode))
6451                 inode_only = LOG_INODE_ALL;
6452
6453         /*
6454          * Before logging the inode item, cache the value returned by
6455          * inode_logged(), because after that we have the need to figure out if
6456          * the inode was previously logged in this transaction.
6457          */
6458         ret = inode_logged(trans, inode, path);
6459         if (ret < 0)
6460                 goto out_unlock;
6461         ctx->logged_before = (ret == 1);
6462         ret = 0;
6463
6464         /*
6465          * This is for cases where logging a directory could result in losing a
6466          * a file after replaying the log. For example, if we move a file from a
6467          * directory A to a directory B, then fsync directory A, we have no way
6468          * to known the file was moved from A to B, so logging just A would
6469          * result in losing the file after a log replay.
6470          */
6471         if (full_dir_logging && inode->last_unlink_trans >= trans->transid) {
6472                 btrfs_set_log_full_commit(trans);
6473                 ret = BTRFS_LOG_FORCE_COMMIT;
6474                 goto out_unlock;
6475         }
6476
6477         /*
6478          * a brute force approach to making sure we get the most uptodate
6479          * copies of everything.
6480          */
6481         if (S_ISDIR(inode->vfs_inode.i_mode)) {
6482                 clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
6483                 if (ctx->logged_before)
6484                         ret = drop_inode_items(trans, log, path, inode,
6485                                                BTRFS_XATTR_ITEM_KEY);
6486         } else {
6487                 if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) {
6488                         /*
6489                          * Make sure the new inode item we write to the log has
6490                          * the same isize as the current one (if it exists).
6491                          * This is necessary to prevent data loss after log
6492                          * replay, and also to prevent doing a wrong expanding
6493                          * truncate - for e.g. create file, write 4K into offset
6494                          * 0, fsync, write 4K into offset 4096, add hard link,
6495                          * fsync some other file (to sync log), power fail - if
6496                          * we use the inode's current i_size, after log replay
6497                          * we get a 8Kb file, with the last 4Kb extent as a hole
6498                          * (zeroes), as if an expanding truncate happened,
6499                          * instead of getting a file of 4Kb only.
6500                          */
6501                         ret = logged_inode_size(log, inode, path, &logged_isize);
6502                         if (ret)
6503                                 goto out_unlock;
6504                 }
6505                 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
6506                              &inode->runtime_flags)) {
6507                         if (inode_only == LOG_INODE_EXISTS) {
6508                                 max_key.type = BTRFS_XATTR_ITEM_KEY;
6509                                 if (ctx->logged_before)
6510                                         ret = drop_inode_items(trans, log, path,
6511                                                                inode, max_key.type);
6512                         } else {
6513                                 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
6514                                           &inode->runtime_flags);
6515                                 clear_bit(BTRFS_INODE_COPY_EVERYTHING,
6516                                           &inode->runtime_flags);
6517                                 if (ctx->logged_before)
6518                                         ret = truncate_inode_items(trans, log,
6519                                                                    inode, 0, 0);
6520                         }
6521                 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
6522                                               &inode->runtime_flags) ||
6523                            inode_only == LOG_INODE_EXISTS) {
6524                         if (inode_only == LOG_INODE_ALL)
6525                                 fast_search = true;
6526                         max_key.type = BTRFS_XATTR_ITEM_KEY;
6527                         if (ctx->logged_before)
6528                                 ret = drop_inode_items(trans, log, path, inode,
6529                                                        max_key.type);
6530                 } else {
6531                         if (inode_only == LOG_INODE_ALL)
6532                                 fast_search = true;
6533                         inode_item_dropped = false;
6534                         goto log_extents;
6535                 }
6536
6537         }
6538         if (ret)
6539                 goto out_unlock;
6540
6541         /*
6542          * If we are logging a directory in full mode, collect the delayed items
6543          * before iterating the subvolume tree, so that we don't miss any new
6544          * dir index items in case they get flushed while or right after we are
6545          * iterating the subvolume tree.
6546          */
6547         if (full_dir_logging && !ctx->logging_new_delayed_dentries)
6548                 btrfs_log_get_delayed_items(inode, &delayed_ins_list,
6549                                             &delayed_del_list);
6550
6551         ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
6552                                       path, dst_path, logged_isize,
6553                                       inode_only, ctx,
6554                                       &need_log_inode_item);
6555         if (ret)
6556                 goto out_unlock;
6557
6558         btrfs_release_path(path);
6559         btrfs_release_path(dst_path);
6560         ret = btrfs_log_all_xattrs(trans, inode, path, dst_path);
6561         if (ret)
6562                 goto out_unlock;
6563         xattrs_logged = true;
6564         if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
6565                 btrfs_release_path(path);
6566                 btrfs_release_path(dst_path);
6567                 ret = btrfs_log_holes(trans, inode, path);
6568                 if (ret)
6569                         goto out_unlock;
6570         }
6571 log_extents:
6572         btrfs_release_path(path);
6573         btrfs_release_path(dst_path);
6574         if (need_log_inode_item) {
6575                 ret = log_inode_item(trans, log, dst_path, inode, inode_item_dropped);
6576                 if (ret)
6577                         goto out_unlock;
6578                 /*
6579                  * If we are doing a fast fsync and the inode was logged before
6580                  * in this transaction, we don't need to log the xattrs because
6581                  * they were logged before. If xattrs were added, changed or
6582                  * deleted since the last time we logged the inode, then we have
6583                  * already logged them because the inode had the runtime flag
6584                  * BTRFS_INODE_COPY_EVERYTHING set.
6585                  */
6586                 if (!xattrs_logged && inode->logged_trans < trans->transid) {
6587                         ret = btrfs_log_all_xattrs(trans, inode, path, dst_path);
6588                         if (ret)
6589                                 goto out_unlock;
6590                         btrfs_release_path(path);
6591                 }
6592         }
6593         if (fast_search) {
6594                 ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx);
6595                 if (ret)
6596                         goto out_unlock;
6597         } else if (inode_only == LOG_INODE_ALL) {
6598                 struct extent_map *em, *n;
6599
6600                 write_lock(&em_tree->lock);
6601                 list_for_each_entry_safe(em, n, &em_tree->modified_extents, list)
6602                         list_del_init(&em->list);
6603                 write_unlock(&em_tree->lock);
6604         }
6605
6606         if (full_dir_logging) {
6607                 ret = log_directory_changes(trans, inode, path, dst_path, ctx);
6608                 if (ret)
6609                         goto out_unlock;
6610                 ret = log_delayed_insertion_items(trans, inode, path,
6611                                                   &delayed_ins_list, ctx);
6612                 if (ret)
6613                         goto out_unlock;
6614                 ret = log_delayed_deletion_items(trans, inode, path,
6615                                                  &delayed_del_list, ctx);
6616                 if (ret)
6617                         goto out_unlock;
6618         }
6619
6620         spin_lock(&inode->lock);
6621         inode->logged_trans = trans->transid;
6622         /*
6623          * Don't update last_log_commit if we logged that an inode exists.
6624          * We do this for three reasons:
6625          *
6626          * 1) We might have had buffered writes to this inode that were
6627          *    flushed and had their ordered extents completed in this
6628          *    transaction, but we did not previously log the inode with
6629          *    LOG_INODE_ALL. Later the inode was evicted and after that
6630          *    it was loaded again and this LOG_INODE_EXISTS log operation
6631          *    happened. We must make sure that if an explicit fsync against
6632          *    the inode is performed later, it logs the new extents, an
6633          *    updated inode item, etc, and syncs the log. The same logic
6634          *    applies to direct IO writes instead of buffered writes.
6635          *
6636          * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
6637          *    is logged with an i_size of 0 or whatever value was logged
6638          *    before. If later the i_size of the inode is increased by a
6639          *    truncate operation, the log is synced through an fsync of
6640          *    some other inode and then finally an explicit fsync against
6641          *    this inode is made, we must make sure this fsync logs the
6642          *    inode with the new i_size, the hole between old i_size and
6643          *    the new i_size, and syncs the log.
6644          *
6645          * 3) If we are logging that an ancestor inode exists as part of
6646          *    logging a new name from a link or rename operation, don't update
6647          *    its last_log_commit - otherwise if an explicit fsync is made
6648          *    against an ancestor, the fsync considers the inode in the log
6649          *    and doesn't sync the log, resulting in the ancestor missing after
6650          *    a power failure unless the log was synced as part of an fsync
6651          *    against any other unrelated inode.
6652          */
6653         if (inode_only != LOG_INODE_EXISTS)
6654                 inode->last_log_commit = inode->last_sub_trans;
6655         spin_unlock(&inode->lock);
6656
6657         /*
6658          * Reset the last_reflink_trans so that the next fsync does not need to
6659          * go through the slower path when logging extents and their checksums.
6660          */
6661         if (inode_only == LOG_INODE_ALL)
6662                 inode->last_reflink_trans = 0;
6663
6664 out_unlock:
6665         mutex_unlock(&inode->log_mutex);
6666 out:
6667         btrfs_free_path(path);
6668         btrfs_free_path(dst_path);
6669
6670         if (ret)
6671                 free_conflicting_inodes(ctx);
6672         else
6673                 ret = log_conflicting_inodes(trans, inode->root, ctx);
6674
6675         if (full_dir_logging && !ctx->logging_new_delayed_dentries) {
6676                 if (!ret)
6677                         ret = log_new_delayed_dentries(trans, inode,
6678                                                        &delayed_ins_list, ctx);
6679
6680                 btrfs_log_put_delayed_items(inode, &delayed_ins_list,
6681                                             &delayed_del_list);
6682         }
6683
6684         return ret;
6685 }
6686
6687 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
6688                                  struct btrfs_inode *inode,
6689                                  struct btrfs_log_ctx *ctx)
6690 {
6691         struct btrfs_fs_info *fs_info = trans->fs_info;
6692         int ret;
6693         struct btrfs_path *path;
6694         struct btrfs_key key;
6695         struct btrfs_root *root = inode->root;
6696         const u64 ino = btrfs_ino(inode);
6697
6698         path = btrfs_alloc_path();
6699         if (!path)
6700                 return -ENOMEM;
6701         path->skip_locking = 1;
6702         path->search_commit_root = 1;
6703
6704         key.objectid = ino;
6705         key.type = BTRFS_INODE_REF_KEY;
6706         key.offset = 0;
6707         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6708         if (ret < 0)
6709                 goto out;
6710
6711         while (true) {
6712                 struct extent_buffer *leaf = path->nodes[0];
6713                 int slot = path->slots[0];
6714                 u32 cur_offset = 0;
6715                 u32 item_size;
6716                 unsigned long ptr;
6717
6718                 if (slot >= btrfs_header_nritems(leaf)) {
6719                         ret = btrfs_next_leaf(root, path);
6720                         if (ret < 0)
6721                                 goto out;
6722                         else if (ret > 0)
6723                                 break;
6724                         continue;
6725                 }
6726
6727                 btrfs_item_key_to_cpu(leaf, &key, slot);
6728                 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
6729                 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
6730                         break;
6731
6732                 item_size = btrfs_item_size(leaf, slot);
6733                 ptr = btrfs_item_ptr_offset(leaf, slot);
6734                 while (cur_offset < item_size) {
6735                         struct btrfs_key inode_key;
6736                         struct inode *dir_inode;
6737
6738                         inode_key.type = BTRFS_INODE_ITEM_KEY;
6739                         inode_key.offset = 0;
6740
6741                         if (key.type == BTRFS_INODE_EXTREF_KEY) {
6742                                 struct btrfs_inode_extref *extref;
6743
6744                                 extref = (struct btrfs_inode_extref *)
6745                                         (ptr + cur_offset);
6746                                 inode_key.objectid = btrfs_inode_extref_parent(
6747                                         leaf, extref);
6748                                 cur_offset += sizeof(*extref);
6749                                 cur_offset += btrfs_inode_extref_name_len(leaf,
6750                                         extref);
6751                         } else {
6752                                 inode_key.objectid = key.offset;
6753                                 cur_offset = item_size;
6754                         }
6755
6756                         dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid,
6757                                                root);
6758                         /*
6759                          * If the parent inode was deleted, return an error to
6760                          * fallback to a transaction commit. This is to prevent
6761                          * getting an inode that was moved from one parent A to
6762                          * a parent B, got its former parent A deleted and then
6763                          * it got fsync'ed, from existing at both parents after
6764                          * a log replay (and the old parent still existing).
6765                          * Example:
6766                          *
6767                          * mkdir /mnt/A
6768                          * mkdir /mnt/B
6769                          * touch /mnt/B/bar
6770                          * sync
6771                          * mv /mnt/B/bar /mnt/A/bar
6772                          * mv -T /mnt/A /mnt/B
6773                          * fsync /mnt/B/bar
6774                          * <power fail>
6775                          *
6776                          * If we ignore the old parent B which got deleted,
6777                          * after a log replay we would have file bar linked
6778                          * at both parents and the old parent B would still
6779                          * exist.
6780                          */
6781                         if (IS_ERR(dir_inode)) {
6782                                 ret = PTR_ERR(dir_inode);
6783                                 goto out;
6784                         }
6785
6786                         if (!need_log_inode(trans, BTRFS_I(dir_inode))) {
6787                                 btrfs_add_delayed_iput(dir_inode);
6788                                 continue;
6789                         }
6790
6791                         ctx->log_new_dentries = false;
6792                         ret = btrfs_log_inode(trans, BTRFS_I(dir_inode),
6793                                               LOG_INODE_ALL, ctx);
6794                         if (!ret && ctx->log_new_dentries)
6795                                 ret = log_new_dir_dentries(trans,
6796                                                    BTRFS_I(dir_inode), ctx);
6797                         btrfs_add_delayed_iput(dir_inode);
6798                         if (ret)
6799                                 goto out;
6800                 }
6801                 path->slots[0]++;
6802         }
6803         ret = 0;
6804 out:
6805         btrfs_free_path(path);
6806         return ret;
6807 }
6808
6809 static int log_new_ancestors(struct btrfs_trans_handle *trans,
6810                              struct btrfs_root *root,
6811                              struct btrfs_path *path,
6812                              struct btrfs_log_ctx *ctx)
6813 {
6814         struct btrfs_key found_key;
6815
6816         btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
6817
6818         while (true) {
6819                 struct btrfs_fs_info *fs_info = root->fs_info;
6820                 struct extent_buffer *leaf = path->nodes[0];
6821                 int slot = path->slots[0];
6822                 struct btrfs_key search_key;
6823                 struct inode *inode;
6824                 u64 ino;
6825                 int ret = 0;
6826
6827                 btrfs_release_path(path);
6828
6829                 ino = found_key.offset;
6830
6831                 search_key.objectid = found_key.offset;
6832                 search_key.type = BTRFS_INODE_ITEM_KEY;
6833                 search_key.offset = 0;
6834                 inode = btrfs_iget(fs_info->sb, ino, root);
6835                 if (IS_ERR(inode))
6836                         return PTR_ERR(inode);
6837
6838                 if (BTRFS_I(inode)->generation >= trans->transid &&
6839                     need_log_inode(trans, BTRFS_I(inode)))
6840                         ret = btrfs_log_inode(trans, BTRFS_I(inode),
6841                                               LOG_INODE_EXISTS, ctx);
6842                 btrfs_add_delayed_iput(inode);
6843                 if (ret)
6844                         return ret;
6845
6846                 if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID)
6847                         break;
6848
6849                 search_key.type = BTRFS_INODE_REF_KEY;
6850                 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
6851                 if (ret < 0)
6852                         return ret;
6853
6854                 leaf = path->nodes[0];
6855                 slot = path->slots[0];
6856                 if (slot >= btrfs_header_nritems(leaf)) {
6857                         ret = btrfs_next_leaf(root, path);
6858                         if (ret < 0)
6859                                 return ret;
6860                         else if (ret > 0)
6861                                 return -ENOENT;
6862                         leaf = path->nodes[0];
6863                         slot = path->slots[0];
6864                 }
6865
6866                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
6867                 if (found_key.objectid != search_key.objectid ||
6868                     found_key.type != BTRFS_INODE_REF_KEY)
6869                         return -ENOENT;
6870         }
6871         return 0;
6872 }
6873
6874 static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
6875                                   struct btrfs_inode *inode,
6876                                   struct dentry *parent,
6877                                   struct btrfs_log_ctx *ctx)
6878 {
6879         struct btrfs_root *root = inode->root;
6880         struct dentry *old_parent = NULL;
6881         struct super_block *sb = inode->vfs_inode.i_sb;
6882         int ret = 0;
6883
6884         while (true) {
6885                 if (!parent || d_really_is_negative(parent) ||
6886                     sb != parent->d_sb)
6887                         break;
6888
6889                 inode = BTRFS_I(d_inode(parent));
6890                 if (root != inode->root)
6891                         break;
6892
6893                 if (inode->generation >= trans->transid &&
6894                     need_log_inode(trans, inode)) {
6895                         ret = btrfs_log_inode(trans, inode,
6896                                               LOG_INODE_EXISTS, ctx);
6897                         if (ret)
6898                                 break;
6899                 }
6900                 if (IS_ROOT(parent))
6901                         break;
6902
6903                 parent = dget_parent(parent);
6904                 dput(old_parent);
6905                 old_parent = parent;
6906         }
6907         dput(old_parent);
6908
6909         return ret;
6910 }
6911
6912 static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
6913                                  struct btrfs_inode *inode,
6914                                  struct dentry *parent,
6915                                  struct btrfs_log_ctx *ctx)
6916 {
6917         struct btrfs_root *root = inode->root;
6918         const u64 ino = btrfs_ino(inode);
6919         struct btrfs_path *path;
6920         struct btrfs_key search_key;
6921         int ret;
6922
6923         /*
6924          * For a single hard link case, go through a fast path that does not
6925          * need to iterate the fs/subvolume tree.
6926          */
6927         if (inode->vfs_inode.i_nlink < 2)
6928                 return log_new_ancestors_fast(trans, inode, parent, ctx);
6929
6930         path = btrfs_alloc_path();
6931         if (!path)
6932                 return -ENOMEM;
6933
6934         search_key.objectid = ino;
6935         search_key.type = BTRFS_INODE_REF_KEY;
6936         search_key.offset = 0;
6937 again:
6938         ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
6939         if (ret < 0)
6940                 goto out;
6941         if (ret == 0)
6942                 path->slots[0]++;
6943
6944         while (true) {
6945                 struct extent_buffer *leaf = path->nodes[0];
6946                 int slot = path->slots[0];
6947                 struct btrfs_key found_key;
6948
6949                 if (slot >= btrfs_header_nritems(leaf)) {
6950                         ret = btrfs_next_leaf(root, path);
6951                         if (ret < 0)
6952                                 goto out;
6953                         else if (ret > 0)
6954                                 break;
6955                         continue;
6956                 }
6957
6958                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
6959                 if (found_key.objectid != ino ||
6960                     found_key.type > BTRFS_INODE_EXTREF_KEY)
6961                         break;
6962
6963                 /*
6964                  * Don't deal with extended references because they are rare
6965                  * cases and too complex to deal with (we would need to keep
6966                  * track of which subitem we are processing for each item in
6967                  * this loop, etc). So just return some error to fallback to
6968                  * a transaction commit.
6969                  */
6970                 if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
6971                         ret = -EMLINK;
6972                         goto out;
6973                 }
6974
6975                 /*
6976                  * Logging ancestors needs to do more searches on the fs/subvol
6977                  * tree, so it releases the path as needed to avoid deadlocks.
6978                  * Keep track of the last inode ref key and resume from that key
6979                  * after logging all new ancestors for the current hard link.
6980                  */
6981                 memcpy(&search_key, &found_key, sizeof(search_key));
6982
6983                 ret = log_new_ancestors(trans, root, path, ctx);
6984                 if (ret)
6985                         goto out;
6986                 btrfs_release_path(path);
6987                 goto again;
6988         }
6989         ret = 0;
6990 out:
6991         btrfs_free_path(path);
6992         return ret;
6993 }
6994
6995 /*
6996  * helper function around btrfs_log_inode to make sure newly created
6997  * parent directories also end up in the log.  A minimal inode and backref
6998  * only logging is done of any parent directories that are older than
6999  * the last committed transaction
7000  */
7001 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
7002                                   struct btrfs_inode *inode,
7003                                   struct dentry *parent,
7004                                   int inode_only,
7005                                   struct btrfs_log_ctx *ctx)
7006 {
7007         struct btrfs_root *root = inode->root;
7008         struct btrfs_fs_info *fs_info = root->fs_info;
7009         int ret = 0;
7010         bool log_dentries = false;
7011
7012         if (btrfs_test_opt(fs_info, NOTREELOG)) {
7013                 ret = BTRFS_LOG_FORCE_COMMIT;
7014                 goto end_no_trans;
7015         }
7016
7017         if (btrfs_root_refs(&root->root_item) == 0) {
7018                 ret = BTRFS_LOG_FORCE_COMMIT;
7019                 goto end_no_trans;
7020         }
7021
7022         /*
7023          * Skip already logged inodes or inodes corresponding to tmpfiles
7024          * (since logging them is pointless, a link count of 0 means they
7025          * will never be accessible).
7026          */
7027         if ((btrfs_inode_in_log(inode, trans->transid) &&
7028              list_empty(&ctx->ordered_extents)) ||
7029             inode->vfs_inode.i_nlink == 0) {
7030                 ret = BTRFS_NO_LOG_SYNC;
7031                 goto end_no_trans;
7032         }
7033
7034         ret = start_log_trans(trans, root, ctx);
7035         if (ret)
7036                 goto end_no_trans;
7037
7038         ret = btrfs_log_inode(trans, inode, inode_only, ctx);
7039         if (ret)
7040                 goto end_trans;
7041
7042         /*
7043          * for regular files, if its inode is already on disk, we don't
7044          * have to worry about the parents at all.  This is because
7045          * we can use the last_unlink_trans field to record renames
7046          * and other fun in this file.
7047          */
7048         if (S_ISREG(inode->vfs_inode.i_mode) &&
7049             inode->generation < trans->transid &&
7050             inode->last_unlink_trans < trans->transid) {
7051                 ret = 0;
7052                 goto end_trans;
7053         }
7054
7055         if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries)
7056                 log_dentries = true;
7057
7058         /*
7059          * On unlink we must make sure all our current and old parent directory
7060          * inodes are fully logged. This is to prevent leaving dangling
7061          * directory index entries in directories that were our parents but are
7062          * not anymore. Not doing this results in old parent directory being
7063          * impossible to delete after log replay (rmdir will always fail with
7064          * error -ENOTEMPTY).
7065          *
7066          * Example 1:
7067          *
7068          * mkdir testdir
7069          * touch testdir/foo
7070          * ln testdir/foo testdir/bar
7071          * sync
7072          * unlink testdir/bar
7073          * xfs_io -c fsync testdir/foo
7074          * <power failure>
7075          * mount fs, triggers log replay
7076          *
7077          * If we don't log the parent directory (testdir), after log replay the
7078          * directory still has an entry pointing to the file inode using the bar
7079          * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
7080          * the file inode has a link count of 1.
7081          *
7082          * Example 2:
7083          *
7084          * mkdir testdir
7085          * touch foo
7086          * ln foo testdir/foo2
7087          * ln foo testdir/foo3
7088          * sync
7089          * unlink testdir/foo3
7090          * xfs_io -c fsync foo
7091          * <power failure>
7092          * mount fs, triggers log replay
7093          *
7094          * Similar as the first example, after log replay the parent directory
7095          * testdir still has an entry pointing to the inode file with name foo3
7096          * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
7097          * and has a link count of 2.
7098          */
7099         if (inode->last_unlink_trans >= trans->transid) {
7100                 ret = btrfs_log_all_parents(trans, inode, ctx);
7101                 if (ret)
7102                         goto end_trans;
7103         }
7104
7105         ret = log_all_new_ancestors(trans, inode, parent, ctx);
7106         if (ret)
7107                 goto end_trans;
7108
7109         if (log_dentries)
7110                 ret = log_new_dir_dentries(trans, inode, ctx);
7111         else
7112                 ret = 0;
7113 end_trans:
7114         if (ret < 0) {
7115                 btrfs_set_log_full_commit(trans);
7116                 ret = BTRFS_LOG_FORCE_COMMIT;
7117         }
7118
7119         if (ret)
7120                 btrfs_remove_log_ctx(root, ctx);
7121         btrfs_end_log_trans(root);
7122 end_no_trans:
7123         return ret;
7124 }
7125
7126 /*
7127  * it is not safe to log dentry if the chunk root has added new
7128  * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
7129  * If this returns 1, you must commit the transaction to safely get your
7130  * data on disk.
7131  */
7132 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
7133                           struct dentry *dentry,
7134                           struct btrfs_log_ctx *ctx)
7135 {
7136         struct dentry *parent = dget_parent(dentry);
7137         int ret;
7138
7139         ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
7140                                      LOG_INODE_ALL, ctx);
7141         dput(parent);
7142
7143         return ret;
7144 }
7145
7146 /*
7147  * should be called during mount to recover any replay any log trees
7148  * from the FS
7149  */
7150 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
7151 {
7152         int ret;
7153         struct btrfs_path *path;
7154         struct btrfs_trans_handle *trans;
7155         struct btrfs_key key;
7156         struct btrfs_key found_key;
7157         struct btrfs_root *log;
7158         struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
7159         struct walk_control wc = {
7160                 .process_func = process_one_buffer,
7161                 .stage = LOG_WALK_PIN_ONLY,
7162         };
7163
7164         path = btrfs_alloc_path();
7165         if (!path)
7166                 return -ENOMEM;
7167
7168         set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
7169
7170         trans = btrfs_start_transaction(fs_info->tree_root, 0);
7171         if (IS_ERR(trans)) {
7172                 ret = PTR_ERR(trans);
7173                 goto error;
7174         }
7175
7176         wc.trans = trans;
7177         wc.pin = 1;
7178
7179         ret = walk_log_tree(trans, log_root_tree, &wc);
7180         if (ret) {
7181                 btrfs_abort_transaction(trans, ret);
7182                 goto error;
7183         }
7184
7185 again:
7186         key.objectid = BTRFS_TREE_LOG_OBJECTID;
7187         key.offset = (u64)-1;
7188         key.type = BTRFS_ROOT_ITEM_KEY;
7189
7190         while (1) {
7191                 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
7192
7193                 if (ret < 0) {
7194                         btrfs_abort_transaction(trans, ret);
7195                         goto error;
7196                 }
7197                 if (ret > 0) {
7198                         if (path->slots[0] == 0)
7199                                 break;
7200                         path->slots[0]--;
7201                 }
7202                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
7203                                       path->slots[0]);
7204                 btrfs_release_path(path);
7205                 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
7206                         break;
7207
7208                 log = btrfs_read_tree_root(log_root_tree, &found_key);
7209                 if (IS_ERR(log)) {
7210                         ret = PTR_ERR(log);
7211                         btrfs_abort_transaction(trans, ret);
7212                         goto error;
7213                 }
7214
7215                 wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset,
7216                                                    true);
7217                 if (IS_ERR(wc.replay_dest)) {
7218                         ret = PTR_ERR(wc.replay_dest);
7219
7220                         /*
7221                          * We didn't find the subvol, likely because it was
7222                          * deleted.  This is ok, simply skip this log and go to
7223                          * the next one.
7224                          *
7225                          * We need to exclude the root because we can't have
7226                          * other log replays overwriting this log as we'll read
7227                          * it back in a few more times.  This will keep our
7228                          * block from being modified, and we'll just bail for
7229                          * each subsequent pass.
7230                          */
7231                         if (ret == -ENOENT)
7232                                 ret = btrfs_pin_extent_for_log_replay(trans,
7233                                                         log->node->start,
7234                                                         log->node->len);
7235                         btrfs_put_root(log);
7236
7237                         if (!ret)
7238                                 goto next;
7239                         btrfs_abort_transaction(trans, ret);
7240                         goto error;
7241                 }
7242
7243                 wc.replay_dest->log_root = log;
7244                 ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
7245                 if (ret)
7246                         /* The loop needs to continue due to the root refs */
7247                         btrfs_abort_transaction(trans, ret);
7248                 else
7249                         ret = walk_log_tree(trans, log, &wc);
7250
7251                 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
7252                         ret = fixup_inode_link_counts(trans, wc.replay_dest,
7253                                                       path);
7254                         if (ret)
7255                                 btrfs_abort_transaction(trans, ret);
7256                 }
7257
7258                 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
7259                         struct btrfs_root *root = wc.replay_dest;
7260
7261                         btrfs_release_path(path);
7262
7263                         /*
7264                          * We have just replayed everything, and the highest
7265                          * objectid of fs roots probably has changed in case
7266                          * some inode_item's got replayed.
7267                          *
7268                          * root->objectid_mutex is not acquired as log replay
7269                          * could only happen during mount.
7270                          */
7271                         ret = btrfs_init_root_free_objectid(root);
7272                         if (ret)
7273                                 btrfs_abort_transaction(trans, ret);
7274                 }
7275
7276                 wc.replay_dest->log_root = NULL;
7277                 btrfs_put_root(wc.replay_dest);
7278                 btrfs_put_root(log);
7279
7280                 if (ret)
7281                         goto error;
7282 next:
7283                 if (found_key.offset == 0)
7284                         break;
7285                 key.offset = found_key.offset - 1;
7286         }
7287         btrfs_release_path(path);
7288
7289         /* step one is to pin it all, step two is to replay just inodes */
7290         if (wc.pin) {
7291                 wc.pin = 0;
7292                 wc.process_func = replay_one_buffer;
7293                 wc.stage = LOG_WALK_REPLAY_INODES;
7294                 goto again;
7295         }
7296         /* step three is to replay everything */
7297         if (wc.stage < LOG_WALK_REPLAY_ALL) {
7298                 wc.stage++;
7299                 goto again;
7300         }
7301
7302         btrfs_free_path(path);
7303
7304         /* step 4: commit the transaction, which also unpins the blocks */
7305         ret = btrfs_commit_transaction(trans);
7306         if (ret)
7307                 return ret;
7308
7309         log_root_tree->log_root = NULL;
7310         clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
7311         btrfs_put_root(log_root_tree);
7312
7313         return 0;
7314 error:
7315         if (wc.trans)
7316                 btrfs_end_transaction(wc.trans);
7317         clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
7318         btrfs_free_path(path);
7319         return ret;
7320 }
7321
7322 /*
7323  * there are some corner cases where we want to force a full
7324  * commit instead of allowing a directory to be logged.
7325  *
7326  * They revolve around files there were unlinked from the directory, and
7327  * this function updates the parent directory so that a full commit is
7328  * properly done if it is fsync'd later after the unlinks are done.
7329  *
7330  * Must be called before the unlink operations (updates to the subvolume tree,
7331  * inodes, etc) are done.
7332  */
7333 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
7334                              struct btrfs_inode *dir, struct btrfs_inode *inode,
7335                              int for_rename)
7336 {
7337         /*
7338          * when we're logging a file, if it hasn't been renamed
7339          * or unlinked, and its inode is fully committed on disk,
7340          * we don't have to worry about walking up the directory chain
7341          * to log its parents.
7342          *
7343          * So, we use the last_unlink_trans field to put this transid
7344          * into the file.  When the file is logged we check it and
7345          * don't log the parents if the file is fully on disk.
7346          */
7347         mutex_lock(&inode->log_mutex);
7348         inode->last_unlink_trans = trans->transid;
7349         mutex_unlock(&inode->log_mutex);
7350
7351         /*
7352          * if this directory was already logged any new
7353          * names for this file/dir will get recorded
7354          */
7355         if (dir->logged_trans == trans->transid)
7356                 return;
7357
7358         /*
7359          * if the inode we're about to unlink was logged,
7360          * the log will be properly updated for any new names
7361          */
7362         if (inode->logged_trans == trans->transid)
7363                 return;
7364
7365         /*
7366          * when renaming files across directories, if the directory
7367          * there we're unlinking from gets fsync'd later on, there's
7368          * no way to find the destination directory later and fsync it
7369          * properly.  So, we have to be conservative and force commits
7370          * so the new name gets discovered.
7371          */
7372         if (for_rename)
7373                 goto record;
7374
7375         /* we can safely do the unlink without any special recording */
7376         return;
7377
7378 record:
7379         mutex_lock(&dir->log_mutex);
7380         dir->last_unlink_trans = trans->transid;
7381         mutex_unlock(&dir->log_mutex);
7382 }
7383
7384 /*
7385  * Make sure that if someone attempts to fsync the parent directory of a deleted
7386  * snapshot, it ends up triggering a transaction commit. This is to guarantee
7387  * that after replaying the log tree of the parent directory's root we will not
7388  * see the snapshot anymore and at log replay time we will not see any log tree
7389  * corresponding to the deleted snapshot's root, which could lead to replaying
7390  * it after replaying the log tree of the parent directory (which would replay
7391  * the snapshot delete operation).
7392  *
7393  * Must be called before the actual snapshot destroy operation (updates to the
7394  * parent root and tree of tree roots trees, etc) are done.
7395  */
7396 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
7397                                    struct btrfs_inode *dir)
7398 {
7399         mutex_lock(&dir->log_mutex);
7400         dir->last_unlink_trans = trans->transid;
7401         mutex_unlock(&dir->log_mutex);
7402 }
7403
7404 /**
7405  * Update the log after adding a new name for an inode.
7406  *
7407  * @trans:              Transaction handle.
7408  * @old_dentry:         The dentry associated with the old name and the old
7409  *                      parent directory.
7410  * @old_dir:            The inode of the previous parent directory for the case
7411  *                      of a rename. For a link operation, it must be NULL.
7412  * @old_dir_index:      The index number associated with the old name, meaningful
7413  *                      only for rename operations (when @old_dir is not NULL).
7414  *                      Ignored for link operations.
7415  * @parent:             The dentry associated with the directory under which the
7416  *                      new name is located.
7417  *
7418  * Call this after adding a new name for an inode, as a result of a link or
7419  * rename operation, and it will properly update the log to reflect the new name.
7420  */
7421 void btrfs_log_new_name(struct btrfs_trans_handle *trans,
7422                         struct dentry *old_dentry, struct btrfs_inode *old_dir,
7423                         u64 old_dir_index, struct dentry *parent)
7424 {
7425         struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry));
7426         struct btrfs_root *root = inode->root;
7427         struct btrfs_log_ctx ctx;
7428         bool log_pinned = false;
7429         int ret;
7430
7431         /*
7432          * this will force the logging code to walk the dentry chain
7433          * up for the file
7434          */
7435         if (!S_ISDIR(inode->vfs_inode.i_mode))
7436                 inode->last_unlink_trans = trans->transid;
7437
7438         /*
7439          * if this inode hasn't been logged and directory we're renaming it
7440          * from hasn't been logged, we don't need to log it
7441          */
7442         ret = inode_logged(trans, inode, NULL);
7443         if (ret < 0) {
7444                 goto out;
7445         } else if (ret == 0) {
7446                 if (!old_dir)
7447                         return;
7448                 /*
7449                  * If the inode was not logged and we are doing a rename (old_dir is not
7450                  * NULL), check if old_dir was logged - if it was not we can return and
7451                  * do nothing.
7452                  */
7453                 ret = inode_logged(trans, old_dir, NULL);
7454                 if (ret < 0)
7455                         goto out;
7456                 else if (ret == 0)
7457                         return;
7458         }
7459         ret = 0;
7460
7461         /*
7462          * If we are doing a rename (old_dir is not NULL) from a directory that
7463          * was previously logged, make sure that on log replay we get the old
7464          * dir entry deleted. This is needed because we will also log the new
7465          * name of the renamed inode, so we need to make sure that after log
7466          * replay we don't end up with both the new and old dir entries existing.
7467          */
7468         if (old_dir && old_dir->logged_trans == trans->transid) {
7469                 struct btrfs_root *log = old_dir->root->log_root;
7470                 struct btrfs_path *path;
7471
7472                 ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX);
7473
7474                 /*
7475                  * We have two inodes to update in the log, the old directory and
7476                  * the inode that got renamed, so we must pin the log to prevent
7477                  * anyone from syncing the log until we have updated both inodes
7478                  * in the log.
7479                  */
7480                 ret = join_running_log_trans(root);
7481                 /*
7482                  * At least one of the inodes was logged before, so this should
7483                  * not fail, but if it does, it's not serious, just bail out and
7484                  * mark the log for a full commit.
7485                  */
7486                 if (WARN_ON_ONCE(ret < 0))
7487                         goto out;
7488                 log_pinned = true;
7489
7490                 path = btrfs_alloc_path();
7491                 if (!path) {
7492                         ret = -ENOMEM;
7493                         goto out;
7494                 }
7495
7496                 /*
7497                  * Other concurrent task might be logging the old directory,
7498                  * as it can be triggered when logging other inode that had or
7499                  * still has a dentry in the old directory. We lock the old
7500                  * directory's log_mutex to ensure the deletion of the old
7501                  * name is persisted, because during directory logging we
7502                  * delete all BTRFS_DIR_LOG_INDEX_KEY keys and the deletion of
7503                  * the old name's dir index item is in the delayed items, so
7504                  * it could be missed by an in progress directory logging.
7505                  */
7506                 mutex_lock(&old_dir->log_mutex);
7507                 ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir),
7508                                         old_dentry->d_name.name,
7509                                         old_dentry->d_name.len, old_dir_index);
7510                 if (ret > 0) {
7511                         /*
7512                          * The dentry does not exist in the log, so record its
7513                          * deletion.
7514                          */
7515                         btrfs_release_path(path);
7516                         ret = insert_dir_log_key(trans, log, path,
7517                                                  btrfs_ino(old_dir),
7518                                                  old_dir_index, old_dir_index);
7519                 }
7520                 mutex_unlock(&old_dir->log_mutex);
7521
7522                 btrfs_free_path(path);
7523                 if (ret < 0)
7524                         goto out;
7525         }
7526
7527         btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
7528         ctx.logging_new_name = true;
7529         /*
7530          * We don't care about the return value. If we fail to log the new name
7531          * then we know the next attempt to sync the log will fallback to a full
7532          * transaction commit (due to a call to btrfs_set_log_full_commit()), so
7533          * we don't need to worry about getting a log committed that has an
7534          * inconsistent state after a rename operation.
7535          */
7536         btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
7537         ASSERT(list_empty(&ctx.conflict_inodes));
7538 out:
7539         /*
7540          * If an error happened mark the log for a full commit because it's not
7541          * consistent and up to date or we couldn't find out if one of the
7542          * inodes was logged before in this transaction. Do it before unpinning
7543          * the log, to avoid any races with someone else trying to commit it.
7544          */
7545         if (ret < 0)
7546                 btrfs_set_log_full_commit(trans);
7547         if (log_pinned)
7548                 btrfs_end_log_trans(root);
7549 }
7550