fs/xfs/xfs_log_recover.c

   1 /*
   2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3  * All Rights Reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write the Free Software Foundation,
  16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18 #include "xfs.h"
  19 #include "xfs_fs.h"
  20 #include "xfs_shared.h"
  21 #include "xfs_format.h"
  22 #include "xfs_log_format.h"
  23 #include "xfs_trans_resv.h"
  24 #include "xfs_bit.h"
  25 #include "xfs_sb.h"
  26 #include "xfs_mount.h"
  27 #include "xfs_da_format.h"
  28 #include "xfs_da_btree.h"
  29 #include "xfs_inode.h"
  30 #include "xfs_trans.h"
  31 #include "xfs_log.h"
  32 #include "xfs_log_priv.h"
  33 #include "xfs_log_recover.h"
  34 #include "xfs_inode_item.h"
  35 #include "xfs_extfree_item.h"
  36 #include "xfs_trans_priv.h"
  37 #include "xfs_alloc.h"
  38 #include "xfs_ialloc.h"
  39 #include "xfs_quota.h"
  40 #include "xfs_cksum.h"
  41 #include "xfs_trace.h"
  42 #include "xfs_icache.h"
  43 #include "xfs_bmap_btree.h"
  44 #include "xfs_error.h"
  45 #include "xfs_dir2.h"
  46 #include "xfs_rmap_item.h"
  47 #include "xfs_buf_item.h"
  48 #include "xfs_refcount_item.h"
  49 #include "xfs_bmap_item.h"
  50
  51 #define BLK_AVG(blk1, blk2)     ((blk1+blk2) >> 1)
  52
  53 STATIC int
  54 xlog_find_zeroed(
  55         struct xlog     *,
  56         xfs_daddr_t     *);
  57 STATIC int
  58 xlog_clear_stale_blocks(
  59         struct xlog     *,
  60         xfs_lsn_t);
  61 #if defined(DEBUG)
  62 STATIC void
  63 xlog_recover_check_summary(
  64         struct xlog *);
  65 #else
  66 #define xlog_recover_check_summary(log)
  67 #endif
  68 STATIC int
  69 xlog_do_recovery_pass(
  70         struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *);
  71
  72 /*
  73  * This structure is used during recovery to record the buf log items which
  74  * have been canceled and should not be replayed.
  75  */
  76 struct xfs_buf_cancel {
  77         xfs_daddr_t             bc_blkno;
  78         uint                    bc_len;
  79         int                     bc_refcount;
  80         struct list_head        bc_list;
  81 };
  82
  83 /*
  84  * Sector aligned buffer routines for buffer create/read/write/access
  85  */
  86
  87 /*
  88  * Verify the given count of basic blocks is valid number of blocks
  89  * to specify for an operation involving the given XFS log buffer.
  90  * Returns nonzero if the count is valid, 0 otherwise.
  91  */
  92
  93 static inline int
  94 xlog_buf_bbcount_valid(
  95         struct xlog     *log,
  96         int             bbcount)
  97 {
  98         return bbcount > 0 && bbcount <= log->l_logBBsize;
  99 }
 100
 101 /*
 102  * Allocate a buffer to hold log data.  The buffer needs to be able
 103  * to map to a range of nbblks basic blocks at any valid (basic
 104  * block) offset within the log.
 105  */
 106 STATIC xfs_buf_t *
 107 xlog_get_bp(
 108         struct xlog     *log,
 109         int             nbblks)
 110 {
 111         struct xfs_buf  *bp;
 112
 113         if (!xlog_buf_bbcount_valid(log, nbblks)) {
 114                 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
 115                         nbblks);
 116                 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
 117                 return NULL;
 118         }
 119
 120         /*
 121          * We do log I/O in units of log sectors (a power-of-2
 122          * multiple of the basic block size), so we round up the
 123          * requested size to accommodate the basic blocks required
 124          * for complete log sectors.
 125          *
 126          * In addition, the buffer may be used for a non-sector-
 127          * aligned block offset, in which case an I/O of the
 128          * requested size could extend beyond the end of the
 129          * buffer.  If the requested size is only 1 basic block it
 130          * will never straddle a sector boundary, so this won't be
 131          * an issue.  Nor will this be a problem if the log I/O is
 132          * done in basic blocks (sector size 1).  But otherwise we
 133          * extend the buffer by one extra log sector to ensure
 134          * there's space to accommodate this possibility.
 135          */
 136         if (nbblks > 1 && log->l_sectBBsize > 1)
 137                 nbblks += log->l_sectBBsize;
 138         nbblks = round_up(nbblks, log->l_sectBBsize);
 139
 140         bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, nbblks, 0);
 141         if (bp)
 142                 xfs_buf_unlock(bp);
 143         return bp;
 144 }
 145
 146 STATIC void
 147 xlog_put_bp(
 148         xfs_buf_t       *bp)
 149 {
 150         xfs_buf_free(bp);
 151 }
 152
 153 /*
 154  * Return the address of the start of the given block number's data
 155  * in a log buffer.  The buffer covers a log sector-aligned region.
 156  */
 157 STATIC char *
 158 xlog_align(
 159         struct xlog     *log,
 160         xfs_daddr_t     blk_no,
 161         int             nbblks,
 162         struct xfs_buf  *bp)
 163 {
 164         xfs_daddr_t     offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
 165
 166         ASSERT(offset + nbblks <= bp->b_length);
 167         return bp->b_addr + BBTOB(offset);
 168 }
 169
 170
 171 /*
 172  * nbblks should be uint, but oh well.  Just want to catch that 32-bit length.
 173  */
 174 STATIC int
 175 xlog_bread_noalign(
 176         struct xlog     *log,
 177         xfs_daddr_t     blk_no,
 178         int             nbblks,
 179         struct xfs_buf  *bp)
 180 {
 181         int             error;
 182
 183         if (!xlog_buf_bbcount_valid(log, nbblks)) {
 184                 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
 185                         nbblks);
 186                 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
 187                 return -EFSCORRUPTED;
 188         }
 189
 190         blk_no = round_down(blk_no, log->l_sectBBsize);
 191         nbblks = round_up(nbblks, log->l_sectBBsize);
 192
 193         ASSERT(nbblks > 0);
 194         ASSERT(nbblks <= bp->b_length);
 195
 196         XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
 197         bp->b_flags |= XBF_READ;
 198         bp->b_io_length = nbblks;
 199         bp->b_error = 0;
 200
 201         error = xfs_buf_submit_wait(bp);
 202         if (error && !XFS_FORCED_SHUTDOWN(log->l_mp))
 203                 xfs_buf_ioerror_alert(bp, __func__);
 204         return error;
 205 }
 206
 207 STATIC int
 208 xlog_bread(
 209         struct xlog     *log,
 210         xfs_daddr_t     blk_no,
 211         int             nbblks,
 212         struct xfs_buf  *bp,
 213         char            **offset)
 214 {
 215         int             error;
 216
 217         error = xlog_bread_noalign(log, blk_no, nbblks, bp);
 218         if (error)
 219                 return error;
 220
 221         *offset = xlog_align(log, blk_no, nbblks, bp);
 222         return 0;
 223 }
 224
 225 /*
 226  * Read at an offset into the buffer. Returns with the buffer in it's original
 227  * state regardless of the result of the read.
 228  */
 229 STATIC int
 230 xlog_bread_offset(
 231         struct xlog     *log,
 232         xfs_daddr_t     blk_no,         /* block to read from */
 233         int             nbblks,         /* blocks to read */
 234         struct xfs_buf  *bp,
 235         char            *offset)
 236 {
 237         char            *orig_offset = bp->b_addr;
 238         int             orig_len = BBTOB(bp->b_length);
 239         int             error, error2;
 240
 241         error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks));
 242         if (error)
 243                 return error;
 244
 245         error = xlog_bread_noalign(log, blk_no, nbblks, bp);
 246
 247         /* must reset buffer pointer even on error */
 248         error2 = xfs_buf_associate_memory(bp, orig_offset, orig_len);
 249         if (error)
 250                 return error;
 251         return error2;
 252 }
 253
 254 /*
 255  * Write out the buffer at the given block for the given number of blocks.
 256  * The buffer is kept locked across the write and is returned locked.
 257  * This can only be used for synchronous log writes.
 258  */
 259 STATIC int
 260 xlog_bwrite(
 261         struct xlog     *log,
 262         xfs_daddr_t     blk_no,
 263         int             nbblks,
 264         struct xfs_buf  *bp)
 265 {
 266         int             error;
 267
 268         if (!xlog_buf_bbcount_valid(log, nbblks)) {
 269                 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
 270                         nbblks);
 271                 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
 272                 return -EFSCORRUPTED;
 273         }
 274
 275         blk_no = round_down(blk_no, log->l_sectBBsize);
 276         nbblks = round_up(nbblks, log->l_sectBBsize);
 277
 278         ASSERT(nbblks > 0);
 279         ASSERT(nbblks <= bp->b_length);
 280
 281         XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
 282         xfs_buf_hold(bp);
 283         xfs_buf_lock(bp);
 284         bp->b_io_length = nbblks;
 285         bp->b_error = 0;
 286
 287         error = xfs_bwrite(bp);
 288         if (error)
 289                 xfs_buf_ioerror_alert(bp, __func__);
 290         xfs_buf_relse(bp);
 291         return error;
 292 }
 293
 294 #ifdef DEBUG
 295 /*
 296  * dump debug superblock and log record information
 297  */
 298 STATIC void
 299 xlog_header_check_dump(
 300         xfs_mount_t             *mp,
 301         xlog_rec_header_t       *head)
 302 {
 303         xfs_debug(mp, "%s:  SB : uuid = %pU, fmt = %d",
 304                 __func__, &mp->m_sb.sb_uuid, XLOG_FMT);
 305         xfs_debug(mp, "    log : uuid = %pU, fmt = %d",
 306                 &head->h_fs_uuid, be32_to_cpu(head->h_fmt));
 307 }
 308 #else
 309 #define xlog_header_check_dump(mp, head)
 310 #endif
 311
 312 /*
 313  * check log record header for recovery
 314  */
 315 STATIC int
 316 xlog_header_check_recover(
 317         xfs_mount_t             *mp,
 318         xlog_rec_header_t       *head)
 319 {
 320         ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
 321
 322         /*
 323          * IRIX doesn't write the h_fmt field and leaves it zeroed
 324          * (XLOG_FMT_UNKNOWN). This stops us from trying to recover
 325          * a dirty log created in IRIX.
 326          */
 327         if (unlikely(head->h_fmt != cpu_to_be32(XLOG_FMT))) {
 328                 xfs_warn(mp,
 329         "dirty log written in incompatible format - can't recover");
 330                 xlog_header_check_dump(mp, head);
 331                 XFS_ERROR_REPORT("xlog_header_check_recover(1)",
 332                                  XFS_ERRLEVEL_HIGH, mp);
 333                 return -EFSCORRUPTED;
 334         } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
 335                 xfs_warn(mp,
 336         "dirty log entry has mismatched uuid - can't recover");
 337                 xlog_header_check_dump(mp, head);
 338                 XFS_ERROR_REPORT("xlog_header_check_recover(2)",
 339                                  XFS_ERRLEVEL_HIGH, mp);
 340                 return -EFSCORRUPTED;
 341         }
 342         return 0;
 343 }
 344
 345 /*
 346  * read the head block of the log and check the header
 347  */
 348 STATIC int
 349 xlog_header_check_mount(
 350         xfs_mount_t             *mp,
 351         xlog_rec_header_t       *head)
 352 {
 353         ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
 354
 355         if (uuid_is_null(&head->h_fs_uuid)) {
 356                 /*
 357                  * IRIX doesn't write the h_fs_uuid or h_fmt fields. If
 358                  * h_fs_uuid is null, we assume this log was last mounted
 359                  * by IRIX and continue.
 360                  */
 361                 xfs_warn(mp, "null uuid in log - IRIX style log");
 362         } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
 363                 xfs_warn(mp, "log has mismatched uuid - can't recover");
 364                 xlog_header_check_dump(mp, head);
 365                 XFS_ERROR_REPORT("xlog_header_check_mount",
 366                                  XFS_ERRLEVEL_HIGH, mp);
 367                 return -EFSCORRUPTED;
 368         }
 369         return 0;
 370 }
 371
 372 STATIC void
 373 xlog_recover_iodone(
 374         struct xfs_buf  *bp)
 375 {
 376         if (bp->b_error) {
 377                 /*
 378                  * We're not going to bother about retrying
 379                  * this during recovery. One strike!
 380                  */
 381                 if (!XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
 382                         xfs_buf_ioerror_alert(bp, __func__);
 383                         xfs_force_shutdown(bp->b_target->bt_mount,
 384                                                 SHUTDOWN_META_IO_ERROR);
 385                 }
 386         }
 387
 388         /*
 389          * On v5 supers, a bli could be attached to update the metadata LSN.
 390          * Clean it up.
 391          */
 392         if (bp->b_fspriv)
 393                 xfs_buf_item_relse(bp);
 394         ASSERT(bp->b_fspriv == NULL);
 395
 396         bp->b_iodone = NULL;
 397         xfs_buf_ioend(bp);
 398 }
 399
 400 /*
 401  * This routine finds (to an approximation) the first block in the physical
 402  * log which contains the given cycle.  It uses a binary search algorithm.
 403  * Note that the algorithm can not be perfect because the disk will not
 404  * necessarily be perfect.
 405  */
 406 STATIC int
 407 xlog_find_cycle_start(
 408         struct xlog     *log,
 409         struct xfs_buf  *bp,
 410         xfs_daddr_t     first_blk,
 411         xfs_daddr_t     *last_blk,
 412         uint            cycle)
 413 {
 414         char            *offset;
 415         xfs_daddr_t     mid_blk;
 416         xfs_daddr_t     end_blk;
 417         uint            mid_cycle;
 418         int             error;
 419
 420         end_blk = *last_blk;
 421         mid_blk = BLK_AVG(first_blk, end_blk);
 422         while (mid_blk != first_blk && mid_blk != end_blk) {
 423                 error = xlog_bread(log, mid_blk, 1, bp, &offset);
 424                 if (error)
 425                         return error;
 426                 mid_cycle = xlog_get_cycle(offset);
 427                 if (mid_cycle == cycle)
 428                         end_blk = mid_blk;   /* last_half_cycle == mid_cycle */
 429                 else
 430                         first_blk = mid_blk; /* first_half_cycle == mid_cycle */
 431                 mid_blk = BLK_AVG(first_blk, end_blk);
 432         }
 433         ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
 434                (mid_blk == end_blk && mid_blk-1 == first_blk));
 435
 436         *last_blk = end_blk;
 437
 438         return 0;
 439 }
 440
 441 /*
 442  * Check that a range of blocks does not contain stop_on_cycle_no.
 443  * Fill in *new_blk with the block offset where such a block is
 444  * found, or with -1 (an invalid block number) if there is no such
 445  * block in the range.  The scan needs to occur from front to back
 446  * and the pointer into the region must be updated since a later
 447  * routine will need to perform another test.
 448  */
 449 STATIC int
 450 xlog_find_verify_cycle(
 451         struct xlog     *log,
 452         xfs_daddr_t     start_blk,
 453         int             nbblks,
 454         uint            stop_on_cycle_no,
 455         xfs_daddr_t     *new_blk)
 456 {
 457         xfs_daddr_t     i, j;
 458         uint            cycle;
 459         xfs_buf_t       *bp;
 460         xfs_daddr_t     bufblks;
 461         char            *buf = NULL;
 462         int             error = 0;
 463
 464         /*
 465          * Greedily allocate a buffer big enough to handle the full
 466          * range of basic blocks we'll be examining.  If that fails,
 467          * try a smaller size.  We need to be able to read at least
 468          * a log sector, or we're out of luck.
 469          */
 470         bufblks = 1 << ffs(nbblks);
 471         while (bufblks > log->l_logBBsize)
 472                 bufblks >>= 1;
 473         while (!(bp = xlog_get_bp(log, bufblks))) {
 474                 bufblks >>= 1;
 475                 if (bufblks < log->l_sectBBsize)
 476                         return -ENOMEM;
 477         }
 478
 479         for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
 480                 int     bcount;
 481
 482                 bcount = min(bufblks, (start_blk + nbblks - i));
 483
 484                 error = xlog_bread(log, i, bcount, bp, &buf);
 485                 if (error)
 486                         goto out;
 487
 488                 for (j = 0; j < bcount; j++) {
 489                         cycle = xlog_get_cycle(buf);
 490                         if (cycle == stop_on_cycle_no) {
 491                                 *new_blk = i+j;
 492                                 goto out;
 493                         }
 494
 495                         buf += BBSIZE;
 496                 }
 497         }
 498
 499         *new_blk = -1;
 500
 501 out:
 502         xlog_put_bp(bp);
 503         return error;
 504 }
 505
 506 /*
 507  * Potentially backup over partial log record write.
 508  *
 509  * In the typical case, last_blk is the number of the block directly after
 510  * a good log record.  Therefore, we subtract one to get the block number
 511  * of the last block in the given buffer.  extra_bblks contains the number
 512  * of blocks we would have read on a previous read.  This happens when the
 513  * last log record is split over the end of the physical log.
 514  *
 515  * extra_bblks is the number of blocks potentially verified on a previous
 516  * call to this routine.
 517  */
 518 STATIC int
 519 xlog_find_verify_log_record(
 520         struct xlog             *log,
 521         xfs_daddr_t             start_blk,
 522         xfs_daddr_t             *last_blk,
 523         int                     extra_bblks)
 524 {
 525         xfs_daddr_t             i;
 526         xfs_buf_t               *bp;
 527         char                    *offset = NULL;
 528         xlog_rec_header_t       *head = NULL;
 529         int                     error = 0;
 530         int                     smallmem = 0;
 531         int                     num_blks = *last_blk - start_blk;
 532         int                     xhdrs;
 533
 534         ASSERT(start_blk != 0 || *last_blk != start_blk);
 535
 536         if (!(bp = xlog_get_bp(log, num_blks))) {
 537                 if (!(bp = xlog_get_bp(log, 1)))
 538                         return -ENOMEM;
 539                 smallmem = 1;
 540         } else {
 541                 error = xlog_bread(log, start_blk, num_blks, bp, &offset);
 542                 if (error)
 543                         goto out;
 544                 offset += ((num_blks - 1) << BBSHIFT);
 545         }
 546
 547         for (i = (*last_blk) - 1; i >= 0; i--) {
 548                 if (i < start_blk) {
 549                         /* valid log record not found */
 550                         xfs_warn(log->l_mp,
 551                 "Log inconsistent (didn't find previous header)");
 552                         ASSERT(0);
 553                         error = -EIO;
 554                         goto out;
 555                 }
 556
 557                 if (smallmem) {
 558                         error = xlog_bread(log, i, 1, bp, &offset);
 559                         if (error)
 560                                 goto out;
 561                 }
 562
 563                 head = (xlog_rec_header_t *)offset;
 564
 565                 if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
 566                         break;
 567
 568                 if (!smallmem)
 569                         offset -= BBSIZE;
 570         }
 571
 572         /*
 573          * We hit the beginning of the physical log & still no header.  Return
 574          * to caller.  If caller can handle a return of -1, then this routine
 575          * will be called again for the end of the physical log.
 576          */
 577         if (i == -1) {
 578                 error = 1;
 579                 goto out;
 580         }
 581
 582         /*
 583          * We have the final block of the good log (the first block
 584          * of the log record _before_ the head. So we check the uuid.
 585          */
 586         if ((error = xlog_header_check_mount(log->l_mp, head)))
 587                 goto out;
 588
 589         /*
 590          * We may have found a log record header before we expected one.
 591          * last_blk will be the 1st block # with a given cycle #.  We may end
 592          * up reading an entire log record.  In this case, we don't want to
 593          * reset last_blk.  Only when last_blk points in the middle of a log
 594          * record do we update last_blk.
 595          */
 596         if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
 597                 uint    h_size = be32_to_cpu(head->h_size);
 598
 599                 xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
 600                 if (h_size % XLOG_HEADER_CYCLE_SIZE)
 601                         xhdrs++;
 602         } else {
 603                 xhdrs = 1;
 604         }
 605
 606         if (*last_blk - i + extra_bblks !=
 607             BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
 608                 *last_blk = i;
 609
 610 out:
 611         xlog_put_bp(bp);
 612         return error;
 613 }
 614
 615 /*
 616  * Head is defined to be the point of the log where the next log write
 617  * could go.  This means that incomplete LR writes at the end are
 618  * eliminated when calculating the head.  We aren't guaranteed that previous
 619  * LR have complete transactions.  We only know that a cycle number of
 620  * current cycle number -1 won't be present in the log if we start writing
 621  * from our current block number.
 622  *
 623  * last_blk contains the block number of the first block with a given
 624  * cycle number.
 625  *
 626  * Return: zero if normal, non-zero if error.
 627  */
 628 STATIC int
 629 xlog_find_head(
 630         struct xlog     *log,
 631         xfs_daddr_t     *return_head_blk)
 632 {
 633         xfs_buf_t       *bp;
 634         char            *offset;
 635         xfs_daddr_t     new_blk, first_blk, start_blk, last_blk, head_blk;
 636         int             num_scan_bblks;
 637         uint            first_half_cycle, last_half_cycle;
 638         uint            stop_on_cycle;
 639         int             error, log_bbnum = log->l_logBBsize;
 640
 641         /* Is the end of the log device zeroed? */
 642         error = xlog_find_zeroed(log, &first_blk);
 643         if (error < 0) {
 644                 xfs_warn(log->l_mp, "empty log check failed");
 645                 return error;
 646         }
 647         if (error == 1) {
 648                 *return_head_blk = first_blk;
 649
 650                 /* Is the whole lot zeroed? */
 651                 if (!first_blk) {
 652                         /* Linux XFS shouldn't generate totally zeroed logs -
 653                          * mkfs etc write a dummy unmount record to a fresh
 654                          * log so we can store the uuid in there
 655                          */
 656                         xfs_warn(log->l_mp, "totally zeroed log");
 657                 }
 658
 659                 return 0;
 660         }
 661
 662         first_blk = 0;                  /* get cycle # of 1st block */
 663         bp = xlog_get_bp(log, 1);
 664         if (!bp)
 665                 return -ENOMEM;
 666
 667         error = xlog_bread(log, 0, 1, bp, &offset);
 668         if (error)
 669                 goto bp_err;
 670
 671         first_half_cycle = xlog_get_cycle(offset);
 672
 673         last_blk = head_blk = log_bbnum - 1;    /* get cycle # of last block */
 674         error = xlog_bread(log, last_blk, 1, bp, &offset);
 675         if (error)
 676                 goto bp_err;
 677
 678         last_half_cycle = xlog_get_cycle(offset);
 679         ASSERT(last_half_cycle != 0);
 680
 681         /*
 682          * If the 1st half cycle number is equal to the last half cycle number,
 683          * then the entire log is stamped with the same cycle number.  In this
 684          * case, head_blk can't be set to zero (which makes sense).  The below
 685          * math doesn't work out properly with head_blk equal to zero.  Instead,
 686          * we set it to log_bbnum which is an invalid block number, but this
 687          * value makes the math correct.  If head_blk doesn't changed through
 688          * all the tests below, *head_blk is set to zero at the very end rather
 689          * than log_bbnum.  In a sense, log_bbnum and zero are the same block
 690          * in a circular file.
 691          */
 692         if (first_half_cycle == last_half_cycle) {
 693                 /*
 694                  * In this case we believe that the entire log should have
 695                  * cycle number last_half_cycle.  We need to scan backwards
 696                  * from the end verifying that there are no holes still
 697                  * containing last_half_cycle - 1.  If we find such a hole,
 698                  * then the start of that hole will be the new head.  The
 699                  * simple case looks like
 700                  *        x | x ... | x - 1 | x
 701                  * Another case that fits this picture would be
 702                  *        x | x + 1 | x ... | x
 703                  * In this case the head really is somewhere at the end of the
 704                  * log, as one of the latest writes at the beginning was
 705                  * incomplete.
 706                  * One more case is
 707                  *        x | x + 1 | x ... | x - 1 | x
 708                  * This is really the combination of the above two cases, and
 709                  * the head has to end up at the start of the x-1 hole at the
 710                  * end of the log.
 711                  *
 712                  * In the 256k log case, we will read from the beginning to the
 713                  * end of the log and search for cycle numbers equal to x-1.
 714                  * We don't worry about the x+1 blocks that we encounter,
 715                  * because we know that they cannot be the head since the log
 716                  * started with x.
 717                  */
 718                 head_blk = log_bbnum;
 719                 stop_on_cycle = last_half_cycle - 1;
 720         } else {
 721                 /*
 722                  * In this case we want to find the first block with cycle
 723                  * number matching last_half_cycle.  We expect the log to be
 724                  * some variation on
 725                  *        x + 1 ... | x ... | x
 726                  * The first block with cycle number x (last_half_cycle) will
 727                  * be where the new head belongs.  First we do a binary search
 728                  * for the first occurrence of last_half_cycle.  The binary
 729                  * search may not be totally accurate, so then we scan back
 730                  * from there looking for occurrences of last_half_cycle before
 731                  * us.  If that backwards scan wraps around the beginning of
 732                  * the log, then we look for occurrences of last_half_cycle - 1
 733                  * at the end of the log.  The cases we're looking for look
 734                  * like
 735                  *                               v binary search stopped here
 736                  *        x + 1 ... | x | x + 1 | x ... | x
 737                  *                   ^ but we want to locate this spot
 738                  * or
 739                  *        <---------> less than scan distance
 740                  *        x + 1 ... | x ... | x - 1 | x
 741                  *                           ^ we want to locate this spot
 742                  */
 743                 stop_on_cycle = last_half_cycle;
 744                 if ((error = xlog_find_cycle_start(log, bp, first_blk,
 745                                                 &head_blk, last_half_cycle)))
 746                         goto bp_err;
 747         }
 748
 749         /*
 750          * Now validate the answer.  Scan back some number of maximum possible
 751          * blocks and make sure each one has the expected cycle number.  The
 752          * maximum is determined by the total possible amount of buffering
 753          * in the in-core log.  The following number can be made tighter if
 754          * we actually look at the block size of the filesystem.
 755          */
 756         num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
 757         if (head_blk >= num_scan_bblks) {
 758                 /*
 759                  * We are guaranteed that the entire check can be performed
 760                  * in one buffer.
 761                  */
 762                 start_blk = head_blk - num_scan_bblks;
 763                 if ((error = xlog_find_verify_cycle(log,
 764                                                 start_blk, num_scan_bblks,
 765                                                 stop_on_cycle, &new_blk)))
 766                         goto bp_err;
 767                 if (new_blk != -1)
 768                         head_blk = new_blk;
 769         } else {                /* need to read 2 parts of log */
 770                 /*
 771                  * We are going to scan backwards in the log in two parts.
 772                  * First we scan the physical end of the log.  In this part
 773                  * of the log, we are looking for blocks with cycle number
 774                  * last_half_cycle - 1.
 775                  * If we find one, then we know that the log starts there, as
 776                  * we've found a hole that didn't get written in going around
 777                  * the end of the physical log.  The simple case for this is
 778                  *        x + 1 ... | x ... | x - 1 | x
 779                  *        <---------> less than scan distance
 780                  * If all of the blocks at the end of the log have cycle number
 781                  * last_half_cycle, then we check the blocks at the start of
 782                  * the log looking for occurrences of last_half_cycle.  If we
 783                  * find one, then our current estimate for the location of the
 784                  * first occurrence of last_half_cycle is wrong and we move
 785                  * back to the hole we've found.  This case looks like
 786                  *        x + 1 ... | x | x + 1 | x ...
 787                  *                               ^ binary search stopped here
 788                  * Another case we need to handle that only occurs in 256k
 789                  * logs is
 790                  *        x + 1 ... | x ... | x+1 | x ...
 791                  *                   ^ binary search stops here
 792                  * In a 256k log, the scan at the end of the log will see the
 793                  * x + 1 blocks.  We need to skip past those since that is
 794                  * certainly not the head of the log.  By searching for
 795                  * last_half_cycle-1 we accomplish that.
 796                  */
 797                 ASSERT(head_blk <= INT_MAX &&
 798                         (xfs_daddr_t) num_scan_bblks >= head_blk);
 799                 start_blk = log_bbnum - (num_scan_bblks - head_blk);
 800                 if ((error = xlog_find_verify_cycle(log, start_blk,
 801                                         num_scan_bblks - (int)head_blk,
 802                                         (stop_on_cycle - 1), &new_blk)))
 803                         goto bp_err;
 804                 if (new_blk != -1) {
 805                         head_blk = new_blk;
 806                         goto validate_head;
 807                 }
 808
 809                 /*
 810                  * Scan beginning of log now.  The last part of the physical
 811                  * log is good.  This scan needs to verify that it doesn't find
 812                  * the last_half_cycle.
 813                  */
 814                 start_blk = 0;
 815                 ASSERT(head_blk <= INT_MAX);
 816                 if ((error = xlog_find_verify_cycle(log,
 817                                         start_blk, (int)head_blk,
 818                                         stop_on_cycle, &new_blk)))
 819                         goto bp_err;
 820                 if (new_blk != -1)
 821                         head_blk = new_blk;
 822         }
 823
 824 validate_head:
 825         /*
 826          * Now we need to make sure head_blk is not pointing to a block in
 827          * the middle of a log record.
 828          */
 829         num_scan_bblks = XLOG_REC_SHIFT(log);
 830         if (head_blk >= num_scan_bblks) {
 831                 start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
 832
 833                 /* start ptr at last block ptr before head_blk */
 834                 error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
 835                 if (error == 1)
 836                         error = -EIO;
 837                 if (error)
 838                         goto bp_err;
 839         } else {
 840                 start_blk = 0;
 841                 ASSERT(head_blk <= INT_MAX);
 842                 error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
 843                 if (error < 0)
 844                         goto bp_err;
 845                 if (error == 1) {
 846                         /* We hit the beginning of the log during our search */
 847                         start_blk = log_bbnum - (num_scan_bblks - head_blk);
 848                         new_blk = log_bbnum;
 849                         ASSERT(start_blk <= INT_MAX &&
 850                                 (xfs_daddr_t) log_bbnum-start_blk >= 0);
 851                         ASSERT(head_blk <= INT_MAX);
 852                         error = xlog_find_verify_log_record(log, start_blk,
 853                                                         &new_blk, (int)head_blk);
 854                         if (error == 1)
 855                                 error = -EIO;
 856                         if (error)
 857                                 goto bp_err;
 858                         if (new_blk != log_bbnum)
 859                                 head_blk = new_blk;
 860                 } else if (error)
 861                         goto bp_err;
 862         }
 863
 864         xlog_put_bp(bp);
 865         if (head_blk == log_bbnum)
 866                 *return_head_blk = 0;
 867         else
 868                 *return_head_blk = head_blk;
 869         /*
 870          * When returning here, we have a good block number.  Bad block
 871          * means that during a previous crash, we didn't have a clean break
 872          * from cycle number N to cycle number N-1.  In this case, we need
 873          * to find the first block with cycle number N-1.
 874          */
 875         return 0;
 876
 877  bp_err:
 878         xlog_put_bp(bp);
 879
 880         if (error)
 881                 xfs_warn(log->l_mp, "failed to find log head");
 882         return error;
 883 }
 884
 885 /*
 886  * Seek backwards in the log for log record headers.
 887  *
 888  * Given a starting log block, walk backwards until we find the provided number
 889  * of records or hit the provided tail block. The return value is the number of
 890  * records encountered or a negative error code. The log block and buffer
 891  * pointer of the last record seen are returned in rblk and rhead respectively.
 892  */
 893 STATIC int
 894 xlog_rseek_logrec_hdr(
 895         struct xlog             *log,
 896         xfs_daddr_t             head_blk,
 897         xfs_daddr_t             tail_blk,
 898         int                     count,
 899         struct xfs_buf          *bp,
 900         xfs_daddr_t             *rblk,
 901         struct xlog_rec_header  **rhead,
 902         bool                    *wrapped)
 903 {
 904         int                     i;
 905         int                     error;
 906         int                     found = 0;
 907         char                    *offset = NULL;
 908         xfs_daddr_t             end_blk;
 909
 910         *wrapped = false;
 911
 912         /*
 913          * Walk backwards from the head block until we hit the tail or the first
 914          * block in the log.
 915          */
 916         end_blk = head_blk > tail_blk ? tail_blk : 0;
 917         for (i = (int) head_blk - 1; i >= end_blk; i--) {
 918                 error = xlog_bread(log, i, 1, bp, &offset);
 919                 if (error)
 920                         goto out_error;
 921
 922                 if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
 923                         *rblk = i;
 924                         *rhead = (struct xlog_rec_header *) offset;
 925                         if (++found == count)
 926                                 break;
 927                 }
 928         }
 929
 930         /*
 931          * If we haven't hit the tail block or the log record header count,
 932          * start looking again from the end of the physical log. Note that
 933          * callers can pass head == tail if the tail is not yet known.
 934          */
 935         if (tail_blk >= head_blk && found != count) {
 936                 for (i = log->l_logBBsize - 1; i >= (int) tail_blk; i--) {
 937                         error = xlog_bread(log, i, 1, bp, &offset);
 938                         if (error)
 939                                 goto out_error;
 940
 941                         if (*(__be32 *)offset ==
 942                             cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
 943                                 *wrapped = true;
 944                                 *rblk = i;
 945                                 *rhead = (struct xlog_rec_header *) offset;
 946                                 if (++found == count)
 947                                         break;
 948                         }
 949                 }
 950         }
 951
 952         return found;
 953
 954 out_error:
 955         return error;
 956 }
 957
 958 /*
 959  * Seek forward in the log for log record headers.
 960  *
 961  * Given head and tail blocks, walk forward from the tail block until we find
 962  * the provided number of records or hit the head block. The return value is the
 963  * number of records encountered or a negative error code. The log block and
 964  * buffer pointer of the last record seen are returned in rblk and rhead
 965  * respectively.
 966  */
 967 STATIC int
 968 xlog_seek_logrec_hdr(
 969         struct xlog             *log,
 970         xfs_daddr_t             head_blk,
 971         xfs_daddr_t             tail_blk,
 972         int                     count,
 973         struct xfs_buf          *bp,
 974         xfs_daddr_t             *rblk,
 975         struct xlog_rec_header  **rhead,
 976         bool                    *wrapped)
 977 {
 978         int                     i;
 979         int                     error;
 980         int                     found = 0;
 981         char                    *offset = NULL;
 982         xfs_daddr_t             end_blk;
 983
 984         *wrapped = false;
 985
 986         /*
 987          * Walk forward from the tail block until we hit the head or the last
 988          * block in the log.
 989          */
 990         end_blk = head_blk > tail_blk ? head_blk : log->l_logBBsize - 1;
 991         for (i = (int) tail_blk; i <= end_blk; i++) {
 992                 error = xlog_bread(log, i, 1, bp, &offset);
 993                 if (error)
 994                         goto out_error;
 995
 996                 if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
 997                         *rblk = i;
 998                         *rhead = (struct xlog_rec_header *) offset;
 999                         if (++found == count)
1000                                 break;
1001                 }
1002         }
1003
1004         /*
1005          * If we haven't hit the head block or the log record header count,
1006          * start looking again from the start of the physical log.
1007          */
1008         if (tail_blk > head_blk && found != count) {
1009                 for (i = 0; i < (int) head_blk; i++) {
1010                         error = xlog_bread(log, i, 1, bp, &offset);
1011                         if (error)
1012                                 goto out_error;
1013
1014                         if (*(__be32 *)offset ==
1015                             cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
1016                                 *wrapped = true;
1017                                 *rblk = i;
1018                                 *rhead = (struct xlog_rec_header *) offset;
1019                                 if (++found == count)
1020                                         break;
1021                         }
1022                 }
1023         }
1024
1025         return found;
1026
1027 out_error:
1028         return error;
1029 }
1030
1031 /*
1032  * Check the log tail for torn writes. This is required when torn writes are
1033  * detected at the head and the head had to be walked back to a previous record.
1034  * The tail of the previous record must now be verified to ensure the torn
1035  * writes didn't corrupt the previous tail.
1036  *
1037  * Return an error if CRC verification fails as recovery cannot proceed.
1038  */
1039 STATIC int
1040 xlog_verify_tail(
1041         struct xlog             *log,
1042         xfs_daddr_t             head_blk,
1043         xfs_daddr_t             tail_blk)
1044 {
1045         struct xlog_rec_header  *thead;
1046         struct xfs_buf          *bp;
1047         xfs_daddr_t             first_bad;
1048         int                     count;
1049         int                     error = 0;
1050         bool                    wrapped;
1051         xfs_daddr_t             tmp_head;
1052
1053         bp = xlog_get_bp(log, 1);
1054         if (!bp)
1055                 return -ENOMEM;
1056
1057         /*
1058          * Seek XLOG_MAX_ICLOGS + 1 records past the current tail record to get
1059          * a temporary head block that points after the last possible
1060          * concurrently written record of the tail.
1061          */
1062         count = xlog_seek_logrec_hdr(log, head_blk, tail_blk,
1063                                      XLOG_MAX_ICLOGS + 1, bp, &tmp_head, &thead,
1064                                      &wrapped);
1065         if (count < 0) {
1066                 error = count;
1067                 goto out;
1068         }
1069
1070         /*
1071          * If the call above didn't find XLOG_MAX_ICLOGS + 1 records, we ran
1072          * into the actual log head. tmp_head points to the start of the record
1073          * so update it to the actual head block.
1074          */
1075         if (count < XLOG_MAX_ICLOGS + 1)
1076                 tmp_head = head_blk;
1077
1078         /*
1079          * We now have a tail and temporary head block that covers at least
1080          * XLOG_MAX_ICLOGS records from the tail. We need to verify that these
1081          * records were completely written. Run a CRC verification pass from
1082          * tail to head and return the result.
1083          */
1084         error = xlog_do_recovery_pass(log, tmp_head, tail_blk,
1085                                       XLOG_RECOVER_CRCPASS, &first_bad);
1086
1087 out:
1088         xlog_put_bp(bp);
1089         return error;
1090 }
1091
1092 /*
1093  * Detect and trim torn writes from the head of the log.
1094  *
1095  * Storage without sector atomicity guarantees can result in torn writes in the
1096  * log in the event of a crash. Our only means to detect this scenario is via
1097  * CRC verification. While we can't always be certain that CRC verification
1098  * failure is due to a torn write vs. an unrelated corruption, we do know that
1099  * only a certain number (XLOG_MAX_ICLOGS) of log records can be written out at
1100  * one time. Therefore, CRC verify up to XLOG_MAX_ICLOGS records at the head of
1101  * the log and treat failures in this range as torn writes as a matter of
1102  * policy. In the event of CRC failure, the head is walked back to the last good
1103  * record in the log and the tail is updated from that record and verified.
1104  */
1105 STATIC int
1106 xlog_verify_head(
1107         struct xlog             *log,
1108         xfs_daddr_t             *head_blk,      /* in/out: unverified head */
1109         xfs_daddr_t             *tail_blk,      /* out: tail block */
1110         struct xfs_buf          *bp,
1111         xfs_daddr_t             *rhead_blk,     /* start blk of last record */
1112         struct xlog_rec_header  **rhead,        /* ptr to last record */
1113         bool                    *wrapped)       /* last rec. wraps phys. log */
1114 {
1115         struct xlog_rec_header  *tmp_rhead;
1116         struct xfs_buf          *tmp_bp;
1117         xfs_daddr_t             first_bad;
1118         xfs_daddr_t             tmp_rhead_blk;
1119         int                     found;
1120         int                     error;
1121         bool                    tmp_wrapped;
1122
1123         /*
1124          * Check the head of the log for torn writes. Search backwards from the
1125          * head until we hit the tail or the maximum number of log record I/Os
1126          * that could have been in flight at one time. Use a temporary buffer so
1127          * we don't trash the rhead/bp pointers from the caller.
1128          */
1129         tmp_bp = xlog_get_bp(log, 1);
1130         if (!tmp_bp)
1131                 return -ENOMEM;
1132         error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk,
1133                                       XLOG_MAX_ICLOGS, tmp_bp, &tmp_rhead_blk,
1134                                       &tmp_rhead, &tmp_wrapped);
1135         xlog_put_bp(tmp_bp);
1136         if (error < 0)
1137                 return error;
1138
1139         /*
1140          * Now run a CRC verification pass over the records starting at the
1141          * block found above to the current head. If a CRC failure occurs, the
1142          * log block of the first bad record is saved in first_bad.
1143          */
1144         error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk,
1145                                       XLOG_RECOVER_CRCPASS, &first_bad);
1146         if (error == -EFSBADCRC) {
1147                 /*
1148                  * We've hit a potential torn write. Reset the error and warn
1149                  * about it.
1150                  */
1151                 error = 0;
1152                 xfs_warn(log->l_mp,
1153 "Torn write (CRC failure) detected at log block 0x%llx. Truncating head block from 0x%llx.",
1154                          first_bad, *head_blk);
1155
1156                 /*
1157                  * Get the header block and buffer pointer for the last good
1158                  * record before the bad record.
1159                  *
1160                  * Note that xlog_find_tail() clears the blocks at the new head
1161                  * (i.e., the records with invalid CRC) if the cycle number
1162                  * matches the the current cycle.
1163                  */
1164                 found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1, bp,
1165                                               rhead_blk, rhead, wrapped);
1166                 if (found < 0)
1167                         return found;
1168                 if (found == 0)         /* XXX: right thing to do here? */
1169                         return -EIO;
1170
1171                 /*
1172                  * Reset the head block to the starting block of the first bad
1173                  * log record and set the tail block based on the last good
1174                  * record.
1175                  *
1176                  * Bail out if the updated head/tail match as this indicates
1177                  * possible corruption outside of the acceptable
1178                  * (XLOG_MAX_ICLOGS) range. This is a job for xfs_repair...
1179                  */
1180                 *head_blk = first_bad;
1181                 *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn));
1182                 if (*head_blk == *tail_blk) {
1183                         ASSERT(0);
1184                         return 0;
1185                 }
1186
1187                 /*
1188                  * Now verify the tail based on the updated head. This is
1189                  * required because the torn writes trimmed from the head could
1190                  * have been written over the tail of a previous record. Return
1191                  * any errors since recovery cannot proceed if the tail is
1192                  * corrupt.
1193                  *
1194                  * XXX: This leaves a gap in truly robust protection from torn
1195                  * writes in the log. If the head is behind the tail, the tail
1196                  * pushes forward to create some space and then a crash occurs
1197                  * causing the writes into the previous record's tail region to
1198                  * tear, log recovery isn't able to recover.
1199                  *
1200                  * How likely is this to occur? If possible, can we do something
1201                  * more intelligent here? Is it safe to push the tail forward if
1202                  * we can determine that the tail is within the range of the
1203                  * torn write (e.g., the kernel can only overwrite the tail if
1204                  * it has actually been pushed forward)? Alternatively, could we
1205                  * somehow prevent this condition at runtime?
1206                  */
1207                 error = xlog_verify_tail(log, *head_blk, *tail_blk);
1208         }
1209
1210         return error;
1211 }
1212
1213 /*
1214  * Check whether the head of the log points to an unmount record. In other
1215  * words, determine whether the log is clean. If so, update the in-core state
1216  * appropriately.
1217  */
1218 static int
1219 xlog_check_unmount_rec(
1220         struct xlog             *log,
1221         xfs_daddr_t             *head_blk,
1222         xfs_daddr_t             *tail_blk,
1223         struct xlog_rec_header  *rhead,
1224         xfs_daddr_t             rhead_blk,
1225         struct xfs_buf          *bp,
1226         bool                    *clean)
1227 {
1228         struct xlog_op_header   *op_head;
1229         xfs_daddr_t             umount_data_blk;
1230         xfs_daddr_t             after_umount_blk;
1231         int                     hblks;
1232         int                     error;
1233         char                    *offset;
1234
1235         *clean = false;
1236
1237         /*
1238          * Look for unmount record. If we find it, then we know there was a
1239          * clean unmount. Since 'i' could be the last block in the physical
1240          * log, we convert to a log block before comparing to the head_blk.
1241          *
1242          * Save the current tail lsn to use to pass to xlog_clear_stale_blocks()
1243          * below. We won't want to clear the unmount record if there is one, so
1244          * we pass the lsn of the unmount record rather than the block after it.
1245          */
1246         if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
1247                 int     h_size = be32_to_cpu(rhead->h_size);
1248                 int     h_version = be32_to_cpu(rhead->h_version);
1249
1250                 if ((h_version & XLOG_VERSION_2) &&
1251                     (h_size > XLOG_HEADER_CYCLE_SIZE)) {
1252                         hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
1253                         if (h_size % XLOG_HEADER_CYCLE_SIZE)
1254                                 hblks++;
1255                 } else {
1256                         hblks = 1;
1257                 }
1258         } else {
1259                 hblks = 1;
1260         }
1261         after_umount_blk = rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len));
1262         after_umount_blk = do_mod(after_umount_blk, log->l_logBBsize);
1263         if (*head_blk == after_umount_blk &&
1264             be32_to_cpu(rhead->h_num_logops) == 1) {
1265                 umount_data_blk = rhead_blk + hblks;
1266                 umount_data_blk = do_mod(umount_data_blk, log->l_logBBsize);
1267                 error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
1268                 if (error)
1269                         return error;
1270
1271                 op_head = (struct xlog_op_header *)offset;
1272                 if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
1273                         /*
1274                          * Set tail and last sync so that newly written log
1275                          * records will point recovery to after the current
1276                          * unmount record.
1277                          */
1278                         xlog_assign_atomic_lsn(&log->l_tail_lsn,
1279                                         log->l_curr_cycle, after_umount_blk);
1280                         xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
1281                                         log->l_curr_cycle, after_umount_blk);
1282                         *tail_blk = after_umount_blk;
1283
1284                         *clean = true;
1285                 }
1286         }
1287
1288         return 0;
1289 }
1290
1291 static void
1292 xlog_set_state(
1293         struct xlog             *log,
1294         xfs_daddr_t             head_blk,
1295         struct xlog_rec_header  *rhead,
1296         xfs_daddr_t             rhead_blk,
1297         bool                    bump_cycle)
1298 {
1299         /*
1300          * Reset log values according to the state of the log when we
1301          * crashed.  In the case where head_blk == 0, we bump curr_cycle
1302          * one because the next write starts a new cycle rather than
1303          * continuing the cycle of the last good log record.  At this
1304          * point we have guaranteed that all partial log records have been
1305          * accounted for.  Therefore, we know that the last good log record
1306          * written was complete and ended exactly on the end boundary
1307          * of the physical log.
1308          */
1309         log->l_prev_block = rhead_blk;
1310         log->l_curr_block = (int)head_blk;
1311         log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
1312         if (bump_cycle)
1313                 log->l_curr_cycle++;
1314         atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
1315         atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
1316         xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
1317                                         BBTOB(log->l_curr_block));
1318         xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
1319                                         BBTOB(log->l_curr_block));
1320 }
1321
1322 /*
1323  * Find the sync block number or the tail of the log.
1324  *
1325  * This will be the block number of the last record to have its
1326  * associated buffers synced to disk.  Every log record header has
1327  * a sync lsn embedded in it.  LSNs hold block numbers, so it is easy
1328  * to get a sync block number.  The only concern is to figure out which
1329  * log record header to believe.
1330  *
1331  * The following algorithm uses the log record header with the largest
1332  * lsn.  The entire log record does not need to be valid.  We only care
1333  * that the header is valid.
1334  *
1335  * We could speed up search by using current head_blk buffer, but it is not
1336  * available.
1337  */
1338 STATIC int
1339 xlog_find_tail(
1340         struct xlog             *log,
1341         xfs_daddr_t             *head_blk,
1342         xfs_daddr_t             *tail_blk)
1343 {
1344         xlog_rec_header_t       *rhead;
1345         char                    *offset = NULL;
1346         xfs_buf_t               *bp;
1347         int                     error;
1348         xfs_daddr_t             rhead_blk;
1349         xfs_lsn_t               tail_lsn;
1350         bool                    wrapped = false;
1351         bool                    clean = false;
1352
1353         /*
1354          * Find previous log record
1355          */
1356         if ((error = xlog_find_head(log, head_blk)))
1357                 return error;
1358         ASSERT(*head_blk < INT_MAX);
1359
1360         bp = xlog_get_bp(log, 1);
1361         if (!bp)
1362                 return -ENOMEM;
1363         if (*head_blk == 0) {                           /* special case */
1364                 error = xlog_bread(log, 0, 1, bp, &offset);
1365                 if (error)
1366                         goto done;
1367
1368                 if (xlog_get_cycle(offset) == 0) {
1369                         *tail_blk = 0;
1370                         /* leave all other log inited values alone */
1371                         goto done;
1372                 }
1373         }
1374
1375         /*
1376          * Search backwards through the log looking for the log record header
1377          * block. This wraps all the way back around to the head so something is
1378          * seriously wrong if we can't find it.
1379          */
1380         error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp,
1381                                       &rhead_blk, &rhead, &wrapped);
1382         if (error < 0)
1383                 return error;
1384         if (!error) {
1385                 xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
1386                 return -EIO;
1387         }
1388         *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
1389
1390         /*
1391          * Set the log state based on the current head record.
1392          */
1393         xlog_set_state(log, *head_blk, rhead, rhead_blk, wrapped);
1394         tail_lsn = atomic64_read(&log->l_tail_lsn);
1395
1396         /*
1397          * Look for an unmount record at the head of the log. This sets the log
1398          * state to determine whether recovery is necessary.
1399          */
1400         error = xlog_check_unmount_rec(log, head_blk, tail_blk, rhead,
1401                                        rhead_blk, bp, &clean);
1402         if (error)
1403                 goto done;
1404
1405         /*
1406          * Verify the log head if the log is not clean (e.g., we have anything
1407          * but an unmount record at the head). This uses CRC verification to
1408          * detect and trim torn writes. If discovered, CRC failures are
1409          * considered torn writes and the log head is trimmed accordingly.
1410          *
1411          * Note that we can only run CRC verification when the log is dirty
1412          * because there's no guarantee that the log data behind an unmount
1413          * record is compatible with the current architecture.
1414          */
1415         if (!clean) {
1416                 xfs_daddr_t     orig_head = *head_blk;
1417
1418                 error = xlog_verify_head(log, head_blk, tail_blk, bp,
1419                                          &rhead_blk, &rhead, &wrapped);
1420                 if (error)
1421                         goto done;
1422
1423                 /* update in-core state again if the head changed */
1424                 if (*head_blk != orig_head) {
1425                         xlog_set_state(log, *head_blk, rhead, rhead_blk,
1426                                        wrapped);
1427                         tail_lsn = atomic64_read(&log->l_tail_lsn);
1428                         error = xlog_check_unmount_rec(log, head_blk, tail_blk,
1429                                                        rhead, rhead_blk, bp,
1430                                                        &clean);
1431                         if (error)
1432                                 goto done;
1433                 }
1434         }
1435
1436         /*
1437          * Note that the unmount was clean. If the unmount was not clean, we
1438          * need to know this to rebuild the superblock counters from the perag
1439          * headers if we have a filesystem using non-persistent counters.
1440          */
1441         if (clean)
1442                 log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
1443
1444         /*
1445          * Make sure that there are no blocks in front of the head
1446          * with the same cycle number as the head.  This can happen
1447          * because we allow multiple outstanding log writes concurrently,
1448          * and the later writes might make it out before earlier ones.
1449          *
1450          * We use the lsn from before modifying it so that we'll never
1451          * overwrite the unmount record after a clean unmount.
1452          *
1453          * Do this only if we are going to recover the filesystem
1454          *
1455          * NOTE: This used to say "if (!readonly)"
1456          * However on Linux, we can & do recover a read-only filesystem.
1457          * We only skip recovery if NORECOVERY is specified on mount,
1458          * in which case we would not be here.
1459          *
1460          * But... if the -device- itself is readonly, just skip this.
1461          * We can't recover this device anyway, so it won't matter.
1462          */
1463         if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))
1464                 error = xlog_clear_stale_blocks(log, tail_lsn);
1465
1466 done:
1467         xlog_put_bp(bp);
1468
1469         if (error)
1470                 xfs_warn(log->l_mp, "failed to locate log tail");
1471         return error;
1472 }
1473
1474 /*
1475  * Is the log zeroed at all?
1476  *
1477  * The last binary search should be changed to perform an X block read
1478  * once X becomes small enough.  You can then search linearly through
1479  * the X blocks.  This will cut down on the number of reads we need to do.
1480  *
1481  * If the log is partially zeroed, this routine will pass back the blkno
1482  * of the first block with cycle number 0.  It won't have a complete LR
1483  * preceding it.
1484  *
1485  * Return:
1486  *      0  => the log is completely written to
1487  *      1 => use *blk_no as the first block of the log
1488  *      <0 => error has occurred
1489  */
1490 STATIC int
1491 xlog_find_zeroed(
1492         struct xlog     *log,
1493         xfs_daddr_t     *blk_no)
1494 {
1495         xfs_buf_t       *bp;
1496         char            *offset;
1497         uint            first_cycle, last_cycle;
1498         xfs_daddr_t     new_blk, last_blk, start_blk;
1499         xfs_daddr_t     num_scan_bblks;
1500         int             error, log_bbnum = log->l_logBBsize;
1501
1502         *blk_no = 0;
1503
1504         /* check totally zeroed log */
1505         bp = xlog_get_bp(log, 1);
1506         if (!bp)
1507                 return -ENOMEM;
1508         error = xlog_bread(log, 0, 1, bp, &offset);
1509         if (error)
1510                 goto bp_err;
1511
1512         first_cycle = xlog_get_cycle(offset);
1513         if (first_cycle == 0) {         /* completely zeroed log */
1514                 *blk_no = 0;
1515                 xlog_put_bp(bp);
1516                 return 1;
1517         }
1518
1519         /* check partially zeroed log */
1520         error = xlog_bread(log, log_bbnum-1, 1, bp, &offset);
1521         if (error)
1522                 goto bp_err;
1523
1524         last_cycle = xlog_get_cycle(offset);
1525         if (last_cycle != 0) {          /* log completely written to */
1526                 xlog_put_bp(bp);
1527                 return 0;
1528         } else if (first_cycle != 1) {
1529                 /*
1530                  * If the cycle of the last block is zero, the cycle of
1531                  * the first block must be 1. If it's not, maybe we're
1532                  * not looking at a log... Bail out.
1533                  */
1534                 xfs_warn(log->l_mp,
1535                         "Log inconsistent or not a log (last==0, first!=1)");
1536                 error = -EINVAL;
1537                 goto bp_err;
1538         }
1539
1540         /* we have a partially zeroed log */
1541         last_blk = log_bbnum-1;
1542         if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
1543                 goto bp_err;
1544
1545         /*
1546          * Validate the answer.  Because there is no way to guarantee that
1547          * the entire log is made up of log records which are the same size,
1548          * we scan over the defined maximum blocks.  At this point, the maximum
1549          * is not chosen to mean anything special.   XXXmiken
1550          */
1551         num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
1552         ASSERT(num_scan_bblks <= INT_MAX);
1553
1554         if (last_blk < num_scan_bblks)
1555                 num_scan_bblks = last_blk;
1556         start_blk = last_blk - num_scan_bblks;
1557
1558         /*
1559          * We search for any instances of cycle number 0 that occur before
1560          * our current estimate of the head.  What we're trying to detect is
1561          *        1 ... | 0 | 1 | 0...
1562          *                       ^ binary search ends here
1563          */
1564         if ((error = xlog_find_verify_cycle(log, start_blk,
1565                                          (int)num_scan_bblks, 0, &new_blk)))
1566                 goto bp_err;
1567         if (new_blk != -1)
1568                 last_blk = new_blk;
1569
1570         /*
1571          * Potentially backup over partial log record write.  We don't need
1572          * to search the end of the log because we know it is zero.
1573          */
1574         error = xlog_find_verify_log_record(log, start_blk, &last_blk, 0);
1575         if (error == 1)
1576                 error = -EIO;
1577         if (error)
1578                 goto bp_err;
1579
1580         *blk_no = last_blk;
1581 bp_err:
1582         xlog_put_bp(bp);
1583         if (error)
1584                 return error;
1585         return 1;
1586 }
1587
1588 /*
1589  * These are simple subroutines used by xlog_clear_stale_blocks() below
1590  * to initialize a buffer full of empty log record headers and write
1591  * them into the log.
1592  */
1593 STATIC void
1594 xlog_add_record(
1595         struct xlog             *log,
1596         char                    *buf,
1597         int                     cycle,
1598         int                     block,
1599         int                     tail_cycle,
1600         int                     tail_block)
1601 {
1602         xlog_rec_header_t       *recp = (xlog_rec_header_t *)buf;
1603
1604         memset(buf, 0, BBSIZE);
1605         recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
1606         recp->h_cycle = cpu_to_be32(cycle);
1607         recp->h_version = cpu_to_be32(
1608                         xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1);
1609         recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block));
1610         recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block));
1611         recp->h_fmt = cpu_to_be32(XLOG_FMT);
1612         memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t));
1613 }
1614
1615 STATIC int
1616 xlog_write_log_records(
1617         struct xlog     *log,
1618         int             cycle,
1619         int             start_block,
1620         int             blocks,
1621         int             tail_cycle,
1622         int             tail_block)
1623 {
1624         char            *offset;
1625         xfs_buf_t       *bp;
1626         int             balign, ealign;
1627         int             sectbb = log->l_sectBBsize;
1628         int             end_block = start_block + blocks;
1629         int             bufblks;
1630         int             error = 0;
1631         int             i, j = 0;
1632
1633         /*
1634          * Greedily allocate a buffer big enough to handle the full
1635          * range of basic blocks to be written.  If that fails, try
1636          * a smaller size.  We need to be able to write at least a
1637          * log sector, or we're out of luck.
1638          */
1639         bufblks = 1 << ffs(blocks);
1640         while (bufblks > log->l_logBBsize)
1641                 bufblks >>= 1;
1642         while (!(bp = xlog_get_bp(log, bufblks))) {
1643                 bufblks >>= 1;
1644                 if (bufblks < sectbb)
1645                         return -ENOMEM;
1646         }
1647
1648         /* We may need to do a read at the start to fill in part of
1649          * the buffer in the starting sector not covered by the first
1650          * write below.
1651          */
1652         balign = round_down(start_block, sectbb);
1653         if (balign != start_block) {
1654                 error = xlog_bread_noalign(log, start_block, 1, bp);
1655                 if (error)
1656                         goto out_put_bp;
1657
1658                 j = start_block - balign;
1659         }
1660
1661         for (i = start_block; i < end_block; i += bufblks) {
1662                 int             bcount, endcount;
1663
1664                 bcount = min(bufblks, end_block - start_block);
1665                 endcount = bcount - j;
1666
1667                 /* We may need to do a read at the end to fill in part of
1668                  * the buffer in the final sector not covered by the write.
1669                  * If this is the same sector as the above read, skip it.
1670                  */
1671                 ealign = round_down(end_block, sectbb);
1672                 if (j == 0 && (start_block + endcount > ealign)) {
1673                         offset = bp->b_addr + BBTOB(ealign - start_block);
1674                         error = xlog_bread_offset(log, ealign, sectbb,
1675                                                         bp, offset);
1676                         if (error)
1677                                 break;
1678
1679                 }
1680
1681                 offset = xlog_align(log, start_block, endcount, bp);
1682                 for (; j < endcount; j++) {
1683                         xlog_add_record(log, offset, cycle, i+j,
1684                                         tail_cycle, tail_block);
1685                         offset += BBSIZE;
1686                 }
1687                 error = xlog_bwrite(log, start_block, endcount, bp);
1688                 if (error)
1689                         break;
1690                 start_block += endcount;
1691                 j = 0;
1692         }
1693
1694  out_put_bp:
1695         xlog_put_bp(bp);
1696         return error;
1697 }
1698
1699 /*
1700  * This routine is called to blow away any incomplete log writes out
1701  * in front of the log head.  We do this so that we won't become confused
1702  * if we come up, write only a little bit more, and then crash again.
1703  * If we leave the partial log records out there, this situation could
1704  * cause us to think those partial writes are valid blocks since they
1705  * have the current cycle number.  We get rid of them by overwriting them
1706  * with empty log records with the old cycle number rather than the
1707  * current one.
1708  *
1709  * The tail lsn is passed in rather than taken from
1710  * the log so that we will not write over the unmount record after a
1711  * clean unmount in a 512 block log.  Doing so would leave the log without
1712  * any valid log records in it until a new one was written.  If we crashed
1713  * during that time we would not be able to recover.
1714  */
1715 STATIC int
1716 xlog_clear_stale_blocks(
1717         struct xlog     *log,
1718         xfs_lsn_t       tail_lsn)
1719 {
1720         int             tail_cycle, head_cycle;
1721         int             tail_block, head_block;
1722         int             tail_distance, max_distance;
1723         int             distance;
1724         int             error;
1725
1726         tail_cycle = CYCLE_LSN(tail_lsn);
1727         tail_block = BLOCK_LSN(tail_lsn);
1728         head_cycle = log->l_curr_cycle;
1729         head_block = log->l_curr_block;
1730
1731         /*
1732          * Figure out the distance between the new head of the log
1733          * and the tail.  We want to write over any blocks beyond the
1734          * head that we may have written just before the crash, but
1735          * we don't want to overwrite the tail of the log.
1736          */
1737         if (head_cycle == tail_cycle) {
1738                 /*
1739                  * The tail is behind the head in the physical log,
1740                  * so the distance from the head to the tail is the
1741                  * distance from the head to the end of the log plus
1742                  * the distance from the beginning of the log to the
1743                  * tail.
1744                  */
1745                 if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) {
1746                         XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)",
1747                                          XFS_ERRLEVEL_LOW, log->l_mp);
1748                         return -EFSCORRUPTED;
1749                 }
1750                 tail_distance = tail_block + (log->l_logBBsize - head_block);
1751         } else {
1752                 /*
1753                  * The head is behind the tail in the physical log,
1754                  * so the distance from the head to the tail is just
1755                  * the tail block minus the head block.
1756                  */
1757                 if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){
1758                         XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)",
1759                                          XFS_ERRLEVEL_LOW, log->l_mp);
1760                         return -EFSCORRUPTED;
1761                 }
1762                 tail_distance = tail_block - head_block;
1763         }
1764
1765         /*
1766          * If the head is right up against the tail, we can't clear
1767          * anything.
1768          */
1769         if (tail_distance <= 0) {
1770                 ASSERT(tail_distance == 0);
1771                 return 0;
1772         }
1773
1774         max_distance = XLOG_TOTAL_REC_SHIFT(log);
1775         /*
1776          * Take the smaller of the maximum amount of outstanding I/O
1777          * we could have and the distance to the tail to clear out.
1778          * We take the smaller so that we don't overwrite the tail and
1779          * we don't waste all day writing from the head to the tail
1780          * for no reason.
1781          */
1782         max_distance = MIN(max_distance, tail_distance);
1783
1784         if ((head_block + max_distance) <= log->l_logBBsize) {
1785                 /*
1786                  * We can stomp all the blocks we need to without
1787                  * wrapping around the end of the log.  Just do it
1788                  * in a single write.  Use the cycle number of the
1789                  * current cycle minus one so that the log will look like:
1790                  *     n ... | n - 1 ...
1791                  */
1792                 error = xlog_write_log_records(log, (head_cycle - 1),
1793                                 head_block, max_distance, tail_cycle,
1794                                 tail_block);
1795                 if (error)
1796                         return error;
1797         } else {
1798                 /*
1799                  * We need to wrap around the end of the physical log in
1800                  * order to clear all the blocks.  Do it in two separate
1801                  * I/Os.  The first write should be from the head to the
1802                  * end of the physical log, and it should use the current
1803                  * cycle number minus one just like above.
1804                  */
1805                 distance = log->l_logBBsize - head_block;
1806                 error = xlog_write_log_records(log, (head_cycle - 1),
1807                                 head_block, distance, tail_cycle,
1808                                 tail_block);
1809
1810                 if (error)
1811                         return error;
1812
1813                 /*
1814                  * Now write the blocks at the start of the physical log.
1815                  * This writes the remainder of the blocks we want to clear.
1816                  * It uses the current cycle number since we're now on the
1817                  * same cycle as the head so that we get:
1818                  *    n ... n ... | n - 1 ...
1819                  *    ^^^^^ blocks we're writing
1820                  */
1821                 distance = max_distance - (log->l_logBBsize - head_block);
1822                 error = xlog_write_log_records(log, head_cycle, 0, distance,
1823                                 tail_cycle, tail_block);
1824                 if (error)
1825                         return error;
1826         }
1827
1828         return 0;
1829 }
1830
1831 /******************************************************************************
1832  *
1833  *              Log recover routines
1834  *
1835  ******************************************************************************
1836  */
1837
1838 /*
1839  * Sort the log items in the transaction.
1840  *
1841  * The ordering constraints are defined by the inode allocation and unlink
1842  * behaviour. The rules are:
1843  *
1844  *      1. Every item is only logged once in a given transaction. Hence it
1845  *         represents the last logged state of the item. Hence ordering is
1846  *         dependent on the order in which operations need to be performed so
1847  *         required initial conditions are always met.
1848  *
1849  *      2. Cancelled buffers are recorded in pass 1 in a separate table and
1850  *         there's nothing to replay from them so we can simply cull them
1851  *         from the transaction. However, we can't do that until after we've
1852  *         replayed all the other items because they may be dependent on the
1853  *         cancelled buffer and replaying the cancelled buffer can remove it
1854  *         form the cancelled buffer table. Hence they have tobe done last.
1855  *
1856  *      3. Inode allocation buffers must be replayed before inode items that
1857  *         read the buffer and replay changes into it. For filesystems using the
1858  *         ICREATE transactions, this means XFS_LI_ICREATE objects need to get
1859  *         treated the same as inode allocation buffers as they create and
1860  *         initialise the buffers directly.
1861  *
1862  *      4. Inode unlink buffers must be replayed after inode items are replayed.
1863  *         This ensures that inodes are completely flushed to the inode buffer
1864  *         in a "free" state before we remove the unlinked inode list pointer.
1865  *
1866  * Hence the ordering needs to be inode allocation buffers first, inode items
1867  * second, inode unlink buffers third and cancelled buffers last.
1868  *
1869  * But there's a problem with that - we can't tell an inode allocation buffer
1870  * apart from a regular buffer, so we can't separate them. We can, however,
1871  * tell an inode unlink buffer from the others, and so we can separate them out
1872  * from all the other buffers and move them to last.
1873  *
1874  * Hence, 4 lists, in order from head to tail:
1875  *      - buffer_list for all buffers except cancelled/inode unlink buffers
1876  *      - item_list for all non-buffer items
1877  *      - inode_buffer_list for inode unlink buffers
1878  *      - cancel_list for the cancelled buffers
1879  *
1880  * Note that we add objects to the tail of the lists so that first-to-last
1881  * ordering is preserved within the lists. Adding objects to the head of the
1882  * list means when we traverse from the head we walk them in last-to-first
1883  * order. For cancelled buffers and inode unlink buffers this doesn't matter,
1884  * but for all other items there may be specific ordering that we need to
1885  * preserve.
1886  */
1887 STATIC int
1888 xlog_recover_reorder_trans(
1889         struct xlog             *log,
1890         struct xlog_recover     *trans,
1891         int                     pass)
1892 {
1893         xlog_recover_item_t     *item, *n;
1894         int                     error = 0;
1895         LIST_HEAD(sort_list);
1896         LIST_HEAD(cancel_list);
1897         LIST_HEAD(buffer_list);
1898         LIST_HEAD(inode_buffer_list);
1899         LIST_HEAD(inode_list);
1900
1901         list_splice_init(&trans->r_itemq, &sort_list);
1902         list_for_each_entry_safe(item, n, &sort_list, ri_list) {
1903                 xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
1904
1905                 switch (ITEM_TYPE(item)) {
1906                 case XFS_LI_ICREATE:
1907                         list_move_tail(&item->ri_list, &buffer_list);
1908                         break;
1909                 case XFS_LI_BUF:
1910                         if (buf_f->blf_flags & XFS_BLF_CANCEL) {
1911                                 trace_xfs_log_recover_item_reorder_head(log,
1912                                                         trans, item, pass);
1913                                 list_move(&item->ri_list, &cancel_list);
1914                                 break;
1915                         }
1916                         if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
1917                                 list_move(&item->ri_list, &inode_buffer_list);
1918                                 break;
1919                         }
1920                         list_move_tail(&item->ri_list, &buffer_list);
1921                         break;
1922                 case XFS_LI_INODE:
1923                 case XFS_LI_DQUOT:
1924                 case XFS_LI_QUOTAOFF:
1925                 case XFS_LI_EFD:
1926                 case XFS_LI_EFI:
1927                 case XFS_LI_RUI:
1928                 case XFS_LI_RUD:
1929                 case XFS_LI_CUI:
1930                 case XFS_LI_CUD:
1931                 case XFS_LI_BUI:
1932                 case XFS_LI_BUD:
1933                         trace_xfs_log_recover_item_reorder_tail(log,
1934                                                         trans, item, pass);
1935                         list_move_tail(&item->ri_list, &inode_list);
1936                         break;
1937                 default:
1938                         xfs_warn(log->l_mp,
1939                                 "%s: unrecognized type of log operation",
1940                                 __func__);
1941                         ASSERT(0);
1942                         /*
1943                          * return the remaining items back to the transaction
1944                          * item list so they can be freed in caller.
1945                          */
1946                         if (!list_empty(&sort_list))
1947                                 list_splice_init(&sort_list, &trans->r_itemq);
1948                         error = -EIO;
1949                         goto out;
1950                 }
1951         }
1952 out:
1953         ASSERT(list_empty(&sort_list));
1954         if (!list_empty(&buffer_list))
1955                 list_splice(&buffer_list, &trans->r_itemq);
1956         if (!list_empty(&inode_list))
1957                 list_splice_tail(&inode_list, &trans->r_itemq);
1958         if (!list_empty(&inode_buffer_list))
1959                 list_splice_tail(&inode_buffer_list, &trans->r_itemq);
1960         if (!list_empty(&cancel_list))
1961                 list_splice_tail(&cancel_list, &trans->r_itemq);
1962         return error;
1963 }
1964
1965 /*
1966  * Build up the table of buf cancel records so that we don't replay
1967  * cancelled data in the second pass.  For buffer records that are
1968  * not cancel records, there is nothing to do here so we just return.
1969  *
1970  * If we get a cancel record which is already in the table, this indicates
1971  * that the buffer was cancelled multiple times.  In order to ensure
1972  * that during pass 2 we keep the record in the table until we reach its
1973  * last occurrence in the log, we keep a reference count in the cancel
1974  * record in the table to tell us how many times we expect to see this
1975  * record during the second pass.
1976  */
1977 STATIC int
1978 xlog_recover_buffer_pass1(
1979         struct xlog                     *log,
1980         struct xlog_recover_item        *item)
1981 {
1982         xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
1983         struct list_head        *bucket;
1984         struct xfs_buf_cancel   *bcp;
1985
1986         /*
1987          * If this isn't a cancel buffer item, then just return.
1988          */
1989         if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
1990                 trace_xfs_log_recover_buf_not_cancel(log, buf_f);
1991                 return 0;
1992         }
1993
1994         /*
1995          * Insert an xfs_buf_cancel record into the hash table of them.
1996          * If there is already an identical record, bump its reference count.
1997          */
1998         bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno);
1999         list_for_each_entry(bcp, bucket, bc_list) {
2000                 if (bcp->bc_blkno == buf_f->blf_blkno &&
2001                     bcp->bc_len == buf_f->blf_len) {
2002                         bcp->bc_refcount++;
2003                         trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
2004                         return 0;
2005                 }
2006         }
2007
2008         bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);
2009         bcp->bc_blkno = buf_f->blf_blkno;
2010         bcp->bc_len = buf_f->blf_len;
2011         bcp->bc_refcount = 1;
2012         list_add_tail(&bcp->bc_list, bucket);
2013
2014         trace_xfs_log_recover_buf_cancel_add(log, buf_f);
2015         return 0;
2016 }
2017
2018 /*
2019  * Check to see whether the buffer being recovered has a corresponding
2020  * entry in the buffer cancel record table. If it is, return the cancel
2021  * buffer structure to the caller.
2022  */
2023 STATIC struct xfs_buf_cancel *
2024 xlog_peek_buffer_cancelled(
2025         struct xlog             *log,
2026         xfs_daddr_t             blkno,
2027         uint                    len,
2028         unsigned short                  flags)
2029 {
2030         struct list_head        *bucket;
2031         struct xfs_buf_cancel   *bcp;
2032
2033         if (!log->l_buf_cancel_table) {
2034                 /* empty table means no cancelled buffers in the log */
2035                 ASSERT(!(flags & XFS_BLF_CANCEL));
2036                 return NULL;
2037         }
2038
2039         bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
2040         list_for_each_entry(bcp, bucket, bc_list) {
2041                 if (bcp->bc_blkno == blkno && bcp->bc_len == len)
2042                         return bcp;
2043         }
2044
2045         /*
2046          * We didn't find a corresponding entry in the table, so return 0 so
2047          * that the buffer is NOT cancelled.
2048          */
2049         ASSERT(!(flags & XFS_BLF_CANCEL));
2050         return NULL;
2051 }
2052
2053 /*
2054  * If the buffer is being cancelled then return 1 so that it will be cancelled,
2055  * otherwise return 0.  If the buffer is actually a buffer cancel item
2056  * (XFS_BLF_CANCEL is set), then decrement the refcount on the entry in the
2057  * table and remove it from the table if this is the last reference.
2058  *
2059  * We remove the cancel record from the table when we encounter its last
2060  * occurrence in the log so that if the same buffer is re-used again after its
2061  * last cancellation we actually replay the changes made at that point.
2062  */
2063 STATIC int
2064 xlog_check_buffer_cancelled(
2065         struct xlog             *log,
2066         xfs_daddr_t             blkno,
2067         uint                    len,
2068         unsigned short                  flags)
2069 {
2070         struct xfs_buf_cancel   *bcp;
2071
2072         bcp = xlog_peek_buffer_cancelled(log, blkno, len, flags);
2073         if (!bcp)
2074                 return 0;
2075
2076         /*
2077          * We've go a match, so return 1 so that the recovery of this buffer
2078          * is cancelled.  If this buffer is actually a buffer cancel log
2079          * item, then decrement the refcount on the one in the table and
2080          * remove it if this is the last reference.
2081          */
2082         if (flags & XFS_BLF_CANCEL) {
2083                 if (--bcp->bc_refcount == 0) {
2084                         list_del(&bcp->bc_list);
2085                         kmem_free(bcp);
2086                 }
2087         }
2088         return 1;
2089 }
2090
2091 /*
2092  * Perform recovery for a buffer full of inodes.  In these buffers, the only
2093  * data which should be recovered is that which corresponds to the
2094  * di_next_unlinked pointers in the on disk inode structures.  The rest of the
2095  * data for the inodes is always logged through the inodes themselves rather
2096  * than the inode buffer and is recovered in xlog_recover_inode_pass2().
2097  *
2098  * The only time when buffers full of inodes are fully recovered is when the
2099  * buffer is full of newly allocated inodes.  In this case the buffer will
2100  * not be marked as an inode buffer and so will be sent to
2101  * xlog_recover_do_reg_buffer() below during recovery.
2102  */
2103 STATIC int
2104 xlog_recover_do_inode_buffer(
2105         struct xfs_mount        *mp,
2106         xlog_recover_item_t     *item,
2107         struct xfs_buf          *bp,
2108         xfs_buf_log_format_t    *buf_f)
2109 {
2110         int                     i;
2111         int                     item_index = 0;
2112         int                     bit = 0;
2113         int                     nbits = 0;
2114         int                     reg_buf_offset = 0;
2115         int                     reg_buf_bytes = 0;
2116         int                     next_unlinked_offset;
2117         int                     inodes_per_buf;
2118         xfs_agino_t             *logged_nextp;
2119         xfs_agino_t             *buffer_nextp;
2120
2121         trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
2122
2123         /*
2124          * Post recovery validation only works properly on CRC enabled
2125          * filesystems.
2126          */
2127         if (xfs_sb_version_hascrc(&mp->m_sb))
2128                 bp->b_ops = &xfs_inode_buf_ops;
2129
2130         inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog;
2131         for (i = 0; i < inodes_per_buf; i++) {
2132                 next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
2133                         offsetof(xfs_dinode_t, di_next_unlinked);
2134
2135                 while (next_unlinked_offset >=
2136                        (reg_buf_offset + reg_buf_bytes)) {
2137                         /*
2138                          * The next di_next_unlinked field is beyond
2139                          * the current logged region.  Find the next
2140                          * logged region that contains or is beyond
2141                          * the current di_next_unlinked field.
2142                          */
2143                         bit += nbits;
2144                         bit = xfs_next_bit(buf_f->blf_data_map,
2145                                            buf_f->blf_map_size, bit);
2146
2147                         /*
2148                          * If there are no more logged regions in the
2149                          * buffer, then we're done.
2150                          */
2151                         if (bit == -1)
2152                                 return 0;
2153
2154                         nbits = xfs_contig_bits(buf_f->blf_data_map,
2155                                                 buf_f->blf_map_size, bit);
2156                         ASSERT(nbits > 0);
2157                         reg_buf_offset = bit << XFS_BLF_SHIFT;
2158                         reg_buf_bytes = nbits << XFS_BLF_SHIFT;
2159                         item_index++;
2160                 }
2161
2162                 /*
2163                  * If the current logged region starts after the current
2164                  * di_next_unlinked field, then move on to the next
2165                  * di_next_unlinked field.
2166                  */
2167                 if (next_unlinked_offset < reg_buf_offset)
2168                         continue;
2169
2170                 ASSERT(item->ri_buf[item_index].i_addr != NULL);
2171                 ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
2172                 ASSERT((reg_buf_offset + reg_buf_bytes) <=
2173                                                         BBTOB(bp->b_io_length));
2174
2175                 /*
2176                  * The current logged region contains a copy of the
2177                  * current di_next_unlinked field.  Extract its value
2178                  * and copy it to the buffer copy.
2179                  */
2180                 logged_nextp = item->ri_buf[item_index].i_addr +
2181                                 next_unlinked_offset - reg_buf_offset;
2182                 if (unlikely(*logged_nextp == 0)) {
2183                         xfs_alert(mp,
2184                 "Bad inode buffer log record (ptr = 0x%p, bp = 0x%p). "
2185                 "Trying to replay bad (0) inode di_next_unlinked field.",
2186                                 item, bp);
2187                         XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
2188                                          XFS_ERRLEVEL_LOW, mp);
2189                         return -EFSCORRUPTED;
2190                 }
2191
2192                 buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset);
2193                 *buffer_nextp = *logged_nextp;
2194
2195                 /*
2196                  * If necessary, recalculate the CRC in the on-disk inode. We
2197                  * have to leave the inode in a consistent state for whoever
2198                  * reads it next....
2199                  */
2200                 xfs_dinode_calc_crc(mp,
2201                                 xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
2202
2203         }
2204
2205         return 0;
2206 }
2207
2208 /*
2209  * V5 filesystems know the age of the buffer on disk being recovered. We can
2210  * have newer objects on disk than we are replaying, and so for these cases we
2211  * don't want to replay the current change as that will make the buffer contents
2212  * temporarily invalid on disk.
2213  *
2214  * The magic number might not match the buffer type we are going to recover
2215  * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags.  Hence
2216  * extract the LSN of the existing object in the buffer based on it's current
2217  * magic number.  If we don't recognise the magic number in the buffer, then
2218  * return a LSN of -1 so that the caller knows it was an unrecognised block and
2219  * so can recover the buffer.
2220  *
2221  * Note: we cannot rely solely on magic number matches to determine that the
2222  * buffer has a valid LSN - we also need to verify that it belongs to this
2223  * filesystem, so we need to extract the object's LSN and compare it to that
2224  * which we read from the superblock. If the UUIDs don't match, then we've got a
2225  * stale metadata block from an old filesystem instance that we need to recover
2226  * over the top of.
2227  */
2228 static xfs_lsn_t
2229 xlog_recover_get_buf_lsn(
2230         struct xfs_mount        *mp,
2231         struct xfs_buf          *bp)
2232 {
2233         uint32_t                magic32;
2234         uint16_t                magic16;
2235         uint16_t                magicda;
2236         void                    *blk = bp->b_addr;
2237         uuid_t                  *uuid;
2238         xfs_lsn_t               lsn = -1;
2239
2240         /* v4 filesystems always recover immediately */
2241         if (!xfs_sb_version_hascrc(&mp->m_sb))
2242                 goto recover_immediately;
2243
2244         magic32 = be32_to_cpu(*(__be32 *)blk);
2245         switch (magic32) {
2246         case XFS_ABTB_CRC_MAGIC:
2247         case XFS_ABTC_CRC_MAGIC:
2248         case XFS_ABTB_MAGIC:
2249         case XFS_ABTC_MAGIC:
2250         case XFS_RMAP_CRC_MAGIC:
2251         case XFS_REFC_CRC_MAGIC:
2252         case XFS_IBT_CRC_MAGIC:
2253         case XFS_IBT_MAGIC: {
2254                 struct xfs_btree_block *btb = blk;
2255
2256                 lsn = be64_to_cpu(btb->bb_u.s.bb_lsn);
2257                 uuid = &btb->bb_u.s.bb_uuid;
2258                 break;
2259         }
2260         case XFS_BMAP_CRC_MAGIC:
2261         case XFS_BMAP_MAGIC: {
2262                 struct xfs_btree_block *btb = blk;
2263
2264                 lsn = be64_to_cpu(btb->bb_u.l.bb_lsn);
2265                 uuid = &btb->bb_u.l.bb_uuid;
2266                 break;
2267         }
2268         case XFS_AGF_MAGIC:
2269                 lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn);
2270                 uuid = &((struct xfs_agf *)blk)->agf_uuid;
2271                 break;
2272         case XFS_AGFL_MAGIC:
2273                 lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn);
2274                 uuid = &((struct xfs_agfl *)blk)->agfl_uuid;
2275                 break;
2276         case XFS_AGI_MAGIC:
2277                 lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn);
2278                 uuid = &((struct xfs_agi *)blk)->agi_uuid;
2279                 break;
2280         case XFS_SYMLINK_MAGIC:
2281                 lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn);
2282                 uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid;
2283                 break;
2284         case XFS_DIR3_BLOCK_MAGIC:
2285         case XFS_DIR3_DATA_MAGIC:
2286         case XFS_DIR3_FREE_MAGIC:
2287                 lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn);
2288                 uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid;
2289                 break;
2290         case XFS_ATTR3_RMT_MAGIC:
2291                 /*
2292                  * Remote attr blocks are written synchronously, rather than
2293                  * being logged. That means they do not contain a valid LSN
2294                  * (i.e. transactionally ordered) in them, and hence any time we
2295                  * see a buffer to replay over the top of a remote attribute
2296                  * block we should simply do so.
2297                  */
2298                 goto recover_immediately;
2299         case XFS_SB_MAGIC:
2300                 /*
2301                  * superblock uuids are magic. We may or may not have a
2302                  * sb_meta_uuid on disk, but it will be set in the in-core
2303                  * superblock. We set the uuid pointer for verification
2304                  * according to the superblock feature mask to ensure we check
2305                  * the relevant UUID in the superblock.
2306                  */
2307                 lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn);
2308                 if (xfs_sb_version_hasmetauuid(&mp->m_sb))
2309                         uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid;
2310                 else
2311                         uuid = &((struct xfs_dsb *)blk)->sb_uuid;
2312                 break;
2313         default:
2314                 break;
2315         }
2316
2317         if (lsn != (xfs_lsn_t)-1) {
2318                 if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid))
2319                         goto recover_immediately;
2320                 return lsn;
2321         }
2322
2323         magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic);
2324         switch (magicda) {
2325         case XFS_DIR3_LEAF1_MAGIC:
2326         case XFS_DIR3_LEAFN_MAGIC:
2327         case XFS_DA3_NODE_MAGIC:
2328                 lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn);
2329                 uuid = &((struct xfs_da3_blkinfo *)blk)->uuid;
2330                 break;
2331         default:
2332                 break;
2333         }
2334
2335         if (lsn != (xfs_lsn_t)-1) {
2336                 if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
2337                         goto recover_immediately;
2338                 return lsn;
2339         }
2340
2341         /*
2342          * We do individual object checks on dquot and inode buffers as they
2343          * have their own individual LSN records. Also, we could have a stale
2344          * buffer here, so we have to at least recognise these buffer types.
2345          *
2346          * A notd complexity here is inode unlinked list processing - it logs
2347          * the inode directly in the buffer, but we don't know which inodes have
2348          * been modified, and there is no global buffer LSN. Hence we need to
2349          * recover all inode buffer types immediately. This problem will be
2350          * fixed by logical logging of the unlinked list modifications.
2351          */
2352         magic16 = be16_to_cpu(*(__be16 *)blk);
2353         switch (magic16) {
2354         case XFS_DQUOT_MAGIC:
2355         case XFS_DINODE_MAGIC:
2356                 goto recover_immediately;
2357         default:
2358                 break;
2359         }
2360
2361         /* unknown buffer contents, recover immediately */
2362
2363 recover_immediately:
2364         return (xfs_lsn_t)-1;
2365
2366 }
2367
2368 /*
2369  * Validate the recovered buffer is of the correct type and attach the
2370  * appropriate buffer operations to them for writeback. Magic numbers are in a
2371  * few places:
2372  *      the first 16 bits of the buffer (inode buffer, dquot buffer),
2373  *      the first 32 bits of the buffer (most blocks),
2374  *      inside a struct xfs_da_blkinfo at the start of the buffer.
2375  */
2376 static void
2377 xlog_recover_validate_buf_type(
2378         struct xfs_mount        *mp,
2379         struct xfs_buf          *bp,
2380         xfs_buf_log_format_t    *buf_f,
2381         xfs_lsn_t               current_lsn)
2382 {
2383         struct xfs_da_blkinfo   *info = bp->b_addr;
2384         uint32_t                magic32;
2385         uint16_t                magic16;
2386         uint16_t                magicda;
2387         char                    *warnmsg = NULL;
2388
2389         /*
2390          * We can only do post recovery validation on items on CRC enabled
2391          * fielsystems as we need to know when the buffer was written to be able
2392          * to determine if we should have replayed the item. If we replay old
2393          * metadata over a newer buffer, then it will enter a temporarily
2394          * inconsistent state resulting in verification failures. Hence for now
2395          * just avoid the verification stage for non-crc filesystems
2396          */
2397         if (!xfs_sb_version_hascrc(&mp->m_sb))
2398                 return;
2399
2400         magic32 = be32_to_cpu(*(__be32 *)bp->b_addr);
2401         magic16 = be16_to_cpu(*(__be16*)bp->b_addr);
2402         magicda = be16_to_cpu(info->magic);
2403         switch (xfs_blft_from_flags(buf_f)) {
2404         case XFS_BLFT_BTREE_BUF:
2405                 switch (magic32) {
2406                 case XFS_ABTB_CRC_MAGIC:
2407                 case XFS_ABTC_CRC_MAGIC:
2408                 case XFS_ABTB_MAGIC:
2409                 case XFS_ABTC_MAGIC:
2410                         bp->b_ops = &xfs_allocbt_buf_ops;
2411                         break;
2412                 case XFS_IBT_CRC_MAGIC:
2413                 case XFS_FIBT_CRC_MAGIC:
2414                 case XFS_IBT_MAGIC:
2415                 case XFS_FIBT_MAGIC:
2416                         bp->b_ops = &xfs_inobt_buf_ops;
2417                         break;
2418                 case XFS_BMAP_CRC_MAGIC:
2419                 case XFS_BMAP_MAGIC:
2420                         bp->b_ops = &xfs_bmbt_buf_ops;
2421                         break;
2422                 case XFS_RMAP_CRC_MAGIC:
2423                         bp->b_ops = &xfs_rmapbt_buf_ops;
2424                         break;
2425                 case XFS_REFC_CRC_MAGIC:
2426                         bp->b_ops = &xfs_refcountbt_buf_ops;
2427                         break;
2428                 default:
2429                         warnmsg = "Bad btree block magic!";
2430                         break;
2431                 }
2432                 break;
2433         case XFS_BLFT_AGF_BUF:
2434                 if (magic32 != XFS_AGF_MAGIC) {
2435                         warnmsg = "Bad AGF block magic!";
2436                         break;
2437                 }
2438                 bp->b_ops = &xfs_agf_buf_ops;
2439                 break;
2440         case XFS_BLFT_AGFL_BUF:
2441                 if (magic32 != XFS_AGFL_MAGIC) {
2442                         warnmsg = "Bad AGFL block magic!";
2443                         break;
2444                 }
2445                 bp->b_ops = &xfs_agfl_buf_ops;
2446                 break;
2447         case XFS_BLFT_AGI_BUF:
2448                 if (magic32 != XFS_AGI_MAGIC) {
2449                         warnmsg = "Bad AGI block magic!";
2450                         break;
2451                 }
2452                 bp->b_ops = &xfs_agi_buf_ops;
2453                 break;
2454         case XFS_BLFT_UDQUOT_BUF:
2455         case XFS_BLFT_PDQUOT_BUF:
2456         case XFS_BLFT_GDQUOT_BUF:
2457 #ifdef CONFIG_XFS_QUOTA
2458                 if (magic16 != XFS_DQUOT_MAGIC) {
2459                         warnmsg = "Bad DQUOT block magic!";
2460                         break;
2461                 }
2462                 bp->b_ops = &xfs_dquot_buf_ops;
2463 #else
2464                 xfs_alert(mp,
2465         "Trying to recover dquots without QUOTA support built in!");
2466                 ASSERT(0);
2467 #endif
2468                 break;
2469         case XFS_BLFT_DINO_BUF:
2470                 if (magic16 != XFS_DINODE_MAGIC) {
2471                         warnmsg = "Bad INODE block magic!";
2472                         break;
2473                 }
2474                 bp->b_ops = &xfs_inode_buf_ops;
2475                 break;
2476         case XFS_BLFT_SYMLINK_BUF:
2477                 if (magic32 != XFS_SYMLINK_MAGIC) {
2478                         warnmsg = "Bad symlink block magic!";
2479                         break;
2480                 }
2481                 bp->b_ops = &xfs_symlink_buf_ops;
2482                 break;
2483         case XFS_BLFT_DIR_BLOCK_BUF:
2484                 if (magic32 != XFS_DIR2_BLOCK_MAGIC &&
2485                     magic32 != XFS_DIR3_BLOCK_MAGIC) {
2486                         warnmsg = "Bad dir block magic!";
2487                         break;
2488                 }
2489                 bp->b_ops = &xfs_dir3_block_buf_ops;
2490                 break;
2491         case XFS_BLFT_DIR_DATA_BUF:
2492                 if (magic32 != XFS_DIR2_DATA_MAGIC &&
2493                     magic32 != XFS_DIR3_DATA_MAGIC) {
2494                         warnmsg = "Bad dir data magic!";
2495                         break;
2496                 }
2497                 bp->b_ops = &xfs_dir3_data_buf_ops;
2498                 break;
2499         case XFS_BLFT_DIR_FREE_BUF:
2500                 if (magic32 != XFS_DIR2_FREE_MAGIC &&
2501                     magic32 != XFS_DIR3_FREE_MAGIC) {
2502                         warnmsg = "Bad dir3 free magic!";
2503                         break;
2504                 }
2505                 bp->b_ops = &xfs_dir3_free_buf_ops;
2506                 break;
2507         case XFS_BLFT_DIR_LEAF1_BUF:
2508                 if (magicda != XFS_DIR2_LEAF1_MAGIC &&
2509                     magicda != XFS_DIR3_LEAF1_MAGIC) {
2510                         warnmsg = "Bad dir leaf1 magic!";
2511                         break;
2512                 }
2513                 bp->b_ops = &xfs_dir3_leaf1_buf_ops;
2514                 break;
2515         case XFS_BLFT_DIR_LEAFN_BUF:
2516                 if (magicda != XFS_DIR2_LEAFN_MAGIC &&
2517                     magicda != XFS_DIR3_LEAFN_MAGIC) {
2518                         warnmsg = "Bad dir leafn magic!";
2519                         break;
2520                 }
2521                 bp->b_ops = &xfs_dir3_leafn_buf_ops;
2522                 break;
2523         case XFS_BLFT_DA_NODE_BUF:
2524                 if (magicda != XFS_DA_NODE_MAGIC &&
2525                     magicda != XFS_DA3_NODE_MAGIC) {
2526                         warnmsg = "Bad da node magic!";
2527                         break;
2528                 }
2529                 bp->b_ops = &xfs_da3_node_buf_ops;
2530                 break;
2531         case XFS_BLFT_ATTR_LEAF_BUF:
2532                 if (magicda != XFS_ATTR_LEAF_MAGIC &&
2533                     magicda != XFS_ATTR3_LEAF_MAGIC) {
2534                         warnmsg = "Bad attr leaf magic!";
2535                         break;
2536                 }
2537                 bp->b_ops = &xfs_attr3_leaf_buf_ops;
2538                 break;
2539         case XFS_BLFT_ATTR_RMT_BUF:
2540                 if (magic32 != XFS_ATTR3_RMT_MAGIC) {
2541                         warnmsg = "Bad attr remote magic!";
2542                         break;
2543                 }
2544                 bp->b_ops = &xfs_attr3_rmt_buf_ops;
2545                 break;
2546         case XFS_BLFT_SB_BUF:
2547                 if (magic32 != XFS_SB_MAGIC) {
2548                         warnmsg = "Bad SB block magic!";
2549                         break;
2550                 }
2551                 bp->b_ops = &xfs_sb_buf_ops;
2552                 break;
2553 #ifdef CONFIG_XFS_RT
2554         case XFS_BLFT_RTBITMAP_BUF:
2555         case XFS_BLFT_RTSUMMARY_BUF:
2556                 /* no magic numbers for verification of RT buffers */
2557                 bp->b_ops = &xfs_rtbuf_ops;
2558                 break;
2559 #endif /* CONFIG_XFS_RT */
2560         default:
2561                 xfs_warn(mp, "Unknown buffer type %d!",
2562                          xfs_blft_from_flags(buf_f));
2563                 break;
2564         }
2565
2566         /*
2567          * Nothing else to do in the case of a NULL current LSN as this means
2568          * the buffer is more recent than the change in the log and will be
2569          * skipped.
2570          */
2571         if (current_lsn == NULLCOMMITLSN)
2572                 return;
2573
2574         if (warnmsg) {
2575                 xfs_warn(mp, warnmsg);
2576                 ASSERT(0);
2577         }
2578
2579         /*
2580          * We must update the metadata LSN of the buffer as it is written out to
2581          * ensure that older transactions never replay over this one and corrupt
2582          * the buffer. This can occur if log recovery is interrupted at some
2583          * point after the current transaction completes, at which point a
2584          * subsequent mount starts recovery from the beginning.
2585          *
2586          * Write verifiers update the metadata LSN from log items attached to
2587          * the buffer. Therefore, initialize a bli purely to carry the LSN to
2588          * the verifier. We'll clean it up in our ->iodone() callback.
2589          */
2590         if (bp->b_ops) {
2591                 struct xfs_buf_log_item *bip;
2592
2593                 ASSERT(!bp->b_iodone || bp->b_iodone == xlog_recover_iodone);
2594                 bp->b_iodone = xlog_recover_iodone;
2595                 xfs_buf_item_init(bp, mp);
2596                 bip = bp->b_fspriv;
2597                 bip->bli_item.li_lsn = current_lsn;
2598         }
2599 }
2600
2601 /*
2602  * Perform a 'normal' buffer recovery.  Each logged region of the
2603  * buffer should be copied over the corresponding region in the
2604  * given buffer.  The bitmap in the buf log format structure indicates
2605  * where to place the logged data.
2606  */
2607 STATIC void
2608 xlog_recover_do_reg_buffer(
2609         struct xfs_mount        *mp,
2610         xlog_recover_item_t     *item,
2611         struct xfs_buf          *bp,
2612         xfs_buf_log_format_t    *buf_f,
2613         xfs_lsn_t               current_lsn)
2614 {
2615         int                     i;
2616         int                     bit;
2617         int                     nbits;
2618         int                     error;
2619
2620         trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
2621
2622         bit = 0;
2623         i = 1;  /* 0 is the buf format structure */
2624         while (1) {
2625                 bit = xfs_next_bit(buf_f->blf_data_map,
2626                                    buf_f->blf_map_size, bit);
2627                 if (bit == -1)
2628                         break;
2629                 nbits = xfs_contig_bits(buf_f->blf_data_map,
2630                                         buf_f->blf_map_size, bit);
2631                 ASSERT(nbits > 0);
2632                 ASSERT(item->ri_buf[i].i_addr != NULL);
2633                 ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
2634                 ASSERT(BBTOB(bp->b_io_length) >=
2635                        ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT));
2636
2637                 /*
2638                  * The dirty regions logged in the buffer, even though
2639                  * contiguous, may span multiple chunks. This is because the
2640                  * dirty region may span a physical page boundary in a buffer
2641                  * and hence be split into two separate vectors for writing into
2642                  * the log. Hence we need to trim nbits back to the length of
2643                  * the current region being copied out of the log.
2644                  */
2645                 if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT))
2646                         nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT;
2647
2648                 /*
2649                  * Do a sanity check if this is a dquot buffer. Just checking
2650                  * the first dquot in the buffer should do. XXXThis is
2651                  * probably a good thing to do for other buf types also.
2652                  */
2653                 error = 0;
2654                 if (buf_f->blf_flags &
2655                    (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
2656                         if (item->ri_buf[i].i_addr == NULL) {
2657                                 xfs_alert(mp,
2658                                         "XFS: NULL dquot in %s.", __func__);
2659                                 goto next;
2660                         }
2661                         if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) {
2662                                 xfs_alert(mp,
2663                                         "XFS: dquot too small (%d) in %s.",
2664                                         item->ri_buf[i].i_len, __func__);
2665                                 goto next;
2666                         }
2667                         error = xfs_dqcheck(mp, item->ri_buf[i].i_addr,
2668                                                -1, 0, XFS_QMOPT_DOWARN,
2669                                                "dquot_buf_recover");
2670                         if (error)
2671                                 goto next;
2672                 }
2673
2674                 memcpy(xfs_buf_offset(bp,
2675                         (uint)bit << XFS_BLF_SHIFT),    /* dest */
2676                         item->ri_buf[i].i_addr,         /* source */
2677                         nbits<<XFS_BLF_SHIFT);          /* length */
2678  next:
2679                 i++;
2680                 bit += nbits;
2681         }
2682
2683         /* Shouldn't be any more regions */
2684         ASSERT(i == item->ri_total);
2685
2686         xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn);
2687 }
2688
2689 /*
2690  * Perform a dquot buffer recovery.
2691  * Simple algorithm: if we have found a QUOTAOFF log item of the same type
2692  * (ie. USR or GRP), then just toss this buffer away; don't recover it.
2693  * Else, treat it as a regular buffer and do recovery.
2694  *
2695  * Return false if the buffer was tossed and true if we recovered the buffer to
2696  * indicate to the caller if the buffer needs writing.
2697  */
2698 STATIC bool
2699 xlog_recover_do_dquot_buffer(
2700         struct xfs_mount                *mp,
2701         struct xlog                     *log,
2702         struct xlog_recover_item        *item,
2703         struct xfs_buf                  *bp,
2704         struct xfs_buf_log_format       *buf_f)
2705 {
2706         uint                    type;
2707
2708         trace_xfs_log_recover_buf_dquot_buf(log, buf_f);
2709
2710         /*
2711          * Filesystems are required to send in quota flags at mount time.
2712          */
2713         if (!mp->m_qflags)
2714                 return false;
2715
2716         type = 0;
2717         if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
2718                 type |= XFS_DQ_USER;
2719         if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)
2720                 type |= XFS_DQ_PROJ;
2721         if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)
2722                 type |= XFS_DQ_GROUP;
2723         /*
2724          * This type of quotas was turned off, so ignore this buffer
2725          */
2726         if (log->l_quotaoffs_flag & type)
2727                 return false;
2728
2729         xlog_recover_do_reg_buffer(mp, item, bp, buf_f, NULLCOMMITLSN);
2730         return true;
2731 }
2732
2733 /*
2734  * This routine replays a modification made to a buffer at runtime.
2735  * There are actually two types of buffer, regular and inode, which
2736  * are handled differently.  Inode buffers are handled differently
2737  * in that we only recover a specific set of data from them, namely
2738  * the inode di_next_unlinked fields.  This is because all other inode
2739  * data is actually logged via inode records and any data we replay
2740  * here which overlaps that may be stale.
2741  *
2742  * When meta-data buffers are freed at run time we log a buffer item
2743  * with the XFS_BLF_CANCEL bit set to indicate that previous copies
2744  * of the buffer in the log should not be replayed at recovery time.
2745  * This is so that if the blocks covered by the buffer are reused for
2746  * file data before we crash we don't end up replaying old, freed
2747  * meta-data into a user's file.
2748  *
2749  * To handle the cancellation of buffer log items, we make two passes
2750  * over the log during recovery.  During the first we build a table of
2751  * those buffers which have been cancelled, and during the second we
2752  * only replay those buffers which do not have corresponding cancel
2753  * records in the table.  See xlog_recover_buffer_pass[1,2] above
2754  * for more details on the implementation of the table of cancel records.
2755  */
2756 STATIC int
2757 xlog_recover_buffer_pass2(
2758         struct xlog                     *log,
2759         struct list_head                *buffer_list,
2760         struct xlog_recover_item        *item,
2761         xfs_lsn_t                       current_lsn)
2762 {
2763         xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
2764         xfs_mount_t             *mp = log->l_mp;
2765         xfs_buf_t               *bp;
2766         int                     error;
2767         uint                    buf_flags;
2768         xfs_lsn_t               lsn;
2769
2770         /*
2771          * In this pass we only want to recover all the buffers which have
2772          * not been cancelled and are not cancellation buffers themselves.
2773          */
2774         if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,
2775                         buf_f->blf_len, buf_f->blf_flags)) {
2776                 trace_xfs_log_recover_buf_cancel(log, buf_f);
2777                 return 0;
2778         }
2779
2780         trace_xfs_log_recover_buf_recover(log, buf_f);
2781
2782         buf_flags = 0;
2783         if (buf_f->blf_flags & XFS_BLF_INODE_BUF)
2784                 buf_flags |= XBF_UNMAPPED;
2785
2786         bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
2787                           buf_flags, NULL);
2788         if (!bp)
2789                 return -ENOMEM;
2790         error = bp->b_error;
2791         if (error) {
2792                 xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");
2793                 goto out_release;
2794         }
2795
2796         /*
2797          * Recover the buffer only if we get an LSN from it and it's less than
2798          * the lsn of the transaction we are replaying.
2799          *
2800          * Note that we have to be extremely careful of readahead here.
2801          * Readahead does not attach verfiers to the buffers so if we don't
2802          * actually do any replay after readahead because of the LSN we found
2803          * in the buffer if more recent than that current transaction then we
2804          * need to attach the verifier directly. Failure to do so can lead to
2805          * future recovery actions (e.g. EFI and unlinked list recovery) can
2806          * operate on the buffers and they won't get the verifier attached. This
2807          * can lead to blocks on disk having the correct content but a stale
2808          * CRC.
2809          *
2810          * It is safe to assume these clean buffers are currently up to date.
2811          * If the buffer is dirtied by a later transaction being replayed, then
2812          * the verifier will be reset to match whatever recover turns that
2813          * buffer into.
2814          */
2815         lsn = xlog_recover_get_buf_lsn(mp, bp);
2816         if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
2817                 trace_xfs_log_recover_buf_skip(log, buf_f);
2818                 xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN);
2819                 goto out_release;
2820         }
2821
2822         if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
2823                 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
2824                 if (error)
2825                         goto out_release;
2826         } else if (buf_f->blf_flags &
2827                   (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
2828                 bool    dirty;
2829
2830                 dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
2831                 if (!dirty)
2832                         goto out_release;
2833         } else {
2834                 xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn);
2835         }
2836
2837         /*
2838          * Perform delayed write on the buffer.  Asynchronous writes will be
2839          * slower when taking into account all the buffers to be flushed.
2840          *
2841          * Also make sure that only inode buffers with good sizes stay in
2842          * the buffer cache.  The kernel moves inodes in buffers of 1 block
2843          * or mp->m_inode_cluster_size bytes, whichever is bigger.  The inode
2844          * buffers in the log can be a different size if the log was generated
2845          * by an older kernel using unclustered inode buffers or a newer kernel
2846          * running with a different inode cluster size.  Regardless, if the
2847          * the inode buffer size isn't MAX(blocksize, mp->m_inode_cluster_size)
2848          * for *our* value of mp->m_inode_cluster_size, then we need to keep
2849          * the buffer out of the buffer cache so that the buffer won't
2850          * overlap with future reads of those inodes.
2851          */
2852         if (XFS_DINODE_MAGIC ==
2853             be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
2854             (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize,
2855                         (uint32_t)log->l_mp->m_inode_cluster_size))) {
2856                 xfs_buf_stale(bp);
2857                 error = xfs_bwrite(bp);
2858         } else {
2859                 ASSERT(bp->b_target->bt_mount == mp);
2860                 bp->b_iodone = xlog_recover_iodone;
2861                 xfs_buf_delwri_queue(bp, buffer_list);
2862         }
2863
2864 out_release:
2865         xfs_buf_relse(bp);
2866         return error;
2867 }
2868
2869 /*
2870  * Inode fork owner changes
2871  *
2872  * If we have been told that we have to reparent the inode fork, it's because an
2873  * extent swap operation on a CRC enabled filesystem has been done and we are
2874  * replaying it. We need to walk the BMBT of the appropriate fork and change the
2875  * owners of it.
2876  *
2877  * The complexity here is that we don't have an inode context to work with, so
2878  * after we've replayed the inode we need to instantiate one.  This is where the
2879  * fun begins.
2880  *
2881  * We are in the middle of log recovery, so we can't run transactions. That
2882  * means we cannot use cache coherent inode instantiation via xfs_iget(), as
2883  * that will result in the corresponding iput() running the inode through
2884  * xfs_inactive(). If we've just replayed an inode core that changes the link
2885  * count to zero (i.e. it's been unlinked), then xfs_inactive() will run
2886  * transactions (bad!).
2887  *
2888  * So, to avoid this, we instantiate an inode directly from the inode core we've
2889  * just recovered. We have the buffer still locked, and all we really need to
2890  * instantiate is the inode core and the forks being modified. We can do this
2891  * manually, then run the inode btree owner change, and then tear down the
2892  * xfs_inode without having to run any transactions at all.
2893  *
2894  * Also, because we don't have a transaction context available here but need to
2895  * gather all the buffers we modify for writeback so we pass the buffer_list
2896  * instead for the operation to use.
2897  */
2898
2899 STATIC int
2900 xfs_recover_inode_owner_change(
2901         struct xfs_mount        *mp,
2902         struct xfs_dinode       *dip,
2903         struct xfs_inode_log_format *in_f,
2904         struct list_head        *buffer_list)
2905 {
2906         struct xfs_inode        *ip;
2907         int                     error;
2908
2909         ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER));
2910
2911         ip = xfs_inode_alloc(mp, in_f->ilf_ino);
2912         if (!ip)
2913                 return -ENOMEM;
2914
2915         /* instantiate the inode */
2916         xfs_inode_from_disk(ip, dip);
2917         ASSERT(ip->i_d.di_version >= 3);
2918
2919         error = xfs_iformat_fork(ip, dip);
2920         if (error)
2921                 goto out_free_ip;
2922
2923
2924         if (in_f->ilf_fields & XFS_ILOG_DOWNER) {
2925                 ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT);
2926                 error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK,
2927                                               ip->i_ino, buffer_list);
2928                 if (error)
2929                         goto out_free_ip;
2930         }
2931
2932         if (in_f->ilf_fields & XFS_ILOG_AOWNER) {
2933                 ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT);
2934                 error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK,
2935                                               ip->i_ino, buffer_list);
2936                 if (error)
2937                         goto out_free_ip;
2938         }
2939
2940 out_free_ip:
2941         xfs_inode_free(ip);
2942         return error;
2943 }
2944
2945 STATIC int
2946 xlog_recover_inode_pass2(
2947         struct xlog                     *log,
2948         struct list_head                *buffer_list,
2949         struct xlog_recover_item        *item,
2950         xfs_lsn_t                       current_lsn)
2951 {
2952         xfs_inode_log_format_t  *in_f;
2953         xfs_mount_t             *mp = log->l_mp;
2954         xfs_buf_t               *bp;
2955         xfs_dinode_t            *dip;
2956         int                     len;
2957         char                    *src;
2958         char                    *dest;
2959         int                     error;
2960         int                     attr_index;
2961         uint                    fields;
2962         struct xfs_log_dinode   *ldip;
2963         uint                    isize;
2964         int                     need_free = 0;
2965
2966         if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
2967                 in_f = item->ri_buf[0].i_addr;
2968         } else {
2969                 in_f = kmem_alloc(sizeof(xfs_inode_log_format_t), KM_SLEEP);
2970                 need_free = 1;
2971                 error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
2972                 if (error)
2973                         goto error;
2974         }
2975
2976         /*
2977          * Inode buffers can be freed, look out for it,
2978          * and do not replay the inode.
2979          */
2980         if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
2981                                         in_f->ilf_len, 0)) {
2982                 error = 0;
2983                 trace_xfs_log_recover_inode_cancel(log, in_f);
2984                 goto error;
2985         }
2986         trace_xfs_log_recover_inode_recover(log, in_f);
2987
2988         bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0,
2989                           &xfs_inode_buf_ops);
2990         if (!bp) {
2991                 error = -ENOMEM;
2992                 goto error;
2993         }
2994         error = bp->b_error;
2995         if (error) {
2996                 xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)");
2997                 goto out_release;
2998         }
2999         ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
3000         dip = xfs_buf_offset(bp, in_f->ilf_boffset);
3001
3002         /*
3003          * Make sure the place we're flushing out to really looks
3004          * like an inode!
3005          */
3006         if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) {
3007                 xfs_alert(mp,
3008         "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld",
3009                         __func__, dip, bp, in_f->ilf_ino);
3010                 XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
3011                                  XFS_ERRLEVEL_LOW, mp);
3012                 error = -EFSCORRUPTED;
3013                 goto out_release;
3014         }
3015         ldip = item->ri_buf[1].i_addr;
3016         if (unlikely(ldip->di_magic != XFS_DINODE_MAGIC)) {
3017                 xfs_alert(mp,
3018                         "%s: Bad inode log record, rec ptr 0x%p, ino %Ld",
3019                         __func__, item, in_f->ilf_ino);
3020                 XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
3021                                  XFS_ERRLEVEL_LOW, mp);
3022                 error = -EFSCORRUPTED;
3023                 goto out_release;
3024         }
3025
3026         /*
3027          * If the inode has an LSN in it, recover the inode only if it's less
3028          * than the lsn of the transaction we are replaying. Note: we still
3029          * need to replay an owner change even though the inode is more recent
3030          * than the transaction as there is no guarantee that all the btree
3031          * blocks are more recent than this transaction, too.
3032          */
3033         if (dip->di_version >= 3) {
3034                 xfs_lsn_t       lsn = be64_to_cpu(dip->di_lsn);
3035
3036                 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
3037                         trace_xfs_log_recover_inode_skip(log, in_f);
3038                         error = 0;
3039                         goto out_owner_change;
3040                 }
3041         }
3042
3043         /*
3044          * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes
3045          * are transactional and if ordering is necessary we can determine that
3046          * more accurately by the LSN field in the V3 inode core. Don't trust
3047          * the inode versions we might be changing them here - use the
3048          * superblock flag to determine whether we need to look at di_flushiter
3049          * to skip replay when the on disk inode is newer than the log one
3050          */
3051         if (!xfs_sb_version_hascrc(&mp->m_sb) &&
3052             ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
3053                 /*
3054                  * Deal with the wrap case, DI_MAX_FLUSH is less
3055                  * than smaller numbers
3056                  */
3057                 if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
3058                     ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) {
3059                         /* do nothing */
3060                 } else {
3061                         trace_xfs_log_recover_inode_skip(log, in_f);
3062                         error = 0;
3063                         goto out_release;
3064                 }
3065         }
3066
3067         /* Take the opportunity to reset the flush iteration count */
3068         ldip->di_flushiter = 0;
3069
3070         if (unlikely(S_ISREG(ldip->di_mode))) {
3071                 if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
3072                     (ldip->di_format != XFS_DINODE_FMT_BTREE)) {
3073                         XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
3074                                          XFS_ERRLEVEL_LOW, mp, ldip);
3075                         xfs_alert(mp,
3076                 "%s: Bad regular inode log record, rec ptr 0x%p, "
3077                 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
3078                                 __func__, item, dip, bp, in_f->ilf_ino);
3079                         error = -EFSCORRUPTED;
3080                         goto out_release;
3081                 }
3082         } else if (unlikely(S_ISDIR(ldip->di_mode))) {
3083                 if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
3084                     (ldip->di_format != XFS_DINODE_FMT_BTREE) &&
3085                     (ldip->di_format != XFS_DINODE_FMT_LOCAL)) {
3086                         XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
3087                                              XFS_ERRLEVEL_LOW, mp, ldip);
3088                         xfs_alert(mp,
3089                 "%s: Bad dir inode log record, rec ptr 0x%p, "
3090                 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
3091                                 __func__, item, dip, bp, in_f->ilf_ino);
3092                         error = -EFSCORRUPTED;
3093                         goto out_release;
3094                 }
3095         }
3096         if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){
3097                 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
3098                                      XFS_ERRLEVEL_LOW, mp, ldip);
3099                 xfs_alert(mp,
3100         "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
3101         "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
3102                         __func__, item, dip, bp, in_f->ilf_ino,
3103                         ldip->di_nextents + ldip->di_anextents,
3104                         ldip->di_nblocks);
3105                 error = -EFSCORRUPTED;
3106                 goto out_release;
3107         }
3108         if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) {
3109                 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
3110                                      XFS_ERRLEVEL_LOW, mp, ldip);
3111                 xfs_alert(mp,
3112         "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
3113         "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
3114                         item, dip, bp, in_f->ilf_ino, ldip->di_forkoff);
3115                 error = -EFSCORRUPTED;
3116                 goto out_release;
3117         }
3118         isize = xfs_log_dinode_size(ldip->di_version);
3119         if (unlikely(item->ri_buf[1].i_len > isize)) {
3120                 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
3121                                      XFS_ERRLEVEL_LOW, mp, ldip);
3122                 xfs_alert(mp,
3123                         "%s: Bad inode log record length %d, rec ptr 0x%p",
3124                         __func__, item->ri_buf[1].i_len, item);
3125                 error = -EFSCORRUPTED;
3126                 goto out_release;
3127         }
3128
3129         /* recover the log dinode inode into the on disk inode */
3130         xfs_log_dinode_to_disk(ldip, dip);
3131
3132         /* the rest is in on-disk format */
3133         if (item->ri_buf[1].i_len > isize) {
3134                 memcpy((char *)dip + isize,
3135                         item->ri_buf[1].i_addr + isize,
3136                         item->ri_buf[1].i_len - isize);
3137         }
3138
3139         fields = in_f->ilf_fields;
3140         switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) {
3141         case XFS_ILOG_DEV:
3142                 xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);
3143                 break;
3144         case XFS_ILOG_UUID:
3145                 memcpy(XFS_DFORK_DPTR(dip),
3146                        &in_f->ilf_u.ilfu_uuid,
3147                        sizeof(uuid_t));
3148                 break;
3149         }
3150
3151         if (in_f->ilf_size == 2)
3152                 goto out_owner_change;
3153         len = item->ri_buf[2].i_len;
3154         src = item->ri_buf[2].i_addr;
3155         ASSERT(in_f->ilf_size <= 4);
3156         ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
3157         ASSERT(!(fields & XFS_ILOG_DFORK) ||
3158                (len == in_f->ilf_dsize));
3159
3160         switch (fields & XFS_ILOG_DFORK) {
3161         case XFS_ILOG_DDATA:
3162         case XFS_ILOG_DEXT:
3163                 memcpy(XFS_DFORK_DPTR(dip), src, len);
3164                 break;
3165
3166         case XFS_ILOG_DBROOT:
3167                 xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,
3168                                  (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip),
3169                                  XFS_DFORK_DSIZE(dip, mp));
3170                 break;
3171
3172         default:
3173                 /*
3174                  * There are no data fork flags set.
3175                  */
3176                 ASSERT((fields & XFS_ILOG_DFORK) == 0);
3177                 break;
3178         }
3179
3180         /*
3181          * If we logged any attribute data, recover it.  There may or
3182          * may not have been any other non-core data logged in this
3183          * transaction.
3184          */
3185         if (in_f->ilf_fields & XFS_ILOG_AFORK) {
3186                 if (in_f->ilf_fields & XFS_ILOG_DFORK) {
3187                         attr_index = 3;
3188                 } else {
3189                         attr_index = 2;
3190                 }
3191                 len = item->ri_buf[attr_index].i_len;
3192                 src = item->ri_buf[attr_index].i_addr;
3193                 ASSERT(len == in_f->ilf_asize);
3194
3195                 switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
3196                 case XFS_ILOG_ADATA:
3197                 case XFS_ILOG_AEXT:
3198                         dest = XFS_DFORK_APTR(dip);
3199                         ASSERT(len <= XFS_DFORK_ASIZE(dip, mp));
3200                         memcpy(dest, src, len);
3201                         break;
3202
3203                 case XFS_ILOG_ABROOT:
3204                         dest = XFS_DFORK_APTR(dip);
3205                         xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src,
3206                                          len, (xfs_bmdr_block_t*)dest,
3207                                          XFS_DFORK_ASIZE(dip, mp));
3208                         break;
3209
3210                 default:
3211                         xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
3212                         ASSERT(0);
3213                         error = -EIO;
3214                         goto out_release;
3215                 }
3216         }
3217
3218 out_owner_change:
3219         if (in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER))
3220                 error = xfs_recover_inode_owner_change(mp, dip, in_f,
3221                                                        buffer_list);
3222         /* re-generate the checksum. */
3223         xfs_dinode_calc_crc(log->l_mp, dip);
3224
3225         ASSERT(bp->b_target->bt_mount == mp);
3226         bp->b_iodone = xlog_recover_iodone;
3227         xfs_buf_delwri_queue(bp, buffer_list);
3228
3229 out_release:
3230         xfs_buf_relse(bp);
3231 error:
3232         if (need_free)
3233                 kmem_free(in_f);
3234         return error;
3235 }
3236
3237 /*
3238  * Recover QUOTAOFF records. We simply make a note of it in the xlog
3239  * structure, so that we know not to do any dquot item or dquot buffer recovery,
3240  * of that type.
3241  */
3242 STATIC int
3243 xlog_recover_quotaoff_pass1(
3244         struct xlog                     *log,
3245         struct xlog_recover_item        *item)
3246 {
3247         xfs_qoff_logformat_t    *qoff_f = item->ri_buf[0].i_addr;
3248         ASSERT(qoff_f);
3249
3250         /*
3251          * The logitem format's flag tells us if this was user quotaoff,
3252          * group/project quotaoff or both.
3253          */
3254         if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)
3255                 log->l_quotaoffs_flag |= XFS_DQ_USER;
3256         if (qoff_f->qf_flags & XFS_PQUOTA_ACCT)
3257                 log->l_quotaoffs_flag |= XFS_DQ_PROJ;
3258         if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
3259                 log->l_quotaoffs_flag |= XFS_DQ_GROUP;
3260
3261         return 0;
3262 }
3263
3264 /*
3265  * Recover a dquot record
3266  */
3267 STATIC int
3268 xlog_recover_dquot_pass2(
3269         struct xlog                     *log,
3270         struct list_head                *buffer_list,
3271         struct xlog_recover_item        *item,
3272         xfs_lsn_t                       current_lsn)
3273 {
3274         xfs_mount_t             *mp = log->l_mp;
3275         xfs_buf_t               *bp;
3276         struct xfs_disk_dquot   *ddq, *recddq;
3277         int                     error;
3278         xfs_dq_logformat_t      *dq_f;
3279         uint                    type;
3280
3281
3282         /*
3283          * Filesystems are required to send in quota flags at mount time.
3284          */
3285         if (mp->m_qflags == 0)
3286                 return 0;
3287
3288         recddq = item->ri_buf[1].i_addr;
3289         if (recddq == NULL) {
3290                 xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
3291                 return -EIO;
3292         }
3293         if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
3294                 xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
3295                         item->ri_buf[1].i_len, __func__);
3296                 return -EIO;
3297         }
3298
3299         /*
3300          * This type of quotas was turned off, so ignore this record.
3301          */
3302         type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
3303         ASSERT(type);
3304         if (log->l_quotaoffs_flag & type)
3305                 return 0;
3306
3307         /*
3308          * At this point we know that quota was _not_ turned off.
3309          * Since the mount flags are not indicating to us otherwise, this
3310          * must mean that quota is on, and the dquot needs to be replayed.
3311          * Remember that we may not have fully recovered the superblock yet,
3312          * so we can't do the usual trick of looking at the SB quota bits.
3313          *
3314          * The other possibility, of course, is that the quota subsystem was
3315          * removed since the last mount - ENOSYS.
3316          */
3317         dq_f = item->ri_buf[0].i_addr;
3318         ASSERT(dq_f);
3319         error = xfs_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
3320                            "xlog_recover_dquot_pass2 (log copy)");
3321         if (error)
3322                 return -EIO;
3323         ASSERT(dq_f->qlf_len == 1);
3324
3325         /*
3326          * At this point we are assuming that the dquots have been allocated
3327          * and hence the buffer has valid dquots stamped in it. It should,
3328          * therefore, pass verifier validation. If the dquot is bad, then the
3329          * we'll return an error here, so we don't need to specifically check
3330          * the dquot in the buffer after the verifier has run.
3331          */
3332         error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,
3333                                    XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp,
3334                                    &xfs_dquot_buf_ops);
3335         if (error)
3336                 return error;
3337
3338         ASSERT(bp);
3339         ddq = xfs_buf_offset(bp, dq_f->qlf_boffset);
3340
3341         /*
3342          * If the dquot has an LSN in it, recover the dquot only if it's less
3343          * than the lsn of the transaction we are replaying.
3344          */
3345         if (xfs_sb_version_hascrc(&mp->m_sb)) {
3346                 struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq;
3347                 xfs_lsn_t       lsn = be64_to_cpu(dqb->dd_lsn);
3348
3349                 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
3350                         goto out_release;
3351                 }
3352         }
3353
3354         memcpy(ddq, recddq, item->ri_buf[1].i_len);
3355         if (xfs_sb_version_hascrc(&mp->m_sb)) {
3356                 xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk),
3357                                  XFS_DQUOT_CRC_OFF);
3358         }
3359
3360         ASSERT(dq_f->qlf_size == 2);
3361         ASSERT(bp->b_target->bt_mount == mp);
3362         bp->b_iodone = xlog_recover_iodone;
3363         xfs_buf_delwri_queue(bp, buffer_list);
3364
3365 out_release:
3366         xfs_buf_relse(bp);
3367         return 0;
3368 }
3369
3370 /*
3371  * This routine is called to create an in-core extent free intent
3372  * item from the efi format structure which was logged on disk.
3373  * It allocates an in-core efi, copies the extents from the format
3374  * structure into it, and adds the efi to the AIL with the given
3375  * LSN.
3376  */
3377 STATIC int
3378 xlog_recover_efi_pass2(
3379         struct xlog                     *log,
3380         struct xlog_recover_item        *item,
3381         xfs_lsn_t                       lsn)
3382 {
3383         int                             error;
3384         struct xfs_mount                *mp = log->l_mp;
3385         struct xfs_efi_log_item         *efip;
3386         struct xfs_efi_log_format       *efi_formatp;
3387
3388         efi_formatp = item->ri_buf[0].i_addr;
3389
3390         efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
3391         error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format);
3392         if (error) {
3393                 xfs_efi_item_free(efip);
3394                 return error;
3395         }
3396         atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
3397
3398         spin_lock(&log->l_ailp->xa_lock);
3399         /*
3400          * The EFI has two references. One for the EFD and one for EFI to ensure
3401          * it makes it into the AIL. Insert the EFI into the AIL directly and
3402          * drop the EFI reference. Note that xfs_trans_ail_update() drops the
3403          * AIL lock.
3404          */
3405         xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
3406         xfs_efi_release(efip);
3407         return 0;
3408 }
3409
3410
3411 /*
3412  * This routine is called when an EFD format structure is found in a committed
3413  * transaction in the log. Its purpose is to cancel the corresponding EFI if it
3414  * was still in the log. To do this it searches the AIL for the EFI with an id
3415  * equal to that in the EFD format structure. If we find it we drop the EFD
3416  * reference, which removes the EFI from the AIL and frees it.
3417  */
3418 STATIC int
3419 xlog_recover_efd_pass2(
3420         struct xlog                     *log,
3421         struct xlog_recover_item        *item)
3422 {
3423         xfs_efd_log_format_t    *efd_formatp;
3424         xfs_efi_log_item_t      *efip = NULL;
3425         xfs_log_item_t          *lip;
3426         uint64_t                efi_id;
3427         struct xfs_ail_cursor   cur;
3428         struct xfs_ail          *ailp = log->l_ailp;
3429
3430         efd_formatp = item->ri_buf[0].i_addr;
3431         ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
3432                 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
3433                (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +
3434                 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t)))));
3435         efi_id = efd_formatp->efd_efi_id;
3436
3437         /*
3438          * Search for the EFI with the id in the EFD format structure in the
3439          * AIL.
3440          */
3441         spin_lock(&ailp->xa_lock);
3442         lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
3443         while (lip != NULL) {
3444                 if (lip->li_type == XFS_LI_EFI) {
3445                         efip = (xfs_efi_log_item_t *)lip;
3446                         if (efip->efi_format.efi_id == efi_id) {
3447                                 /*
3448                                  * Drop the EFD reference to the EFI. This
3449                                  * removes the EFI from the AIL and frees it.
3450                                  */
3451                                 spin_unlock(&ailp->xa_lock);
3452                                 xfs_efi_release(efip);
3453                                 spin_lock(&ailp->xa_lock);
3454                                 break;
3455                         }
3456                 }
3457                 lip = xfs_trans_ail_cursor_next(ailp, &cur);
3458         }
3459
3460         xfs_trans_ail_cursor_done(&cur);
3461         spin_unlock(&ailp->xa_lock);
3462
3463         return 0;
3464 }
3465
3466 /*
3467  * This routine is called to create an in-core extent rmap update
3468  * item from the rui format structure which was logged on disk.
3469  * It allocates an in-core rui, copies the extents from the format
3470  * structure into it, and adds the rui to the AIL with the given
3471  * LSN.
3472  */
3473 STATIC int
3474 xlog_recover_rui_pass2(
3475         struct xlog                     *log,
3476         struct xlog_recover_item        *item,
3477         xfs_lsn_t                       lsn)
3478 {
3479         int                             error;
3480         struct xfs_mount                *mp = log->l_mp;
3481         struct xfs_rui_log_item         *ruip;
3482         struct xfs_rui_log_format       *rui_formatp;
3483
3484         rui_formatp = item->ri_buf[0].i_addr;
3485
3486         ruip = xfs_rui_init(mp, rui_formatp->rui_nextents);
3487         error = xfs_rui_copy_format(&item->ri_buf[0], &ruip->rui_format);
3488         if (error) {
3489                 xfs_rui_item_free(ruip);
3490                 return error;
3491         }
3492         atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents);
3493
3494         spin_lock(&log->l_ailp->xa_lock);
3495         /*
3496          * The RUI has two references. One for the RUD and one for RUI to ensure
3497          * it makes it into the AIL. Insert the RUI into the AIL directly and
3498          * drop the RUI reference. Note that xfs_trans_ail_update() drops the
3499          * AIL lock.
3500          */
3501         xfs_trans_ail_update(log->l_ailp, &ruip->rui_item, lsn);
3502         xfs_rui_release(ruip);
3503         return 0;
3504 }
3505
3506
3507 /*
3508  * This routine is called when an RUD format structure is found in a committed
3509  * transaction in the log. Its purpose is to cancel the corresponding RUI if it
3510  * was still in the log. To do this it searches the AIL for the RUI with an id
3511  * equal to that in the RUD format structure. If we find it we drop the RUD
3512  * reference, which removes the RUI from the AIL and frees it.
3513  */
3514 STATIC int
3515 xlog_recover_rud_pass2(
3516         struct xlog                     *log,
3517         struct xlog_recover_item        *item)
3518 {
3519         struct xfs_rud_log_format       *rud_formatp;
3520         struct xfs_rui_log_item         *ruip = NULL;
3521         struct xfs_log_item             *lip;
3522         uint64_t                        rui_id;
3523         struct xfs_ail_cursor           cur;
3524         struct xfs_ail                  *ailp = log->l_ailp;
3525
3526         rud_formatp = item->ri_buf[0].i_addr;
3527         ASSERT(item->ri_buf[0].i_len == sizeof(struct xfs_rud_log_format));
3528         rui_id = rud_formatp->rud_rui_id;
3529
3530         /*
3531          * Search for the RUI with the id in the RUD format structure in the
3532          * AIL.
3533          */
3534         spin_lock(&ailp->xa_lock);
3535         lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
3536         while (lip != NULL) {
3537                 if (lip->li_type == XFS_LI_RUI) {
3538                         ruip = (struct xfs_rui_log_item *)lip;
3539                         if (ruip->rui_format.rui_id == rui_id) {
3540                                 /*
3541                                  * Drop the RUD reference to the RUI. This
3542                                  * removes the RUI from the AIL and frees it.
3543                                  */
3544                                 spin_unlock(&ailp->xa_lock);
3545                                 xfs_rui_release(ruip);
3546                                 spin_lock(&ailp->xa_lock);
3547                                 break;
3548                         }
3549                 }
3550                 lip = xfs_trans_ail_cursor_next(ailp, &cur);
3551         }
3552
3553         xfs_trans_ail_cursor_done(&cur);
3554         spin_unlock(&ailp->xa_lock);
3555
3556         return 0;
3557 }
3558
3559 /*
3560  * Copy an CUI format buffer from the given buf, and into the destination
3561  * CUI format structure.  The CUI/CUD items were designed not to need any
3562  * special alignment handling.
3563  */
3564 static int
3565 xfs_cui_copy_format(
3566         struct xfs_log_iovec            *buf,
3567         struct xfs_cui_log_format       *dst_cui_fmt)
3568 {
3569         struct xfs_cui_log_format       *src_cui_fmt;
3570         uint                            len;
3571
3572         src_cui_fmt = buf->i_addr;
3573         len = xfs_cui_log_format_sizeof(src_cui_fmt->cui_nextents);
3574
3575         if (buf->i_len == len) {
3576                 memcpy(dst_cui_fmt, src_cui_fmt, len);
3577                 return 0;
3578         }
3579         return -EFSCORRUPTED;
3580 }
3581
3582 /*
3583  * This routine is called to create an in-core extent refcount update
3584  * item from the cui format structure which was logged on disk.
3585  * It allocates an in-core cui, copies the extents from the format
3586  * structure into it, and adds the cui to the AIL with the given
3587  * LSN.
3588  */
3589 STATIC int
3590 xlog_recover_cui_pass2(
3591         struct xlog                     *log,
3592         struct xlog_recover_item        *item,
3593         xfs_lsn_t                       lsn)
3594 {
3595         int                             error;
3596         struct xfs_mount                *mp = log->l_mp;
3597         struct xfs_cui_log_item         *cuip;
3598         struct xfs_cui_log_format       *cui_formatp;
3599
3600         cui_formatp = item->ri_buf[0].i_addr;
3601
3602         cuip = xfs_cui_init(mp, cui_formatp->cui_nextents);
3603         error = xfs_cui_copy_format(&item->ri_buf[0], &cuip->cui_format);
3604         if (error) {
3605                 xfs_cui_item_free(cuip);
3606                 return error;
3607         }
3608         atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents);
3609
3610         spin_lock(&log->l_ailp->xa_lock);
3611         /*
3612          * The CUI has two references. One for the CUD and one for CUI to ensure
3613          * it makes it into the AIL. Insert the CUI into the AIL directly and
3614          * drop the CUI reference. Note that xfs_trans_ail_update() drops the
3615          * AIL lock.
3616          */
3617         xfs_trans_ail_update(log->l_ailp, &cuip->cui_item, lsn);
3618         xfs_cui_release(cuip);
3619         return 0;
3620 }
3621
3622
3623 /*
3624  * This routine is called when an CUD format structure is found in a committed
3625  * transaction in the log. Its purpose is to cancel the corresponding CUI if it
3626  * was still in the log. To do this it searches the AIL for the CUI with an id
3627  * equal to that in the CUD format structure. If we find it we drop the CUD
3628  * reference, which removes the CUI from the AIL and frees it.
3629  */
3630 STATIC int
3631 xlog_recover_cud_pass2(
3632         struct xlog                     *log,
3633         struct xlog_recover_item        *item)
3634 {
3635         struct xfs_cud_log_format       *cud_formatp;
3636         struct xfs_cui_log_item         *cuip = NULL;
3637         struct xfs_log_item             *lip;
3638         uint64_t                        cui_id;
3639         struct xfs_ail_cursor           cur;
3640         struct xfs_ail                  *ailp = log->l_ailp;
3641
3642         cud_formatp = item->ri_buf[0].i_addr;
3643         if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format))
3644                 return -EFSCORRUPTED;
3645         cui_id = cud_formatp->cud_cui_id;
3646
3647         /*
3648          * Search for the CUI with the id in the CUD format structure in the
3649          * AIL.
3650          */
3651         spin_lock(&ailp->xa_lock);
3652         lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
3653         while (lip != NULL) {
3654                 if (lip->li_type == XFS_LI_CUI) {
3655                         cuip = (struct xfs_cui_log_item *)lip;
3656                         if (cuip->cui_format.cui_id == cui_id) {
3657                                 /*
3658                                  * Drop the CUD reference to the CUI. This
3659                                  * removes the CUI from the AIL and frees it.
3660                                  */
3661                                 spin_unlock(&ailp->xa_lock);
3662                                 xfs_cui_release(cuip);
3663                                 spin_lock(&ailp->xa_lock);
3664                                 break;
3665                         }
3666                 }
3667                 lip = xfs_trans_ail_cursor_next(ailp, &cur);
3668         }
3669
3670         xfs_trans_ail_cursor_done(&cur);
3671         spin_unlock(&ailp->xa_lock);
3672
3673         return 0;
3674 }
3675
3676 /*
3677  * Copy an BUI format buffer from the given buf, and into the destination
3678  * BUI format structure.  The BUI/BUD items were designed not to need any
3679  * special alignment handling.
3680  */
3681 static int
3682 xfs_bui_copy_format(
3683         struct xfs_log_iovec            *buf,
3684         struct xfs_bui_log_format       *dst_bui_fmt)
3685 {
3686         struct xfs_bui_log_format       *src_bui_fmt;
3687         uint                            len;
3688
3689         src_bui_fmt = buf->i_addr;
3690         len = xfs_bui_log_format_sizeof(src_bui_fmt->bui_nextents);
3691
3692         if (buf->i_len == len) {
3693                 memcpy(dst_bui_fmt, src_bui_fmt, len);
3694                 return 0;
3695         }
3696         return -EFSCORRUPTED;
3697 }
3698
3699 /*
3700  * This routine is called to create an in-core extent bmap update
3701  * item from the bui format structure which was logged on disk.
3702  * It allocates an in-core bui, copies the extents from the format
3703  * structure into it, and adds the bui to the AIL with the given
3704  * LSN.
3705  */
3706 STATIC int
3707 xlog_recover_bui_pass2(
3708         struct xlog                     *log,
3709         struct xlog_recover_item        *item,
3710         xfs_lsn_t                       lsn)
3711 {
3712         int                             error;
3713         struct xfs_mount                *mp = log->l_mp;
3714         struct xfs_bui_log_item         *buip;
3715         struct xfs_bui_log_format       *bui_formatp;
3716
3717         bui_formatp = item->ri_buf[0].i_addr;
3718
3719         if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS)
3720                 return -EFSCORRUPTED;
3721         buip = xfs_bui_init(mp);
3722         error = xfs_bui_copy_format(&item->ri_buf[0], &buip->bui_format);
3723         if (error) {
3724                 xfs_bui_item_free(buip);
3725                 return error;
3726         }
3727         atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents);
3728
3729         spin_lock(&log->l_ailp->xa_lock);
3730         /*
3731          * The RUI has two references. One for the RUD and one for RUI to ensure
3732          * it makes it into the AIL. Insert the RUI into the AIL directly and
3733          * drop the RUI reference. Note that xfs_trans_ail_update() drops the
3734          * AIL lock.
3735          */
3736         xfs_trans_ail_update(log->l_ailp, &buip->bui_item, lsn);
3737         xfs_bui_release(buip);
3738         return 0;
3739 }
3740
3741
3742 /*
3743  * This routine is called when an BUD format structure is found in a committed
3744  * transaction in the log. Its purpose is to cancel the corresponding BUI if it
3745  * was still in the log. To do this it searches the AIL for the BUI with an id
3746  * equal to that in the BUD format structure. If we find it we drop the BUD
3747  * reference, which removes the BUI from the AIL and frees it.
3748  */
3749 STATIC int
3750 xlog_recover_bud_pass2(
3751         struct xlog                     *log,
3752         struct xlog_recover_item        *item)
3753 {
3754         struct xfs_bud_log_format       *bud_formatp;
3755         struct xfs_bui_log_item         *buip = NULL;
3756         struct xfs_log_item             *lip;
3757         uint64_t                        bui_id;
3758         struct xfs_ail_cursor           cur;
3759         struct xfs_ail                  *ailp = log->l_ailp;
3760
3761         bud_formatp = item->ri_buf[0].i_addr;
3762         if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format))
3763                 return -EFSCORRUPTED;
3764         bui_id = bud_formatp->bud_bui_id;
3765
3766         /*
3767          * Search for the BUI with the id in the BUD format structure in the
3768          * AIL.
3769          */
3770         spin_lock(&ailp->xa_lock);
3771         lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
3772         while (lip != NULL) {
3773                 if (lip->li_type == XFS_LI_BUI) {
3774                         buip = (struct xfs_bui_log_item *)lip;
3775                         if (buip->bui_format.bui_id == bui_id) {
3776                                 /*
3777                                  * Drop the BUD reference to the BUI. This
3778                                  * removes the BUI from the AIL and frees it.
3779                                  */
3780                                 spin_unlock(&ailp->xa_lock);
3781                                 xfs_bui_release(buip);
3782                                 spin_lock(&ailp->xa_lock);
3783                                 break;
3784                         }
3785                 }
3786                 lip = xfs_trans_ail_cursor_next(ailp, &cur);
3787         }
3788
3789         xfs_trans_ail_cursor_done(&cur);
3790         spin_unlock(&ailp->xa_lock);
3791
3792         return 0;
3793 }
3794
3795 /*
3796  * This routine is called when an inode create format structure is found in a
3797  * committed transaction in the log.  It's purpose is to initialise the inodes
3798  * being allocated on disk. This requires us to get inode cluster buffers that
3799  * match the range to be initialised, stamped with inode templates and written
3800  * by delayed write so that subsequent modifications will hit the cached buffer
3801  * and only need writing out at the end of recovery.
3802  */
3803 STATIC int
3804 xlog_recover_do_icreate_pass2(
3805         struct xlog             *log,
3806         struct list_head        *buffer_list,
3807         xlog_recover_item_t     *item)
3808 {
3809         struct xfs_mount        *mp = log->l_mp;
3810         struct xfs_icreate_log  *icl;
3811         xfs_agnumber_t          agno;
3812         xfs_agblock_t           agbno;
3813         unsigned int            count;
3814         unsigned int            isize;
3815         xfs_agblock_t           length;
3816         int                     blks_per_cluster;
3817         int                     bb_per_cluster;
3818         int                     cancel_count;
3819         int                     nbufs;
3820         int                     i;
3821
3822         icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
3823         if (icl->icl_type != XFS_LI_ICREATE) {
3824                 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type");
3825                 return -EINVAL;
3826         }
3827
3828         if (icl->icl_size != 1) {
3829                 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size");
3830                 return -EINVAL;
3831         }
3832
3833         agno = be32_to_cpu(icl->icl_ag);
3834         if (agno >= mp->m_sb.sb_agcount) {
3835                 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno");
3836                 return -EINVAL;
3837         }
3838         agbno = be32_to_cpu(icl->icl_agbno);
3839         if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) {
3840                 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno");
3841                 return -EINVAL;
3842         }
3843         isize = be32_to_cpu(icl->icl_isize);
3844         if (isize != mp->m_sb.sb_inodesize) {
3845                 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize");
3846                 return -EINVAL;
3847         }
3848         count = be32_to_cpu(icl->icl_count);
3849         if (!count) {
3850                 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count");
3851                 return -EINVAL;
3852         }
3853         length = be32_to_cpu(icl->icl_length);
3854         if (!length || length >= mp->m_sb.sb_agblocks) {
3855                 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length");
3856                 return -EINVAL;
3857         }
3858
3859         /*
3860          * The inode chunk is either full or sparse and we only support
3861          * m_ialloc_min_blks sized sparse allocations at this time.
3862          */
3863         if (length != mp->m_ialloc_blks &&
3864             length != mp->m_ialloc_min_blks) {
3865                 xfs_warn(log->l_mp,
3866                          "%s: unsupported chunk length", __FUNCTION__);
3867                 return -EINVAL;
3868         }
3869
3870         /* verify inode count is consistent with extent length */
3871         if ((count >> mp->m_sb.sb_inopblog) != length) {
3872                 xfs_warn(log->l_mp,
3873                          "%s: inconsistent inode count and chunk length",
3874                          __FUNCTION__);
3875                 return -EINVAL;
3876         }
3877
3878         /*
3879          * The icreate transaction can cover multiple cluster buffers and these
3880          * buffers could have been freed and reused. Check the individual
3881          * buffers for cancellation so we don't overwrite anything written after
3882          * a cancellation.
3883          */
3884         blks_per_cluster = xfs_icluster_size_fsb(mp);
3885         bb_per_cluster = XFS_FSB_TO_BB(mp, blks_per_cluster);
3886         nbufs = length / blks_per_cluster;
3887         for (i = 0, cancel_count = 0; i < nbufs; i++) {
3888                 xfs_daddr_t     daddr;
3889
3890                 daddr = XFS_AGB_TO_DADDR(mp, agno,
3891                                          agbno + i * blks_per_cluster);
3892                 if (xlog_check_buffer_cancelled(log, daddr, bb_per_cluster, 0))
3893                         cancel_count++;
3894         }
3895
3896         /*
3897          * We currently only use icreate for a single allocation at a time. This
3898          * means we should expect either all or none of the buffers to be
3899          * cancelled. Be conservative and skip replay if at least one buffer is
3900          * cancelled, but warn the user that something is awry if the buffers
3901          * are not consistent.
3902          *
3903          * XXX: This must be refined to only skip cancelled clusters once we use
3904          * icreate for multiple chunk allocations.
3905          */
3906         ASSERT(!cancel_count || cancel_count == nbufs);
3907         if (cancel_count) {
3908                 if (cancel_count != nbufs)
3909                         xfs_warn(mp,
3910         "WARNING: partial inode chunk cancellation, skipped icreate.");
3911                 trace_xfs_log_recover_icreate_cancel(log, icl);
3912                 return 0;
3913         }
3914
3915         trace_xfs_log_recover_icreate_recover(log, icl);
3916         return xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno,
3917                                      length, be32_to_cpu(icl->icl_gen));
3918 }
3919
3920 STATIC void
3921 xlog_recover_buffer_ra_pass2(
3922         struct xlog                     *log,
3923         struct xlog_recover_item        *item)
3924 {
3925         struct xfs_buf_log_format       *buf_f = item->ri_buf[0].i_addr;
3926         struct xfs_mount                *mp = log->l_mp;
3927
3928         if (xlog_peek_buffer_cancelled(log, buf_f->blf_blkno,
3929                         buf_f->blf_len, buf_f->blf_flags)) {
3930                 return;
3931         }
3932
3933         xfs_buf_readahead(mp->m_ddev_targp, buf_f->blf_blkno,
3934                                 buf_f->blf_len, NULL);
3935 }
3936
3937 STATIC void
3938 xlog_recover_inode_ra_pass2(
3939         struct xlog                     *log,
3940         struct xlog_recover_item        *item)
3941 {
3942         struct xfs_inode_log_format     ilf_buf;
3943         struct xfs_inode_log_format     *ilfp;
3944         struct xfs_mount                *mp = log->l_mp;
3945         int                     error;
3946
3947         if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
3948                 ilfp = item->ri_buf[0].i_addr;
3949         } else {
3950                 ilfp = &ilf_buf;
3951                 memset(ilfp, 0, sizeof(*ilfp));
3952                 error = xfs_inode_item_format_convert(&item->ri_buf[0], ilfp);
3953                 if (error)
3954                         return;
3955         }
3956
3957         if (xlog_peek_buffer_cancelled(log, ilfp->ilf_blkno, ilfp->ilf_len, 0))
3958                 return;
3959
3960         xfs_buf_readahead(mp->m_ddev_targp, ilfp->ilf_blkno,
3961                                 ilfp->ilf_len, &xfs_inode_buf_ra_ops);
3962 }
3963
3964 STATIC void
3965 xlog_recover_dquot_ra_pass2(
3966         struct xlog                     *log,
3967         struct xlog_recover_item        *item)
3968 {
3969         struct xfs_mount        *mp = log->l_mp;
3970         struct xfs_disk_dquot   *recddq;
3971         struct xfs_dq_logformat *dq_f;
3972         uint                    type;
3973         int                     len;
3974
3975
3976         if (mp->m_qflags == 0)
3977                 return;
3978
3979         recddq = item->ri_buf[1].i_addr;
3980         if (recddq == NULL)
3981                 return;
3982         if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot))
3983                 return;
3984
3985         type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
3986         ASSERT(type);
3987         if (log->l_quotaoffs_flag & type)
3988                 return;
3989
3990         dq_f = item->ri_buf[0].i_addr;
3991         ASSERT(dq_f);
3992         ASSERT(dq_f->qlf_len == 1);
3993
3994         len = XFS_FSB_TO_BB(mp, dq_f->qlf_len);
3995         if (xlog_peek_buffer_cancelled(log, dq_f->qlf_blkno, len, 0))
3996                 return;
3997
3998         xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, len,
3999                           &xfs_dquot_buf_ra_ops);
4000 }
4001
4002 STATIC void
4003 xlog_recover_ra_pass2(
4004         struct xlog                     *log,
4005         struct xlog_recover_item        *item)
4006 {
4007         switch (ITEM_TYPE(item)) {
4008         case XFS_LI_BUF:
4009                 xlog_recover_buffer_ra_pass2(log, item);
4010                 break;
4011         case XFS_LI_INODE:
4012                 xlog_recover_inode_ra_pass2(log, item);
4013                 break;
4014         case XFS_LI_DQUOT:
4015                 xlog_recover_dquot_ra_pass2(log, item);
4016                 break;
4017         case XFS_LI_EFI:
4018         case XFS_LI_EFD:
4019         case XFS_LI_QUOTAOFF:
4020         case XFS_LI_RUI:
4021         case XFS_LI_RUD:
4022         case XFS_LI_CUI:
4023         case XFS_LI_CUD:
4024         case XFS_LI_BUI:
4025         case XFS_LI_BUD:
4026         default:
4027                 break;
4028         }
4029 }
4030
4031 STATIC int
4032 xlog_recover_commit_pass1(
4033         struct xlog                     *log,
4034         struct xlog_recover             *trans,
4035         struct xlog_recover_item        *item)
4036 {
4037         trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);
4038
4039         switch (ITEM_TYPE(item)) {
4040         case XFS_LI_BUF:
4041                 return xlog_recover_buffer_pass1(log, item);
4042         case XFS_LI_QUOTAOFF:
4043                 return xlog_recover_quotaoff_pass1(log, item);
4044         case XFS_LI_INODE:
4045         case XFS_LI_EFI:
4046         case XFS_LI_EFD:
4047         case XFS_LI_DQUOT:
4048         case XFS_LI_ICREATE:
4049         case XFS_LI_RUI:
4050         case XFS_LI_RUD:
4051         case XFS_LI_CUI:
4052         case XFS_LI_CUD:
4053         case XFS_LI_BUI:
4054         case XFS_LI_BUD:
4055                 /* nothing to do in pass 1 */
4056                 return 0;
4057         default:
4058                 xfs_warn(log->l_mp, "%s: invalid item type (%d)",
4059                         __func__, ITEM_TYPE(item));
4060                 ASSERT(0);
4061                 return -EIO;
4062         }
4063 }
4064
4065 STATIC int
4066 xlog_recover_commit_pass2(
4067         struct xlog                     *log,
4068         struct xlog_recover             *trans,
4069         struct list_head                *buffer_list,
4070         struct xlog_recover_item        *item)
4071 {
4072         trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
4073
4074         switch (ITEM_TYPE(item)) {
4075         case XFS_LI_BUF:
4076                 return xlog_recover_buffer_pass2(log, buffer_list, item,
4077                                                  trans->r_lsn);
4078         case XFS_LI_INODE:
4079                 return xlog_recover_inode_pass2(log, buffer_list, item,
4080                                                  trans->r_lsn);
4081         case XFS_LI_EFI:
4082                 return xlog_recover_efi_pass2(log, item, trans->r_lsn);
4083         case XFS_LI_EFD:
4084                 return xlog_recover_efd_pass2(log, item);
4085         case XFS_LI_RUI:
4086                 return xlog_recover_rui_pass2(log, item, trans->r_lsn);
4087         case XFS_LI_RUD:
4088                 return xlog_recover_rud_pass2(log, item);
4089         case XFS_LI_CUI:
4090                 return xlog_recover_cui_pass2(log, item, trans->r_lsn);
4091         case XFS_LI_CUD:
4092                 return xlog_recover_cud_pass2(log, item);
4093         case XFS_LI_BUI:
4094                 return xlog_recover_bui_pass2(log, item, trans->r_lsn);
4095         case XFS_LI_BUD:
4096                 return xlog_recover_bud_pass2(log, item);
4097         case XFS_LI_DQUOT:
4098                 return xlog_recover_dquot_pass2(log, buffer_list, item,
4099                                                 trans->r_lsn);
4100         case XFS_LI_ICREATE:
4101                 return xlog_recover_do_icreate_pass2(log, buffer_list, item);
4102         case XFS_LI_QUOTAOFF:
4103                 /* nothing to do in pass2 */
4104                 return 0;
4105         default:
4106                 xfs_warn(log->l_mp, "%s: invalid item type (%d)",
4107                         __func__, ITEM_TYPE(item));
4108                 ASSERT(0);
4109                 return -EIO;
4110         }
4111 }
4112
4113 STATIC int
4114 xlog_recover_items_pass2(
4115         struct xlog                     *log,
4116         struct xlog_recover             *trans,
4117         struct list_head                *buffer_list,
4118         struct list_head                *item_list)
4119 {
4120         struct xlog_recover_item        *item;
4121         int                             error = 0;
4122
4123         list_for_each_entry(item, item_list, ri_list) {
4124                 error = xlog_recover_commit_pass2(log, trans,
4125                                           buffer_list, item);
4126                 if (error)
4127                         return error;
4128         }
4129
4130         return error;
4131 }
4132
4133 /*
4134  * Perform the transaction.
4135  *
4136  * If the transaction modifies a buffer or inode, do it now.  Otherwise,
4137  * EFIs and EFDs get queued up by adding entries into the AIL for them.
4138  */
4139 STATIC int
4140 xlog_recover_commit_trans(
4141         struct xlog             *log,
4142         struct xlog_recover     *trans,
4143         int                     pass,
4144         struct list_head        *buffer_list)
4145 {
4146         int                             error = 0;
4147         int                             items_queued = 0;
4148         struct xlog_recover_item        *item;
4149         struct xlog_recover_item        *next;
4150         LIST_HEAD                       (ra_list);
4151         LIST_HEAD                       (done_list);
4152
4153         #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100
4154
4155         hlist_del_init(&trans->r_list);
4156
4157         error = xlog_recover_reorder_trans(log, trans, pass);
4158         if (error)
4159                 return error;
4160
4161         list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) {
4162                 switch (pass) {
4163                 case XLOG_RECOVER_PASS1:
4164                         error = xlog_recover_commit_pass1(log, trans, item);
4165                         break;
4166                 case XLOG_RECOVER_PASS2:
4167                         xlog_recover_ra_pass2(log, item);
4168                         list_move_tail(&item->ri_list, &ra_list);
4169                         items_queued++;
4170                         if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) {
4171                                 error = xlog_recover_items_pass2(log, trans,
4172                                                 buffer_list, &ra_list);
4173                                 list_splice_tail_init(&ra_list, &done_list);
4174                                 items_queued = 0;
4175                         }
4176
4177                         break;
4178                 default:
4179                         ASSERT(0);
4180                 }
4181
4182                 if (error)
4183                         goto out;
4184         }
4185
4186 out:
4187         if (!list_empty(&ra_list)) {
4188                 if (!error)
4189                         error = xlog_recover_items_pass2(log, trans,
4190                                         buffer_list, &ra_list);
4191                 list_splice_tail_init(&ra_list, &done_list);
4192         }
4193
4194         if (!list_empty(&done_list))
4195                 list_splice_init(&done_list, &trans->r_itemq);
4196
4197         return error;
4198 }
4199
4200 STATIC void
4201 xlog_recover_add_item(
4202         struct list_head        *head)
4203 {
4204         xlog_recover_item_t     *item;
4205
4206         item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
4207         INIT_LIST_HEAD(&item->ri_list);
4208         list_add_tail(&item->ri_list, head);
4209 }
4210
4211 STATIC int
4212 xlog_recover_add_to_cont_trans(
4213         struct xlog             *log,
4214         struct xlog_recover     *trans,
4215         char                    *dp,
4216         int                     len)
4217 {
4218         xlog_recover_item_t     *item;
4219         char                    *ptr, *old_ptr;
4220         int                     old_len;
4221
4222         /*
4223          * If the transaction is empty, the header was split across this and the
4224          * previous record. Copy the rest of the header.
4225          */
4226         if (list_empty(&trans->r_itemq)) {
4227                 ASSERT(len <= sizeof(struct xfs_trans_header));
4228                 if (len > sizeof(struct xfs_trans_header)) {
4229                         xfs_warn(log->l_mp, "%s: bad header length", __func__);
4230                         return -EIO;
4231                 }
4232
4233                 xlog_recover_add_item(&trans->r_itemq);
4234                 ptr = (char *)&trans->r_theader +
4235                                 sizeof(struct xfs_trans_header) - len;
4236                 memcpy(ptr, dp, len);
4237                 return 0;
4238         }
4239
4240         /* take the tail entry */
4241         item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
4242
4243         old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
4244         old_len = item->ri_buf[item->ri_cnt-1].i_len;
4245
4246         ptr = kmem_realloc(old_ptr, len + old_len, KM_SLEEP);
4247         memcpy(&ptr[old_len], dp, len);
4248         item->ri_buf[item->ri_cnt-1].i_len += len;
4249         item->ri_buf[item->ri_cnt-1].i_addr = ptr;
4250         trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
4251         return 0;
4252 }
4253
4254 /*
4255  * The next region to add is the start of a new region.  It could be
4256  * a whole region or it could be the first part of a new region.  Because
4257  * of this, the assumption here is that the type and size fields of all
4258  * format structures fit into the first 32 bits of the structure.
4259  *
4260  * This works because all regions must be 32 bit aligned.  Therefore, we
4261  * either have both fields or we have neither field.  In the case we have
4262  * neither field, the data part of the region is zero length.  We only have
4263  * a log_op_header and can throw away the header since a new one will appear
4264  * later.  If we have at least 4 bytes, then we can determine how many regions
4265  * will appear in the current log item.
4266  */
4267 STATIC int
4268 xlog_recover_add_to_trans(
4269         struct xlog             *log,
4270         struct xlog_recover     *trans,
4271         char                    *dp,
4272         int                     len)
4273 {
4274         xfs_inode_log_format_t  *in_f;                  /* any will do */
4275         xlog_recover_item_t     *item;
4276         char                    *ptr;
4277
4278         if (!len)
4279                 return 0;
4280         if (list_empty(&trans->r_itemq)) {
4281                 /* we need to catch log corruptions here */
4282                 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
4283                         xfs_warn(log->l_mp, "%s: bad header magic number",
4284                                 __func__);
4285                         ASSERT(0);
4286                         return -EIO;
4287                 }
4288
4289                 if (len > sizeof(struct xfs_trans_header)) {
4290                         xfs_warn(log->l_mp, "%s: bad header length", __func__);
4291                         ASSERT(0);
4292                         return -EIO;
4293                 }
4294
4295                 /*
4296                  * The transaction header can be arbitrarily split across op
4297                  * records. If we don't have the whole thing here, copy what we
4298                  * do have and handle the rest in the next record.
4299                  */
4300                 if (len == sizeof(struct xfs_trans_header))
4301                         xlog_recover_add_item(&trans->r_itemq);
4302                 memcpy(&trans->r_theader, dp, len);
4303                 return 0;
4304         }
4305
4306         ptr = kmem_alloc(len, KM_SLEEP);
4307         memcpy(ptr, dp, len);
4308         in_f = (xfs_inode_log_format_t *)ptr;
4309
4310         /* take the tail entry */
4311         item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
4312         if (item->ri_total != 0 &&
4313              item->ri_total == item->ri_cnt) {
4314                 /* tail item is in use, get a new one */
4315                 xlog_recover_add_item(&trans->r_itemq);
4316                 item = list_entry(trans->r_itemq.prev,
4317                                         xlog_recover_item_t, ri_list);
4318         }
4319
4320         if (item->ri_total == 0) {              /* first region to be added */
4321                 if (in_f->ilf_size == 0 ||
4322                     in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
4323                         xfs_warn(log->l_mp,
4324                 "bad number of regions (%d) in inode log format",
4325                                   in_f->ilf_size);
4326                         ASSERT(0);
4327                         kmem_free(ptr);
4328                         return -EIO;
4329                 }
4330
4331                 item->ri_total = in_f->ilf_size;
4332                 item->ri_buf =
4333                         kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
4334                                     KM_SLEEP);
4335         }
4336         ASSERT(item->ri_total > item->ri_cnt);
4337         /* Description region is ri_buf[0] */
4338         item->ri_buf[item->ri_cnt].i_addr = ptr;
4339         item->ri_buf[item->ri_cnt].i_len  = len;
4340         item->ri_cnt++;
4341         trace_xfs_log_recover_item_add(log, trans, item, 0);
4342         return 0;
4343 }
4344
4345 /*
4346  * Free up any resources allocated by the transaction
4347  *
4348  * Remember that EFIs, EFDs, and IUNLINKs are handled later.
4349  */
4350 STATIC void
4351 xlog_recover_free_trans(
4352         struct xlog_recover     *trans)
4353 {
4354         xlog_recover_item_t     *item, *n;
4355         int                     i;
4356
4357         hlist_del_init(&trans->r_list);
4358
4359         list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
4360                 /* Free the regions in the item. */
4361                 list_del(&item->ri_list);
4362                 for (i = 0; i < item->ri_cnt; i++)
4363                         kmem_free(item->ri_buf[i].i_addr);
4364                 /* Free the item itself */
4365                 kmem_free(item->ri_buf);
4366                 kmem_free(item);
4367         }
4368         /* Free the transaction recover structure */
4369         kmem_free(trans);
4370 }
4371
4372 /*
4373  * On error or completion, trans is freed.
4374  */
4375 STATIC int
4376 xlog_recovery_process_trans(
4377         struct xlog             *log,
4378         struct xlog_recover     *trans,
4379         char                    *dp,
4380         unsigned int            len,
4381         unsigned int            flags,
4382         int                     pass,
4383         struct list_head        *buffer_list)
4384 {
4385         int                     error = 0;
4386         bool                    freeit = false;
4387
4388         /* mask off ophdr transaction container flags */
4389         flags &= ~XLOG_END_TRANS;
4390         if (flags & XLOG_WAS_CONT_TRANS)
4391                 flags &= ~XLOG_CONTINUE_TRANS;
4392
4393         /*
4394          * Callees must not free the trans structure. We'll decide if we need to
4395          * free it or not based on the operation being done and it's result.
4396          */
4397         switch (flags) {
4398         /* expected flag values */
4399         case 0:
4400         case XLOG_CONTINUE_TRANS:
4401                 error = xlog_recover_add_to_trans(log, trans, dp, len);
4402                 break;
4403         case XLOG_WAS_CONT_TRANS:
4404                 error = xlog_recover_add_to_cont_trans(log, trans, dp, len);
4405                 break;
4406         case XLOG_COMMIT_TRANS:
4407                 error = xlog_recover_commit_trans(log, trans, pass,
4408                                                   buffer_list);
4409                 /* success or fail, we are now done with this transaction. */
4410                 freeit = true;
4411                 break;
4412
4413         /* unexpected flag values */
4414         case XLOG_UNMOUNT_TRANS:
4415                 /* just skip trans */
4416                 xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
4417                 freeit = true;
4418                 break;
4419         case XLOG_START_TRANS:
4420         default:
4421                 xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags);
4422                 ASSERT(0);
4423                 error = -EIO;
4424                 break;
4425         }
4426         if (error || freeit)
4427                 xlog_recover_free_trans(trans);
4428         return error;
4429 }
4430
4431 /*
4432  * Lookup the transaction recovery structure associated with the ID in the
4433  * current ophdr. If the transaction doesn't exist and the start flag is set in
4434  * the ophdr, then allocate a new transaction for future ID matches to find.
4435  * Either way, return what we found during the lookup - an existing transaction
4436  * or nothing.
4437  */
4438 STATIC struct xlog_recover *
4439 xlog_recover_ophdr_to_trans(
4440         struct hlist_head       rhash[],
4441         struct xlog_rec_header  *rhead,
4442         struct xlog_op_header   *ohead)
4443 {
4444         struct xlog_recover     *trans;
4445         xlog_tid_t              tid;
4446         struct hlist_head       *rhp;
4447
4448         tid = be32_to_cpu(ohead->oh_tid);
4449         rhp = &rhash[XLOG_RHASH(tid)];
4450         hlist_for_each_entry(trans, rhp, r_list) {
4451                 if (trans->r_log_tid == tid)
4452                         return trans;
4453         }
4454
4455         /*
4456          * skip over non-start transaction headers - we could be
4457          * processing slack space before the next transaction starts
4458          */
4459         if (!(ohead->oh_flags & XLOG_START_TRANS))
4460                 return NULL;
4461
4462         ASSERT(be32_to_cpu(ohead->oh_len) == 0);
4463
4464         /*
4465          * This is a new transaction so allocate a new recovery container to
4466          * hold the recovery ops that will follow.
4467          */
4468         trans = kmem_zalloc(sizeof(struct xlog_recover), KM_SLEEP);
4469         trans->r_log_tid = tid;
4470         trans->r_lsn = be64_to_cpu(rhead->h_lsn);
4471         INIT_LIST_HEAD(&trans->r_itemq);
4472         INIT_HLIST_NODE(&trans->r_list);
4473         hlist_add_head(&trans->r_list, rhp);
4474
4475         /*
4476          * Nothing more to do for this ophdr. Items to be added to this new
4477          * transaction will be in subsequent ophdr containers.
4478          */
4479         return NULL;
4480 }
4481
4482 STATIC int
4483 xlog_recover_process_ophdr(
4484         struct xlog             *log,
4485         struct hlist_head       rhash[],
4486         struct xlog_rec_header  *rhead,
4487         struct xlog_op_header   *ohead,
4488         char                    *dp,
4489         char                    *end,
4490         int                     pass,
4491         struct list_head        *buffer_list)
4492 {
4493         struct xlog_recover     *trans;
4494         unsigned int            len;
4495         int                     error;
4496
4497         /* Do we understand who wrote this op? */
4498         if (ohead->oh_clientid != XFS_TRANSACTION &&
4499             ohead->oh_clientid != XFS_LOG) {
4500                 xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
4501                         __func__, ohead->oh_clientid);
4502                 ASSERT(0);
4503                 return -EIO;
4504         }
4505
4506         /*
4507          * Check the ophdr contains all the data it is supposed to contain.
4508          */
4509         len = be32_to_cpu(ohead->oh_len);
4510         if (dp + len > end) {
4511                 xfs_warn(log->l_mp, "%s: bad length 0x%x", __func__, len);
4512                 WARN_ON(1);
4513                 return -EIO;
4514         }
4515
4516         trans = xlog_recover_ophdr_to_trans(rhash, rhead, ohead);
4517         if (!trans) {
4518                 /* nothing to do, so skip over this ophdr */
4519                 return 0;
4520         }
4521
4522         /*
4523          * The recovered buffer queue is drained only once we know that all
4524          * recovery items for the current LSN have been processed. This is
4525          * required because:
4526          *
4527          * - Buffer write submission updates the metadata LSN of the buffer.
4528          * - Log recovery skips items with a metadata LSN >= the current LSN of
4529          *   the recovery item.
4530          * - Separate recovery items against the same metadata buffer can share
4531          *   a current LSN. I.e., consider that the LSN of a recovery item is
4532          *   defined as the starting LSN of the first record in which its
4533          *   transaction appears, that a record can hold multiple transactions,
4534          *   and/or that a transaction can span multiple records.
4535          *
4536          * In other words, we are allowed to submit a buffer from log recovery
4537          * once per current LSN. Otherwise, we may incorrectly skip recovery
4538          * items and cause corruption.
4539          *
4540          * We don't know up front whether buffers are updated multiple times per
4541          * LSN. Therefore, track the current LSN of each commit log record as it
4542          * is processed and drain the queue when it changes. Use commit records
4543          * because they are ordered correctly by the logging code.
4544          */
4545         if (log->l_recovery_lsn != trans->r_lsn &&
4546             ohead->oh_flags & XLOG_COMMIT_TRANS) {
4547                 error = xfs_buf_delwri_submit(buffer_list);
4548                 if (error)
4549                         return error;
4550                 log->l_recovery_lsn = trans->r_lsn;
4551         }
4552
4553         return xlog_recovery_process_trans(log, trans, dp, len,
4554                                            ohead->oh_flags, pass, buffer_list);
4555 }
4556
4557 /*
4558  * There are two valid states of the r_state field.  0 indicates that the
4559  * transaction structure is in a normal state.  We have either seen the
4560  * start of the transaction or the last operation we added was not a partial
4561  * operation.  If the last operation we added to the transaction was a
4562  * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
4563  *
4564  * NOTE: skip LRs with 0 data length.
4565  */
4566 STATIC int
4567 xlog_recover_process_data(
4568         struct xlog             *log,
4569         struct hlist_head       rhash[],
4570         struct xlog_rec_header  *rhead,
4571         char                    *dp,
4572         int                     pass,
4573         struct list_head        *buffer_list)
4574 {
4575         struct xlog_op_header   *ohead;
4576         char                    *end;
4577         int                     num_logops;
4578         int                     error;
4579
4580         end = dp + be32_to_cpu(rhead->h_len);
4581         num_logops = be32_to_cpu(rhead->h_num_logops);
4582
4583         /* check the log format matches our own - else we can't recover */
4584         if (xlog_header_check_recover(log->l_mp, rhead))
4585                 return -EIO;
4586
4587         trace_xfs_log_recover_record(log, rhead, pass);
4588         while ((dp < end) && num_logops) {
4589
4590                 ohead = (struct xlog_op_header *)dp;
4591                 dp += sizeof(*ohead);
4592                 ASSERT(dp <= end);
4593
4594                 /* errors will abort recovery */
4595                 error = xlog_recover_process_ophdr(log, rhash, rhead, ohead,
4596                                                    dp, end, pass, buffer_list);
4597                 if (error)
4598                         return error;
4599
4600                 dp += be32_to_cpu(ohead->oh_len);
4601                 num_logops--;
4602         }
4603         return 0;
4604 }
4605
4606 /* Recover the EFI if necessary. */
4607 STATIC int
4608 xlog_recover_process_efi(
4609         struct xfs_mount                *mp,
4610         struct xfs_ail                  *ailp,
4611         struct xfs_log_item             *lip)
4612 {
4613         struct xfs_efi_log_item         *efip;
4614         int                             error;
4615
4616         /*
4617          * Skip EFIs that we've already processed.
4618          */
4619         efip = container_of(lip, struct xfs_efi_log_item, efi_item);
4620         if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags))
4621                 return 0;
4622
4623         spin_unlock(&ailp->xa_lock);
4624         error = xfs_efi_recover(mp, efip);
4625         spin_lock(&ailp->xa_lock);
4626
4627         return error;
4628 }
4629
4630 /* Release the EFI since we're cancelling everything. */
4631 STATIC void
4632 xlog_recover_cancel_efi(
4633         struct xfs_mount                *mp,
4634         struct xfs_ail                  *ailp,
4635         struct xfs_log_item             *lip)
4636 {
4637         struct xfs_efi_log_item         *efip;
4638
4639         efip = container_of(lip, struct xfs_efi_log_item, efi_item);
4640
4641         spin_unlock(&ailp->xa_lock);
4642         xfs_efi_release(efip);
4643         spin_lock(&ailp->xa_lock);
4644 }
4645
4646 /* Recover the RUI if necessary. */
4647 STATIC int
4648 xlog_recover_process_rui(
4649         struct xfs_mount                *mp,
4650         struct xfs_ail                  *ailp,
4651         struct xfs_log_item             *lip)
4652 {
4653         struct xfs_rui_log_item         *ruip;
4654         int                             error;
4655
4656         /*
4657          * Skip RUIs that we've already processed.
4658          */
4659         ruip = container_of(lip, struct xfs_rui_log_item, rui_item);
4660         if (test_bit(XFS_RUI_RECOVERED, &ruip->rui_flags))
4661                 return 0;
4662
4663         spin_unlock(&ailp->xa_lock);
4664         error = xfs_rui_recover(mp, ruip);
4665         spin_lock(&ailp->xa_lock);
4666
4667         return error;
4668 }
4669
4670 /* Release the RUI since we're cancelling everything. */
4671 STATIC void
4672 xlog_recover_cancel_rui(
4673         struct xfs_mount                *mp,
4674         struct xfs_ail                  *ailp,
4675         struct xfs_log_item             *lip)
4676 {
4677         struct xfs_rui_log_item         *ruip;
4678
4679         ruip = container_of(lip, struct xfs_rui_log_item, rui_item);
4680
4681         spin_unlock(&ailp->xa_lock);
4682         xfs_rui_release(ruip);
4683         spin_lock(&ailp->xa_lock);
4684 }
4685
4686 /* Recover the CUI if necessary. */
4687 STATIC int
4688 xlog_recover_process_cui(
4689         struct xfs_mount                *mp,
4690         struct xfs_ail                  *ailp,
4691         struct xfs_log_item             *lip)
4692 {
4693         struct xfs_cui_log_item         *cuip;
4694         int                             error;
4695
4696         /*
4697          * Skip CUIs that we've already processed.
4698          */
4699         cuip = container_of(lip, struct xfs_cui_log_item, cui_item);
4700         if (test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags))
4701                 return 0;
4702
4703         spin_unlock(&ailp->xa_lock);
4704         error = xfs_cui_recover(mp, cuip);
4705         spin_lock(&ailp->xa_lock);
4706
4707         return error;
4708 }
4709
4710 /* Release the CUI since we're cancelling everything. */
4711 STATIC void
4712 xlog_recover_cancel_cui(
4713         struct xfs_mount                *mp,
4714         struct xfs_ail                  *ailp,
4715         struct xfs_log_item             *lip)
4716 {
4717         struct xfs_cui_log_item         *cuip;
4718
4719         cuip = container_of(lip, struct xfs_cui_log_item, cui_item);
4720
4721         spin_unlock(&ailp->xa_lock);
4722         xfs_cui_release(cuip);
4723         spin_lock(&ailp->xa_lock);
4724 }
4725
4726 /* Recover the BUI if necessary. */
4727 STATIC int
4728 xlog_recover_process_bui(
4729         struct xfs_mount                *mp,
4730         struct xfs_ail                  *ailp,
4731         struct xfs_log_item             *lip)
4732 {
4733         struct xfs_bui_log_item         *buip;
4734         int                             error;
4735
4736         /*
4737          * Skip BUIs that we've already processed.
4738          */
4739         buip = container_of(lip, struct xfs_bui_log_item, bui_item);
4740         if (test_bit(XFS_BUI_RECOVERED, &buip->bui_flags))
4741                 return 0;
4742
4743         spin_unlock(&ailp->xa_lock);
4744         error = xfs_bui_recover(mp, buip);
4745         spin_lock(&ailp->xa_lock);
4746
4747         return error;
4748 }
4749
4750 /* Release the BUI since we're cancelling everything. */
4751 STATIC void
4752 xlog_recover_cancel_bui(
4753         struct xfs_mount                *mp,
4754         struct xfs_ail                  *ailp,
4755         struct xfs_log_item             *lip)
4756 {
4757         struct xfs_bui_log_item         *buip;
4758
4759         buip = container_of(lip, struct xfs_bui_log_item, bui_item);
4760
4761         spin_unlock(&ailp->xa_lock);
4762         xfs_bui_release(buip);
4763         spin_lock(&ailp->xa_lock);
4764 }
4765
4766 /* Is this log item a deferred action intent? */
4767 static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
4768 {
4769         switch (lip->li_type) {
4770         case XFS_LI_EFI:
4771         case XFS_LI_RUI:
4772         case XFS_LI_CUI:
4773         case XFS_LI_BUI:
4774                 return true;
4775         default:
4776                 return false;
4777         }
4778 }
4779
4780 /*
4781  * When this is called, all of the log intent items which did not have
4782  * corresponding log done items should be in the AIL.  What we do now
4783  * is update the data structures associated with each one.
4784  *
4785  * Since we process the log intent items in normal transactions, they
4786  * will be removed at some point after the commit.  This prevents us
4787  * from just walking down the list processing each one.  We'll use a
4788  * flag in the intent item to skip those that we've already processed
4789  * and use the AIL iteration mechanism's generation count to try to
4790  * speed this up at least a bit.
4791  *
4792  * When we start, we know that the intents are the only things in the
4793  * AIL.  As we process them, however, other items are added to the
4794  * AIL.
4795  */
4796 STATIC int
4797 xlog_recover_process_intents(
4798         struct xlog             *log)
4799 {
4800         struct xfs_log_item     *lip;
4801         int                     error = 0;
4802         struct xfs_ail_cursor   cur;
4803         struct xfs_ail          *ailp;
4804         xfs_lsn_t               last_lsn;
4805
4806         ailp = log->l_ailp;
4807         spin_lock(&ailp->xa_lock);
4808         lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
4809         last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
4810         while (lip != NULL) {
4811                 /*
4812                  * We're done when we see something other than an intent.
4813                  * There should be no intents left in the AIL now.
4814                  */
4815                 if (!xlog_item_is_intent(lip)) {
4816 #ifdef DEBUG
4817                         for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
4818                                 ASSERT(!xlog_item_is_intent(lip));
4819 #endif
4820                         break;
4821                 }
4822
4823                 /*
4824                  * We should never see a redo item with a LSN higher than
4825                  * the last transaction we found in the log at the start
4826                  * of recovery.
4827                  */
4828                 ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0);
4829
4830                 switch (lip->li_type) {
4831                 case XFS_LI_EFI:
4832                         error = xlog_recover_process_efi(log->l_mp, ailp, lip);
4833                         break;
4834                 case XFS_LI_RUI:
4835                         error = xlog_recover_process_rui(log->l_mp, ailp, lip);
4836                         break;
4837                 case XFS_LI_CUI:
4838                         error = xlog_recover_process_cui(log->l_mp, ailp, lip);
4839                         break;
4840                 case XFS_LI_BUI:
4841                         error = xlog_recover_process_bui(log->l_mp, ailp, lip);
4842                         break;
4843                 }
4844                 if (error)
4845                         goto out;
4846                 lip = xfs_trans_ail_cursor_next(ailp, &cur);
4847         }
4848 out:
4849         xfs_trans_ail_cursor_done(&cur);
4850         spin_unlock(&ailp->xa_lock);
4851         return error;
4852 }
4853
4854 /*
4855  * A cancel occurs when the mount has failed and we're bailing out.
4856  * Release all pending log intent items so they don't pin the AIL.
4857  */
4858 STATIC int
4859 xlog_recover_cancel_intents(
4860         struct xlog             *log)
4861 {
4862         struct xfs_log_item     *lip;
4863         int                     error = 0;
4864         struct xfs_ail_cursor   cur;
4865         struct xfs_ail          *ailp;
4866
4867         ailp = log->l_ailp;
4868         spin_lock(&ailp->xa_lock);
4869         lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
4870         while (lip != NULL) {
4871                 /*
4872                  * We're done when we see something other than an intent.
4873                  * There should be no intents left in the AIL now.
4874                  */
4875                 if (!xlog_item_is_intent(lip)) {
4876 #ifdef DEBUG
4877                         for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
4878                                 ASSERT(!xlog_item_is_intent(lip));
4879 #endif
4880                         break;
4881                 }
4882
4883                 switch (lip->li_type) {
4884                 case XFS_LI_EFI:
4885                         xlog_recover_cancel_efi(log->l_mp, ailp, lip);
4886                         break;
4887                 case XFS_LI_RUI:
4888                         xlog_recover_cancel_rui(log->l_mp, ailp, lip);
4889                         break;
4890                 case XFS_LI_CUI:
4891                         xlog_recover_cancel_cui(log->l_mp, ailp, lip);
4892                         break;
4893                 case XFS_LI_BUI:
4894                         xlog_recover_cancel_bui(log->l_mp, ailp, lip);
4895                         break;
4896                 }
4897
4898                 lip = xfs_trans_ail_cursor_next(ailp, &cur);
4899         }
4900
4901         xfs_trans_ail_cursor_done(&cur);
4902         spin_unlock(&ailp->xa_lock);
4903         return error;
4904 }
4905
4906 /*
4907  * This routine performs a transaction to null out a bad inode pointer
4908  * in an agi unlinked inode hash bucket.
4909  */
4910 STATIC void
4911 xlog_recover_clear_agi_bucket(
4912         xfs_mount_t     *mp,
4913         xfs_agnumber_t  agno,
4914         int             bucket)
4915 {
4916         xfs_trans_t     *tp;
4917         xfs_agi_t       *agi;
4918         xfs_buf_t       *agibp;
4919         int             offset;
4920         int             error;
4921
4922         error = xfs_trans_alloc(mp, &M_RES(mp)->tr_clearagi, 0, 0, 0, &tp);
4923         if (error)
4924                 goto out_error;
4925
4926         error = xfs_read_agi(mp, tp, agno, &agibp);
4927         if (error)
4928                 goto out_abort;
4929
4930         agi = XFS_BUF_TO_AGI(agibp);
4931         agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
4932         offset = offsetof(xfs_agi_t, agi_unlinked) +
4933                  (sizeof(xfs_agino_t) * bucket);
4934         xfs_trans_log_buf(tp, agibp, offset,
4935                           (offset + sizeof(xfs_agino_t) - 1));
4936
4937         error = xfs_trans_commit(tp);
4938         if (error)
4939                 goto out_error;
4940         return;
4941
4942 out_abort:
4943         xfs_trans_cancel(tp);
4944 out_error:
4945         xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);
4946         return;
4947 }
4948
4949 STATIC xfs_agino_t
4950 xlog_recover_process_one_iunlink(
4951         struct xfs_mount                *mp,
4952         xfs_agnumber_t                  agno,
4953         xfs_agino_t                     agino,
4954         int                             bucket)
4955 {
4956         struct xfs_buf                  *ibp;
4957         struct xfs_dinode               *dip;
4958         struct xfs_inode                *ip;
4959         xfs_ino_t                       ino;
4960         int                             error;
4961
4962         ino = XFS_AGINO_TO_INO(mp, agno, agino);
4963         error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
4964         if (error)
4965                 goto fail;
4966
4967         /*
4968          * Get the on disk inode to find the next inode in the bucket.
4969          */
4970         error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0, 0);
4971         if (error)
4972                 goto fail_iput;
4973
4974         xfs_iflags_clear(ip, XFS_IRECOVERY);
4975         ASSERT(VFS_I(ip)->i_nlink == 0);
4976         ASSERT(VFS_I(ip)->i_mode != 0);
4977
4978         /* setup for the next pass */
4979         agino = be32_to_cpu(dip->di_next_unlinked);
4980         xfs_buf_relse(ibp);
4981
4982         /*
4983          * Prevent any DMAPI event from being sent when the reference on
4984          * the inode is dropped.
4985          */
4986         ip->i_d.di_dmevmask = 0;
4987
4988         IRELE(ip);
4989         return agino;
4990
4991  fail_iput:
4992         IRELE(ip);
4993  fail:
4994         /*
4995          * We can't read in the inode this bucket points to, or this inode
4996          * is messed up.  Just ditch this bucket of inodes.  We will lose
4997          * some inodes and space, but at least we won't hang.
4998          *
4999          * Call xlog_recover_clear_agi_bucket() to perform a transaction to
5000          * clear the inode pointer in the bucket.
5001          */
5002         xlog_recover_clear_agi_bucket(mp, agno, bucket);
5003         return NULLAGINO;
5004 }
5005
5006 /*
5007  * xlog_iunlink_recover
5008  *
5009  * This is called during recovery to process any inodes which
5010  * we unlinked but not freed when the system crashed.  These
5011  * inodes will be on the lists in the AGI blocks.  What we do
5012  * here is scan all the AGIs and fully truncate and free any
5013  * inodes found on the lists.  Each inode is removed from the
5014  * lists when it has been fully truncated and is freed.  The
5015  * freeing of the inode and its removal from the list must be
5016  * atomic.
5017  */
5018 STATIC void
5019 xlog_recover_process_iunlinks(
5020         struct xlog     *log)
5021 {
5022         xfs_mount_t     *mp;
5023         xfs_agnumber_t  agno;
5024         xfs_agi_t       *agi;
5025         xfs_buf_t       *agibp;
5026         xfs_agino_t     agino;
5027         int             bucket;
5028         int             error;
5029         uint            mp_dmevmask;
5030
5031         mp = log->l_mp;
5032
5033         /*
5034          * Prevent any DMAPI event from being sent while in this function.
5035          */
5036         mp_dmevmask = mp->m_dmevmask;
5037         mp->m_dmevmask = 0;
5038
5039         for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
5040                 /*
5041                  * Find the agi for this ag.
5042                  */
5043                 error = xfs_read_agi(mp, NULL, agno, &agibp);
5044                 if (error) {
5045                         /*
5046                          * AGI is b0rked. Don't process it.
5047                          *
5048                          * We should probably mark the filesystem as corrupt
5049                          * after we've recovered all the ag's we can....
5050                          */
5051                         continue;
5052                 }
5053                 /*
5054                  * Unlock the buffer so that it can be acquired in the normal
5055                  * course of the transaction to truncate and free each inode.
5056                  * Because we are not racing with anyone else here for the AGI
5057                  * buffer, we don't even need to hold it locked to read the
5058                  * initial unlinked bucket entries out of the buffer. We keep
5059                  * buffer reference though, so that it stays pinned in memory
5060                  * while we need the buffer.
5061                  */
5062                 agi = XFS_BUF_TO_AGI(agibp);
5063                 xfs_buf_unlock(agibp);
5064
5065                 for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
5066                         agino = be32_to_cpu(agi->agi_unlinked[bucket]);
5067                         while (agino != NULLAGINO) {
5068                                 agino = xlog_recover_process_one_iunlink(mp,
5069                                                         agno, agino, bucket);
5070                         }
5071                 }
5072                 xfs_buf_rele(agibp);
5073         }
5074
5075         mp->m_dmevmask = mp_dmevmask;
5076 }
5077
5078 STATIC int
5079 xlog_unpack_data(
5080         struct xlog_rec_header  *rhead,
5081         char                    *dp,
5082         struct xlog             *log)
5083 {
5084         int                     i, j, k;
5085
5086         for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
5087                   i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
5088                 *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
5089                 dp += BBSIZE;
5090         }
5091
5092         if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
5093                 xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
5094                 for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
5095                         j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
5096                         k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
5097                         *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
5098                         dp += BBSIZE;
5099                 }
5100         }
5101
5102         return 0;
5103 }
5104
5105 /*
5106  * CRC check, unpack and process a log record.
5107  */
5108 STATIC int
5109 xlog_recover_process(
5110         struct xlog             *log,
5111         struct hlist_head       rhash[],
5112         struct xlog_rec_header  *rhead,
5113         char                    *dp,
5114         int                     pass,
5115         struct list_head        *buffer_list)
5116 {
5117         int                     error;
5118         __le32                  old_crc = rhead->h_crc;
5119         __le32                  crc;
5120
5121
5122         crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
5123
5124         /*
5125          * Nothing else to do if this is a CRC verification pass. Just return
5126          * if this a record with a non-zero crc. Unfortunately, mkfs always
5127          * sets old_crc to 0 so we must consider this valid even on v5 supers.
5128          * Otherwise, return EFSBADCRC on failure so the callers up the stack
5129          * know precisely what failed.
5130          */
5131         if (pass == XLOG_RECOVER_CRCPASS) {
5132                 if (old_crc && crc != old_crc)
5133                         return -EFSBADCRC;
5134                 return 0;
5135         }
5136
5137         /*
5138          * We're in the normal recovery path. Issue a warning if and only if the
5139          * CRC in the header is non-zero. This is an advisory warning and the
5140          * zero CRC check prevents warnings from being emitted when upgrading
5141          * the kernel from one that does not add CRCs by default.
5142          */
5143         if (crc != old_crc) {
5144                 if (old_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
5145                         xfs_alert(log->l_mp,
5146                 "log record CRC mismatch: found 0x%x, expected 0x%x.",
5147                                         le32_to_cpu(old_crc),
5148                                         le32_to_cpu(crc));
5149                         xfs_hex_dump(dp, 32);
5150                 }
5151
5152                 /*
5153                  * If the filesystem is CRC enabled, this mismatch becomes a
5154                  * fatal log corruption failure.
5155                  */
5156                 if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
5157                         return -EFSCORRUPTED;
5158         }
5159
5160         error = xlog_unpack_data(rhead, dp, log);
5161         if (error)
5162                 return error;
5163
5164         return xlog_recover_process_data(log, rhash, rhead, dp, pass,
5165                                          buffer_list);
5166 }
5167
5168 STATIC int
5169 xlog_valid_rec_header(
5170         struct xlog             *log,
5171         struct xlog_rec_header  *rhead,
5172         xfs_daddr_t             blkno)
5173 {
5174         int                     hlen;
5175
5176         if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) {
5177                 XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
5178                                 XFS_ERRLEVEL_LOW, log->l_mp);
5179                 return -EFSCORRUPTED;
5180         }
5181         if (unlikely(
5182             (!rhead->h_version ||
5183             (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
5184                 xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
5185                         __func__, be32_to_cpu(rhead->h_version));
5186                 return -EIO;
5187         }
5188
5189         /* LR body must have data or it wouldn't have been written */
5190         hlen = be32_to_cpu(rhead->h_len);
5191         if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
5192                 XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
5193                                 XFS_ERRLEVEL_LOW, log->l_mp);
5194                 return -EFSCORRUPTED;
5195         }
5196         if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
5197                 XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
5198                                 XFS_ERRLEVEL_LOW, log->l_mp);
5199                 return -EFSCORRUPTED;
5200         }
5201         return 0;
5202 }
5203
5204 /*
5205  * Read the log from tail to head and process the log records found.
5206  * Handle the two cases where the tail and head are in the same cycle
5207  * and where the active portion of the log wraps around the end of
5208  * the physical log separately.  The pass parameter is passed through
5209  * to the routines called to process the data and is not looked at
5210  * here.
5211  */
5212 STATIC int
5213 xlog_do_recovery_pass(
5214         struct xlog             *log,
5215         xfs_daddr_t             head_blk,
5216         xfs_daddr_t             tail_blk,
5217         int                     pass,
5218         xfs_daddr_t             *first_bad)     /* out: first bad log rec */
5219 {
5220         xlog_rec_header_t       *rhead;
5221         xfs_daddr_t             blk_no;
5222         xfs_daddr_t             rhead_blk;
5223         char                    *offset;
5224         xfs_buf_t               *hbp, *dbp;
5225         int                     error = 0, h_size, h_len;
5226         int                     error2 = 0;
5227         int                     bblks, split_bblks;
5228         int                     hblks, split_hblks, wrapped_hblks;
5229         int                     i;
5230         struct hlist_head       rhash[XLOG_RHASH_SIZE];
5231         LIST_HEAD               (buffer_list);
5232
5233         ASSERT(head_blk != tail_blk);
5234         rhead_blk = 0;
5235
5236         for (i = 0; i < XLOG_RHASH_SIZE; i++)
5237                 INIT_HLIST_HEAD(&rhash[i]);
5238
5239         /*
5240          * Read the header of the tail block and get the iclog buffer size from
5241          * h_size.  Use this to tell how many sectors make up the log header.
5242          */
5243         if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
5244                 /*
5245                  * When using variable length iclogs, read first sector of
5246                  * iclog header and extract the header size from it.  Get a
5247                  * new hbp that is the correct size.
5248                  */
5249                 hbp = xlog_get_bp(log, 1);
5250                 if (!hbp)
5251                         return -ENOMEM;
5252
5253                 error = xlog_bread(log, tail_blk, 1, hbp, &offset);
5254                 if (error)
5255                         goto bread_err1;
5256
5257                 rhead = (xlog_rec_header_t *)offset;
5258                 error = xlog_valid_rec_header(log, rhead, tail_blk);
5259                 if (error)
5260                         goto bread_err1;
5261
5262                 /*
5263                  * xfsprogs has a bug where record length is based on lsunit but
5264                  * h_size (iclog size) is hardcoded to 32k. Now that we
5265                  * unconditionally CRC verify the unmount record, this means the
5266                  * log buffer can be too small for the record and cause an
5267                  * overrun.
5268                  *
5269                  * Detect this condition here. Use lsunit for the buffer size as
5270                  * long as this looks like the mkfs case. Otherwise, return an
5271                  * error to avoid a buffer overrun.
5272                  */
5273                 h_size = be32_to_cpu(rhead->h_size);
5274                 h_len = be32_to_cpu(rhead->h_len);
5275                 if (h_len > h_size) {
5276                         if (h_len <= log->l_mp->m_logbsize &&
5277                             be32_to_cpu(rhead->h_num_logops) == 1) {
5278                                 xfs_warn(log->l_mp,
5279                 "invalid iclog size (%d bytes), using lsunit (%d bytes)",
5280                                          h_size, log->l_mp->m_logbsize);
5281                                 h_size = log->l_mp->m_logbsize;
5282                         } else
5283                                 return -EFSCORRUPTED;
5284                 }
5285
5286                 if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
5287                     (h_size > XLOG_HEADER_CYCLE_SIZE)) {
5288                         hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
5289                         if (h_size % XLOG_HEADER_CYCLE_SIZE)
5290                                 hblks++;
5291                         xlog_put_bp(hbp);
5292                         hbp = xlog_get_bp(log, hblks);
5293                 } else {
5294                         hblks = 1;
5295                 }
5296         } else {
5297                 ASSERT(log->l_sectBBsize == 1);
5298                 hblks = 1;
5299                 hbp = xlog_get_bp(log, 1);
5300                 h_size = XLOG_BIG_RECORD_BSIZE;
5301         }
5302
5303         if (!hbp)
5304                 return -ENOMEM;
5305         dbp = xlog_get_bp(log, BTOBB(h_size));
5306         if (!dbp) {
5307                 xlog_put_bp(hbp);
5308                 return -ENOMEM;
5309         }
5310
5311         memset(rhash, 0, sizeof(rhash));
5312         blk_no = rhead_blk = tail_blk;
5313         if (tail_blk > head_blk) {
5314                 /*
5315                  * Perform recovery around the end of the physical log.
5316                  * When the head is not on the same cycle number as the tail,
5317                  * we can't do a sequential recovery.
5318                  */
5319                 while (blk_no < log->l_logBBsize) {
5320                         /*
5321                          * Check for header wrapping around physical end-of-log
5322                          */
5323                         offset = hbp->b_addr;
5324                         split_hblks = 0;
5325                         wrapped_hblks = 0;
5326                         if (blk_no + hblks <= log->l_logBBsize) {
5327                                 /* Read header in one read */
5328                                 error = xlog_bread(log, blk_no, hblks, hbp,
5329                                                    &offset);
5330                                 if (error)
5331                                         goto bread_err2;
5332                         } else {
5333                                 /* This LR is split across physical log end */
5334                                 if (blk_no != log->l_logBBsize) {
5335                                         /* some data before physical log end */
5336                                         ASSERT(blk_no <= INT_MAX);
5337                                         split_hblks = log->l_logBBsize - (int)blk_no;
5338                                         ASSERT(split_hblks > 0);
5339                                         error = xlog_bread(log, blk_no,
5340                                                            split_hblks, hbp,
5341                                                            &offset);
5342                                         if (error)
5343                                                 goto bread_err2;
5344                                 }
5345
5346                                 /*
5347                                  * Note: this black magic still works with
5348                                  * large sector sizes (non-512) only because:
5349                                  * - we increased the buffer size originally
5350                                  *   by 1 sector giving us enough extra space
5351                                  *   for the second read;
5352                                  * - the log start is guaranteed to be sector
5353                                  *   aligned;
5354                                  * - we read the log end (LR header start)
5355                                  *   _first_, then the log start (LR header end)
5356                                  *   - order is important.
5357                                  */
5358                                 wrapped_hblks = hblks - split_hblks;
5359                                 error = xlog_bread_offset(log, 0,
5360                                                 wrapped_hblks, hbp,
5361                                                 offset + BBTOB(split_hblks));
5362                                 if (error)
5363                                         goto bread_err2;
5364                         }
5365                         rhead = (xlog_rec_header_t *)offset;
5366                         error = xlog_valid_rec_header(log, rhead,
5367                                                 split_hblks ? blk_no : 0);
5368                         if (error)
5369                                 goto bread_err2;
5370
5371                         bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
5372                         blk_no += hblks;
5373
5374                         /* Read in data for log record */
5375                         if (blk_no + bblks <= log->l_logBBsize) {
5376                                 error = xlog_bread(log, blk_no, bblks, dbp,
5377                                                    &offset);
5378                                 if (error)
5379                                         goto bread_err2;
5380                         } else {
5381                                 /* This log record is split across the
5382                                  * physical end of log */
5383                                 offset = dbp->b_addr;
5384                                 split_bblks = 0;
5385                                 if (blk_no != log->l_logBBsize) {
5386                                         /* some data is before the physical
5387                                          * end of log */
5388                                         ASSERT(!wrapped_hblks);
5389                                         ASSERT(blk_no <= INT_MAX);
5390                                         split_bblks =
5391                                                 log->l_logBBsize - (int)blk_no;
5392                                         ASSERT(split_bblks > 0);
5393                                         error = xlog_bread(log, blk_no,
5394                                                         split_bblks, dbp,
5395                                                         &offset);
5396                                         if (error)
5397                                                 goto bread_err2;
5398                                 }
5399
5400                                 /*
5401                                  * Note: this black magic still works with
5402                                  * large sector sizes (non-512) only because:
5403                                  * - we increased the buffer size originally
5404                                  *   by 1 sector giving us enough extra space
5405                                  *   for the second read;
5406                                  * - the log start is guaranteed to be sector
5407                                  *   aligned;
5408                                  * - we read the log end (LR header start)
5409                                  *   _first_, then the log start (LR header end)
5410                                  *   - order is important.
5411                                  */
5412                                 error = xlog_bread_offset(log, 0,
5413                                                 bblks - split_bblks, dbp,
5414                                                 offset + BBTOB(split_bblks));
5415                                 if (error)
5416                                         goto bread_err2;
5417                         }
5418
5419                         error = xlog_recover_process(log, rhash, rhead, offset,
5420                                                      pass, &buffer_list);
5421                         if (error)
5422                                 goto bread_err2;
5423
5424                         blk_no += bblks;
5425                         rhead_blk = blk_no;
5426                 }
5427
5428                 ASSERT(blk_no >= log->l_logBBsize);
5429                 blk_no -= log->l_logBBsize;
5430                 rhead_blk = blk_no;
5431         }
5432
5433         /* read first part of physical log */
5434         while (blk_no < head_blk) {
5435                 error = xlog_bread(log, blk_no, hblks, hbp, &offset);
5436                 if (error)
5437                         goto bread_err2;
5438
5439                 rhead = (xlog_rec_header_t *)offset;
5440                 error = xlog_valid_rec_header(log, rhead, blk_no);
5441                 if (error)
5442                         goto bread_err2;
5443
5444                 /* blocks in data section */
5445                 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
5446                 error = xlog_bread(log, blk_no+hblks, bblks, dbp,
5447                                    &offset);
5448                 if (error)
5449                         goto bread_err2;
5450
5451                 error = xlog_recover_process(log, rhash, rhead, offset, pass,
5452                                              &buffer_list);
5453                 if (error)
5454                         goto bread_err2;
5455
5456                 blk_no += bblks + hblks;
5457                 rhead_blk = blk_no;
5458         }
5459
5460  bread_err2:
5461         xlog_put_bp(dbp);
5462  bread_err1:
5463         xlog_put_bp(hbp);
5464
5465         /*
5466          * Submit buffers that have been added from the last record processed,
5467          * regardless of error status.
5468          */
5469         if (!list_empty(&buffer_list))
5470                 error2 = xfs_buf_delwri_submit(&buffer_list);
5471
5472         if (error && first_bad)
5473                 *first_bad = rhead_blk;
5474
5475         /*
5476          * Transactions are freed at commit time but transactions without commit
5477          * records on disk are never committed. Free any that may be left in the
5478          * hash table.
5479          */
5480         for (i = 0; i < XLOG_RHASH_SIZE; i++) {
5481                 struct hlist_node       *tmp;
5482                 struct xlog_recover     *trans;
5483
5484                 hlist_for_each_entry_safe(trans, tmp, &rhash[i], r_list)
5485                         xlog_recover_free_trans(trans);
5486         }
5487
5488         return error ? error : error2;
5489 }
5490
5491 /*
5492  * Do the recovery of the log.  We actually do this in two phases.
5493  * The two passes are necessary in order to implement the function
5494  * of cancelling a record written into the log.  The first pass
5495  * determines those things which have been cancelled, and the
5496  * second pass replays log items normally except for those which
5497  * have been cancelled.  The handling of the replay and cancellations
5498  * takes place in the log item type specific routines.
5499  *
5500  * The table of items which have cancel records in the log is allocated
5501  * and freed at this level, since only here do we know when all of
5502  * the log recovery has been completed.
5503  */
5504 STATIC int
5505 xlog_do_log_recovery(
5506         struct xlog     *log,
5507         xfs_daddr_t     head_blk,
5508         xfs_daddr_t     tail_blk)
5509 {
5510         int             error, i;
5511
5512         ASSERT(head_blk != tail_blk);
5513
5514         /*
5515          * First do a pass to find all of the cancelled buf log items.
5516          * Store them in the buf_cancel_table for use in the second pass.
5517          */
5518         log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
5519                                                  sizeof(struct list_head),
5520                                                  KM_SLEEP);
5521         for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
5522                 INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
5523
5524         error = xlog_do_recovery_pass(log, head_blk, tail_blk,
5525                                       XLOG_RECOVER_PASS1, NULL);
5526         if (error != 0) {
5527                 kmem_free(log->l_buf_cancel_table);
5528                 log->l_buf_cancel_table = NULL;
5529                 return error;
5530         }
5531         /*
5532          * Then do a second pass to actually recover the items in the log.
5533          * When it is complete free the table of buf cancel items.
5534          */
5535         error = xlog_do_recovery_pass(log, head_blk, tail_blk,
5536                                       XLOG_RECOVER_PASS2, NULL);
5537 #ifdef DEBUG
5538         if (!error) {
5539                 int     i;
5540
5541                 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
5542                         ASSERT(list_empty(&log->l_buf_cancel_table[i]));
5543         }
5544 #endif  /* DEBUG */
5545
5546         kmem_free(log->l_buf_cancel_table);
5547         log->l_buf_cancel_table = NULL;
5548
5549         return error;
5550 }
5551
5552 /*
5553  * Do the actual recovery
5554  */
5555 STATIC int
5556 xlog_do_recover(
5557         struct xlog     *log,
5558         xfs_daddr_t     head_blk,
5559         xfs_daddr_t     tail_blk)
5560 {
5561         struct xfs_mount *mp = log->l_mp;
5562         int             error;
5563         xfs_buf_t       *bp;
5564         xfs_sb_t        *sbp;
5565
5566         /*
5567          * First replay the images in the log.
5568          */
5569         error = xlog_do_log_recovery(log, head_blk, tail_blk);
5570         if (error)
5571                 return error;
5572
5573         /*
5574          * If IO errors happened during recovery, bail out.
5575          */
5576         if (XFS_FORCED_SHUTDOWN(mp)) {
5577                 return -EIO;
5578         }
5579
5580         /*
5581          * We now update the tail_lsn since much of the recovery has completed
5582          * and there may be space available to use.  If there were no extent
5583          * or iunlinks, we can free up the entire log and set the tail_lsn to
5584          * be the last_sync_lsn.  This was set in xlog_find_tail to be the
5585          * lsn of the last known good LR on disk.  If there are extent frees
5586          * or iunlinks they will have some entries in the AIL; so we look at
5587          * the AIL to determine how to set the tail_lsn.
5588          */
5589         xlog_assign_tail_lsn(mp);
5590
5591         /*
5592          * Now that we've finished replaying all buffer and inode
5593          * updates, re-read in the superblock and reverify it.
5594          */
5595         bp = xfs_getsb(mp, 0);
5596         bp->b_flags &= ~(XBF_DONE | XBF_ASYNC);
5597         ASSERT(!(bp->b_flags & XBF_WRITE));
5598         bp->b_flags |= XBF_READ;
5599         bp->b_ops = &xfs_sb_buf_ops;
5600
5601         error = xfs_buf_submit_wait(bp);
5602         if (error) {
5603                 if (!XFS_FORCED_SHUTDOWN(mp)) {
5604                         xfs_buf_ioerror_alert(bp, __func__);
5605                         ASSERT(0);
5606                 }
5607                 xfs_buf_relse(bp);
5608                 return error;
5609         }
5610
5611         /* Convert superblock from on-disk format */
5612         sbp = &mp->m_sb;
5613         xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
5614         xfs_buf_relse(bp);
5615
5616         /* re-initialise in-core superblock and geometry structures */
5617         xfs_reinit_percpu_counters(mp);
5618         error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
5619         if (error) {
5620                 xfs_warn(mp, "Failed post-recovery per-ag init: %d", error);
5621                 return error;
5622         }
5623         mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
5624
5625         xlog_recover_check_summary(log);
5626
5627         /* Normal transactions can now occur */
5628         log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
5629         return 0;
5630 }
5631
5632 /*
5633  * Perform recovery and re-initialize some log variables in xlog_find_tail.
5634  *
5635  * Return error or zero.
5636  */
5637 int
5638 xlog_recover(
5639         struct xlog     *log)
5640 {
5641         xfs_daddr_t     head_blk, tail_blk;
5642         int             error;
5643
5644         /* find the tail of the log */
5645         error = xlog_find_tail(log, &head_blk, &tail_blk);
5646         if (error)
5647                 return error;
5648
5649         /*
5650          * The superblock was read before the log was available and thus the LSN
5651          * could not be verified. Check the superblock LSN against the current
5652          * LSN now that it's known.
5653          */
5654         if (xfs_sb_version_hascrc(&log->l_mp->m_sb) &&
5655             !xfs_log_check_lsn(log->l_mp, log->l_mp->m_sb.sb_lsn))
5656                 return -EINVAL;
5657
5658         if (tail_blk != head_blk) {
5659                 /* There used to be a comment here:
5660                  *
5661                  * disallow recovery on read-only mounts.  note -- mount
5662                  * checks for ENOSPC and turns it into an intelligent
5663                  * error message.
5664                  * ...but this is no longer true.  Now, unless you specify
5665                  * NORECOVERY (in which case this function would never be
5666                  * called), we just go ahead and recover.  We do this all
5667                  * under the vfs layer, so we can get away with it unless
5668                  * the device itself is read-only, in which case we fail.
5669                  */
5670                 if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) {
5671                         return error;
5672                 }
5673
5674                 /*
5675                  * Version 5 superblock log feature mask validation. We know the
5676                  * log is dirty so check if there are any unknown log features
5677                  * in what we need to recover. If there are unknown features
5678                  * (e.g. unsupported transactions, then simply reject the
5679                  * attempt at recovery before touching anything.
5680                  */
5681                 if (XFS_SB_VERSION_NUM(&log->l_mp->m_sb) == XFS_SB_VERSION_5 &&
5682                     xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb,
5683                                         XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) {
5684                         xfs_warn(log->l_mp,
5685 "Superblock has unknown incompatible log features (0x%x) enabled.",
5686                                 (log->l_mp->m_sb.sb_features_log_incompat &
5687                                         XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
5688                         xfs_warn(log->l_mp,
5689 "The log can not be fully and/or safely recovered by this kernel.");
5690                         xfs_warn(log->l_mp,
5691 "Please recover the log on a kernel that supports the unknown features.");
5692                         return -EINVAL;
5693                 }
5694
5695                 /*
5696                  * Delay log recovery if the debug hook is set. This is debug
5697                  * instrumention to coordinate simulation of I/O failures with
5698                  * log recovery.
5699                  */
5700                 if (xfs_globals.log_recovery_delay) {
5701                         xfs_notice(log->l_mp,
5702                                 "Delaying log recovery for %d seconds.",
5703                                 xfs_globals.log_recovery_delay);
5704                         msleep(xfs_globals.log_recovery_delay * 1000);
5705                 }
5706
5707                 xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
5708                                 log->l_mp->m_logname ? log->l_mp->m_logname
5709                                                      : "internal");
5710
5711                 error = xlog_do_recover(log, head_blk, tail_blk);
5712                 log->l_flags |= XLOG_RECOVERY_NEEDED;
5713         }
5714         return error;
5715 }
5716
5717 /*
5718  * In the first part of recovery we replay inodes and buffers and build
5719  * up the list of extent free items which need to be processed.  Here
5720  * we process the extent free items and clean up the on disk unlinked
5721  * inode lists.  This is separated from the first part of recovery so
5722  * that the root and real-time bitmap inodes can be read in from disk in
5723  * between the two stages.  This is necessary so that we can free space
5724  * in the real-time portion of the file system.
5725  */
5726 int
5727 xlog_recover_finish(
5728         struct xlog     *log)
5729 {
5730         /*
5731          * Now we're ready to do the transactions needed for the
5732          * rest of recovery.  Start with completing all the extent
5733          * free intent records and then process the unlinked inode
5734          * lists.  At this point, we essentially run in normal mode
5735          * except that we're still performing recovery actions
5736          * rather than accepting new requests.
5737          */
5738         if (log->l_flags & XLOG_RECOVERY_NEEDED) {
5739                 int     error;
5740                 error = xlog_recover_process_intents(log);
5741                 if (error) {
5742                         xfs_alert(log->l_mp, "Failed to recover intents");
5743                         return error;
5744                 }
5745
5746                 /*
5747                  * Sync the log to get all the intents out of the AIL.
5748                  * This isn't absolutely necessary, but it helps in
5749                  * case the unlink transactions would have problems
5750                  * pushing the intents out of the way.
5751                  */
5752                 xfs_log_force(log->l_mp, XFS_LOG_SYNC);
5753
5754                 xlog_recover_process_iunlinks(log);
5755
5756                 xlog_recover_check_summary(log);
5757
5758                 xfs_notice(log->l_mp, "Ending recovery (logdev: %s)",
5759                                 log->l_mp->m_logname ? log->l_mp->m_logname
5760                                                      : "internal");
5761                 log->l_flags &= ~XLOG_RECOVERY_NEEDED;
5762         } else {
5763                 xfs_info(log->l_mp, "Ending clean mount");
5764         }
5765         return 0;
5766 }
5767
5768 int
5769 xlog_recover_cancel(
5770         struct xlog     *log)
5771 {
5772         int             error = 0;
5773
5774         if (log->l_flags & XLOG_RECOVERY_NEEDED)
5775                 error = xlog_recover_cancel_intents(log);
5776
5777         return error;
5778 }
5779
5780 #if defined(DEBUG)
5781 /*
5782  * Read all of the agf and agi counters and check that they
5783  * are consistent with the superblock counters.
5784  */
5785 void
5786 xlog_recover_check_summary(
5787         struct xlog     *log)
5788 {
5789         xfs_mount_t     *mp;
5790         xfs_agf_t       *agfp;
5791         xfs_buf_t       *agfbp;
5792         xfs_buf_t       *agibp;
5793         xfs_agnumber_t  agno;
5794         uint64_t        freeblks;
5795         uint64_t        itotal;
5796         uint64_t        ifree;
5797         int             error;
5798
5799         mp = log->l_mp;
5800
5801         freeblks = 0LL;
5802         itotal = 0LL;
5803         ifree = 0LL;
5804         for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
5805                 error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
5806                 if (error) {
5807                         xfs_alert(mp, "%s agf read failed agno %d error %d",
5808                                                 __func__, agno, error);
5809                 } else {
5810                         agfp = XFS_BUF_TO_AGF(agfbp);
5811                         freeblks += be32_to_cpu(agfp->agf_freeblks) +
5812                                     be32_to_cpu(agfp->agf_flcount);
5813                         xfs_buf_relse(agfbp);
5814                 }
5815
5816                 error = xfs_read_agi(mp, NULL, agno, &agibp);
5817                 if (error) {
5818                         xfs_alert(mp, "%s agi read failed agno %d error %d",
5819                                                 __func__, agno, error);
5820                 } else {
5821                         struct xfs_agi  *agi = XFS_BUF_TO_AGI(agibp);
5822
5823                         itotal += be32_to_cpu(agi->agi_count);
5824                         ifree += be32_to_cpu(agi->agi_freecount);
5825                         xfs_buf_relse(agibp);
5826                 }
5827         }
5828 }
5829 #endif /* DEBUG */