Merge tag 'xfs-5.1-merge-4' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 7 Mar 2019 17:38:51 +0000 (09:38 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 7 Mar 2019 17:38:51 +0000 (09:38 -0800)
Pull xfs updates from Darrick Wong:
 "Here are a number of new features and bug fixes for 5.1

  They've undergone a week's worth of fstesting and merge cleanly with
  master as of this morning

  Most of the changes center on improving metadata validation and fixing
  problems with online fsck, though there's also a new cache to speed up
  unlinked inode handling and cleanup of the copy on write code in
  preparation for future features

  Changes for Linux 5.1:

   - Fix online fsck to handle inode btrees correctly on 64k block
     filesystems

   - Teach online fsck to check directory and attribute names for
     invalid characters

   - Miscellanous fixes for online fsck

   - Introduce a new panic mask so that we can halt immediately on
     metadata corruption (for debugging purposes)

   - Fix a block mapping race during writeback

   - Cache unlinked inode list backrefs in memory to speed up list
     processing

   - Separate the bnobt/cntbt and inobt/finobt buffer verifiers so that
     we can detect crosslinked btrees

   - Refactor magic number verification so that we can standardize it

   - Strengthen ondisk metadata structure offset build time verification

   - Fix a memory corruption problem in the listxattr code

   - Fix a shutdown problem during log recovery due to unreserved finobt
     expansion

   - Fix a referential integrity problem where O_TMPFILE inodes were put
     on the unlinked list with nlink > 0 which would cause asserts
     during log recovery if the system went down immediately

   - Refactor the delayed allocation allocator to be more clever about
     the possibility that its mapping might be stale

   - Various fixes to the copy on write mechanism

   - Make CoW preallocation suitable for use even with writes that
     wouldn't otherwise require it

   - Refactor an internal API

   - Fix some statx implementation bugs

   - Fix miscellaneous compiler and static checker complaints"

* tag 'xfs-5.1-merge-4' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: (70 commits)
  xfs: fix reporting supported extra file attributes for statx()
  xfs: fix backwards endian conversion in scrub
  xfs: fix uninitialized error variables
  xfs: rework breaking of shared extents in xfs_file_iomap_begin
  xfs: don't pass iomap flags to xfs_reflink_allocate_cow
  xfs: fix uninitialized error variable
  xfs: introduce an always_cow mode
  xfs: report IOMAP_F_SHARED from xfs_file_iomap_begin_delay
  xfs: make COW fork unwritten extent conversions more robust
  xfs: merge COW handling into xfs_file_iomap_begin_delay
  xfs: also truncate holes covered by COW blocks
  xfs: don't use delalloc extents for COW on files with extsize hints
  xfs: fix SEEK_DATA for speculative COW fork preallocation
  xfs: make xfs_bmbt_to_iomap more useful
  xfs: fix xfs_buf magic number endian checks
  xfs: retry COW fork delalloc conversion when no extent was found
  xfs: remove the truncate short cut in xfs_map_blocks
  xfs: move xfs_iomap_write_allocate to xfs_aops.c
  xfs: move stat accounting to xfs_bmapi_convert_delalloc
  xfs: move transaction handling to xfs_bmapi_convert_delalloc
  ..

77 files changed:
Documentation/filesystems/xfs.txt
fs/xfs/libxfs/xfs_ag.c
fs/xfs/libxfs/xfs_ag_resv.c
fs/xfs/libxfs/xfs_alloc.c
fs/xfs/libxfs/xfs_alloc_btree.c
fs/xfs/libxfs/xfs_attr.c
fs/xfs/libxfs/xfs_attr.h
fs/xfs/libxfs/xfs_attr_leaf.c
fs/xfs/libxfs/xfs_attr_remote.c
fs/xfs/libxfs/xfs_bmap.c
fs/xfs/libxfs/xfs_bmap.h
fs/xfs/libxfs/xfs_bmap_btree.c
fs/xfs/libxfs/xfs_da_btree.c
fs/xfs/libxfs/xfs_da_format.h
fs/xfs/libxfs/xfs_dir2.c
fs/xfs/libxfs/xfs_dir2.h
fs/xfs/libxfs/xfs_dir2_block.c
fs/xfs/libxfs/xfs_dir2_data.c
fs/xfs/libxfs/xfs_dir2_leaf.c
fs/xfs/libxfs/xfs_dir2_node.c
fs/xfs/libxfs/xfs_dquot_buf.c
fs/xfs/libxfs/xfs_errortag.h
fs/xfs/libxfs/xfs_ialloc.c
fs/xfs/libxfs/xfs_ialloc_btree.c
fs/xfs/libxfs/xfs_iext_tree.c
fs/xfs/libxfs/xfs_inode_buf.c
fs/xfs/libxfs/xfs_inode_fork.h
fs/xfs/libxfs/xfs_refcount_btree.c
fs/xfs/libxfs/xfs_rmap_btree.c
fs/xfs/libxfs/xfs_sb.c
fs/xfs/libxfs/xfs_shared.h
fs/xfs/libxfs/xfs_symlink_remote.c
fs/xfs/libxfs/xfs_types.c
fs/xfs/libxfs/xfs_types.h
fs/xfs/scrub/agheader.c
fs/xfs/scrub/agheader_repair.c
fs/xfs/scrub/attr.c
fs/xfs/scrub/bmap.c
fs/xfs/scrub/dir.c
fs/xfs/scrub/ialloc.c
fs/xfs/scrub/repair.c
fs/xfs/scrub/repair.h
fs/xfs/scrub/rtbitmap.c
fs/xfs/scrub/trace.h
fs/xfs/xfs_aops.c
fs/xfs/xfs_aops.h
fs/xfs/xfs_attr_list.c
fs/xfs/xfs_bmap_util.c
fs/xfs/xfs_buf.c
fs/xfs/xfs_buf.h
fs/xfs/xfs_error.c
fs/xfs/xfs_error.h
fs/xfs/xfs_file.c
fs/xfs/xfs_fsops.c
fs/xfs/xfs_globals.c
fs/xfs/xfs_inode.c
fs/xfs/xfs_inode.h
fs/xfs/xfs_iomap.c
fs/xfs/xfs_iomap.h
fs/xfs/xfs_iops.c
fs/xfs/xfs_log_recover.c
fs/xfs/xfs_mount.c
fs/xfs/xfs_mount.h
fs/xfs/xfs_ondisk.h
fs/xfs/xfs_pnfs.c
fs/xfs/xfs_reflink.c
fs/xfs/xfs_reflink.h
fs/xfs/xfs_super.c
fs/xfs/xfs_sysctl.h
fs/xfs/xfs_sysfs.c
fs/xfs/xfs_trace.h
fs/xfs/xfs_trans_bmap.c
fs/xfs/xfs_trans_buf.c
fs/xfs/xfs_trans_extfree.c
fs/xfs/xfs_trans_refcount.c
fs/xfs/xfs_trans_rmap.c
fs/xfs/xfs_xattr.c

index 9ccfd1bc6201862e4eae6a5be252287a512151be..a5cbb5e0e3db48b6f6275cc5098845cabd109c4f 100644 (file)
@@ -272,7 +272,7 @@ The following sysctls are available for the XFS filesystem:
                XFS_ERRLEVEL_LOW:       1
                XFS_ERRLEVEL_HIGH:      5
 
-  fs.xfs.panic_mask            (Min: 0  Default: 0  Max: 255)
+  fs.xfs.panic_mask            (Min: 0  Default: 0  Max: 256)
        Causes certain error conditions to call BUG(). Value is a bitmask;
        OR together the tags which represent errors which should cause panics:
 
@@ -285,6 +285,7 @@ The following sysctls are available for the XFS filesystem:
                XFS_PTAG_SHUTDOWN_IOERROR       0x00000020
                XFS_PTAG_SHUTDOWN_LOGERROR      0x00000040
                XFS_PTAG_FSBLOCK_ZERO           0x00000080
+               XFS_PTAG_VERIFIER_ERROR         0x00000100
 
        This option is intended for debugging only.
 
index 999ad8d00d433b278554e7df259c076dec95f14e..1ef8acf35e7d8f33aea4ed771dd8e9aaf23eedd8 100644 (file)
@@ -339,14 +339,14 @@ xfs_ag_init_headers(
        { /* BNO root block */
                .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_BNO_BLOCK(mp)),
                .numblks = BTOBB(mp->m_sb.sb_blocksize),
-               .ops = &xfs_allocbt_buf_ops,
+               .ops = &xfs_bnobt_buf_ops,
                .work = &xfs_bnoroot_init,
                .need_init = true
        },
        { /* CNT root block */
                .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp)),
                .numblks = BTOBB(mp->m_sb.sb_blocksize),
-               .ops = &xfs_allocbt_buf_ops,
+               .ops = &xfs_cntbt_buf_ops,
                .work = &xfs_cntroot_init,
                .need_init = true
        },
@@ -361,7 +361,7 @@ xfs_ag_init_headers(
        { /* FINO root block */
                .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_FIBT_BLOCK(mp)),
                .numblks = BTOBB(mp->m_sb.sb_blocksize),
-               .ops = &xfs_inobt_buf_ops,
+               .ops = &xfs_finobt_buf_ops,
                .work = &xfs_btroot_init,
                .type = XFS_BTNUM_FINO,
                .need_init =  xfs_sb_version_hasfinobt(&mp->m_sb)
index e701ebc36c069f5696c5b6287474bf37fad4b05c..e2ba2a3b63b20a6378283e35e1c58c939f1d2476 100644 (file)
@@ -281,7 +281,7 @@ xfs_ag_resv_init(
                         */
                        ask = used = 0;
 
-                       mp->m_inotbt_nores = true;
+                       mp->m_finobt_nores = true;
 
                        error = xfs_refcountbt_calc_reserves(mp, tp, agno, &ask,
                                        &used);
index b715668886a4824adf8a1c4527236469afed89fe..bc3367b8b7bb0375d38a9462800916f083a5ff10 100644 (file)
@@ -568,9 +568,9 @@ xfs_agfl_verify(
        if (!xfs_sb_version_hascrc(&mp->m_sb))
                return NULL;
 
-       if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
+       if (!xfs_verify_magic(bp, agfl->agfl_magicnum))
                return __this_address;
-       if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC)
+       if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
                return __this_address;
        /*
         * during growfs operations, the perag is not fully initialised,
@@ -643,6 +643,7 @@ xfs_agfl_write_verify(
 
 const struct xfs_buf_ops xfs_agfl_buf_ops = {
        .name = "xfs_agfl",
+       .magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) },
        .verify_read = xfs_agfl_read_verify,
        .verify_write = xfs_agfl_write_verify,
        .verify_struct = xfs_agfl_verify,
@@ -2587,8 +2588,10 @@ xfs_agf_verify(
                        return __this_address;
        }
 
-       if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
-             XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
+       if (!xfs_verify_magic(bp, agf->agf_magicnum))
+               return __this_address;
+
+       if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
              be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
              be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) &&
              be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) &&
@@ -2670,6 +2673,7 @@ xfs_agf_write_verify(
 
 const struct xfs_buf_ops xfs_agf_buf_ops = {
        .name = "xfs_agf",
+       .magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) },
        .verify_read = xfs_agf_read_verify,
        .verify_write = xfs_agf_write_verify,
        .verify_struct = xfs_agf_verify,
index 4e59cc8a280221973279f262e2ee613ba61a5b3d..9fe949f6055ec32e89e08d3cf01608cdbb678f42 100644 (file)
@@ -297,48 +297,34 @@ xfs_allocbt_verify(
        struct xfs_perag        *pag = bp->b_pag;
        xfs_failaddr_t          fa;
        unsigned int            level;
+       xfs_btnum_t             btnum = XFS_BTNUM_BNOi;
+
+       if (!xfs_verify_magic(bp, block->bb_magic))
+               return __this_address;
+
+       if (xfs_sb_version_hascrc(&mp->m_sb)) {
+               fa = xfs_btree_sblock_v5hdr_verify(bp);
+               if (fa)
+                       return fa;
+       }
 
        /*
-        * magic number and level verification
-        *
-        * During growfs operations, we can't verify the exact level or owner as
-        * the perag is not fully initialised and hence not attached to the
-        * buffer.  In this case, check against the maximum tree depth.
+        * The perag may not be attached during grow operations or fully
+        * initialized from the AGF during log recovery. Therefore we can only
+        * check against maximum tree depth from those contexts.
         *
-        * Similarly, during log recovery we will have a perag structure
-        * attached, but the agf information will not yet have been initialised
-        * from the on disk AGF. Again, we can only check against maximum limits
-        * in this case.
+        * Otherwise check against the per-tree limit. Peek at one of the
+        * verifier magic values to determine the type of tree we're verifying
+        * against.
         */
        level = be16_to_cpu(block->bb_level);
-       switch (block->bb_magic) {
-       case cpu_to_be32(XFS_ABTB_CRC_MAGIC):
-               fa = xfs_btree_sblock_v5hdr_verify(bp);
-               if (fa)
-                       return fa;
-               /* fall through */
-       case cpu_to_be32(XFS_ABTB_MAGIC):
-               if (pag && pag->pagf_init) {
-                       if (level >= pag->pagf_levels[XFS_BTNUM_BNOi])
-                               return __this_address;
-               } else if (level >= mp->m_ag_maxlevels)
+       if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC))
+               btnum = XFS_BTNUM_CNTi;
+       if (pag && pag->pagf_init) {
+               if (level >= pag->pagf_levels[btnum])
                        return __this_address;
-               break;
-       case cpu_to_be32(XFS_ABTC_CRC_MAGIC):
-               fa = xfs_btree_sblock_v5hdr_verify(bp);
-               if (fa)
-                       return fa;
-               /* fall through */
-       case cpu_to_be32(XFS_ABTC_MAGIC):
-               if (pag && pag->pagf_init) {
-                       if (level >= pag->pagf_levels[XFS_BTNUM_CNTi])
-                               return __this_address;
-               } else if (level >= mp->m_ag_maxlevels)
-                       return __this_address;
-               break;
-       default:
+       } else if (level >= mp->m_ag_maxlevels)
                return __this_address;
-       }
 
        return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]);
 }
@@ -377,13 +363,23 @@ xfs_allocbt_write_verify(
 
 }
 
-const struct xfs_buf_ops xfs_allocbt_buf_ops = {
-       .name = "xfs_allocbt",
+const struct xfs_buf_ops xfs_bnobt_buf_ops = {
+       .name = "xfs_bnobt",
+       .magic = { cpu_to_be32(XFS_ABTB_MAGIC),
+                  cpu_to_be32(XFS_ABTB_CRC_MAGIC) },
        .verify_read = xfs_allocbt_read_verify,
        .verify_write = xfs_allocbt_write_verify,
        .verify_struct = xfs_allocbt_verify,
 };
 
+const struct xfs_buf_ops xfs_cntbt_buf_ops = {
+       .name = "xfs_cntbt",
+       .magic = { cpu_to_be32(XFS_ABTC_MAGIC),
+                  cpu_to_be32(XFS_ABTC_CRC_MAGIC) },
+       .verify_read = xfs_allocbt_read_verify,
+       .verify_write = xfs_allocbt_write_verify,
+       .verify_struct = xfs_allocbt_verify,
+};
 
 STATIC int
 xfs_bnobt_keys_inorder(
@@ -448,7 +444,7 @@ static const struct xfs_btree_ops xfs_bnobt_ops = {
        .init_rec_from_cur      = xfs_allocbt_init_rec_from_cur,
        .init_ptr_from_cur      = xfs_allocbt_init_ptr_from_cur,
        .key_diff               = xfs_bnobt_key_diff,
-       .buf_ops                = &xfs_allocbt_buf_ops,
+       .buf_ops                = &xfs_bnobt_buf_ops,
        .diff_two_keys          = xfs_bnobt_diff_two_keys,
        .keys_inorder           = xfs_bnobt_keys_inorder,
        .recs_inorder           = xfs_bnobt_recs_inorder,
@@ -470,7 +466,7 @@ static const struct xfs_btree_ops xfs_cntbt_ops = {
        .init_rec_from_cur      = xfs_allocbt_init_rec_from_cur,
        .init_ptr_from_cur      = xfs_allocbt_init_ptr_from_cur,
        .key_diff               = xfs_cntbt_key_diff,
-       .buf_ops                = &xfs_allocbt_buf_ops,
+       .buf_ops                = &xfs_cntbt_buf_ops,
        .diff_two_keys          = xfs_cntbt_diff_two_keys,
        .keys_inorder           = xfs_cntbt_keys_inorder,
        .recs_inorder           = xfs_cntbt_recs_inorder,
index 844ed87b190077115c760204659179bca1da8c43..2dd9ee2a2e08f2b99fedf66075fe44d65e3d16d2 100644 (file)
@@ -1336,3 +1336,20 @@ xfs_attr_node_get(xfs_da_args_t *args)
        xfs_da_state_free(state);
        return retval;
 }
+
+/* Returns true if the attribute entry name is valid. */
+bool
+xfs_attr_namecheck(
+       const void      *name,
+       size_t          length)
+{
+       /*
+        * MAXNAMELEN includes the trailing null, but (name/length) leave it
+        * out, so use >= for the length check.
+        */
+       if (length >= MAXNAMELEN)
+               return false;
+
+       /* There shouldn't be any nulls here */
+       return !memchr(name, 0, length);
+}
index bdf52a333f3f9a2d7d4492d54b0ecf9a919402a5..2297d84676669ff25aaad45f86686e6671a1d182 100644 (file)
@@ -145,6 +145,6 @@ int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
 int xfs_attr_remove_args(struct xfs_da_args *args);
 int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
                  int flags, struct attrlist_cursor_kern *cursor);
-
+bool xfs_attr_namecheck(const void *name, size_t length);
 
 #endif /* __XFS_ATTR_H__ */
index 2652d00842d6ba8c6479f816765c87dfc622d1cb..1f6e3965ff7425456ca64477a713573cb5e7943a 100644 (file)
@@ -245,25 +245,14 @@ xfs_attr3_leaf_verify(
        struct xfs_attr_leaf_entry      *entries;
        uint32_t                        end;    /* must be 32bit - see below */
        int                             i;
+       xfs_failaddr_t                  fa;
 
        xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
 
-       if (xfs_sb_version_hascrc(&mp->m_sb)) {
-               struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
-
-               if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC)
-                       return __this_address;
+       fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
+       if (fa)
+               return fa;
 
-               if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid))
-                       return __this_address;
-               if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
-                       return __this_address;
-               if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn)))
-                       return __this_address;
-       } else {
-               if (ichdr.magic != XFS_ATTR_LEAF_MAGIC)
-                       return __this_address;
-       }
        /*
         * In recovery there is a transient state where count == 0 is valid
         * because we may have transitioned an empty shortform attr to a leaf
@@ -369,6 +358,8 @@ xfs_attr3_leaf_read_verify(
 
 const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = {
        .name = "xfs_attr3_leaf",
+       .magic16 = { cpu_to_be16(XFS_ATTR_LEAF_MAGIC),
+                    cpu_to_be16(XFS_ATTR3_LEAF_MAGIC) },
        .verify_read = xfs_attr3_leaf_read_verify,
        .verify_write = xfs_attr3_leaf_write_verify,
        .verify_struct = xfs_attr3_leaf_verify,
index d89363c6b5234d73cef58d4e9533a88f6de09c46..65ff600a8067875f3d898481e1ef2c271d55bdd9 100644 (file)
@@ -79,6 +79,7 @@ xfs_attr3_rmt_hdr_ok(
 static xfs_failaddr_t
 xfs_attr3_rmt_verify(
        struct xfs_mount        *mp,
+       struct xfs_buf          *bp,
        void                    *ptr,
        int                     fsbsize,
        xfs_daddr_t             bno)
@@ -87,7 +88,7 @@ xfs_attr3_rmt_verify(
 
        if (!xfs_sb_version_hascrc(&mp->m_sb))
                return __this_address;
-       if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC))
+       if (!xfs_verify_magic(bp, rmt->rm_magic))
                return __this_address;
        if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid))
                return __this_address;
@@ -131,7 +132,7 @@ __xfs_attr3_rmt_read_verify(
                        *failaddr = __this_address;
                        return -EFSBADCRC;
                }
-               *failaddr = xfs_attr3_rmt_verify(mp, ptr, blksize, bno);
+               *failaddr = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno);
                if (*failaddr)
                        return -EFSCORRUPTED;
                len -= blksize;
@@ -193,7 +194,7 @@ xfs_attr3_rmt_write_verify(
        while (len > 0) {
                struct xfs_attr3_rmt_hdr *rmt = (struct xfs_attr3_rmt_hdr *)ptr;
 
-               fa = xfs_attr3_rmt_verify(mp, ptr, blksize, bno);
+               fa = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno);
                if (fa) {
                        xfs_verifier_error(bp, -EFSCORRUPTED, fa);
                        return;
@@ -220,6 +221,7 @@ xfs_attr3_rmt_write_verify(
 
 const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = {
        .name = "xfs_attr3_rmt",
+       .magic = { 0, cpu_to_be32(XFS_ATTR3_RMT_MAGIC) },
        .verify_read = xfs_attr3_rmt_read_verify,
        .verify_write = xfs_attr3_rmt_write_verify,
        .verify_struct = xfs_attr3_rmt_verify_struct,
index 332eefa2700ba7c86e480533b4d0de45339f26f8..48502cb9990f184a55b780372adaef3bda406509 100644 (file)
@@ -577,42 +577,44 @@ __xfs_bmap_add_free(
  */
 
 /*
- * Transform a btree format file with only one leaf node, where the
- * extents list will fit in the inode, into an extents format file.
- * Since the file extents are already in-core, all we have to do is
- * give up the space for the btree root and pitch the leaf block.
+ * Convert the inode format to extent format if it currently is in btree format,
+ * but the extent list is small enough that it fits into the extent format.
+ *
+ * Since the extents are already in-core, all we have to do is give up the space
+ * for the btree root and pitch the leaf block.
  */
 STATIC int                             /* error */
 xfs_bmap_btree_to_extents(
-       xfs_trans_t             *tp,    /* transaction pointer */
-       xfs_inode_t             *ip,    /* incore inode pointer */
-       xfs_btree_cur_t         *cur,   /* btree cursor */
+       struct xfs_trans        *tp,    /* transaction pointer */
+       struct xfs_inode        *ip,    /* incore inode pointer */
+       struct xfs_btree_cur    *cur,   /* btree cursor */
        int                     *logflagsp, /* inode logging flags */
        int                     whichfork)  /* data or attr fork */
 {
-       /* REFERENCED */
+       struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
+       struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_btree_block  *rblock = ifp->if_broot;
        struct xfs_btree_block  *cblock;/* child btree block */
        xfs_fsblock_t           cbno;   /* child block number */
        xfs_buf_t               *cbp;   /* child block's buffer */
        int                     error;  /* error return value */
-       struct xfs_ifork        *ifp;   /* inode fork data */
-       xfs_mount_t             *mp;    /* mount point structure */
        __be64                  *pp;    /* ptr to block address */
-       struct xfs_btree_block  *rblock;/* root btree block */
        struct xfs_owner_info   oinfo;
 
-       mp = ip->i_mount;
-       ifp = XFS_IFORK_PTR(ip, whichfork);
+       /* check if we actually need the extent format first: */
+       if (!xfs_bmap_wants_extents(ip, whichfork))
+               return 0;
+
+       ASSERT(cur);
        ASSERT(whichfork != XFS_COW_FORK);
        ASSERT(ifp->if_flags & XFS_IFEXTENTS);
        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
-       rblock = ifp->if_broot;
        ASSERT(be16_to_cpu(rblock->bb_level) == 1);
        ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
        ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
+
        pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
        cbno = be64_to_cpu(*pp);
-       *logflagsp = 0;
 #ifdef DEBUG
        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
                        xfs_btree_check_lptr(cur, cbno, 1));
@@ -635,7 +637,7 @@ xfs_bmap_btree_to_extents(
        ASSERT(ifp->if_broot == NULL);
        ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
        XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
-       *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+       *logflagsp |= XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
        return 0;
 }
 
@@ -2029,7 +2031,7 @@ done:
 /*
  * Convert an unwritten allocation to a real allocation or vice versa.
  */
-STATIC int                             /* error */
+int                                    /* error */
 xfs_bmap_add_extent_unwritten_real(
        struct xfs_trans        *tp,
        xfs_inode_t             *ip,    /* incore inode pointer */
@@ -3685,17 +3687,6 @@ xfs_trim_extent(
        }
 }
 
-/* trim extent to within eof */
-void
-xfs_trim_extent_eof(
-       struct xfs_bmbt_irec    *irec,
-       struct xfs_inode        *ip)
-
-{
-       xfs_trim_extent(irec, 0, XFS_B_TO_FSB(ip->i_mount,
-                                             i_size_read(VFS_I(ip))));
-}
-
 /*
  * Trim the returned map to the required bounds
  */
@@ -4203,6 +4194,44 @@ xfs_bmapi_convert_unwritten(
        return 0;
 }
 
+static inline xfs_extlen_t
+xfs_bmapi_minleft(
+       struct xfs_trans        *tp,
+       struct xfs_inode        *ip,
+       int                     fork)
+{
+       if (tp && tp->t_firstblock != NULLFSBLOCK)
+               return 0;
+       if (XFS_IFORK_FORMAT(ip, fork) != XFS_DINODE_FMT_BTREE)
+               return 1;
+       return be16_to_cpu(XFS_IFORK_PTR(ip, fork)->if_broot->bb_level) + 1;
+}
+
+/*
+ * Log whatever the flags say, even if error.  Otherwise we might miss detecting
+ * a case where the data is changed, there's an error, and it's not logged so we
+ * don't shutdown when we should.  Don't bother logging extents/btree changes if
+ * we converted to the other format.
+ */
+static void
+xfs_bmapi_finish(
+       struct xfs_bmalloca     *bma,
+       int                     whichfork,
+       int                     error)
+{
+       if ((bma->logflags & xfs_ilog_fext(whichfork)) &&
+           XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+               bma->logflags &= ~xfs_ilog_fext(whichfork);
+       else if ((bma->logflags & xfs_ilog_fbroot(whichfork)) &&
+                XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_BTREE)
+               bma->logflags &= ~xfs_ilog_fbroot(whichfork);
+
+       if (bma->logflags)
+               xfs_trans_log_inode(bma->tp, bma->ip, bma->logflags);
+       if (bma->cur)
+               xfs_btree_del_cursor(bma->cur, error);
+}
+
 /*
  * Map file blocks to filesystem blocks, and allocate blocks or convert the
  * extent state if necessary.  Details behaviour is controlled by the flags
@@ -4247,9 +4276,7 @@ xfs_bmapi_write(
 
        ASSERT(*nmap >= 1);
        ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
-       ASSERT(tp != NULL ||
-              (flags & (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)) ==
-                       (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK));
+       ASSERT(tp != NULL);
        ASSERT(len > 0);
        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -4282,25 +4309,12 @@ xfs_bmapi_write(
 
        XFS_STATS_INC(mp, xs_blk_mapw);
 
-       if (!tp || tp->t_firstblock == NULLFSBLOCK) {
-               if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
-                       bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
-               else
-                       bma.minleft = 1;
-       } else {
-               bma.minleft = 0;
-       }
-
        if (!(ifp->if_flags & XFS_IFEXTENTS)) {
                error = xfs_iread_extents(tp, ip, whichfork);
                if (error)
                        goto error0;
        }
 
-       n = 0;
-       end = bno + len;
-       obno = bno;
-
        if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.icur, &bma.got))
                eof = true;
        if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev))
@@ -4309,7 +4323,11 @@ xfs_bmapi_write(
        bma.ip = ip;
        bma.total = total;
        bma.datatype = 0;
+       bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
 
+       n = 0;
+       end = bno + len;
+       obno = bno;
        while (bno < end && n < *nmap) {
                bool                    need_alloc = false, wasdelay = false;
 
@@ -4323,26 +4341,7 @@ xfs_bmapi_write(
                        ASSERT(!((flags & XFS_BMAPI_CONVERT) &&
                                 (flags & XFS_BMAPI_COWFORK)));
 
-                       if (flags & XFS_BMAPI_DELALLOC) {
-                               /*
-                                * For the COW fork we can reasonably get a
-                                * request for converting an extent that races
-                                * with other threads already having converted
-                                * part of it, as there converting COW to
-                                * regular blocks is not protected using the
-                                * IOLOCK.
-                                */
-                               ASSERT(flags & XFS_BMAPI_COWFORK);
-                               if (!(flags & XFS_BMAPI_COWFORK)) {
-                                       error = -EIO;
-                                       goto error0;
-                               }
-
-                               if (eof || bno >= end)
-                                       break;
-                       } else {
-                               need_alloc = true;
-                       }
+                       need_alloc = true;
                } else if (isnullstartblock(bma.got.br_startblock)) {
                        wasdelay = true;
                }
@@ -4351,8 +4350,7 @@ xfs_bmapi_write(
                 * First, deal with the hole before the allocated space
                 * that we found, if any.
                 */
-               if ((need_alloc || wasdelay) &&
-                   !(flags & XFS_BMAPI_CONVERT_ONLY)) {
+               if (need_alloc || wasdelay) {
                        bma.eof = eof;
                        bma.conv = !!(flags & XFS_BMAPI_CONVERT);
                        bma.wasdel = wasdelay;
@@ -4420,49 +4418,130 @@ xfs_bmapi_write(
        }
        *nmap = n;
 
-       /*
-        * Transform from btree to extents, give it cur.
-        */
-       if (xfs_bmap_wants_extents(ip, whichfork)) {
-               int             tmp_logflags = 0;
-
-               ASSERT(bma.cur);
-               error = xfs_bmap_btree_to_extents(tp, ip, bma.cur,
-                       &tmp_logflags, whichfork);
-               bma.logflags |= tmp_logflags;
-               if (error)
-                       goto error0;
-       }
+       error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
+                       whichfork);
+       if (error)
+               goto error0;
 
        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
               XFS_IFORK_NEXTENTS(ip, whichfork) >
                XFS_IFORK_MAXEXT(ip, whichfork));
-       error = 0;
+       xfs_bmapi_finish(&bma, whichfork, 0);
+       xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
+               orig_nmap, *nmap);
+       return 0;
 error0:
+       xfs_bmapi_finish(&bma, whichfork, error);
+       return error;
+}
+
+/*
+ * Convert an existing delalloc extent to real blocks based on file offset. This
+ * attempts to allocate the entire delalloc extent and may require multiple
+ * invocations to allocate the target offset if a large enough physical extent
+ * is not available.
+ */
+int
+xfs_bmapi_convert_delalloc(
+       struct xfs_inode        *ip,
+       int                     whichfork,
+       xfs_fileoff_t           offset_fsb,
+       struct xfs_bmbt_irec    *imap,
+       unsigned int            *seq)
+{
+       struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
+       struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_bmalloca     bma = { NULL };
+       struct xfs_trans        *tp;
+       int                     error;
+
        /*
-        * Log everything.  Do this after conversion, there's no point in
-        * logging the extent records if we've converted to btree format.
+        * Space for the extent and indirect blocks was reserved when the
+        * delalloc extent was created so there's no need to do so here.
         */
-       if ((bma.logflags & xfs_ilog_fext(whichfork)) &&
-           XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
-               bma.logflags &= ~xfs_ilog_fext(whichfork);
-       else if ((bma.logflags & xfs_ilog_fbroot(whichfork)) &&
-                XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
-               bma.logflags &= ~xfs_ilog_fbroot(whichfork);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0,
+                               XFS_TRANS_RESERVE, &tp);
+       if (error)
+               return error;
+
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(tp, ip, 0);
+
+       if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) ||
+           bma.got.br_startoff > offset_fsb) {
+               /*
+                * No extent found in the range we are trying to convert.  This
+                * should only happen for the COW fork, where another thread
+                * might have moved the extent to the data fork in the meantime.
+                */
+               WARN_ON_ONCE(whichfork != XFS_COW_FORK);
+               error = -EAGAIN;
+               goto out_trans_cancel;
+       }
+
        /*
-        * Log whatever the flags say, even if error.  Otherwise we might miss
-        * detecting a case where the data is changed, there's an error,
-        * and it's not logged so we don't shutdown when we should.
+        * If we find a real extent here we raced with another thread converting
+        * the extent.  Just return the real extent at this offset.
         */
-       if (bma.logflags)
-               xfs_trans_log_inode(tp, ip, bma.logflags);
+       if (!isnullstartblock(bma.got.br_startblock)) {
+               *imap = bma.got;
+               *seq = READ_ONCE(ifp->if_seq);
+               goto out_trans_cancel;
+       }
+
+       bma.tp = tp;
+       bma.ip = ip;
+       bma.wasdel = true;
+       bma.offset = bma.got.br_startoff;
+       bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN);
+       bma.total = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
+       bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
+       if (whichfork == XFS_COW_FORK)
+               bma.flags = XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC;
 
-       if (bma.cur) {
-               xfs_btree_del_cursor(bma.cur, error);
+       if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev))
+               bma.prev.br_startoff = NULLFILEOFF;
+
+       error = xfs_bmapi_allocate(&bma);
+       if (error)
+               goto out_finish;
+
+       error = -ENOSPC;
+       if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK))
+               goto out_finish;
+       error = -EFSCORRUPTED;
+       if (WARN_ON_ONCE(!bma.got.br_startblock && !XFS_IS_REALTIME_INODE(ip)))
+               goto out_finish;
+
+       XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length));
+       XFS_STATS_INC(mp, xs_xstrat_quick);
+
+       ASSERT(!isnullstartblock(bma.got.br_startblock));
+       *imap = bma.got;
+       *seq = READ_ONCE(ifp->if_seq);
+
+       if (whichfork == XFS_COW_FORK) {
+               error = xfs_refcount_alloc_cow_extent(tp, bma.blkno,
+                               bma.length);
+               if (error)
+                       goto out_finish;
        }
-       if (!error)
-               xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
-                       orig_nmap, *nmap);
+
+       error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
+                       whichfork);
+       if (error)
+               goto out_finish;
+
+       xfs_bmapi_finish(&bma, whichfork, 0);
+       error = xfs_trans_commit(tp);
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       return error;
+
+out_finish:
+       xfs_bmapi_finish(&bma, whichfork, error);
+out_trans_cancel:
+       xfs_trans_cancel(tp);
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
        return error;
 }
 
@@ -4536,13 +4615,7 @@ xfs_bmapi_remap(
        if (error)
                goto error0;
 
-       if (xfs_bmap_wants_extents(ip, whichfork)) {
-               int             tmp_logflags = 0;
-
-               error = xfs_bmap_btree_to_extents(tp, ip, cur,
-                       &tmp_logflags, whichfork);
-               logflags |= tmp_logflags;
-       }
+       error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags, whichfork);
 
 error0:
        if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS)
@@ -5406,24 +5479,11 @@ nodelete:
                error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0,
                                &tmp_logflags, whichfork);
                logflags |= tmp_logflags;
-               if (error)
-                       goto error0;
-       }
-       /*
-        * transform from btree to extents, give it cur
-        */
-       else if (xfs_bmap_wants_extents(ip, whichfork)) {
-               ASSERT(cur != NULL);
-               error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags,
+       } else {
+               error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags,
                        whichfork);
-               logflags |= tmp_logflags;
-               if (error)
-                       goto error0;
        }
-       /*
-        * transform from extents to local?
-        */
-       error = 0;
+
 error0:
        /*
         * Log everything.  Do this after conversion, there's no point in
index 09d3ea97cc15a207cd9b6445ddab1372fc012146..8f597f9abdbe952e8e92fb4fc12f568fa9886cba 100644 (file)
@@ -95,12 +95,6 @@ struct xfs_extent_free_item
 /* Map something in the CoW fork. */
 #define XFS_BMAPI_COWFORK      0x200
 
-/* Only convert delalloc space, don't allocate entirely new extents */
-#define XFS_BMAPI_DELALLOC     0x400
-
-/* Only convert unwritten extents, don't allocate new blocks */
-#define XFS_BMAPI_CONVERT_ONLY 0x800
-
 /* Skip online discard of freed extents */
 #define XFS_BMAPI_NODISCARD    0x1000
 
@@ -117,8 +111,6 @@ struct xfs_extent_free_item
        { XFS_BMAPI_ZERO,       "ZERO" }, \
        { XFS_BMAPI_REMAP,      "REMAP" }, \
        { XFS_BMAPI_COWFORK,    "COWFORK" }, \
-       { XFS_BMAPI_DELALLOC,   "DELALLOC" }, \
-       { XFS_BMAPI_CONVERT_ONLY, "CONVERT_ONLY" }, \
        { XFS_BMAPI_NODISCARD,  "NODISCARD" }, \
        { XFS_BMAPI_NORMAP,     "NORMAP" }
 
@@ -181,7 +173,6 @@ static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec)
 
 void   xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
                xfs_filblks_t len);
-void   xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *);
 int    xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
 int    xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version);
 void   xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
@@ -228,6 +219,13 @@ int        xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
                xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc,
                struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur,
                int eof);
+int    xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork,
+               xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap,
+               unsigned int *seq);
+int    xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp,
+               struct xfs_inode *ip, int whichfork,
+               struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp,
+               struct xfs_bmbt_irec *new, int *logflagsp);
 
 static inline void
 xfs_bmap_add_free(
index cdb74d2e2a435bc446778dc2e1c9f7a4d7318506..aff82ed112c93c26f43bed5ada5fd4b82e4e3711 100644 (file)
@@ -416,8 +416,10 @@ xfs_bmbt_verify(
        xfs_failaddr_t          fa;
        unsigned int            level;
 
-       switch (block->bb_magic) {
-       case cpu_to_be32(XFS_BMAP_CRC_MAGIC):
+       if (!xfs_verify_magic(bp, block->bb_magic))
+               return __this_address;
+
+       if (xfs_sb_version_hascrc(&mp->m_sb)) {
                /*
                 * XXX: need a better way of verifying the owner here. Right now
                 * just make sure there has been one set.
@@ -425,11 +427,6 @@ xfs_bmbt_verify(
                fa = xfs_btree_lblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
                if (fa)
                        return fa;
-               /* fall through */
-       case cpu_to_be32(XFS_BMAP_MAGIC):
-               break;
-       default:
-               return __this_address;
        }
 
        /*
@@ -481,6 +478,8 @@ xfs_bmbt_write_verify(
 
 const struct xfs_buf_ops xfs_bmbt_buf_ops = {
        .name = "xfs_bmbt",
+       .magic = { cpu_to_be32(XFS_BMAP_MAGIC),
+                  cpu_to_be32(XFS_BMAP_CRC_MAGIC) },
        .verify_read = xfs_bmbt_read_verify,
        .verify_write = xfs_bmbt_write_verify,
        .verify_struct = xfs_bmbt_verify,
index 376bee94b5dd0b92ecc94cab2a92247a08adc88e..e2737e2ac2aeb5e31a997ee3ed5f3800bf5ecfa7 100644 (file)
@@ -116,6 +116,34 @@ xfs_da_state_free(xfs_da_state_t *state)
        kmem_zone_free(xfs_da_state_zone, state);
 }
 
+/*
+ * Verify an xfs_da3_blkinfo structure. Note that the da3 fields are only
+ * accessible on v5 filesystems. This header format is common across da node,
+ * attr leaf and dir leaf blocks.
+ */
+xfs_failaddr_t
+xfs_da3_blkinfo_verify(
+       struct xfs_buf          *bp,
+       struct xfs_da3_blkinfo  *hdr3)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       struct xfs_da_blkinfo   *hdr = &hdr3->hdr;
+
+       if (!xfs_verify_magic16(bp, hdr->magic))
+               return __this_address;
+
+       if (xfs_sb_version_hascrc(&mp->m_sb)) {
+               if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
+                       return __this_address;
+               if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+                       return __this_address;
+               if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+                       return __this_address;
+       }
+
+       return NULL;
+}
+
 static xfs_failaddr_t
 xfs_da3_node_verify(
        struct xfs_buf          *bp)
@@ -124,27 +152,16 @@ xfs_da3_node_verify(
        struct xfs_da_intnode   *hdr = bp->b_addr;
        struct xfs_da3_icnode_hdr ichdr;
        const struct xfs_dir_ops *ops;
+       xfs_failaddr_t          fa;
 
        ops = xfs_dir_get_ops(mp, NULL);
 
        ops->node_hdr_from_disk(&ichdr, hdr);
 
-       if (xfs_sb_version_hascrc(&mp->m_sb)) {
-               struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
-
-               if (ichdr.magic != XFS_DA3_NODE_MAGIC)
-                       return __this_address;
+       fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
+       if (fa)
+               return fa;
 
-               if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid))
-                       return __this_address;
-               if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
-                       return __this_address;
-               if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn)))
-                       return __this_address;
-       } else {
-               if (ichdr.magic != XFS_DA_NODE_MAGIC)
-                       return __this_address;
-       }
        if (ichdr.level == 0)
                return __this_address;
        if (ichdr.level > XFS_DA_NODE_MAXDEPTH)
@@ -257,6 +274,8 @@ xfs_da3_node_verify_struct(
 
 const struct xfs_buf_ops xfs_da3_node_buf_ops = {
        .name = "xfs_da3_node",
+       .magic16 = { cpu_to_be16(XFS_DA_NODE_MAGIC),
+                    cpu_to_be16(XFS_DA3_NODE_MAGIC) },
        .verify_read = xfs_da3_node_read_verify,
        .verify_write = xfs_da3_node_write_verify,
        .verify_struct = xfs_da3_node_verify_struct,
index 5d5bf3bffc783a1f3711cdf9edca5c2e4ccc2e4b..ae654e06b2fb693627311c8e59235d1f04e79773 100644 (file)
@@ -869,4 +869,7 @@ static inline unsigned int xfs_dir2_dirblock_bytes(struct xfs_sb *sbp)
        return 1 << (sbp->sb_blocklog + sbp->sb_dirblklog);
 }
 
+xfs_failaddr_t xfs_da3_blkinfo_verify(struct xfs_buf *bp,
+                                     struct xfs_da3_blkinfo *hdr3);
+
 #endif /* __XFS_DA_FORMAT_H__ */
index 229152cd1a246f34bf9237572a4680bd13093ab7..156ce95c9c4545de6b03cd638463e23f7fa4746e 100644 (file)
@@ -703,3 +703,20 @@ xfs_dir2_shrink_inode(
        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
        return 0;
 }
+
+/* Returns true if the directory entry name is valid. */
+bool
+xfs_dir2_namecheck(
+       const void      *name,
+       size_t          length)
+{
+       /*
+        * MAXNAMELEN includes the trailing null, but (name/length) leave it
+        * out, so use >= for the length check.
+        */
+       if (length >= MAXNAMELEN)
+               return false;
+
+       /* There shouldn't be any slashes or nulls here */
+       return !memchr(name, '/', length) && !memchr(name, 0, length);
+}
index c3e3f6b813d869cb2f7a78bebc3b2a5765ab58ed..f542447794928e47c9eba683690866e105506aaa 100644 (file)
@@ -326,5 +326,6 @@ xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp)
 unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp, uint8_t filetype);
 void *xfs_dir3_data_endp(struct xfs_da_geometry *geo,
                struct xfs_dir2_data_hdr *hdr);
+bool xfs_dir2_namecheck(const void *name, size_t length);
 
 #endif /* __XFS_DIR2_H__ */
index 30ed5919da7235e8885afbe15adacffad9949a79..b7d6d78f4ce2f3ef263fd54d8523702045dc5c40 100644 (file)
@@ -53,18 +53,16 @@ xfs_dir3_block_verify(
        struct xfs_mount        *mp = bp->b_target->bt_mount;
        struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
 
+       if (!xfs_verify_magic(bp, hdr3->magic))
+               return __this_address;
+
        if (xfs_sb_version_hascrc(&mp->m_sb)) {
-               if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC))
-                       return __this_address;
                if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
                        return __this_address;
                if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
                        return __this_address;
                if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
                        return __this_address;
-       } else {
-               if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
-                       return __this_address;
        }
        return __xfs_dir3_data_check(NULL, bp);
 }
@@ -112,6 +110,8 @@ xfs_dir3_block_write_verify(
 
 const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
        .name = "xfs_dir3_block",
+       .magic = { cpu_to_be32(XFS_DIR2_BLOCK_MAGIC),
+                  cpu_to_be32(XFS_DIR3_BLOCK_MAGIC) },
        .verify_read = xfs_dir3_block_read_verify,
        .verify_write = xfs_dir3_block_write_verify,
        .verify_struct = xfs_dir3_block_verify,
index 01162c62ec8f8fe49ac05c68a9b8925b2b693946..b7b9ce002cb97838d2413ad579d499c582fda3d6 100644 (file)
@@ -252,18 +252,16 @@ xfs_dir3_data_verify(
        struct xfs_mount        *mp = bp->b_target->bt_mount;
        struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
 
+       if (!xfs_verify_magic(bp, hdr3->magic))
+               return __this_address;
+
        if (xfs_sb_version_hascrc(&mp->m_sb)) {
-               if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC))
-                       return __this_address;
                if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
                        return __this_address;
                if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
                        return __this_address;
                if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
                        return __this_address;
-       } else {
-               if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC))
-                       return __this_address;
        }
        return __xfs_dir3_data_check(NULL, bp);
 }
@@ -339,6 +337,8 @@ xfs_dir3_data_write_verify(
 
 const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
        .name = "xfs_dir3_data",
+       .magic = { cpu_to_be32(XFS_DIR2_DATA_MAGIC),
+                  cpu_to_be32(XFS_DIR3_DATA_MAGIC) },
        .verify_read = xfs_dir3_data_read_verify,
        .verify_write = xfs_dir3_data_write_verify,
        .verify_struct = xfs_dir3_data_verify,
@@ -346,6 +346,8 @@ const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
 
 static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = {
        .name = "xfs_dir3_data_reada",
+       .magic = { cpu_to_be32(XFS_DIR2_DATA_MAGIC),
+                  cpu_to_be32(XFS_DIR3_DATA_MAGIC) },
        .verify_read = xfs_dir3_data_reada_verify,
        .verify_write = xfs_dir3_data_write_verify,
 };
index 1728a3e6f5cf7381460ffce134f166661e164dc3..9a3767818c507b61377434b971c7309b9e768050 100644 (file)
@@ -142,41 +142,22 @@ xfs_dir3_leaf_check_int(
  */
 static xfs_failaddr_t
 xfs_dir3_leaf_verify(
-       struct xfs_buf          *bp,
-       uint16_t                magic)
+       struct xfs_buf          *bp)
 {
        struct xfs_mount        *mp = bp->b_target->bt_mount;
        struct xfs_dir2_leaf    *leaf = bp->b_addr;
+       xfs_failaddr_t          fa;
 
-       ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
-
-       if (xfs_sb_version_hascrc(&mp->m_sb)) {
-               struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
-               uint16_t                magic3;
-
-               magic3 = (magic == XFS_DIR2_LEAF1_MAGIC) ? XFS_DIR3_LEAF1_MAGIC
-                                                        : XFS_DIR3_LEAFN_MAGIC;
-
-               if (leaf3->info.hdr.magic != cpu_to_be16(magic3))
-                       return __this_address;
-               if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid))
-                       return __this_address;
-               if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
-                       return __this_address;
-               if (!xfs_log_check_lsn(mp, be64_to_cpu(leaf3->info.lsn)))
-                       return __this_address;
-       } else {
-               if (leaf->hdr.info.magic != cpu_to_be16(magic))
-                       return __this_address;
-       }
+       fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
+       if (fa)
+               return fa;
 
        return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf);
 }
 
 static void
-__read_verify(
-       struct xfs_buf  *bp,
-       uint16_t        magic)
+xfs_dir3_leaf_read_verify(
+       struct xfs_buf  *bp)
 {
        struct xfs_mount        *mp = bp->b_target->bt_mount;
        xfs_failaddr_t          fa;
@@ -185,23 +166,22 @@ __read_verify(
             !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF))
                xfs_verifier_error(bp, -EFSBADCRC, __this_address);
        else {
-               fa = xfs_dir3_leaf_verify(bp, magic);
+               fa = xfs_dir3_leaf_verify(bp);
                if (fa)
                        xfs_verifier_error(bp, -EFSCORRUPTED, fa);
        }
 }
 
 static void
-__write_verify(
-       struct xfs_buf  *bp,
-       uint16_t        magic)
+xfs_dir3_leaf_write_verify(
+       struct xfs_buf  *bp)
 {
        struct xfs_mount        *mp = bp->b_target->bt_mount;
        struct xfs_buf_log_item *bip = bp->b_log_item;
        struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr;
        xfs_failaddr_t          fa;
 
-       fa = xfs_dir3_leaf_verify(bp, magic);
+       fa = xfs_dir3_leaf_verify(bp);
        if (fa) {
                xfs_verifier_error(bp, -EFSCORRUPTED, fa);
                return;
@@ -216,60 +196,22 @@ __write_verify(
        xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF);
 }
 
-static xfs_failaddr_t
-xfs_dir3_leaf1_verify(
-       struct xfs_buf  *bp)
-{
-       return xfs_dir3_leaf_verify(bp, XFS_DIR2_LEAF1_MAGIC);
-}
-
-static void
-xfs_dir3_leaf1_read_verify(
-       struct xfs_buf  *bp)
-{
-       __read_verify(bp, XFS_DIR2_LEAF1_MAGIC);
-}
-
-static void
-xfs_dir3_leaf1_write_verify(
-       struct xfs_buf  *bp)
-{
-       __write_verify(bp, XFS_DIR2_LEAF1_MAGIC);
-}
-
-static xfs_failaddr_t
-xfs_dir3_leafn_verify(
-       struct xfs_buf  *bp)
-{
-       return xfs_dir3_leaf_verify(bp, XFS_DIR2_LEAFN_MAGIC);
-}
-
-static void
-xfs_dir3_leafn_read_verify(
-       struct xfs_buf  *bp)
-{
-       __read_verify(bp, XFS_DIR2_LEAFN_MAGIC);
-}
-
-static void
-xfs_dir3_leafn_write_verify(
-       struct xfs_buf  *bp)
-{
-       __write_verify(bp, XFS_DIR2_LEAFN_MAGIC);
-}
-
 const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = {
        .name = "xfs_dir3_leaf1",
-       .verify_read = xfs_dir3_leaf1_read_verify,
-       .verify_write = xfs_dir3_leaf1_write_verify,
-       .verify_struct = xfs_dir3_leaf1_verify,
+       .magic16 = { cpu_to_be16(XFS_DIR2_LEAF1_MAGIC),
+                    cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) },
+       .verify_read = xfs_dir3_leaf_read_verify,
+       .verify_write = xfs_dir3_leaf_write_verify,
+       .verify_struct = xfs_dir3_leaf_verify,
 };
 
 const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = {
        .name = "xfs_dir3_leafn",
-       .verify_read = xfs_dir3_leafn_read_verify,
-       .verify_write = xfs_dir3_leafn_write_verify,
-       .verify_struct = xfs_dir3_leafn_verify,
+       .magic16 = { cpu_to_be16(XFS_DIR2_LEAFN_MAGIC),
+                    cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) },
+       .verify_read = xfs_dir3_leaf_read_verify,
+       .verify_write = xfs_dir3_leaf_write_verify,
+       .verify_struct = xfs_dir3_leaf_verify,
 };
 
 int
index f1bb3434f51c79d17fbc951b6e108d8c33b6865e..3b03703c5c3dbb3e3058356f76b0220bd37f300e 100644 (file)
@@ -87,20 +87,18 @@ xfs_dir3_free_verify(
        struct xfs_mount        *mp = bp->b_target->bt_mount;
        struct xfs_dir2_free_hdr *hdr = bp->b_addr;
 
+       if (!xfs_verify_magic(bp, hdr->magic))
+               return __this_address;
+
        if (xfs_sb_version_hascrc(&mp->m_sb)) {
                struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
 
-               if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC))
-                       return __this_address;
                if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
                        return __this_address;
                if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
                        return __this_address;
                if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
                        return __this_address;
-       } else {
-               if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC))
-                       return __this_address;
        }
 
        /* XXX: should bounds check the xfs_dir3_icfree_hdr here */
@@ -151,6 +149,8 @@ xfs_dir3_free_write_verify(
 
 const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
        .name = "xfs_dir3_free",
+       .magic = { cpu_to_be32(XFS_DIR2_FREE_MAGIC),
+                  cpu_to_be32(XFS_DIR3_FREE_MAGIC) },
        .verify_read = xfs_dir3_free_read_verify,
        .verify_write = xfs_dir3_free_write_verify,
        .verify_struct = xfs_dir3_free_verify,
index d293f371dd54bc70583407a3d7638c64efbe586c..fb5bd9a804f6a863452a47e8cc2f3be50d5378ef 100644 (file)
@@ -277,6 +277,8 @@ xfs_dquot_buf_write_verify(
 
 const struct xfs_buf_ops xfs_dquot_buf_ops = {
        .name = "xfs_dquot",
+       .magic16 = { cpu_to_be16(XFS_DQUOT_MAGIC),
+                    cpu_to_be16(XFS_DQUOT_MAGIC) },
        .verify_read = xfs_dquot_buf_read_verify,
        .verify_write = xfs_dquot_buf_write_verify,
        .verify_struct = xfs_dquot_buf_verify_struct,
@@ -284,6 +286,8 @@ const struct xfs_buf_ops xfs_dquot_buf_ops = {
 
 const struct xfs_buf_ops xfs_dquot_buf_ra_ops = {
        .name = "xfs_dquot_ra",
+       .magic16 = { cpu_to_be16(XFS_DQUOT_MAGIC),
+                    cpu_to_be16(XFS_DQUOT_MAGIC) },
        .verify_read = xfs_dquot_buf_readahead_verify,
        .verify_write = xfs_dquot_buf_write_verify,
 };
index 66077a105cbb7408131a6b9d5580e6a0c12599ca..79e6c4fb1d8a8440ae744c2bc032658da0bb5da0 100644 (file)
@@ -54,7 +54,8 @@
 #define XFS_ERRTAG_BUF_LRU_REF                         31
 #define XFS_ERRTAG_FORCE_SCRUB_REPAIR                  32
 #define XFS_ERRTAG_FORCE_SUMMARY_RECALC                        33
-#define XFS_ERRTAG_MAX                                 34
+#define XFS_ERRTAG_IUNLINK_FALLBACK                    34
+#define XFS_ERRTAG_MAX                                 35
 
 /*
  * Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -93,5 +94,6 @@
 #define XFS_RANDOM_BUF_LRU_REF                         2
 #define XFS_RANDOM_FORCE_SCRUB_REPAIR                  1
 #define XFS_RANDOM_FORCE_SUMMARY_RECALC                        1
+#define XFS_RANDOM_IUNLINK_FALLBACK                    (XFS_RANDOM_DEFAULT/10)
 
 #endif /* __XFS_ERRORTAG_H_ */
index d32152fc8a6c56bae799c8999e1862210659cd0b..fe9898875097f5cd8506f9664f636a393cce2e2e 100644 (file)
@@ -2508,7 +2508,7 @@ xfs_agi_verify(
        /*
         * Validate the magic number of the agi block.
         */
-       if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC))
+       if (!xfs_verify_magic(bp, agi->agi_magicnum))
                return __this_address;
        if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
                return __this_address;
@@ -2582,6 +2582,7 @@ xfs_agi_write_verify(
 
 const struct xfs_buf_ops xfs_agi_buf_ops = {
        .name = "xfs_agi",
+       .magic = { cpu_to_be32(XFS_AGI_MAGIC), cpu_to_be32(XFS_AGI_MAGIC) },
        .verify_read = xfs_agi_read_verify,
        .verify_write = xfs_agi_write_verify,
        .verify_struct = xfs_agi_verify,
index 9b25e7a0df470b6e5552d7e841a74082681a8cf8..1080381ff243e68ee3b6c89fef1e2e78341d8994 100644 (file)
@@ -124,7 +124,7 @@ xfs_finobt_alloc_block(
        union xfs_btree_ptr     *new,
        int                     *stat)
 {
-       if (cur->bc_mp->m_inotbt_nores)
+       if (cur->bc_mp->m_finobt_nores)
                return xfs_inobt_alloc_block(cur, start, new, stat);
        return __xfs_inobt_alloc_block(cur, start, new, stat,
                        XFS_AG_RESV_METADATA);
@@ -154,7 +154,7 @@ xfs_finobt_free_block(
        struct xfs_btree_cur    *cur,
        struct xfs_buf          *bp)
 {
-       if (cur->bc_mp->m_inotbt_nores)
+       if (cur->bc_mp->m_finobt_nores)
                return xfs_inobt_free_block(cur, bp);
        return __xfs_inobt_free_block(cur, bp, XFS_AG_RESV_METADATA);
 }
@@ -260,6 +260,9 @@ xfs_inobt_verify(
        xfs_failaddr_t          fa;
        unsigned int            level;
 
+       if (!xfs_verify_magic(bp, block->bb_magic))
+               return __this_address;
+
        /*
         * During growfs operations, we can't verify the exact owner as the
         * perag is not fully initialised and hence not attached to the buffer.
@@ -270,18 +273,10 @@ xfs_inobt_verify(
         * but beware of the landmine (i.e. need to check pag->pagi_init) if we
         * ever do.
         */
-       switch (block->bb_magic) {
-       case cpu_to_be32(XFS_IBT_CRC_MAGIC):
-       case cpu_to_be32(XFS_FIBT_CRC_MAGIC):
+       if (xfs_sb_version_hascrc(&mp->m_sb)) {
                fa = xfs_btree_sblock_v5hdr_verify(bp);
                if (fa)
                        return fa;
-               /* fall through */
-       case cpu_to_be32(XFS_IBT_MAGIC):
-       case cpu_to_be32(XFS_FIBT_MAGIC):
-               break;
-       default:
-               return __this_address;
        }
 
        /* level verification */
@@ -328,6 +323,16 @@ xfs_inobt_write_verify(
 
 const struct xfs_buf_ops xfs_inobt_buf_ops = {
        .name = "xfs_inobt",
+       .magic = { cpu_to_be32(XFS_IBT_MAGIC), cpu_to_be32(XFS_IBT_CRC_MAGIC) },
+       .verify_read = xfs_inobt_read_verify,
+       .verify_write = xfs_inobt_write_verify,
+       .verify_struct = xfs_inobt_verify,
+};
+
+const struct xfs_buf_ops xfs_finobt_buf_ops = {
+       .name = "xfs_finobt",
+       .magic = { cpu_to_be32(XFS_FIBT_MAGIC),
+                  cpu_to_be32(XFS_FIBT_CRC_MAGIC) },
        .verify_read = xfs_inobt_read_verify,
        .verify_write = xfs_inobt_write_verify,
        .verify_struct = xfs_inobt_verify,
@@ -389,7 +394,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = {
        .init_rec_from_cur      = xfs_inobt_init_rec_from_cur,
        .init_ptr_from_cur      = xfs_finobt_init_ptr_from_cur,
        .key_diff               = xfs_inobt_key_diff,
-       .buf_ops                = &xfs_inobt_buf_ops,
+       .buf_ops                = &xfs_finobt_buf_ops,
        .diff_two_keys          = xfs_inobt_diff_two_keys,
        .keys_inorder           = xfs_inobt_keys_inorder,
        .recs_inorder           = xfs_inobt_recs_inorder,
index 771dd072015d50bd68901f8ac5a13d847534c5d2..bc690f2409faab3135fc1cf857263fab99faf2a8 100644 (file)
@@ -614,16 +614,15 @@ xfs_iext_realloc_root(
 }
 
 /*
- * Increment the sequence counter if we are on a COW fork.  This allows
- * the writeback code to skip looking for a COW extent if the COW fork
- * hasn't changed.  We use WRITE_ONCE here to ensure the update to the
- * sequence counter is seen before the modifications to the extent
- * tree itself take effect.
+ * Increment the sequence counter on extent tree changes. If we are on a COW
+ * fork, this allows the writeback code to skip looking for a COW extent if the
+ * COW fork hasn't changed. We use WRITE_ONCE here to ensure the update to the
+ * sequence counter is seen before the modifications to the extent tree itself
+ * take effect.
  */
 static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp, int state)
 {
-       if (state & BMAP_COWFORK)
-               WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1);
+       WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1);
 }
 
 void
index 09d9c8cfa4a09f933a55f1122879809ecb3010af..e021d5133ccb42d7b51f916180420bb41421aa09 100644 (file)
@@ -97,10 +97,9 @@ xfs_inode_buf_verify(
 
                dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog));
                unlinked_ino = be32_to_cpu(dip->di_next_unlinked);
-               di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
+               di_ok = xfs_verify_magic16(bp, dip->di_magic) &&
                        xfs_dinode_good_version(mp, dip->di_version) &&
-                       (unlinked_ino == NULLAGINO ||
-                        xfs_verify_agino(mp, agno, unlinked_ino));
+                       xfs_verify_agino_or_null(mp, agno, unlinked_ino);
                if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
                                                XFS_ERRTAG_ITOBP_INOTOBP))) {
                        if (readahead) {
@@ -147,12 +146,16 @@ xfs_inode_buf_write_verify(
 
 const struct xfs_buf_ops xfs_inode_buf_ops = {
        .name = "xfs_inode",
+       .magic16 = { cpu_to_be16(XFS_DINODE_MAGIC),
+                    cpu_to_be16(XFS_DINODE_MAGIC) },
        .verify_read = xfs_inode_buf_read_verify,
        .verify_write = xfs_inode_buf_write_verify,
 };
 
 const struct xfs_buf_ops xfs_inode_buf_ra_ops = {
-       .name = "xxfs_inode_ra",
+       .name = "xfs_inode_ra",
+       .magic16 = { cpu_to_be16(XFS_DINODE_MAGIC),
+                    cpu_to_be16(XFS_DINODE_MAGIC) },
        .verify_read = xfs_inode_buf_readahead_verify,
        .verify_write = xfs_inode_buf_write_verify,
 };
index 60361d2d74a182808333aa65be8c05fb9f817bb5..00c62ce170d0eb55db2e5046f89d842beee05bf2 100644 (file)
@@ -14,7 +14,7 @@ struct xfs_dinode;
  */
 struct xfs_ifork {
        int                     if_bytes;       /* bytes in if_u1 */
-       unsigned int            if_seq;         /* cow fork mod counter */
+       unsigned int            if_seq;         /* fork mod counter */
        struct xfs_btree_block  *if_broot;      /* file's incore btree root */
        short                   if_broot_bytes; /* bytes allocated for root */
        unsigned char           if_flags;       /* per-fork flags */
index d9eab657b63e4212ee6e731771b32244f65d7876..6f47ab876d90f229713d2cc8b5c8fc54aee06c5b 100644 (file)
@@ -209,7 +209,7 @@ xfs_refcountbt_verify(
        xfs_failaddr_t          fa;
        unsigned int            level;
 
-       if (block->bb_magic != cpu_to_be32(XFS_REFC_CRC_MAGIC))
+       if (!xfs_verify_magic(bp, block->bb_magic))
                return __this_address;
 
        if (!xfs_sb_version_hasreflink(&mp->m_sb))
@@ -264,6 +264,7 @@ xfs_refcountbt_write_verify(
 
 const struct xfs_buf_ops xfs_refcountbt_buf_ops = {
        .name                   = "xfs_refcountbt",
+       .magic                  = { 0, cpu_to_be32(XFS_REFC_CRC_MAGIC) },
        .verify_read            = xfs_refcountbt_read_verify,
        .verify_write           = xfs_refcountbt_write_verify,
        .verify_struct          = xfs_refcountbt_verify,
index f79cf040d7450fe0c61a3f27bedbe89f4660d5cb..5738e11055e6bbeaf9602b353555450b24908600 100644 (file)
@@ -310,7 +310,7 @@ xfs_rmapbt_verify(
         * from the on disk AGF. Again, we can only check against maximum limits
         * in this case.
         */
-       if (block->bb_magic != cpu_to_be32(XFS_RMAP_CRC_MAGIC))
+       if (!xfs_verify_magic(bp, block->bb_magic))
                return __this_address;
 
        if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
@@ -365,6 +365,7 @@ xfs_rmapbt_write_verify(
 
 const struct xfs_buf_ops xfs_rmapbt_buf_ops = {
        .name                   = "xfs_rmapbt",
+       .magic                  = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) },
        .verify_read            = xfs_rmapbt_read_verify,
        .verify_write           = xfs_rmapbt_write_verify,
        .verify_struct          = xfs_rmapbt_verify,
index b5a82acd7dfe01d9225c345bbd15740fb4995e83..77a3a4085de3b7e56d3747b6b8f64df0fb13dba2 100644 (file)
@@ -225,10 +225,11 @@ xfs_validate_sb_common(
        struct xfs_buf          *bp,
        struct xfs_sb           *sbp)
 {
+       struct xfs_dsb          *dsb = XFS_BUF_TO_SBP(bp);
        uint32_t                agcount = 0;
        uint32_t                rem;
 
-       if (sbp->sb_magicnum != XFS_SB_MAGIC) {
+       if (!xfs_verify_magic(bp, dsb->sb_magicnum)) {
                xfs_warn(mp, "bad magic number");
                return -EWRONGFS;
        }
@@ -781,12 +782,14 @@ out_error:
 
 const struct xfs_buf_ops xfs_sb_buf_ops = {
        .name = "xfs_sb",
+       .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) },
        .verify_read = xfs_sb_read_verify,
        .verify_write = xfs_sb_write_verify,
 };
 
 const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
        .name = "xfs_sb_quiet",
+       .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) },
        .verify_read = xfs_sb_quiet_read_verify,
        .verify_write = xfs_sb_write_verify,
 };
@@ -874,7 +877,7 @@ xfs_initialize_perag_data(
        uint64_t        bfreelst = 0;
        uint64_t        btree = 0;
        uint64_t        fdblocks;
-       int             error;
+       int             error = 0;
 
        for (index = 0; index < agcount; index++) {
                /*
index 1c5debe748f0aca5431fa45daf7fbc078270378e..4e909791aeac48a9ca82c6eb5564ca8e2cc7cadc 100644 (file)
@@ -25,7 +25,8 @@ extern const struct xfs_buf_ops xfs_agf_buf_ops;
 extern const struct xfs_buf_ops xfs_agi_buf_ops;
 extern const struct xfs_buf_ops xfs_agf_buf_ops;
 extern const struct xfs_buf_ops xfs_agfl_buf_ops;
-extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
+extern const struct xfs_buf_ops xfs_bnobt_buf_ops;
+extern const struct xfs_buf_ops xfs_cntbt_buf_ops;
 extern const struct xfs_buf_ops xfs_rmapbt_buf_ops;
 extern const struct xfs_buf_ops xfs_refcountbt_buf_ops;
 extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;
@@ -36,6 +37,7 @@ extern const struct xfs_buf_ops xfs_dquot_buf_ops;
 extern const struct xfs_buf_ops xfs_symlink_buf_ops;
 extern const struct xfs_buf_ops xfs_agi_buf_ops;
 extern const struct xfs_buf_ops xfs_inobt_buf_ops;
+extern const struct xfs_buf_ops xfs_finobt_buf_ops;
 extern const struct xfs_buf_ops xfs_inode_buf_ops;
 extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
 extern const struct xfs_buf_ops xfs_dquot_buf_ops;
index 77d80106f989a74f26390f950763509c47cabca4..a0ccc253c43d0a4c5733c28086c2475c7be5a67b 100644 (file)
@@ -95,7 +95,7 @@ xfs_symlink_verify(
 
        if (!xfs_sb_version_hascrc(&mp->m_sb))
                return __this_address;
-       if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC))
+       if (!xfs_verify_magic(bp, dsl->sl_magic))
                return __this_address;
        if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid))
                return __this_address;
@@ -159,6 +159,7 @@ xfs_symlink_write_verify(
 
 const struct xfs_buf_ops xfs_symlink_buf_ops = {
        .name = "xfs_symlink",
+       .magic = { 0, cpu_to_be32(XFS_SYMLINK_MAGIC) },
        .verify_read = xfs_symlink_read_verify,
        .verify_write = xfs_symlink_write_verify,
        .verify_struct = xfs_symlink_verify,
index 3306fc42cfad40b004bce74114deb2eae4164620..de310712dd6d12946b9ae0771194caf03c5a39ab 100644 (file)
@@ -115,6 +115,19 @@ xfs_verify_agino(
        return agino >= first && agino <= last;
 }
 
+/*
+ * Verify that an AG inode number pointer neither points outside the AG
+ * nor points at static metadata, or is NULLAGINO.
+ */
+bool
+xfs_verify_agino_or_null(
+       struct xfs_mount        *mp,
+       xfs_agnumber_t          agno,
+       xfs_agino_t             agino)
+{
+       return agino == NULLAGINO || xfs_verify_agino(mp, agno, agino);
+}
+
 /*
  * Verify that an FS inode number pointer neither points outside the
  * filesystem nor points at static AG metadata.
@@ -204,3 +217,14 @@ xfs_verify_icount(
        xfs_icount_range(mp, &min, &max);
        return icount >= min && icount <= max;
 }
+
+/* Sanity-checking of dir/attr block offsets. */
+bool
+xfs_verify_dablk(
+       struct xfs_mount        *mp,
+       xfs_fileoff_t           dabno)
+{
+       xfs_dablk_t             max_dablk = -1U;
+
+       return dabno <= max_dablk;
+}
index 8f02855a019a41c5d1c481e044af88102792308c..c5a25403b4db40cd624b1431785f2b2c9bf32e6d 100644 (file)
@@ -183,10 +183,13 @@ void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno,
                xfs_agino_t *first, xfs_agino_t *last);
 bool xfs_verify_agino(struct xfs_mount *mp, xfs_agnumber_t agno,
                xfs_agino_t agino);
+bool xfs_verify_agino_or_null(struct xfs_mount *mp, xfs_agnumber_t agno,
+               xfs_agino_t agino);
 bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino);
 bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino);
 bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino);
 bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno);
 bool xfs_verify_icount(struct xfs_mount *mp, unsigned long long icount);
+bool xfs_verify_dablk(struct xfs_mount *mp, xfs_fileoff_t off);
 
 #endif /* __XFS_TYPES_H__ */
index 90955ab1e89599303e64effcfa237cb8f25e5e73..ddf06bfaa29d6c189673093f8f3c480895557bc5 100644 (file)
@@ -399,7 +399,7 @@ xchk_agf_xref_cntbt(
        if (!xchk_should_check_xref(sc, &error, &sc->sa.cnt_cur))
                return;
        if (!have) {
-               if (agf->agf_freeblks != be32_to_cpu(0))
+               if (agf->agf_freeblks != cpu_to_be32(0))
                        xchk_block_xref_set_corrupt(sc, sc->sa.agf_bp);
                return;
        }
@@ -864,19 +864,17 @@ xchk_agi(
 
        /* Check inode pointers */
        agino = be32_to_cpu(agi->agi_newino);
-       if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino))
+       if (!xfs_verify_agino_or_null(mp, agno, agino))
                xchk_block_set_corrupt(sc, sc->sa.agi_bp);
 
        agino = be32_to_cpu(agi->agi_dirino);
-       if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino))
+       if (!xfs_verify_agino_or_null(mp, agno, agino))
                xchk_block_set_corrupt(sc, sc->sa.agi_bp);
 
        /* Check unlinked inode buckets */
        for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
                agino = be32_to_cpu(agi->agi_unlinked[i]);
-               if (agino == NULLAGINO)
-                       continue;
-               if (!xfs_verify_agino(mp, agno, agino))
+               if (!xfs_verify_agino_or_null(mp, agno, agino))
                        xchk_block_set_corrupt(sc, sc->sa.agi_bp);
        }
 
index 03d1e15ccebaa3364226a9985955d224cf9724a7..64e31f87d4907ada7d775ef3e3d6d729bdceeffb 100644 (file)
@@ -341,23 +341,19 @@ xrep_agf(
        struct xrep_find_ag_btree       fab[XREP_AGF_MAX] = {
                [XREP_AGF_BNOBT] = {
                        .rmap_owner = XFS_RMAP_OWN_AG,
-                       .buf_ops = &xfs_allocbt_buf_ops,
-                       .magic = XFS_ABTB_CRC_MAGIC,
+                       .buf_ops = &xfs_bnobt_buf_ops,
                },
                [XREP_AGF_CNTBT] = {
                        .rmap_owner = XFS_RMAP_OWN_AG,
-                       .buf_ops = &xfs_allocbt_buf_ops,
-                       .magic = XFS_ABTC_CRC_MAGIC,
+                       .buf_ops = &xfs_cntbt_buf_ops,
                },
                [XREP_AGF_RMAPBT] = {
                        .rmap_owner = XFS_RMAP_OWN_AG,
                        .buf_ops = &xfs_rmapbt_buf_ops,
-                       .magic = XFS_RMAP_CRC_MAGIC,
                },
                [XREP_AGF_REFCOUNTBT] = {
                        .rmap_owner = XFS_RMAP_OWN_REFC,
                        .buf_ops = &xfs_refcountbt_buf_ops,
-                       .magic = XFS_REFC_CRC_MAGIC,
                },
                [XREP_AGF_END] = {
                        .buf_ops = NULL,
@@ -875,12 +871,10 @@ xrep_agi(
                [XREP_AGI_INOBT] = {
                        .rmap_owner = XFS_RMAP_OWN_INOBT,
                        .buf_ops = &xfs_inobt_buf_ops,
-                       .magic = XFS_IBT_CRC_MAGIC,
                },
                [XREP_AGI_FINOBT] = {
                        .rmap_owner = XFS_RMAP_OWN_INOBT,
-                       .buf_ops = &xfs_inobt_buf_ops,
-                       .magic = XFS_FIBT_CRC_MAGIC,
+                       .buf_ops = &xfs_finobt_buf_ops,
                },
                [XREP_AGI_END] = {
                        .buf_ops = NULL
index 81d5e90547a1602a3c39fd4dfff6416b10637501..dce74ec570389a21204e40ddd14d4e1f619bebf9 100644 (file)
@@ -82,12 +82,23 @@ xchk_xattr_listent(
 
        sx = container_of(context, struct xchk_xattr, context);
 
+       if (xchk_should_terminate(sx->sc, &error)) {
+               context->seen_enough = 1;
+               return;
+       }
+
        if (flags & XFS_ATTR_INCOMPLETE) {
                /* Incomplete attr key, just mark the inode for preening. */
                xchk_ino_set_preen(sx->sc, context->dp->i_ino);
                return;
        }
 
+       /* Does this name make sense? */
+       if (!xfs_attr_namecheck(name, namelen)) {
+               xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno);
+               return;
+       }
+
        args.flags = ATTR_KERNOTIME;
        if (flags & XFS_ATTR_ROOT)
                args.flags |= ATTR_ROOT;
index e1d11f3223e360d72bf7c3c175da8be56c3838e1..a703cd58a90e678854ac220f5661b9fb55b9ee8f 100644 (file)
@@ -281,6 +281,31 @@ xchk_bmap_extent_xref(
        xchk_ag_free(info->sc, &info->sc->sa);
 }
 
+/*
+ * Directories and attr forks should never have blocks that can't be addressed
+ * by a xfs_dablk_t.
+ */
+STATIC void
+xchk_bmap_dirattr_extent(
+       struct xfs_inode        *ip,
+       struct xchk_bmap_info   *info,
+       struct xfs_bmbt_irec    *irec)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       xfs_fileoff_t           off;
+
+       if (!S_ISDIR(VFS_I(ip)->i_mode) && info->whichfork != XFS_ATTR_FORK)
+               return;
+
+       if (!xfs_verify_dablk(mp, irec->br_startoff))
+               xchk_fblock_set_corrupt(info->sc, info->whichfork,
+                               irec->br_startoff);
+
+       off = irec->br_startoff + irec->br_blockcount - 1;
+       if (!xfs_verify_dablk(mp, off))
+               xchk_fblock_set_corrupt(info->sc, info->whichfork, off);
+}
+
 /* Scrub a single extent record. */
 STATIC int
 xchk_bmap_extent(
@@ -305,6 +330,8 @@ xchk_bmap_extent(
                xchk_fblock_set_corrupt(info->sc, info->whichfork,
                                irec->br_startoff);
 
+       xchk_bmap_dirattr_extent(ip, info, irec);
+
        /* There should never be a "hole" extent in either extent list. */
        if (irec->br_startblock == HOLESTARTBLOCK)
                xchk_fblock_set_corrupt(info->sc, info->whichfork,
index cd3e4d768a18ce2d6466c973dfe891174cc523cf..a38a22785a1a28e6a7a50b533c1103f5caf2ebd0 100644 (file)
@@ -129,6 +129,12 @@ xchk_dir_actor(
                goto out;
        }
 
+       /* Does this name make sense? */
+       if (!xfs_dir2_namecheck(name, namelen)) {
+               xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset);
+               goto out;
+       }
+
        if (!strncmp(".", name, namelen)) {
                /* If this is "." then check that the inum matches the dir. */
                if (xfs_sb_version_hasftype(&mp->m_sb) && type != DT_DIR)
index 882dc56c5c21e5c0f8d02ea705669eced2132664..700114f79a7d3085fbcea5cbdf90f76b3e74fce6 100644 (file)
@@ -47,6 +47,12 @@ xchk_setup_ag_iallocbt(
 struct xchk_iallocbt {
        /* Number of inodes we see while scanning inobt. */
        unsigned long long      inodes;
+
+       /* Expected next startino, for big block filesystems. */
+       xfs_agino_t             next_startino;
+
+       /* Expected end of the current inode cluster. */
+       xfs_agino_t             next_cluster_ino;
 };
 
 /*
@@ -128,41 +134,57 @@ xchk_iallocbt_freecount(
        return hweight64(freemask);
 }
 
-/* Check a particular inode with ir_free. */
+/*
+ * Check that an inode's allocation status matches ir_free in the inobt
+ * record.  First we try querying the in-core inode state, and if the inode
+ * isn't loaded we examine the on-disk inode directly.
+ *
+ * Since there can be 1:M and M:1 mappings between inobt records and inode
+ * clusters, we pass in the inode location information as an inobt record;
+ * the index of an inode cluster within the inobt record (as well as the
+ * cluster buffer itself); and the index of the inode within the cluster.
+ *
+ * @irec is the inobt record.
+ * @irec_ino is the inode offset from the start of the record.
+ * @dip is the on-disk inode.
+ */
 STATIC int
-xchk_iallocbt_check_cluster_freemask(
+xchk_iallocbt_check_cluster_ifree(
        struct xchk_btree               *bs,
-       xfs_ino_t                       fsino,
-       xfs_agino_t                     chunkino,
-       xfs_agino_t                     clusterino,
        struct xfs_inobt_rec_incore     *irec,
-       struct xfs_buf                  *bp)
+       unsigned int                    irec_ino,
+       struct xfs_dinode               *dip)
 {
-       struct xfs_dinode               *dip;
        struct xfs_mount                *mp = bs->cur->bc_mp;
-       bool                            inode_is_free = false;
+       xfs_ino_t                       fsino;
+       xfs_agino_t                     agino;
+       bool                            irec_free;
+       bool                            ino_inuse;
        bool                            freemask_ok;
-       bool                            inuse;
        int                             error = 0;
 
        if (xchk_should_terminate(bs->sc, &error))
                return error;
 
-       dip = xfs_buf_offset(bp, clusterino * mp->m_sb.sb_inodesize);
+       /*
+        * Given an inobt record and the offset of an inode from the start of
+        * the record, compute which fs inode we're talking about.
+        */
+       agino = irec->ir_startino + irec_ino;
+       fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino);
+       irec_free = (irec->ir_free & XFS_INOBT_MASK(irec_ino));
+
        if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC ||
-           (dip->di_version >= 3 &&
-            be64_to_cpu(dip->di_ino) != fsino + clusterino)) {
+           (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino)) {
                xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
                goto out;
        }
 
-       if (irec->ir_free & XFS_INOBT_MASK(chunkino + clusterino))
-               inode_is_free = true;
-       error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp,
-                       fsino + clusterino, &inuse);
+       error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp, fsino,
+                       &ino_inuse);
        if (error == -ENODATA) {
                /* Not cached, just read the disk buffer */
-               freemask_ok = inode_is_free ^ !!(dip->di_mode);
+               freemask_ok = irec_free ^ !!(dip->di_mode);
                if (!bs->sc->try_harder && !freemask_ok)
                        return -EDEADLOCK;
        } else if (error < 0) {
@@ -174,7 +196,7 @@ xchk_iallocbt_check_cluster_freemask(
                goto out;
        } else {
                /* Inode is all there. */
-               freemask_ok = inode_is_free ^ inuse;
+               freemask_ok = irec_free ^ ino_inuse;
        }
        if (!freemask_ok)
                xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
@@ -182,86 +204,221 @@ out:
        return 0;
 }
 
-/* Make sure the free mask is consistent with what the inodes think. */
+/*
+ * Check that the holemask and freemask of a hypothetical inode cluster match
+ * what's actually on disk.  If sparse inodes are enabled, the cluster does
+ * not actually have to map to inodes if the corresponding holemask bit is set.
+ *
+ * @cluster_base is the first inode in the cluster within the @irec.
+ */
 STATIC int
-xchk_iallocbt_check_freemask(
+xchk_iallocbt_check_cluster(
        struct xchk_btree               *bs,
-       struct xfs_inobt_rec_incore     *irec)
+       struct xfs_inobt_rec_incore     *irec,
+       unsigned int                    cluster_base)
 {
        struct xfs_imap                 imap;
        struct xfs_mount                *mp = bs->cur->bc_mp;
        struct xfs_dinode               *dip;
-       struct xfs_buf                  *bp;
-       xfs_ino_t                       fsino;
-       xfs_agino_t                     nr_inodes;
-       xfs_agino_t                     agino;
-       xfs_agino_t                     chunkino;
-       xfs_agino_t                     clusterino;
+       struct xfs_buf                  *cluster_bp;
+       unsigned int                    nr_inodes;
+       xfs_agnumber_t                  agno = bs->cur->bc_private.a.agno;
        xfs_agblock_t                   agbno;
-       uint16_t                        holemask;
+       unsigned int                    cluster_index;
+       uint16_t                        cluster_mask = 0;
        uint16_t                        ir_holemask;
        int                             error = 0;
 
-       /* Make sure the freemask matches the inode records. */
-       nr_inodes = mp->m_inodes_per_cluster;
-
-       for (agino = irec->ir_startino;
-            agino < irec->ir_startino + XFS_INODES_PER_CHUNK;
-            agino += mp->m_inodes_per_cluster) {
-               fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino);
-               chunkino = agino - irec->ir_startino;
-               agbno = XFS_AGINO_TO_AGBNO(mp, agino);
-
-               /* Compute the holemask mask for this cluster. */
-               for (clusterino = 0, holemask = 0; clusterino < nr_inodes;
-                    clusterino += XFS_INODES_PER_HOLEMASK_BIT)
-                       holemask |= XFS_INOBT_MASK((chunkino + clusterino) /
-                                       XFS_INODES_PER_HOLEMASK_BIT);
-
-               /* The whole cluster must be a hole or not a hole. */
-               ir_holemask = (irec->ir_holemask & holemask);
-               if (ir_holemask != holemask && ir_holemask != 0) {
+       nr_inodes = min_t(unsigned int, XFS_INODES_PER_CHUNK,
+                       mp->m_inodes_per_cluster);
+
+       /* Map this inode cluster */
+       agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino + cluster_base);
+
+       /* Compute a bitmask for this cluster that can be used for holemask. */
+       for (cluster_index = 0;
+            cluster_index < nr_inodes;
+            cluster_index += XFS_INODES_PER_HOLEMASK_BIT)
+               cluster_mask |= XFS_INOBT_MASK((cluster_base + cluster_index) /
+                               XFS_INODES_PER_HOLEMASK_BIT);
+
+       /*
+        * Map the first inode of this cluster to a buffer and offset.
+        * Be careful about inobt records that don't align with the start of
+        * the inode buffer when block sizes are large enough to hold multiple
+        * inode chunks.  When this happens, cluster_base will be zero but
+        * ir_startino can be large enough to make im_boffset nonzero.
+        */
+       ir_holemask = (irec->ir_holemask & cluster_mask);
+       imap.im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
+       imap.im_len = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster);
+       imap.im_boffset = XFS_INO_TO_OFFSET(mp, irec->ir_startino);
+
+       if (imap.im_boffset != 0 && cluster_base != 0) {
+               ASSERT(imap.im_boffset == 0 || cluster_base == 0);
+               xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+               return 0;
+       }
+
+       trace_xchk_iallocbt_check_cluster(mp, agno, irec->ir_startino,
+                       imap.im_blkno, imap.im_len, cluster_base, nr_inodes,
+                       cluster_mask, ir_holemask,
+                       XFS_INO_TO_OFFSET(mp, irec->ir_startino +
+                                         cluster_base));
+
+       /* The whole cluster must be a hole or not a hole. */
+       if (ir_holemask != cluster_mask && ir_holemask != 0) {
+               xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+               return 0;
+       }
+
+       /* If any part of this is a hole, skip it. */
+       if (ir_holemask) {
+               xchk_xref_is_not_owned_by(bs->sc, agbno,
+                               mp->m_blocks_per_cluster,
+                               &XFS_RMAP_OINFO_INODES);
+               return 0;
+       }
+
+       xchk_xref_is_owned_by(bs->sc, agbno, mp->m_blocks_per_cluster,
+                       &XFS_RMAP_OINFO_INODES);
+
+       /* Grab the inode cluster buffer. */
+       error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, &dip, &cluster_bp,
+                       0, 0);
+       if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0, &error))
+               return error;
+
+       /* Check free status of each inode within this cluster. */
+       for (cluster_index = 0; cluster_index < nr_inodes; cluster_index++) {
+               struct xfs_dinode       *dip;
+
+               if (imap.im_boffset >= BBTOB(cluster_bp->b_length)) {
                        xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
-                       continue;
+                       break;
                }
 
-               /* If any part of this is a hole, skip it. */
-               if (ir_holemask) {
-                       xchk_xref_is_not_owned_by(bs->sc, agbno,
-                                       mp->m_blocks_per_cluster,
-                                       &XFS_RMAP_OINFO_INODES);
-                       continue;
+               dip = xfs_buf_offset(cluster_bp, imap.im_boffset);
+               error = xchk_iallocbt_check_cluster_ifree(bs, irec,
+                               cluster_base + cluster_index, dip);
+               if (error)
+                       break;
+               imap.im_boffset += mp->m_sb.sb_inodesize;
+       }
+
+       xfs_trans_brelse(bs->cur->bc_tp, cluster_bp);
+       return error;
+}
+
+/*
+ * For all the inode clusters that could map to this inobt record, make sure
+ * that the holemask makes sense and that the allocation status of each inode
+ * matches the freemask.
+ */
+STATIC int
+xchk_iallocbt_check_clusters(
+       struct xchk_btree               *bs,
+       struct xfs_inobt_rec_incore     *irec)
+{
+       unsigned int                    cluster_base;
+       int                             error = 0;
+
+       /*
+        * For the common case where this inobt record maps to multiple inode
+        * clusters this will call _check_cluster for each cluster.
+        *
+        * For the case that multiple inobt records map to a single cluster,
+        * this will call _check_cluster once.
+        */
+       for (cluster_base = 0;
+            cluster_base < XFS_INODES_PER_CHUNK;
+            cluster_base += bs->sc->mp->m_inodes_per_cluster) {
+               error = xchk_iallocbt_check_cluster(bs, irec, cluster_base);
+               if (error)
+                       break;
+       }
+
+       return error;
+}
+
+/*
+ * Make sure this inode btree record is aligned properly.  Because a fs block
+ * contains multiple inodes, we check that the inobt record is aligned to the
+ * correct inode, not just the correct block on disk.  This results in a finer
+ * grained corruption check.
+ */
+STATIC void
+xchk_iallocbt_rec_alignment(
+       struct xchk_btree               *bs,
+       struct xfs_inobt_rec_incore     *irec)
+{
+       struct xfs_mount                *mp = bs->sc->mp;
+       struct xchk_iallocbt            *iabt = bs->private;
+
+       /*
+        * finobt records have different positioning requirements than inobt
+        * records: each finobt record must have a corresponding inobt record.
+        * That is checked in the xref function, so for now we only catch the
+        * obvious case where the record isn't at all aligned properly.
+        *
+        * Note that if a fs block contains more than a single chunk of inodes,
+        * we will have finobt records only for those chunks containing free
+        * inodes, and therefore expect chunk alignment of finobt records.
+        * Otherwise, we expect that the finobt record is aligned to the
+        * cluster alignment as told by the superblock.
+        */
+       if (bs->cur->bc_btnum == XFS_BTNUM_FINO) {
+               unsigned int    imask;
+
+               imask = min_t(unsigned int, XFS_INODES_PER_CHUNK,
+                               mp->m_cluster_align_inodes) - 1;
+               if (irec->ir_startino & imask)
+                       xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+               return;
+       }
+
+       if (iabt->next_startino != NULLAGINO) {
+               /*
+                * We're midway through a cluster of inodes that is mapped by
+                * multiple inobt records.  Did we get the record for the next
+                * irec in the sequence?
+                */
+               if (irec->ir_startino != iabt->next_startino) {
+                       xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+                       return;
                }
 
-               xchk_xref_is_owned_by(bs->sc, agbno, mp->m_blocks_per_cluster,
-                               &XFS_RMAP_OINFO_INODES);
+               iabt->next_startino += XFS_INODES_PER_CHUNK;
 
-               /* Grab the inode cluster buffer. */
-               imap.im_blkno = XFS_AGB_TO_DADDR(mp, bs->cur->bc_private.a.agno,
-                               agbno);
-               imap.im_len = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster);
-               imap.im_boffset = 0;
-
-               error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap,
-                               &dip, &bp, 0, 0);
-               if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0,
-                               &error))
-                       continue;
-
-               /* Which inodes are free? */
-               for (clusterino = 0; clusterino < nr_inodes; clusterino++) {
-                       error = xchk_iallocbt_check_cluster_freemask(bs,
-                                       fsino, chunkino, clusterino, irec, bp);
-                       if (error) {
-                               xfs_trans_brelse(bs->cur->bc_tp, bp);
-                               return error;
-                       }
+               /* Are we done with the cluster? */
+               if (iabt->next_startino >= iabt->next_cluster_ino) {
+                       iabt->next_startino = NULLAGINO;
+                       iabt->next_cluster_ino = NULLAGINO;
                }
+               return;
+       }
+
+       /* inobt records must be aligned to cluster and inoalignmnt size. */
+       if (irec->ir_startino & (mp->m_cluster_align_inodes - 1)) {
+               xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+               return;
+       }
 
-               xfs_trans_brelse(bs->cur->bc_tp, bp);
+       if (irec->ir_startino & (mp->m_inodes_per_cluster - 1)) {
+               xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+               return;
        }
 
-       return error;
+       if (mp->m_inodes_per_cluster <= XFS_INODES_PER_CHUNK)
+               return;
+
+       /*
+        * If this is the start of an inode cluster that can be mapped by
+        * multiple inobt records, the next inobt record must follow exactly
+        * after this one.
+        */
+       iabt->next_startino = irec->ir_startino + XFS_INODES_PER_CHUNK;
+       iabt->next_cluster_ino = irec->ir_startino + mp->m_inodes_per_cluster;
 }
 
 /* Scrub an inobt/finobt record. */
@@ -276,7 +433,6 @@ xchk_iallocbt_rec(
        uint64_t                        holes;
        xfs_agnumber_t                  agno = bs->cur->bc_private.a.agno;
        xfs_agino_t                     agino;
-       xfs_agblock_t                   agbno;
        xfs_extlen_t                    len;
        int                             holecount;
        int                             i;
@@ -303,11 +459,9 @@ xchk_iallocbt_rec(
                goto out;
        }
 
-       /* Make sure this record is aligned to cluster and inoalignmnt size. */
-       agbno = XFS_AGINO_TO_AGBNO(mp, irec.ir_startino);
-       if ((agbno & (mp->m_cluster_align - 1)) ||
-           (agbno & (mp->m_blocks_per_cluster - 1)))
-               xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+       xchk_iallocbt_rec_alignment(bs, &irec);
+       if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+               goto out;
 
        iabt->inodes += irec.ir_count;
 
@@ -320,7 +474,7 @@ xchk_iallocbt_rec(
 
                if (!xchk_iallocbt_chunk(bs, &irec, agino, len))
                        goto out;
-               goto check_freemask;
+               goto check_clusters;
        }
 
        /* Check each chunk of a sparse inode cluster. */
@@ -346,8 +500,8 @@ xchk_iallocbt_rec(
            holecount + irec.ir_count != XFS_INODES_PER_CHUNK)
                xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
 
-check_freemask:
-       error = xchk_iallocbt_check_freemask(bs, &irec);
+check_clusters:
+       error = xchk_iallocbt_check_clusters(bs, &irec);
        if (error)
                goto out;
 
@@ -429,6 +583,8 @@ xchk_iallocbt(
        struct xfs_btree_cur    *cur;
        struct xchk_iallocbt    iabt = {
                .inodes         = 0,
+               .next_startino  = NULLAGINO,
+               .next_cluster_ino = NULLAGINO,
        };
        int                     error;
 
index 6acf1bfa0bfee57e6d5f105e47797c4126d9d88b..f28f4bad317b6792f73f45ac7ac118a1c40f1060 100644 (file)
@@ -743,7 +743,8 @@ xrep_findroot_block(
 
        /* Ensure the block magic matches the btree type we're looking for. */
        btblock = XFS_BUF_TO_BLOCK(bp);
-       if (be32_to_cpu(btblock->bb_magic) != fab->magic)
+       ASSERT(fab->buf_ops->magic[1] != 0);
+       if (btblock->bb_magic != fab->buf_ops->magic[1])
                goto out;
 
        /*
index f2fc18bb760520003dde1b3530dca6c42368619c..d990314eb08b2b0b9a271ee07c2923c27d921b67 100644 (file)
@@ -42,9 +42,6 @@ struct xrep_find_ag_btree {
        /* in: buffer ops */
        const struct xfs_buf_ops        *buf_ops;
 
-       /* in: magic number of the btree */
-       uint32_t                        magic;
-
        /* out: the highest btree block found and the tree height */
        xfs_agblock_t                   root;
        unsigned int                    height;
index 665d4bbb17cc8c3b9718a4f1f24f100e31414412..dbe115b075f714007aef48b16e8b765629f6c284 100644 (file)
@@ -141,9 +141,8 @@ xchk_xref_is_used_rt_space(
        startext = fsbno;
        endext = fsbno + len - 1;
        do_div(startext, sc->mp->m_sb.sb_rextsize);
-       if (do_div(endext, sc->mp->m_sb.sb_rextsize))
-               endext++;
-       extcount = endext - startext;
+       do_div(endext, sc->mp->m_sb.sb_rextsize);
+       extcount = endext - startext + 1;
        xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
        error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, extcount,
                        &is_free);
index 8344b14031efa9ab0db697699cfbfabdcdace0a6..3c83e8b3b39c17e5b1e52110ed5bfa966c383e3a 100644 (file)
@@ -545,6 +545,51 @@ TRACE_EVENT(xchk_xref_error,
                  __entry->ret_ip)
 );
 
+TRACE_EVENT(xchk_iallocbt_check_cluster,
+       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+                xfs_agino_t startino, xfs_daddr_t map_daddr,
+                unsigned short map_len, unsigned int chunk_ino,
+                unsigned int nr_inodes, uint16_t cluster_mask,
+                uint16_t holemask, unsigned int cluster_ino),
+       TP_ARGS(mp, agno, startino, map_daddr, map_len, chunk_ino, nr_inodes,
+               cluster_mask, holemask, cluster_ino),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_agnumber_t, agno)
+               __field(xfs_agino_t, startino)
+               __field(xfs_daddr_t, map_daddr)
+               __field(unsigned short, map_len)
+               __field(unsigned int, chunk_ino)
+               __field(unsigned int, nr_inodes)
+               __field(unsigned int, cluster_ino)
+               __field(uint16_t, cluster_mask)
+               __field(uint16_t, holemask)
+       ),
+       TP_fast_assign(
+               __entry->dev = mp->m_super->s_dev;
+               __entry->agno = agno;
+               __entry->startino = startino;
+               __entry->map_daddr = map_daddr;
+               __entry->map_len = map_len;
+               __entry->chunk_ino = chunk_ino;
+               __entry->nr_inodes = nr_inodes;
+               __entry->cluster_mask = cluster_mask;
+               __entry->holemask = holemask;
+               __entry->cluster_ino = cluster_ino;
+       ),
+       TP_printk("dev %d:%d agno %d startino %u daddr 0x%llx len %d chunkino %u nr_inodes %u cluster_mask 0x%x holemask 0x%x cluster_ino %u",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->agno,
+                 __entry->startino,
+                 __entry->map_daddr,
+                 __entry->map_len,
+                 __entry->chunk_ino,
+                 __entry->nr_inodes,
+                 __entry->cluster_mask,
+                 __entry->holemask,
+                 __entry->cluster_ino)
+)
+
 /* repair tracepoints */
 #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)
 
index d9048bcea49c5203c6d89186637d63fb33f69c37..7b8bb6bde981028ad692fa07c87b0e4911bf3436 100644 (file)
@@ -28,7 +28,8 @@
  */
 struct xfs_writepage_ctx {
        struct xfs_bmbt_irec    imap;
-       unsigned int            io_type;
+       int                     fork;
+       unsigned int            data_seq;
        unsigned int            cow_seq;
        struct xfs_ioend        *ioend;
 };
@@ -255,30 +256,20 @@ xfs_end_io(
         */
        error = blk_status_to_errno(ioend->io_bio->bi_status);
        if (unlikely(error)) {
-               switch (ioend->io_type) {
-               case XFS_IO_COW:
+               if (ioend->io_fork == XFS_COW_FORK)
                        xfs_reflink_cancel_cow_range(ip, offset, size, true);
-                       break;
-               }
-
                goto done;
        }
 
        /*
-        * Success:  commit the COW or unwritten blocks if needed.
+        * Success: commit the COW or unwritten blocks if needed.
         */
-       switch (ioend->io_type) {
-       case XFS_IO_COW:
+       if (ioend->io_fork == XFS_COW_FORK)
                error = xfs_reflink_end_cow(ip, offset, size);
-               break;
-       case XFS_IO_UNWRITTEN:
-               /* writeback should never update isize */
+       else if (ioend->io_state == XFS_EXT_UNWRITTEN)
                error = xfs_iomap_write_unwritten(ip, offset, size, false);
-               break;
-       default:
+       else
                ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
-               break;
-       }
 
 done:
        if (ioend->io_append_trans)
@@ -293,7 +284,8 @@ xfs_end_bio(
        struct xfs_ioend        *ioend = bio->bi_private;
        struct xfs_mount        *mp = XFS_I(ioend->io_inode)->i_mount;
 
-       if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW)
+       if (ioend->io_fork == XFS_COW_FORK ||
+           ioend->io_state == XFS_EXT_UNWRITTEN)
                queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
        else if (ioend->io_append_trans)
                queue_work(mp->m_data_workqueue, &ioend->io_work);
@@ -301,6 +293,75 @@ xfs_end_bio(
                xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status));
 }
 
+/*
+ * Fast revalidation of the cached writeback mapping. Return true if the current
+ * mapping is valid, false otherwise.
+ */
+static bool
+xfs_imap_valid(
+       struct xfs_writepage_ctx        *wpc,
+       struct xfs_inode                *ip,
+       xfs_fileoff_t                   offset_fsb)
+{
+       if (offset_fsb < wpc->imap.br_startoff ||
+           offset_fsb >= wpc->imap.br_startoff + wpc->imap.br_blockcount)
+               return false;
+       /*
+        * If this is a COW mapping, it is sufficient to check that the mapping
+        * covers the offset. Be careful to check this first because the caller
+        * can revalidate a COW mapping without updating the data seqno.
+        */
+       if (wpc->fork == XFS_COW_FORK)
+               return true;
+
+       /*
+        * This is not a COW mapping. Check the sequence number of the data fork
+        * because concurrent changes could have invalidated the extent. Check
+        * the COW fork because concurrent changes since the last time we
+        * checked (and found nothing at this offset) could have added
+        * overlapping blocks.
+        */
+       if (wpc->data_seq != READ_ONCE(ip->i_df.if_seq))
+               return false;
+       if (xfs_inode_has_cow_data(ip) &&
+           wpc->cow_seq != READ_ONCE(ip->i_cowfp->if_seq))
+               return false;
+       return true;
+}
+
+/*
+ * Pass in a dellalloc extent and convert it to real extents, return the real
+ * extent that maps offset_fsb in wpc->imap.
+ *
+ * The current page is held locked so nothing could have removed the block
+ * backing offset_fsb, although it could have moved from the COW to the data
+ * fork by another thread.
+ */
+static int
+xfs_convert_blocks(
+       struct xfs_writepage_ctx *wpc,
+       struct xfs_inode        *ip,
+       xfs_fileoff_t           offset_fsb)
+{
+       int                     error;
+
+       /*
+        * Attempt to allocate whatever delalloc extent currently backs
+        * offset_fsb and put the result into wpc->imap.  Allocate in a loop
+        * because it may take several attempts to allocate real blocks for a
+        * contiguous delalloc extent if free space is sufficiently fragmented.
+        */
+       do {
+               error = xfs_bmapi_convert_delalloc(ip, wpc->fork, offset_fsb,
+                               &wpc->imap, wpc->fork == XFS_COW_FORK ?
+                                       &wpc->cow_seq : &wpc->data_seq);
+               if (error)
+                       return error;
+       } while (wpc->imap.br_startoff + wpc->imap.br_blockcount <= offset_fsb);
+
+       return 0;
+}
+
 STATIC int
 xfs_map_blocks(
        struct xfs_writepage_ctx *wpc,
@@ -310,26 +371,16 @@ xfs_map_blocks(
        struct xfs_inode        *ip = XFS_I(inode);
        struct xfs_mount        *mp = ip->i_mount;
        ssize_t                 count = i_blocksize(inode);
-       xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset), end_fsb;
+       xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset);
+       xfs_fileoff_t           end_fsb = XFS_B_TO_FSB(mp, offset + count);
        xfs_fileoff_t           cow_fsb = NULLFILEOFF;
        struct xfs_bmbt_irec    imap;
-       int                     whichfork = XFS_DATA_FORK;
        struct xfs_iext_cursor  icur;
-       bool                    imap_valid;
+       int                     retries = 0;
        int                     error = 0;
 
-       /*
-        * We have to make sure the cached mapping is within EOF to protect
-        * against eofblocks trimming on file release leaving us with a stale
-        * mapping. Otherwise, a page for a subsequent file extending buffered
-        * write could get picked up by this writeback cycle and written to the
-        * wrong blocks.
-        *
-        * Note that what we really want here is a generic mapping invalidation
-        * mechanism to protect us from arbitrary extent modifying contexts, not
-        * just eofblocks.
-        */
-       xfs_trim_extent_eof(&wpc->imap, ip);
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return -EIO;
 
        /*
         * COW fork blocks can overlap data fork blocks even if the blocks
@@ -346,31 +397,19 @@ xfs_map_blocks(
         * against concurrent updates and provides a memory barrier on the way
         * out that ensures that we always see the current value.
         */
-       imap_valid = offset_fsb >= wpc->imap.br_startoff &&
-                    offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount;
-       if (imap_valid &&
-           (!xfs_inode_has_cow_data(ip) ||
-            wpc->io_type == XFS_IO_COW ||
-            wpc->cow_seq == READ_ONCE(ip->i_cowfp->if_seq)))
+       if (xfs_imap_valid(wpc, ip, offset_fsb))
                return 0;
 
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return -EIO;
-
        /*
         * If we don't have a valid map, now it's time to get a new one for this
         * offset.  This will convert delayed allocations (including COW ones)
         * into real extents.  If we return without a valid map, it means we
         * landed in a hole and we skip the block.
         */
+retry:
        xfs_ilock(ip, XFS_ILOCK_SHARED);
        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
               (ip->i_df.if_flags & XFS_IFEXTENTS));
-       ASSERT(offset <= mp->m_super->s_maxbytes);
-
-       if (offset > mp->m_super->s_maxbytes - count)
-               count = mp->m_super->s_maxbytes - offset;
-       end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
 
        /*
         * Check if this is offset is covered by a COW extents, and if yes use
@@ -382,30 +421,16 @@ xfs_map_blocks(
        if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
                wpc->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
                xfs_iunlock(ip, XFS_ILOCK_SHARED);
-               /*
-                * Truncate can race with writeback since writeback doesn't
-                * take the iolock and truncate decreases the file size before
-                * it starts truncating the pages between new_size and old_size.
-                * Therefore, we can end up in the situation where writeback
-                * gets a CoW fork mapping but the truncate makes the mapping
-                * invalid and we end up in here trying to get a new mapping.
-                * bail out here so that we simply never get a valid mapping
-                * and so we drop the write altogether.  The page truncation
-                * will kill the contents anyway.
-                */
-               if (offset > i_size_read(inode)) {
-                       wpc->io_type = XFS_IO_HOLE;
-                       return 0;
-               }
-               whichfork = XFS_COW_FORK;
-               wpc->io_type = XFS_IO_COW;
+
+               wpc->fork = XFS_COW_FORK;
                goto allocate_blocks;
        }
 
        /*
-        * Map valid and no COW extent in the way?  We're done.
+        * No COW extent overlap. Revalidate now that we may have updated
+        * ->cow_seq. If the data mapping is still valid, we're done.
         */
-       if (imap_valid) {
+       if (xfs_imap_valid(wpc, ip, offset_fsb)) {
                xfs_iunlock(ip, XFS_ILOCK_SHARED);
                return 0;
        }
@@ -417,51 +442,65 @@ xfs_map_blocks(
         */
        if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
                imap.br_startoff = end_fsb;     /* fake a hole past EOF */
+       wpc->data_seq = READ_ONCE(ip->i_df.if_seq);
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
 
+       wpc->fork = XFS_DATA_FORK;
+
+       /* landed in a hole or beyond EOF? */
        if (imap.br_startoff > offset_fsb) {
-               /* landed in a hole or beyond EOF */
                imap.br_blockcount = imap.br_startoff - offset_fsb;
                imap.br_startoff = offset_fsb;
                imap.br_startblock = HOLESTARTBLOCK;
-               wpc->io_type = XFS_IO_HOLE;
-       } else {
-               /*
-                * Truncate to the next COW extent if there is one.  This is the
-                * only opportunity to do this because we can skip COW fork
-                * lookups for the subsequent blocks in the mapping; however,
-                * the requirement to treat the COW range separately remains.
-                */
-               if (cow_fsb != NULLFILEOFF &&
-                   cow_fsb < imap.br_startoff + imap.br_blockcount)
-                       imap.br_blockcount = cow_fsb - imap.br_startoff;
-
-               if (isnullstartblock(imap.br_startblock)) {
-                       /* got a delalloc extent */
-                       wpc->io_type = XFS_IO_DELALLOC;
-                       goto allocate_blocks;
-               }
-
-               if (imap.br_state == XFS_EXT_UNWRITTEN)
-                       wpc->io_type = XFS_IO_UNWRITTEN;
-               else
-                       wpc->io_type = XFS_IO_OVERWRITE;
+               imap.br_state = XFS_EXT_NORM;
        }
 
+       /*
+        * Truncate to the next COW extent if there is one.  This is the only
+        * opportunity to do this because we can skip COW fork lookups for the
+        * subsequent blocks in the mapping; however, the requirement to treat
+        * the COW range separately remains.
+        */
+       if (cow_fsb != NULLFILEOFF &&
+           cow_fsb < imap.br_startoff + imap.br_blockcount)
+               imap.br_blockcount = cow_fsb - imap.br_startoff;
+
+       /* got a delalloc extent? */
+       if (imap.br_startblock != HOLESTARTBLOCK &&
+           isnullstartblock(imap.br_startblock))
+               goto allocate_blocks;
+
        wpc->imap = imap;
-       xfs_trim_extent_eof(&wpc->imap, ip);
-       trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap);
+       trace_xfs_map_blocks_found(ip, offset, count, wpc->fork, &imap);
        return 0;
 allocate_blocks:
-       error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap,
-                       &wpc->cow_seq);
-       if (error)
+       error = xfs_convert_blocks(wpc, ip, offset_fsb);
+       if (error) {
+               /*
+                * If we failed to find the extent in the COW fork we might have
+                * raced with a COW to data fork conversion or truncate.
+                * Restart the lookup to catch the extent in the data fork for
+                * the former case, but prevent additional retries to avoid
+                * looping forever for the latter case.
+                */
+               if (error == -EAGAIN && wpc->fork == XFS_COW_FORK && !retries++)
+                       goto retry;
+               ASSERT(error != -EAGAIN);
                return error;
-       ASSERT(whichfork == XFS_COW_FORK || cow_fsb == NULLFILEOFF ||
-              imap.br_startoff + imap.br_blockcount <= cow_fsb);
-       wpc->imap = imap;
-       xfs_trim_extent_eof(&wpc->imap, ip);
-       trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap);
+       }
+
+       /*
+        * Due to merging the return real extent might be larger than the
+        * original delalloc one.  Trim the return extent to the next COW
+        * boundary again to force a re-lookup.
+        */
+       if (wpc->fork != XFS_COW_FORK && cow_fsb != NULLFILEOFF &&
+           cow_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount)
+               wpc->imap.br_blockcount = cow_fsb - wpc->imap.br_startoff;
+
+       ASSERT(wpc->imap.br_startoff <= offset_fsb);
+       ASSERT(wpc->imap.br_startoff + wpc->imap.br_blockcount > offset_fsb);
+       trace_xfs_map_blocks_alloc(ip, offset, count, wpc->fork, &imap);
        return 0;
 }
 
@@ -486,7 +525,7 @@ xfs_submit_ioend(
        int                     status)
 {
        /* Convert CoW extents to regular */
-       if (!status && ioend->io_type == XFS_IO_COW) {
+       if (!status && ioend->io_fork == XFS_COW_FORK) {
                /*
                 * Yuk. This can do memory allocation, but is not a
                 * transactional operation so everything is done in GFP_KERNEL
@@ -504,7 +543,8 @@ xfs_submit_ioend(
 
        /* Reserve log space if we might write beyond the on-disk inode size. */
        if (!status &&
-           ioend->io_type != XFS_IO_UNWRITTEN &&
+           (ioend->io_fork == XFS_COW_FORK ||
+            ioend->io_state != XFS_EXT_UNWRITTEN) &&
            xfs_ioend_is_append(ioend) &&
            !ioend->io_append_trans)
                status = xfs_setfilesize_trans_alloc(ioend);
@@ -533,7 +573,8 @@ xfs_submit_ioend(
 static struct xfs_ioend *
 xfs_alloc_ioend(
        struct inode            *inode,
-       unsigned int            type,
+       int                     fork,
+       xfs_exntst_t            state,
        xfs_off_t               offset,
        struct block_device     *bdev,
        sector_t                sector)
@@ -547,7 +588,8 @@ xfs_alloc_ioend(
 
        ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
        INIT_LIST_HEAD(&ioend->io_list);
-       ioend->io_type = type;
+       ioend->io_fork = fork;
+       ioend->io_state = state;
        ioend->io_inode = inode;
        ioend->io_size = 0;
        ioend->io_offset = offset;
@@ -608,13 +650,15 @@ xfs_add_to_ioend(
        sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) +
                ((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9);
 
-       if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
+       if (!wpc->ioend ||
+           wpc->fork != wpc->ioend->io_fork ||
+           wpc->imap.br_state != wpc->ioend->io_state ||
            sector != bio_end_sector(wpc->ioend->io_bio) ||
            offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
                if (wpc->ioend)
                        list_add(&wpc->ioend->io_list, iolist);
-               wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset,
-                               bdev, sector);
+               wpc->ioend = xfs_alloc_ioend(inode, wpc->fork,
+                               wpc->imap.br_state, offset, bdev, sector);
        }
 
        if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) {
@@ -723,7 +767,7 @@ xfs_writepage_map(
                error = xfs_map_blocks(wpc, inode, file_offset);
                if (error)
                        break;
-               if (wpc->io_type == XFS_IO_HOLE)
+               if (wpc->imap.br_startblock == HOLESTARTBLOCK)
                        continue;
                xfs_add_to_ioend(inode, file_offset, page, iop, wpc, wbc,
                                 &submit_list);
@@ -918,9 +962,7 @@ xfs_vm_writepage(
        struct page             *page,
        struct writeback_control *wbc)
 {
-       struct xfs_writepage_ctx wpc = {
-               .io_type = XFS_IO_HOLE,
-       };
+       struct xfs_writepage_ctx wpc = { };
        int                     ret;
 
        ret = xfs_do_writepage(page, wbc, &wpc);
@@ -934,9 +976,7 @@ xfs_vm_writepages(
        struct address_space    *mapping,
        struct writeback_control *wbc)
 {
-       struct xfs_writepage_ctx wpc = {
-               .io_type = XFS_IO_HOLE,
-       };
+       struct xfs_writepage_ctx wpc = { };
        int                     ret;
 
        xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
@@ -983,7 +1023,7 @@ xfs_vm_bmap(
         * Since we don't pass back blockdev info, we can't return bmap
         * information for rt files either.
         */
-       if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip))
+       if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip))
                return 0;
        return iomap_bmap(mapping, block, &xfs_iomap_ops);
 }
index e5c23948a8ab3dde1007113eeaab24edcb48e4c5..6c2615b83c5d863ea8db349ea4ffa8be72d66e61 100644 (file)
@@ -8,33 +8,13 @@
 
 extern struct bio_set xfs_ioend_bioset;
 
-/*
- * Types of I/O for bmap clustering and I/O completion tracking.
- *
- * This enum is used in string mapping in xfs_trace.h; please keep the
- * TRACE_DEFINE_ENUMs for it up to date.
- */
-enum {
-       XFS_IO_HOLE,            /* covers region without any block allocation */
-       XFS_IO_DELALLOC,        /* covers delalloc region */
-       XFS_IO_UNWRITTEN,       /* covers allocated but uninitialized data */
-       XFS_IO_OVERWRITE,       /* covers already allocated extent */
-       XFS_IO_COW,             /* covers copy-on-write extent */
-};
-
-#define XFS_IO_TYPES \
-       { XFS_IO_HOLE,                  "hole" },       \
-       { XFS_IO_DELALLOC,              "delalloc" },   \
-       { XFS_IO_UNWRITTEN,             "unwritten" },  \
-       { XFS_IO_OVERWRITE,             "overwrite" },  \
-       { XFS_IO_COW,                   "CoW" }
-
 /*
  * Structure for buffered I/O completions.
  */
 struct xfs_ioend {
        struct list_head        io_list;        /* next ioend in chain */
-       unsigned int            io_type;        /* delalloc / unwritten */
+       int                     io_fork;        /* inode fork written back */
+       xfs_exntst_t            io_state;       /* extent state */
        struct inode            *io_inode;      /* file being written to */
        size_t                  io_size;        /* size of the extent */
        xfs_off_t               io_offset;      /* offset in the file */
index a58034049995b4c6a7db190164ea886ec5113dff..3d213a7394c5b747dfb5cffc17dfb3d44d66cf03 100644 (file)
@@ -555,6 +555,7 @@ xfs_attr_put_listent(
        attrlist_ent_t *aep;
        int arraytop;
 
+       ASSERT(!context->seen_enough);
        ASSERT(!(context->flags & ATTR_KERNOVAL));
        ASSERT(context->count >= 0);
        ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
index 1ee8c5539fa4f2e999808acc021d63be0b4963b0..2db43ff4f8b59d303b70a0dbf537a45bbf897706 100644 (file)
@@ -1162,16 +1162,13 @@ xfs_zero_file_space(
         * by virtue of the hole punch.
         */
        error = xfs_free_file_space(ip, offset, len);
-       if (error)
-               goto out;
+       if (error || xfs_is_always_cow_inode(ip))
+               return error;
 
-       error = xfs_alloc_file_space(ip, round_down(offset, blksize),
+       return xfs_alloc_file_space(ip, round_down(offset, blksize),
                                     round_up(offset + len, blksize) -
                                     round_down(offset, blksize),
                                     XFS_BMAPI_PREALLOC);
-out:
-       return error;
-
 }
 
 static int
index 4f5f2ff3f70f944130f94a674f09f464d2b6c970..548344e2512833bbb82f141fe34aefed88a6729e 100644 (file)
@@ -776,29 +776,24 @@ _xfs_buf_read(
 }
 
 /*
- * Set buffer ops on an unchecked buffer and validate it, if possible.
+ * Reverify a buffer found in cache without an attached ->b_ops.
  *
- * If the caller passed in an ops structure and the buffer doesn't have ops
- * assigned, set the ops and use them to verify the contents.  If the contents
- * cannot be verified, we'll clear XBF_DONE.  We assume the buffer has no
- * recorded errors and is already in XBF_DONE state.
+ * If the caller passed an ops structure and the buffer doesn't have ops
+ * assigned, set the ops and use it to verify the contents. If verification
+ * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is
+ * already in XBF_DONE state on entry.
  *
- * Under normal operations, every in-core buffer must have buffer ops assigned
- * to them when the buffer is read in from disk so that we can validate the
- * metadata.
- *
- * However, there are two scenarios where one can encounter in-core buffers
- * that don't have buffer ops.  The first is during log recovery of buffers on
- * a V4 filesystem, though these buffers are purged at the end of recovery.
- *
- * The other is online repair, which tries to match arbitrary metadata blocks
- * with btree types in order to find the root.  If online repair doesn't match
- * the buffer with /any/ btree type, the buffer remains in memory in DONE state
- * with no ops, and a subsequent read_buf call from elsewhere will not set the
- * ops.  This function helps us fix this situation.
+ * Under normal operations, every in-core buffer is verified on read I/O
+ * completion. There are two scenarios that can lead to in-core buffers without
+ * an assigned ->b_ops. The first is during log recovery of buffers on a V4
+ * filesystem, though these buffers are purged at the end of recovery. The
+ * other is online repair, which intentionally reads with a NULL buffer ops to
+ * run several verifiers across an in-core buffer in order to establish buffer
+ * type.  If repair can't establish that, the buffer will be left in memory
+ * with NULL buffer ops.
  */
 int
-xfs_buf_ensure_ops(
+xfs_buf_reverify(
        struct xfs_buf          *bp,
        const struct xfs_buf_ops *ops)
 {
@@ -840,7 +835,7 @@ xfs_buf_read_map(
                return bp;
        }
 
-       xfs_buf_ensure_ops(bp, ops);
+       xfs_buf_reverify(bp, ops);
 
        if (flags & XBF_ASYNC) {
                /*
@@ -2209,3 +2204,40 @@ void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
 
        atomic_set(&bp->b_lru_ref, lru_ref);
 }
+
+/*
+ * Verify an on-disk magic value against the magic value specified in the
+ * verifier structure. The verifier magic is in disk byte order so the caller is
+ * expected to pass the value directly from disk.
+ */
+bool
+xfs_verify_magic(
+       struct xfs_buf          *bp,
+       __be32                  dmagic)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       int                     idx;
+
+       idx = xfs_sb_version_hascrc(&mp->m_sb);
+       if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx])))
+               return false;
+       return dmagic == bp->b_ops->magic[idx];
+}
+/*
+ * Verify an on-disk magic value against the magic value specified in the
+ * verifier structure. The verifier magic is in disk byte order so the caller is
+ * expected to pass the value directly from disk.
+ */
+bool
+xfs_verify_magic16(
+       struct xfs_buf          *bp,
+       __be16                  dmagic)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       int                     idx;
+
+       idx = xfs_sb_version_hascrc(&mp->m_sb);
+       if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx])))
+               return false;
+       return dmagic == bp->b_ops->magic16[idx];
+}
index b9f5511ea998a22927f141ecf446f63e3c99f60c..d0b96e071cec197a39ea7cf4c67f777f1bebb046 100644 (file)
@@ -125,6 +125,10 @@ struct xfs_buf_map {
 
 struct xfs_buf_ops {
        char *name;
+       union {
+               __be32 magic[2];        /* v4 and v5 on disk magic values */
+               __be16 magic16[2];      /* v4 and v5 on disk magic values */
+       };
        void (*verify_read)(struct xfs_buf *);
        void (*verify_write)(struct xfs_buf *);
        xfs_failaddr_t (*verify_struct)(struct xfs_buf *bp);
@@ -385,6 +389,8 @@ extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int);
 #define xfs_getsize_buftarg(buftarg)   block_size((buftarg)->bt_bdev)
 #define xfs_readonly_buftarg(buftarg)  bdev_read_only((buftarg)->bt_bdev)
 
-int xfs_buf_ensure_ops(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
+int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
+bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic);
+bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic);
 
 #endif /* __XFS_BUF_H__ */
index 9866f542e77b18f20689531821856be972c39bdf..a1e177f66404d28184fd99a5711d2c6f0b5005c2 100644 (file)
@@ -51,6 +51,7 @@ static unsigned int xfs_errortag_random_default[] = {
        XFS_RANDOM_BUF_LRU_REF,
        XFS_RANDOM_FORCE_SCRUB_REPAIR,
        XFS_RANDOM_FORCE_SUMMARY_RECALC,
+       XFS_RANDOM_IUNLINK_FALLBACK,
 };
 
 struct xfs_errortag_attr {
@@ -159,6 +160,7 @@ XFS_ERRORTAG_ATTR_RW(log_item_pin,  XFS_ERRTAG_LOG_ITEM_PIN);
 XFS_ERRORTAG_ATTR_RW(buf_lru_ref,      XFS_ERRTAG_BUF_LRU_REF);
 XFS_ERRORTAG_ATTR_RW(force_repair,     XFS_ERRTAG_FORCE_SCRUB_REPAIR);
 XFS_ERRORTAG_ATTR_RW(bad_summary,      XFS_ERRTAG_FORCE_SUMMARY_RECALC);
+XFS_ERRORTAG_ATTR_RW(iunlink_fallback, XFS_ERRTAG_IUNLINK_FALLBACK);
 
 static struct attribute *xfs_errortag_attrs[] = {
        XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -195,6 +197,7 @@ static struct attribute *xfs_errortag_attrs[] = {
        XFS_ERRORTAG_ATTR_LIST(buf_lru_ref),
        XFS_ERRORTAG_ATTR_LIST(force_repair),
        XFS_ERRORTAG_ATTR_LIST(bad_summary),
+       XFS_ERRORTAG_ATTR_LIST(iunlink_fallback),
        NULL,
 };
 
@@ -357,7 +360,8 @@ xfs_buf_verifier_error(
        fa = failaddr ? failaddr : __return_address;
        __xfs_buf_ioerror(bp, error, fa);
 
-       xfs_alert(mp, "Metadata %s detected at %pS, %s block 0x%llx %s",
+       xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR,
+                 "Metadata %s detected at %pS, %s block 0x%llx %s",
                  bp->b_error == -EFSBADCRC ? "CRC error" : "corruption",
                  fa, bp->b_ops->name, bp->b_bn, name);
 
index 246d3e989c6c92770ac0dff3af1bcb5dbdcb23c0..602aa7d62b66e09a8133a385805571fe20cf68a5 100644 (file)
@@ -98,5 +98,6 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp);
 #define                XFS_PTAG_SHUTDOWN_IOERROR       0x00000020
 #define                XFS_PTAG_SHUTDOWN_LOGERROR      0x00000040
 #define                XFS_PTAG_FSBLOCK_ZERO           0x00000080
+#define                XFS_PTAG_VERIFIER_ERROR         0x00000100
 
 #endif /* __XFS_ERROR_H__ */
index e47425071e654473f4b34e7899015cecce19ef5e..770cc2edf777f4bb3ef6089986d5d49f75788ee4 100644 (file)
@@ -507,7 +507,7 @@ xfs_file_dio_aio_write(
                 * We can't properly handle unaligned direct I/O to reflink
                 * files yet, as we can't unshare a partial block.
                 */
-               if (xfs_is_reflink_inode(ip)) {
+               if (xfs_is_cow_inode(ip)) {
                        trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
                        return -EREMCHG;
                }
@@ -872,14 +872,27 @@ xfs_file_fallocate(
                                goto out_unlock;
                }
 
-               if (mode & FALLOC_FL_ZERO_RANGE)
+               if (mode & FALLOC_FL_ZERO_RANGE) {
                        error = xfs_zero_file_space(ip, offset, len);
-               else {
-                       if (mode & FALLOC_FL_UNSHARE_RANGE) {
-                               error = xfs_reflink_unshare(ip, offset, len);
-                               if (error)
-                                       goto out_unlock;
+               } else if (mode & FALLOC_FL_UNSHARE_RANGE) {
+                       error = xfs_reflink_unshare(ip, offset, len);
+                       if (error)
+                               goto out_unlock;
+
+                       if (!xfs_is_always_cow_inode(ip)) {
+                               error = xfs_alloc_file_space(ip, offset, len,
+                                               XFS_BMAPI_PREALLOC);
                        }
+               } else {
+                       /*
+                        * If always_cow mode we can't use preallocations and
+                        * thus should not create them.
+                        */
+                       if (xfs_is_always_cow_inode(ip)) {
+                               error = -EOPNOTSUPP;
+                               goto out_unlock;
+                       }
+
                        error = xfs_alloc_file_space(ip, offset, len,
                                                     XFS_BMAPI_PREALLOC);
                }
@@ -1068,10 +1081,10 @@ xfs_file_llseek(
        default:
                return generic_file_llseek(file, offset, whence);
        case SEEK_HOLE:
-               offset = iomap_seek_hole(inode, offset, &xfs_iomap_ops);
+               offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
                break;
        case SEEK_DATA:
-               offset = iomap_seek_data(inode, offset, &xfs_iomap_ops);
+               offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
                break;
        }
 
index f3ef70c542e1bc3392b391fefe904b787a0920a5..584648582ba717be54e61f7829085025acbc2ea3 100644 (file)
@@ -533,6 +533,7 @@ xfs_fs_reserve_ag_blocks(
        int                     error = 0;
        int                     err2;
 
+       mp->m_finobt_nores = false;
        for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
                pag = xfs_perag_get(mp, agno);
                err2 = xfs_ag_resv_init(pag, NULL);
index 5169e84ae38255a9438d2f7425c534b4954b9c9f..d0d37738412009355957661fe7e47a18c7522de4 100644 (file)
@@ -16,7 +16,7 @@ xfs_param_t xfs_params = {
                          /*    MIN             DFLT            MAX     */
        .sgid_inherit   = {     0,              0,              1       },
        .symlink_mode   = {     0,              0,              1       },
-       .panic_mask     = {     0,              0,              255     },
+       .panic_mask     = {     0,              0,              256     },
        .error_level    = {     0,              3,              11      },
        .syncd_timer    = {     1*100,          30*100,         7200*100},
        .stats_clear    = {     0,              0,              1       },
index ae667ba74a1c3a4975197aad3e0a2ef4073aa7fa..f643a92951794e3f73951a0abc4b9a3d4b16bdbc 100644 (file)
@@ -1332,7 +1332,7 @@ xfs_create_tmpfile(
        if (error)
                goto out_trans_cancel;
 
-       error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, &ip);
+       error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, &ip);
        if (error)
                goto out_trans_cancel;
 
@@ -1754,7 +1754,7 @@ xfs_inactive_ifree(
         * now remains allocated and sits on the unlinked list until the fs is
         * repaired.
         */
-       if (unlikely(mp->m_inotbt_nores)) {
+       if (unlikely(mp->m_finobt_nores)) {
                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
                                XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE,
                                &tp);
@@ -1907,86 +1907,510 @@ xfs_inactive(
 }
 
 /*
- * This is called when the inode's link count goes to 0 or we are creating a
- * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be
- * set to true as the link count is dropped to zero by the VFS after we've
- * created the file successfully, so we have to add it to the unlinked list
- * while the link count is non-zero.
+ * In-Core Unlinked List Lookups
+ * =============================
+ *
+ * Every inode is supposed to be reachable from some other piece of metadata
+ * with the exception of the root directory.  Inodes with a connection to a
+ * file descriptor but not linked from anywhere in the on-disk directory tree
+ * are collectively known as unlinked inodes, though the filesystem itself
+ * maintains links to these inodes so that on-disk metadata are consistent.
+ *
+ * XFS implements a per-AG on-disk hash table of unlinked inodes.  The AGI
+ * header contains a number of buckets that point to an inode, and each inode
+ * record has a pointer to the next inode in the hash chain.  This
+ * singly-linked list causes scaling problems in the iunlink remove function
+ * because we must walk that list to find the inode that points to the inode
+ * being removed from the unlinked hash bucket list.
+ *
+ * What if we modelled the unlinked list as a collection of records capturing
+ * "X.next_unlinked = Y" relations?  If we indexed those records on Y, we'd
+ * have a fast way to look up unlinked list predecessors, which avoids the
+ * slow list walk.  That's exactly what we do here (in-core) with a per-AG
+ * rhashtable.
+ *
+ * Because this is a backref cache, we ignore operational failures since the
+ * iunlink code can fall back to the slow bucket walk.  The only errors that
+ * should bubble out are for obviously incorrect situations.
+ *
+ * All users of the backref cache MUST hold the AGI buffer lock to serialize
+ * access or have otherwise provided for concurrency control.
+ */
+
+/* Capture a "X.next_unlinked = Y" relationship. */
+struct xfs_iunlink {
+       struct rhash_head       iu_rhash_head;
+       xfs_agino_t             iu_agino;               /* X */
+       xfs_agino_t             iu_next_unlinked;       /* Y */
+};
+
+/* Unlinked list predecessor lookup hashtable construction */
+static int
+xfs_iunlink_obj_cmpfn(
+       struct rhashtable_compare_arg   *arg,
+       const void                      *obj)
+{
+       const xfs_agino_t               *key = arg->key;
+       const struct xfs_iunlink        *iu = obj;
+
+       if (iu->iu_next_unlinked != *key)
+               return 1;
+       return 0;
+}
+
+static const struct rhashtable_params xfs_iunlink_hash_params = {
+       .min_size               = XFS_AGI_UNLINKED_BUCKETS,
+       .key_len                = sizeof(xfs_agino_t),
+       .key_offset             = offsetof(struct xfs_iunlink,
+                                          iu_next_unlinked),
+       .head_offset            = offsetof(struct xfs_iunlink, iu_rhash_head),
+       .automatic_shrinking    = true,
+       .obj_cmpfn              = xfs_iunlink_obj_cmpfn,
+};
+
+/*
+ * Return X, where X.next_unlinked == @agino.  Returns NULLAGINO if no such
+ * relation is found.
+ */
+static xfs_agino_t
+xfs_iunlink_lookup_backref(
+       struct xfs_perag        *pag,
+       xfs_agino_t             agino)
+{
+       struct xfs_iunlink      *iu;
+
+       iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
+                       xfs_iunlink_hash_params);
+       return iu ? iu->iu_agino : NULLAGINO;
+}
+
+/*
+ * Take ownership of an iunlink cache entry and insert it into the hash table.
+ * If successful, the entry will be owned by the cache; if not, it is freed.
+ * Either way, the caller does not own @iu after this call.
+ */
+static int
+xfs_iunlink_insert_backref(
+       struct xfs_perag        *pag,
+       struct xfs_iunlink      *iu)
+{
+       int                     error;
+
+       error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
+                       &iu->iu_rhash_head, xfs_iunlink_hash_params);
+       /*
+        * Fail loudly if there already was an entry because that's a sign of
+        * corruption of in-memory data.  Also fail loudly if we see an error
+        * code we didn't anticipate from the rhashtable code.  Currently we
+        * only anticipate ENOMEM.
+        */
+       if (error) {
+               WARN(error != -ENOMEM, "iunlink cache insert error %d", error);
+               kmem_free(iu);
+       }
+       /*
+        * Absorb any runtime errors that aren't a result of corruption because
+        * this is a cache and we can always fall back to bucket list scanning.
+        */
+       if (error != 0 && error != -EEXIST)
+               error = 0;
+       return error;
+}
+
+/* Remember that @prev_agino.next_unlinked = @this_agino. */
+static int
+xfs_iunlink_add_backref(
+       struct xfs_perag        *pag,
+       xfs_agino_t             prev_agino,
+       xfs_agino_t             this_agino)
+{
+       struct xfs_iunlink      *iu;
+
+       if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
+               return 0;
+
+       iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS);
+       iu->iu_agino = prev_agino;
+       iu->iu_next_unlinked = this_agino;
+
+       return xfs_iunlink_insert_backref(pag, iu);
+}
+
+/*
+ * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked.
+ * If @next_unlinked is NULLAGINO, we drop the backref and exit.  If there
+ * wasn't any such entry then we don't bother.
+ */
+static int
+xfs_iunlink_change_backref(
+       struct xfs_perag        *pag,
+       xfs_agino_t             agino,
+       xfs_agino_t             next_unlinked)
+{
+       struct xfs_iunlink      *iu;
+       int                     error;
+
+       /* Look up the old entry; if there wasn't one then exit. */
+       iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
+                       xfs_iunlink_hash_params);
+       if (!iu)
+               return 0;
+
+       /*
+        * Remove the entry.  This shouldn't ever return an error, but if we
+        * couldn't remove the old entry we don't want to add it again to the
+        * hash table, and if the entry disappeared on us then someone's
+        * violated the locking rules and we need to fail loudly.  Either way
+        * we cannot remove the inode because internal state is or would have
+        * been corrupt.
+        */
+       error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
+                       &iu->iu_rhash_head, xfs_iunlink_hash_params);
+       if (error)
+               return error;
+
+       /* If there is no new next entry just free our item and return. */
+       if (next_unlinked == NULLAGINO) {
+               kmem_free(iu);
+               return 0;
+       }
+
+       /* Update the entry and re-add it to the hash table. */
+       iu->iu_next_unlinked = next_unlinked;
+       return xfs_iunlink_insert_backref(pag, iu);
+}
+
+/* Set up the in-core predecessor structures. */
+int
+xfs_iunlink_init(
+       struct xfs_perag        *pag)
+{
+       return rhashtable_init(&pag->pagi_unlinked_hash,
+                       &xfs_iunlink_hash_params);
+}
+
+/* Free the in-core predecessor structures. */
+static void
+xfs_iunlink_free_item(
+       void                    *ptr,
+       void                    *arg)
+{
+       struct xfs_iunlink      *iu = ptr;
+       bool                    *freed_anything = arg;
+
+       *freed_anything = true;
+       kmem_free(iu);
+}
+
+void
+xfs_iunlink_destroy(
+       struct xfs_perag        *pag)
+{
+       bool                    freed_anything = false;
+
+       rhashtable_free_and_destroy(&pag->pagi_unlinked_hash,
+                       xfs_iunlink_free_item, &freed_anything);
+
+       ASSERT(freed_anything == false || XFS_FORCED_SHUTDOWN(pag->pag_mount));
+}
+
+/*
+ * Point the AGI unlinked bucket at an inode and log the results.  The caller
+ * is responsible for validating the old value.
+ */
+STATIC int
+xfs_iunlink_update_bucket(
+       struct xfs_trans        *tp,
+       xfs_agnumber_t          agno,
+       struct xfs_buf          *agibp,
+       unsigned int            bucket_index,
+       xfs_agino_t             new_agino)
+{
+       struct xfs_agi          *agi = XFS_BUF_TO_AGI(agibp);
+       xfs_agino_t             old_value;
+       int                     offset;
+
+       ASSERT(xfs_verify_agino_or_null(tp->t_mountp, agno, new_agino));
+
+       old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+       trace_xfs_iunlink_update_bucket(tp->t_mountp, agno, bucket_index,
+                       old_value, new_agino);
+
+       /*
+        * We should never find the head of the list already set to the value
+        * passed in because either we're adding or removing ourselves from the
+        * head of the list.
+        */
+       if (old_value == new_agino)
+               return -EFSCORRUPTED;
+
+       agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
+       offset = offsetof(struct xfs_agi, agi_unlinked) +
+                       (sizeof(xfs_agino_t) * bucket_index);
+       xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1);
+       return 0;
+}
+
+/* Set an on-disk inode's next_unlinked pointer. */
+STATIC void
+xfs_iunlink_update_dinode(
+       struct xfs_trans        *tp,
+       xfs_agnumber_t          agno,
+       xfs_agino_t             agino,
+       struct xfs_buf          *ibp,
+       struct xfs_dinode       *dip,
+       struct xfs_imap         *imap,
+       xfs_agino_t             next_agino)
+{
+       struct xfs_mount        *mp = tp->t_mountp;
+       int                     offset;
+
+       ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
+
+       trace_xfs_iunlink_update_dinode(mp, agno, agino,
+                       be32_to_cpu(dip->di_next_unlinked), next_agino);
+
+       dip->di_next_unlinked = cpu_to_be32(next_agino);
+       offset = imap->im_boffset +
+                       offsetof(struct xfs_dinode, di_next_unlinked);
+
+       /* need to recalc the inode CRC if appropriate */
+       xfs_dinode_calc_crc(mp, dip);
+       xfs_trans_inode_buf(tp, ibp);
+       xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
+       xfs_inobp_check(mp, ibp);
+}
+
+/* Set an in-core inode's unlinked pointer and return the old value. */
+STATIC int
+xfs_iunlink_update_inode(
+       struct xfs_trans        *tp,
+       struct xfs_inode        *ip,
+       xfs_agnumber_t          agno,
+       xfs_agino_t             next_agino,
+       xfs_agino_t             *old_next_agino)
+{
+       struct xfs_mount        *mp = tp->t_mountp;
+       struct xfs_dinode       *dip;
+       struct xfs_buf          *ibp;
+       xfs_agino_t             old_value;
+       int                     error;
+
+       ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
+
+       error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0, 0);
+       if (error)
+               return error;
+
+       /* Make sure the old pointer isn't garbage. */
+       old_value = be32_to_cpu(dip->di_next_unlinked);
+       if (!xfs_verify_agino_or_null(mp, agno, old_value)) {
+               error = -EFSCORRUPTED;
+               goto out;
+       }
+
+       /*
+        * Since we're updating a linked list, we should never find that the
+        * current pointer is the same as the new value, unless we're
+        * terminating the list.
+        */
+       *old_next_agino = old_value;
+       if (old_value == next_agino) {
+               if (next_agino != NULLAGINO)
+                       error = -EFSCORRUPTED;
+               goto out;
+       }
+
+       /* Ok, update the new pointer. */
+       xfs_iunlink_update_dinode(tp, agno, XFS_INO_TO_AGINO(mp, ip->i_ino),
+                       ibp, dip, &ip->i_imap, next_agino);
+       return 0;
+out:
+       xfs_trans_brelse(tp, ibp);
+       return error;
+}
+
+/*
+ * This is called when the inode's link count has gone to 0 or we are creating
+ * a tmpfile via O_TMPFILE.  The inode @ip must have nlink == 0.
  *
  * We place the on-disk inode on a list in the AGI.  It will be pulled from this
  * list when the inode is freed.
  */
 STATIC int
 xfs_iunlink(
-       struct xfs_trans *tp,
-       struct xfs_inode *ip)
+       struct xfs_trans        *tp,
+       struct xfs_inode        *ip)
 {
-       xfs_mount_t     *mp = tp->t_mountp;
-       xfs_agi_t       *agi;
-       xfs_dinode_t    *dip;
-       xfs_buf_t       *agibp;
-       xfs_buf_t       *ibp;
-       xfs_agino_t     agino;
-       short           bucket_index;
-       int             offset;
-       int             error;
+       struct xfs_mount        *mp = tp->t_mountp;
+       struct xfs_agi          *agi;
+       struct xfs_buf          *agibp;
+       xfs_agino_t             next_agino;
+       xfs_agnumber_t          agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+       xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+       short                   bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+       int                     error;
 
+       ASSERT(VFS_I(ip)->i_nlink == 0);
        ASSERT(VFS_I(ip)->i_mode != 0);
+       trace_xfs_iunlink(ip);
 
-       /*
-        * Get the agi buffer first.  It ensures lock ordering
-        * on the list.
-        */
-       error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
+       /* Get the agi buffer first.  It ensures lock ordering on the list. */
+       error = xfs_read_agi(mp, tp, agno, &agibp);
        if (error)
                return error;
        agi = XFS_BUF_TO_AGI(agibp);
 
        /*
-        * Get the index into the agi hash table for the
-        * list this inode will go on.
+        * Get the index into the agi hash table for the list this inode will
+        * go on.  Make sure the pointer isn't garbage and that this inode
+        * isn't already on the list.
         */
-       agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
-       ASSERT(agino != 0);
-       bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
-       ASSERT(agi->agi_unlinked[bucket_index]);
-       ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
+       next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+       if (next_agino == agino ||
+           !xfs_verify_agino_or_null(mp, agno, next_agino))
+               return -EFSCORRUPTED;
+
+       if (next_agino != NULLAGINO) {
+               struct xfs_perag        *pag;
+               xfs_agino_t             old_agino;
+
+               /*
+                * There is already another inode in the bucket, so point this
+                * inode to the current head of the list.
+                */
+               error = xfs_iunlink_update_inode(tp, ip, agno, next_agino,
+                               &old_agino);
+               if (error)
+                       return error;
+               ASSERT(old_agino == NULLAGINO);
 
-       if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) {
                /*
-                * There is already another inode in the bucket we need
-                * to add ourselves to.  Add us at the front of the list.
-                * Here we put the head pointer into our next pointer,
-                * and then we fall through to point the head at us.
+                * agino has been unlinked, add a backref from the next inode
+                * back to agino.
                 */
-               error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
-                                      0, 0);
+               pag = xfs_perag_get(mp, agno);
+               error = xfs_iunlink_add_backref(pag, agino, next_agino);
+               xfs_perag_put(pag);
                if (error)
                        return error;
+       }
+
+       /* Point the head of the list to point to this inode. */
+       return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, agino);
+}
 
-               ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO));
-               dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
-               offset = ip->i_imap.im_boffset +
-                       offsetof(xfs_dinode_t, di_next_unlinked);
+/* Return the imap, dinode pointer, and buffer for an inode. */
+STATIC int
+xfs_iunlink_map_ino(
+       struct xfs_trans        *tp,
+       xfs_agnumber_t          agno,
+       xfs_agino_t             agino,
+       struct xfs_imap         *imap,
+       struct xfs_dinode       **dipp,
+       struct xfs_buf          **bpp)
+{
+       struct xfs_mount        *mp = tp->t_mountp;
+       int                     error;
 
-               /* need to recalc the inode CRC if appropriate */
-               xfs_dinode_calc_crc(mp, dip);
+       imap->im_blkno = 0;
+       error = xfs_imap(mp, tp, XFS_AGINO_TO_INO(mp, agno, agino), imap, 0);
+       if (error) {
+               xfs_warn(mp, "%s: xfs_imap returned error %d.",
+                               __func__, error);
+               return error;
+       }
 
-               xfs_trans_inode_buf(tp, ibp);
-               xfs_trans_log_buf(tp, ibp, offset,
-                                 (offset + sizeof(xfs_agino_t) - 1));
-               xfs_inobp_check(mp, ibp);
+       error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0, 0);
+       if (error) {
+               xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
+                               __func__, error);
+               return error;
+       }
+
+       return 0;
+}
+
+/*
+ * Walk the unlinked chain from @head_agino until we find the inode that
+ * points to @target_agino.  Return the inode number, map, dinode pointer,
+ * and inode cluster buffer of that inode as @agino, @imap, @dipp, and @bpp.
+ *
+ * @tp, @pag, @head_agino, and @target_agino are input parameters.
+ * @agino, @imap, @dipp, and @bpp are all output parameters.
+ *
+ * Do not call this function if @target_agino is the head of the list.
+ */
+STATIC int
+xfs_iunlink_map_prev(
+       struct xfs_trans        *tp,
+       xfs_agnumber_t          agno,
+       xfs_agino_t             head_agino,
+       xfs_agino_t             target_agino,
+       xfs_agino_t             *agino,
+       struct xfs_imap         *imap,
+       struct xfs_dinode       **dipp,
+       struct xfs_buf          **bpp,
+       struct xfs_perag        *pag)
+{
+       struct xfs_mount        *mp = tp->t_mountp;
+       xfs_agino_t             next_agino;
+       int                     error;
+
+       ASSERT(head_agino != target_agino);
+       *bpp = NULL;
+
+       /* See if our backref cache can find it faster. */
+       *agino = xfs_iunlink_lookup_backref(pag, target_agino);
+       if (*agino != NULLAGINO) {
+               error = xfs_iunlink_map_ino(tp, agno, *agino, imap, dipp, bpp);
+               if (error)
+                       return error;
+
+               if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino)
+                       return 0;
+
+               /*
+                * If we get here the cache contents were corrupt, so drop the
+                * buffer and fall back to walking the bucket list.
+                */
+               xfs_trans_brelse(tp, *bpp);
+               *bpp = NULL;
+               WARN_ON_ONCE(1);
+       }
+
+       trace_xfs_iunlink_map_prev_fallback(mp, agno);
+
+       /* Otherwise, walk the entire bucket until we find it. */
+       next_agino = head_agino;
+       while (next_agino != target_agino) {
+               xfs_agino_t     unlinked_agino;
+
+               if (*bpp)
+                       xfs_trans_brelse(tp, *bpp);
+
+               *agino = next_agino;
+               error = xfs_iunlink_map_ino(tp, agno, next_agino, imap, dipp,
+                               bpp);
+               if (error)
+                       return error;
+
+               unlinked_agino = be32_to_cpu((*dipp)->di_next_unlinked);
+               /*
+                * Make sure this pointer is valid and isn't an obvious
+                * infinite loop.
+                */
+               if (!xfs_verify_agino(mp, agno, unlinked_agino) ||
+                   next_agino == unlinked_agino) {
+                       XFS_CORRUPTION_ERROR(__func__,
+                                       XFS_ERRLEVEL_LOW, mp,
+                                       *dipp, sizeof(**dipp));
+                       error = -EFSCORRUPTED;
+                       return error;
+               }
+               next_agino = unlinked_agino;
        }
 
-       /*
-        * Point the bucket head pointer at the inode being inserted.
-        */
-       ASSERT(agino != 0);
-       agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
-       offset = offsetof(xfs_agi_t, agi_unlinked) +
-               (sizeof(xfs_agino_t) * bucket_index);
-       xfs_trans_log_buf(tp, agibp, offset,
-                         (offset + sizeof(xfs_agino_t) - 1));
        return 0;
 }
 
@@ -1995,181 +2419,106 @@ xfs_iunlink(
  */
 STATIC int
 xfs_iunlink_remove(
-       xfs_trans_t     *tp,
-       xfs_inode_t     *ip)
+       struct xfs_trans        *tp,
+       struct xfs_inode        *ip)
 {
-       xfs_ino_t       next_ino;
-       xfs_mount_t     *mp;
-       xfs_agi_t       *agi;
-       xfs_dinode_t    *dip;
-       xfs_buf_t       *agibp;
-       xfs_buf_t       *ibp;
-       xfs_agnumber_t  agno;
-       xfs_agino_t     agino;
-       xfs_agino_t     next_agino;
-       xfs_buf_t       *last_ibp;
-       xfs_dinode_t    *last_dip = NULL;
-       short           bucket_index;
-       int             offset, last_offset = 0;
-       int             error;
+       struct xfs_mount        *mp = tp->t_mountp;
+       struct xfs_agi          *agi;
+       struct xfs_buf          *agibp;
+       struct xfs_buf          *last_ibp;
+       struct xfs_dinode       *last_dip = NULL;
+       struct xfs_perag        *pag = NULL;
+       xfs_agnumber_t          agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+       xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+       xfs_agino_t             next_agino;
+       xfs_agino_t             head_agino;
+       short                   bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+       int                     error;
 
-       mp = tp->t_mountp;
-       agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+       trace_xfs_iunlink_remove(ip);
 
-       /*
-        * Get the agi buffer first.  It ensures lock ordering
-        * on the list.
-        */
+       /* Get the agi buffer first.  It ensures lock ordering on the list. */
        error = xfs_read_agi(mp, tp, agno, &agibp);
        if (error)
                return error;
-
        agi = XFS_BUF_TO_AGI(agibp);
 
        /*
-        * Get the index into the agi hash table for the
-        * list this inode will go on.
+        * Get the index into the agi hash table for the list this inode will
+        * go on.  Make sure the head pointer isn't garbage.
         */
-       agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
-       if (!xfs_verify_agino(mp, agno, agino))
-               return -EFSCORRUPTED;
-       bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
-       if (!xfs_verify_agino(mp, agno,
-                       be32_to_cpu(agi->agi_unlinked[bucket_index]))) {
+       head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+       if (!xfs_verify_agino(mp, agno, head_agino)) {
                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
                                agi, sizeof(*agi));
                return -EFSCORRUPTED;
        }
 
-       if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
-               /*
-                * We're at the head of the list.  Get the inode's on-disk
-                * buffer to see if there is anyone after us on the list.
-                * Only modify our next pointer if it is not already NULLAGINO.
-                * This saves us the overhead of dealing with the buffer when
-                * there is no need to change it.
-                */
-               error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
-                                      0, 0);
-               if (error) {
-                       xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
-                               __func__, error);
-                       return error;
-               }
-               next_agino = be32_to_cpu(dip->di_next_unlinked);
-               ASSERT(next_agino != 0);
-               if (next_agino != NULLAGINO) {
-                       dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
-                       offset = ip->i_imap.im_boffset +
-                               offsetof(xfs_dinode_t, di_next_unlinked);
-
-                       /* need to recalc the inode CRC if appropriate */
-                       xfs_dinode_calc_crc(mp, dip);
-
-                       xfs_trans_inode_buf(tp, ibp);
-                       xfs_trans_log_buf(tp, ibp, offset,
-                                         (offset + sizeof(xfs_agino_t) - 1));
-                       xfs_inobp_check(mp, ibp);
-               } else {
-                       xfs_trans_brelse(tp, ibp);
-               }
-               /*
-                * Point the bucket head pointer at the next inode.
-                */
-               ASSERT(next_agino != 0);
-               ASSERT(next_agino != agino);
-               agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
-               offset = offsetof(xfs_agi_t, agi_unlinked) +
-                       (sizeof(xfs_agino_t) * bucket_index);
-               xfs_trans_log_buf(tp, agibp, offset,
-                                 (offset + sizeof(xfs_agino_t) - 1));
-       } else {
-               /*
-                * We need to search the list for the inode being freed.
-                */
-               next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
-               last_ibp = NULL;
-               while (next_agino != agino) {
-                       struct xfs_imap imap;
+       /*
+        * Set our inode's next_unlinked pointer to NULL and then return
+        * the old pointer value so that we can update whatever was previous
+        * to us in the list to point to whatever was next in the list.
+        */
+       error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO, &next_agino);
+       if (error)
+               return error;
 
-                       if (last_ibp)
-                               xfs_trans_brelse(tp, last_ibp);
+       /*
+        * If there was a backref pointing from the next inode back to this
+        * one, remove it because we've removed this inode from the list.
+        *
+        * Later, if this inode was in the middle of the list we'll update
+        * this inode's backref to point from the next inode.
+        */
+       if (next_agino != NULLAGINO) {
+               pag = xfs_perag_get(mp, agno);
+               error = xfs_iunlink_change_backref(pag, next_agino,
+                               NULLAGINO);
+               if (error)
+                       goto out;
+       }
 
-                       imap.im_blkno = 0;
-                       next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
+       if (head_agino == agino) {
+               /* Point the head of the list to the next unlinked inode. */
+               error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
+                               next_agino);
+               if (error)
+                       goto out;
+       } else {
+               struct xfs_imap imap;
+               xfs_agino_t     prev_agino;
 
-                       error = xfs_imap(mp, tp, next_ino, &imap, 0);
-                       if (error) {
-                               xfs_warn(mp,
-       "%s: xfs_imap returned error %d.",
-                                        __func__, error);
-                               return error;
-                       }
+               if (!pag)
+                       pag = xfs_perag_get(mp, agno);
 
-                       error = xfs_imap_to_bp(mp, tp, &imap, &last_dip,
-                                              &last_ibp, 0, 0);
-                       if (error) {
-                               xfs_warn(mp,
-       "%s: xfs_imap_to_bp returned error %d.",
-                                       __func__, error);
-                               return error;
-                       }
+               /* We need to search the list for the inode being freed. */
+               error = xfs_iunlink_map_prev(tp, agno, head_agino, agino,
+                               &prev_agino, &imap, &last_dip, &last_ibp,
+                               pag);
+               if (error)
+                       goto out;
 
-                       last_offset = imap.im_boffset;
-                       next_agino = be32_to_cpu(last_dip->di_next_unlinked);
-                       if (!xfs_verify_agino(mp, agno, next_agino)) {
-                               XFS_CORRUPTION_ERROR(__func__,
-                                               XFS_ERRLEVEL_LOW, mp,
-                                               last_dip, sizeof(*last_dip));
-                               return -EFSCORRUPTED;
-                       }
-               }
+               /* Point the previous inode on the list to the next inode. */
+               xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp,
+                               last_dip, &imap, next_agino);
 
                /*
-                * Now last_ibp points to the buffer previous to us on the
-                * unlinked list.  Pull us from the list.
+                * Now we deal with the backref for this inode.  If this inode
+                * pointed at a real inode, change the backref that pointed to
+                * us to point to our old next.  If this inode was the end of
+                * the list, delete the backref that pointed to us.  Note that
+                * change_backref takes care of deleting the backref if
+                * next_agino is NULLAGINO.
                 */
-               error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
-                                      0, 0);
-               if (error) {
-                       xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.",
-                               __func__, error);
-                       return error;
-               }
-               next_agino = be32_to_cpu(dip->di_next_unlinked);
-               ASSERT(next_agino != 0);
-               ASSERT(next_agino != agino);
-               if (next_agino != NULLAGINO) {
-                       dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
-                       offset = ip->i_imap.im_boffset +
-                               offsetof(xfs_dinode_t, di_next_unlinked);
-
-                       /* need to recalc the inode CRC if appropriate */
-                       xfs_dinode_calc_crc(mp, dip);
-
-                       xfs_trans_inode_buf(tp, ibp);
-                       xfs_trans_log_buf(tp, ibp, offset,
-                                         (offset + sizeof(xfs_agino_t) - 1));
-                       xfs_inobp_check(mp, ibp);
-               } else {
-                       xfs_trans_brelse(tp, ibp);
-               }
-               /*
-                * Point the previous inode on the list to the next inode.
-                */
-               last_dip->di_next_unlinked = cpu_to_be32(next_agino);
-               ASSERT(next_agino != 0);
-               offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
-
-               /* need to recalc the inode CRC if appropriate */
-               xfs_dinode_calc_crc(mp, last_dip);
-
-               xfs_trans_inode_buf(tp, last_ibp);
-               xfs_trans_log_buf(tp, last_ibp, offset,
-                                 (offset + sizeof(xfs_agino_t) - 1));
-               xfs_inobp_check(mp, last_ibp);
+               error = xfs_iunlink_change_backref(pag, agino, next_agino);
+               if (error)
+                       goto out;
        }
-       return 0;
+
+out:
+       if (pag)
+               xfs_perag_put(pag);
+       return error;
 }
 
 /*
@@ -2833,11 +3182,9 @@ xfs_rename_alloc_whiteout(
 
        /*
         * Prepare the tmpfile inode as if it were created through the VFS.
-        * Otherwise, the link increment paths will complain about nlink 0->1.
-        * Drop the link count as done by d_tmpfile(), complete the inode setup
-        * and flag it as linkable.
+        * Complete the inode setup and flag it as linkable.  nlink is already
+        * zero, so we can skip the drop_nlink.
         */
-       drop_nlink(VFS_I(tmpfile));
        xfs_setup_iops(tmpfile);
        xfs_finish_inode_setup(tmpfile);
        VFS_I(tmpfile)->i_state |= I_LINKABLE;
index be201452015582e7728aaf66c43e7076502594db..e62074a5257ce3a5e4932743129b7fdedafc6b6a 100644 (file)
@@ -500,4 +500,7 @@ extern struct kmem_zone     *xfs_inode_zone;
 
 bool xfs_inode_verify_forks(struct xfs_inode *ip);
 
+int xfs_iunlink_init(struct xfs_perag *pag);
+void xfs_iunlink_destroy(struct xfs_perag *pag);
+
 #endif /* __XFS_INODE_H__ */
index 27c93b5f029df92b17c22456c0bfe7a5a3fa085c..63d323916bba9e42dc3f37d81359b16a6821784b 100644 (file)
 #define XFS_WRITEIO_ALIGN(mp,off)      (((off) >> mp->m_writeio_log) \
                                                << mp->m_writeio_log)
 
-void
+static int
+xfs_alert_fsblock_zero(
+       xfs_inode_t     *ip,
+       xfs_bmbt_irec_t *imap)
+{
+       xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
+                       "Access to block zero in inode %llu "
+                       "start_block: %llx start_off: %llx "
+                       "blkcnt: %llx extent-state: %x",
+               (unsigned long long)ip->i_ino,
+               (unsigned long long)imap->br_startblock,
+               (unsigned long long)imap->br_startoff,
+               (unsigned long long)imap->br_blockcount,
+               imap->br_state);
+       return -EFSCORRUPTED;
+}
+
+int
 xfs_bmbt_to_iomap(
        struct xfs_inode        *ip,
        struct iomap            *iomap,
-       struct xfs_bmbt_irec    *imap)
+       struct xfs_bmbt_irec    *imap,
+       bool                    shared)
 {
        struct xfs_mount        *mp = ip->i_mount;
 
+       if (unlikely(!imap->br_startblock && !XFS_IS_REALTIME_INODE(ip)))
+               return xfs_alert_fsblock_zero(ip, imap);
+
        if (imap->br_startblock == HOLESTARTBLOCK) {
                iomap->addr = IOMAP_NULL_ADDR;
                iomap->type = IOMAP_HOLE;
-       } else if (imap->br_startblock == DELAYSTARTBLOCK) {
+       } else if (imap->br_startblock == DELAYSTARTBLOCK ||
+                  isnullstartblock(imap->br_startblock)) {
                iomap->addr = IOMAP_NULL_ADDR;
                iomap->type = IOMAP_DELALLOC;
        } else {
@@ -60,6 +82,13 @@ xfs_bmbt_to_iomap(
        iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
        iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
        iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip));
+
+       if (xfs_ipincount(ip) &&
+           (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
+               iomap->flags |= IOMAP_F_DIRTY;
+       if (shared)
+               iomap->flags |= IOMAP_F_SHARED;
+       return 0;
 }
 
 static void
@@ -138,23 +167,6 @@ xfs_iomap_eof_align_last_fsb(
        return 0;
 }
 
-STATIC int
-xfs_alert_fsblock_zero(
-       xfs_inode_t     *ip,
-       xfs_bmbt_irec_t *imap)
-{
-       xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
-                       "Access to block zero in inode %llu "
-                       "start_block: %llx start_off: %llx "
-                       "blkcnt: %llx extent-state: %x",
-               (unsigned long long)ip->i_ino,
-               (unsigned long long)imap->br_startblock,
-               (unsigned long long)imap->br_startoff,
-               (unsigned long long)imap->br_blockcount,
-               imap->br_state);
-       return -EFSCORRUPTED;
-}
-
 int
 xfs_iomap_write_direct(
        xfs_inode_t     *ip,
@@ -383,12 +395,13 @@ xfs_quota_calc_throttle(
 STATIC xfs_fsblock_t
 xfs_iomap_prealloc_size(
        struct xfs_inode        *ip,
+       int                     whichfork,
        loff_t                  offset,
        loff_t                  count,
        struct xfs_iext_cursor  *icur)
 {
        struct xfs_mount        *mp = ip->i_mount;
-       struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+       struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
        xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset);
        struct xfs_bmbt_irec    prev;
        int                     shift = 0;
@@ -522,15 +535,16 @@ xfs_file_iomap_begin_delay(
 {
        struct xfs_inode        *ip = XFS_I(inode);
        struct xfs_mount        *mp = ip->i_mount;
-       struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
        xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset);
        xfs_fileoff_t           maxbytes_fsb =
                XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
        xfs_fileoff_t           end_fsb;
-       int                     error = 0, eof = 0;
-       struct xfs_bmbt_irec    got;
-       struct xfs_iext_cursor  icur;
+       struct xfs_bmbt_irec    imap, cmap;
+       struct xfs_iext_cursor  icur, ccur;
        xfs_fsblock_t           prealloc_blocks = 0;
+       bool                    eof = false, cow_eof = false, shared = false;
+       int                     whichfork = XFS_DATA_FORK;
+       int                     error = 0;
 
        ASSERT(!XFS_IS_REALTIME_INODE(ip));
        ASSERT(!xfs_get_extsz_hint(ip));
@@ -548,7 +562,7 @@ xfs_file_iomap_begin_delay(
 
        XFS_STATS_INC(mp, xs_blk_mapw);
 
-       if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+       if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
                error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
                if (error)
                        goto out_unlock;
@@ -556,53 +570,101 @@ xfs_file_iomap_begin_delay(
 
        end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
 
-       eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got);
+       /*
+        * Search the data fork fork first to look up our source mapping.  We
+        * always need the data fork map, as we have to return it to the
+        * iomap code so that the higher level write code can read data in to
+        * perform read-modify-write cycles for unaligned writes.
+        */
+       eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap);
        if (eof)
-               got.br_startoff = end_fsb; /* fake hole until the end */
+               imap.br_startoff = end_fsb; /* fake hole until the end */
+
+       /* We never need to allocate blocks for zeroing a hole. */
+       if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
+               xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
+               goto out_unlock;
+       }
 
-       if (got.br_startoff <= offset_fsb) {
+       /*
+        * Search the COW fork extent list even if we did not find a data fork
+        * extent.  This serves two purposes: first this implements the
+        * speculative preallocation using cowextsize, so that we also unshare
+        * block adjacent to shared blocks instead of just the shared blocks
+        * themselves.  Second the lookup in the extent list is generally faster
+        * than going out to the shared extent tree.
+        */
+       if (xfs_is_cow_inode(ip)) {
+               if (!ip->i_cowfp) {
+                       ASSERT(!xfs_is_reflink_inode(ip));
+                       xfs_ifork_init_cow(ip);
+               }
+               cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
+                               &ccur, &cmap);
+               if (!cow_eof && cmap.br_startoff <= offset_fsb) {
+                       trace_xfs_reflink_cow_found(ip, &cmap);
+                       whichfork = XFS_COW_FORK;
+                       goto done;
+               }
+       }
+
+       if (imap.br_startoff <= offset_fsb) {
                /*
                 * For reflink files we may need a delalloc reservation when
                 * overwriting shared extents.   This includes zeroing of
                 * existing extents that contain data.
                 */
-               if (xfs_is_reflink_inode(ip) &&
-                   ((flags & IOMAP_WRITE) ||
-                    got.br_state != XFS_EXT_UNWRITTEN)) {
-                       xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb);
-                       error = xfs_reflink_reserve_cow(ip, &got);
-                       if (error)
-                               goto out_unlock;
+               if (!xfs_is_cow_inode(ip) ||
+                   ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) {
+                       trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
+                                       &imap);
+                       goto done;
                }
 
-               trace_xfs_iomap_found(ip, offset, count, 0, &got);
-               goto done;
-       }
+               xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
 
-       if (flags & IOMAP_ZERO) {
-               xfs_hole_to_iomap(ip, iomap, offset_fsb, got.br_startoff);
-               goto out_unlock;
+               /* Trim the mapping to the nearest shared extent boundary. */
+               error = xfs_inode_need_cow(ip, &imap, &shared);
+               if (error)
+                       goto out_unlock;
+
+               /* Not shared?  Just report the (potentially capped) extent. */
+               if (!shared) {
+                       trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
+                                       &imap);
+                       goto done;
+               }
+
+               /*
+                * Fork all the shared blocks from our write offset until the
+                * end of the extent.
+                */
+               whichfork = XFS_COW_FORK;
+               end_fsb = imap.br_startoff + imap.br_blockcount;
+       } else {
+               /*
+                * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
+                * pages to keep the chunks of work done where somewhat
+                * symmetric with the work writeback does.  This is a completely
+                * arbitrary number pulled out of thin air.
+                *
+                * Note that the values needs to be less than 32-bits wide until
+                * the lower level functions are updated.
+                */
+               count = min_t(loff_t, count, 1024 * PAGE_SIZE);
+               end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
+
+               if (xfs_is_always_cow_inode(ip))
+                       whichfork = XFS_COW_FORK;
        }
 
        error = xfs_qm_dqattach_locked(ip, false);
        if (error)
                goto out_unlock;
 
-       /*
-        * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages
-        * to keep the chunks of work done where somewhat symmetric with the
-        * work writeback does. This is a completely arbitrary number pulled
-        * out of thin air as a best guess for initial testing.
-        *
-        * Note that the values needs to be less than 32-bits wide until
-        * the lower level functions are updated.
-        */
-       count = min_t(loff_t, count, 1024 * PAGE_SIZE);
-       end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
-
        if (eof) {
-               prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count,
-                               &icur);
+               prealloc_blocks = xfs_iomap_prealloc_size(ip, whichfork, offset,
+                               count, &icur);
                if (prealloc_blocks) {
                        xfs_extlen_t    align;
                        xfs_off_t       end_offset;
@@ -623,9 +685,11 @@ xfs_file_iomap_begin_delay(
        }
 
 retry:
-       error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb,
-                       end_fsb - offset_fsb, prealloc_blocks, &got, &icur,
-                       eof);
+       error = xfs_bmapi_reserve_delalloc(ip, whichfork, offset_fsb,
+                       end_fsb - offset_fsb, prealloc_blocks,
+                       whichfork == XFS_DATA_FORK ? &imap : &cmap,
+                       whichfork == XFS_DATA_FORK ? &icur : &ccur,
+                       whichfork == XFS_DATA_FORK ? eof : cow_eof);
        switch (error) {
        case 0:
                break;
@@ -647,186 +711,22 @@ retry:
         * them out if the write happens to fail.
         */
        iomap->flags |= IOMAP_F_NEW;
-       trace_xfs_iomap_alloc(ip, offset, count, 0, &got);
+       trace_xfs_iomap_alloc(ip, offset, count, whichfork,
+                       whichfork == XFS_DATA_FORK ? &imap : &cmap);
 done:
-       if (isnullstartblock(got.br_startblock))
-               got.br_startblock = DELAYSTARTBLOCK;
-
-       if (!got.br_startblock) {
-               error = xfs_alert_fsblock_zero(ip, &got);
-               if (error)
+       if (whichfork == XFS_COW_FORK) {
+               if (imap.br_startoff > offset_fsb) {
+                       xfs_trim_extent(&cmap, offset_fsb,
+                                       imap.br_startoff - offset_fsb);
+                       error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true);
                        goto out_unlock;
-       }
-
-       xfs_bmbt_to_iomap(ip, iomap, &got);
-
-out_unlock:
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       return error;
-}
-
-/*
- * Pass in a delayed allocate extent, convert it to real extents;
- * return to the caller the extent we create which maps on top of
- * the originating callers request.
- *
- * Called without a lock on the inode.
- *
- * We no longer bother to look at the incoming map - all we have to
- * guarantee is that whatever we allocate fills the required range.
- */
-int
-xfs_iomap_write_allocate(
-       xfs_inode_t     *ip,
-       int             whichfork,
-       xfs_off_t       offset,
-       xfs_bmbt_irec_t *imap,
-       unsigned int    *cow_seq)
-{
-       xfs_mount_t     *mp = ip->i_mount;
-       struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
-       xfs_fileoff_t   offset_fsb, last_block;
-       xfs_fileoff_t   end_fsb, map_start_fsb;
-       xfs_filblks_t   count_fsb;
-       xfs_trans_t     *tp;
-       int             nimaps;
-       int             error = 0;
-       int             flags = XFS_BMAPI_DELALLOC;
-       int             nres;
-
-       if (whichfork == XFS_COW_FORK)
-               flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC;
-
-       /*
-        * Make sure that the dquots are there.
-        */
-       error = xfs_qm_dqattach(ip);
-       if (error)
-               return error;
-
-       offset_fsb = XFS_B_TO_FSBT(mp, offset);
-       count_fsb = imap->br_blockcount;
-       map_start_fsb = imap->br_startoff;
-
-       XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
-
-       while (count_fsb != 0) {
-               /*
-                * Set up a transaction with which to allocate the
-                * backing store for the file.  Do allocations in a
-                * loop until we get some space in the range we are
-                * interested in.  The other space that might be allocated
-                * is in the delayed allocation extent on which we sit
-                * but before our buffer starts.
-                */
-               nimaps = 0;
-               while (nimaps == 0) {
-                       nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
-                       /*
-                        * We have already reserved space for the extent and any
-                        * indirect blocks when creating the delalloc extent,
-                        * there is no need to reserve space in this transaction
-                        * again.
-                        */
-                       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0,
-                                       0, XFS_TRANS_RESERVE, &tp);
-                       if (error)
-                               return error;
-
-                       xfs_ilock(ip, XFS_ILOCK_EXCL);
-                       xfs_trans_ijoin(tp, ip, 0);
-
-                       /*
-                        * it is possible that the extents have changed since
-                        * we did the read call as we dropped the ilock for a
-                        * while. We have to be careful about truncates or hole
-                        * punchs here - we are not allowed to allocate
-                        * non-delalloc blocks here.
-                        *
-                        * The only protection against truncation is the pages
-                        * for the range we are being asked to convert are
-                        * locked and hence a truncate will block on them
-                        * first.
-                        *
-                        * As a result, if we go beyond the range we really
-                        * need and hit an delalloc extent boundary followed by
-                        * a hole while we have excess blocks in the map, we
-                        * will fill the hole incorrectly and overrun the
-                        * transaction reservation.
-                        *
-                        * Using a single map prevents this as we are forced to
-                        * check each map we look for overlap with the desired
-                        * range and abort as soon as we find it. Also, given
-                        * that we only return a single map, having one beyond
-                        * what we can return is probably a bit silly.
-                        *
-                        * We also need to check that we don't go beyond EOF;
-                        * this is a truncate optimisation as a truncate sets
-                        * the new file size before block on the pages we
-                        * currently have locked under writeback. Because they
-                        * are about to be tossed, we don't need to write them
-                        * back....
-                        */
-                       nimaps = 1;
-                       end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
-                       error = xfs_bmap_last_offset(ip, &last_block,
-                                                       XFS_DATA_FORK);
-                       if (error)
-                               goto trans_cancel;
-
-                       last_block = XFS_FILEOFF_MAX(last_block, end_fsb);
-                       if ((map_start_fsb + count_fsb) > last_block) {
-                               count_fsb = last_block - map_start_fsb;
-                               if (count_fsb == 0) {
-                                       error = -EAGAIN;
-                                       goto trans_cancel;
-                               }
-                       }
-
-                       /*
-                        * From this point onwards we overwrite the imap
-                        * pointer that the caller gave to us.
-                        */
-                       error = xfs_bmapi_write(tp, ip, map_start_fsb,
-                                               count_fsb, flags, nres, imap,
-                                               &nimaps);
-                       if (error)
-                               goto trans_cancel;
-
-                       error = xfs_trans_commit(tp);
-                       if (error)
-                               goto error0;
-
-                       if (whichfork == XFS_COW_FORK)
-                               *cow_seq = READ_ONCE(ifp->if_seq);
-                       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-               }
-
-               /*
-                * See if we were able to allocate an extent that
-                * covers at least part of the callers request
-                */
-               if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
-                       return xfs_alert_fsblock_zero(ip, imap);
-
-               if ((offset_fsb >= imap->br_startoff) &&
-                   (offset_fsb < (imap->br_startoff +
-                                  imap->br_blockcount))) {
-                       XFS_STATS_INC(mp, xs_xstrat_quick);
-                       return 0;
                }
-
-               /*
-                * So far we have not mapped the requested part of the
-                * file, just surrounding data, try again.
-                */
-               count_fsb -= imap->br_blockcount;
-               map_start_fsb = imap->br_startoff + imap->br_blockcount;
+               /* ensure we only report blocks we have a reservation for */
+               xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount);
+               shared = true;
        }
-
-trans_cancel:
-       xfs_trans_cancel(tp);
-error0:
+       error = xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
+out_unlock:
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
        return error;
 }
@@ -975,7 +875,7 @@ xfs_ilock_for_iomap(
         * COW writes may allocate delalloc space or convert unwritten COW
         * extents, so we need to make sure to take the lock exclusively here.
         */
-       if (xfs_is_reflink_inode(ip) && is_write) {
+       if (xfs_is_cow_inode(ip) && is_write) {
                /*
                 * FIXME: It could still overwrite on unshared extents and not
                 * need allocation.
@@ -1009,7 +909,7 @@ relock:
         * check, so if we got ILOCK_SHARED for a write and but we're now a
         * reflink inode we have to switch to ILOCK_EXCL and relock.
         */
-       if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_reflink_inode(ip)) {
+       if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_cow_inode(ip)) {
                xfs_iunlock(ip, mode);
                mode = XFS_ILOCK_EXCL;
                goto relock;
@@ -1081,23 +981,33 @@ xfs_file_iomap_begin(
         * Break shared extents if necessary. Checks for non-blocking IO have
         * been done up front, so we don't need to do them here.
         */
-       if (xfs_is_reflink_inode(ip)) {
+       if (xfs_is_cow_inode(ip)) {
+               struct xfs_bmbt_irec    cmap;
+               bool                    directio = (flags & IOMAP_DIRECT);
+
                /* if zeroing doesn't need COW allocation, then we are done. */
                if ((flags & IOMAP_ZERO) &&
                    !needs_cow_for_zeroing(&imap, nimaps))
                        goto out_found;
 
-               if (flags & IOMAP_DIRECT) {
-                       /* may drop and re-acquire the ilock */
-                       error = xfs_reflink_allocate_cow(ip, &imap, &shared,
-                                       &lockmode);
-                       if (error)
-                               goto out_unlock;
-               } else {
-                       error = xfs_reflink_reserve_cow(ip, &imap);
-                       if (error)
-                               goto out_unlock;
-               }
+               /* may drop and re-acquire the ilock */
+               cmap = imap;
+               error = xfs_reflink_allocate_cow(ip, &cmap, &shared, &lockmode,
+                               directio);
+               if (error)
+                       goto out_unlock;
+
+               /*
+                * For buffered writes we need to report the address of the
+                * previous block (if there was any) so that the higher level
+                * write code can perform read-modify-write operations; we
+                * won't need the CoW fork mapping until writeback.  For direct
+                * I/O, which must be block aligned, we need to report the
+                * newly allocated address.  If the data fork has a hole, copy
+                * the COW fork mapping to avoid allocating to the data fork.
+                */
+               if (directio || imap.br_startblock == HOLESTARTBLOCK)
+                       imap = cmap;
 
                end_fsb = imap.br_startoff + imap.br_blockcount;
                length = XFS_FSB_TO_B(mp, end_fsb) - offset;
@@ -1139,23 +1049,15 @@ xfs_file_iomap_begin(
                return error;
 
        iomap->flags |= IOMAP_F_NEW;
-       trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
+       trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
 
 out_finish:
-       if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields
-                               & ~XFS_ILOG_TIMESTAMP))
-               iomap->flags |= IOMAP_F_DIRTY;
-
-       xfs_bmbt_to_iomap(ip, iomap, &imap);
-
-       if (shared)
-               iomap->flags |= IOMAP_F_SHARED;
-       return 0;
+       return xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
 
 out_found:
        ASSERT(nimaps);
        xfs_iunlock(ip, lockmode);
-       trace_xfs_iomap_found(ip, offset, length, 0, &imap);
+       trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
        goto out_finish;
 
 out_unlock:
@@ -1240,6 +1142,92 @@ const struct iomap_ops xfs_iomap_ops = {
        .iomap_end              = xfs_file_iomap_end,
 };
 
+static int
+xfs_seek_iomap_begin(
+       struct inode            *inode,
+       loff_t                  offset,
+       loff_t                  length,
+       unsigned                flags,
+       struct iomap            *iomap)
+{
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset);
+       xfs_fileoff_t           end_fsb = XFS_B_TO_FSB(mp, offset + length);
+       xfs_fileoff_t           cow_fsb = NULLFILEOFF, data_fsb = NULLFILEOFF;
+       struct xfs_iext_cursor  icur;
+       struct xfs_bmbt_irec    imap, cmap;
+       int                     error = 0;
+       unsigned                lockmode;
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return -EIO;
+
+       lockmode = xfs_ilock_data_map_shared(ip);
+       if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
+               error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+               if (error)
+                       goto out_unlock;
+       }
+
+       if (xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) {
+               /*
+                * If we found a data extent we are done.
+                */
+               if (imap.br_startoff <= offset_fsb)
+                       goto done;
+               data_fsb = imap.br_startoff;
+       } else {
+               /*
+                * Fake a hole until the end of the file.
+                */
+               data_fsb = min(XFS_B_TO_FSB(mp, offset + length),
+                              XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes));
+       }
+
+       /*
+        * If a COW fork extent covers the hole, report it - capped to the next
+        * data fork extent:
+        */
+       if (xfs_inode_has_cow_data(ip) &&
+           xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
+               cow_fsb = cmap.br_startoff;
+       if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
+               if (data_fsb < cow_fsb + cmap.br_blockcount)
+                       end_fsb = min(end_fsb, data_fsb);
+               xfs_trim_extent(&cmap, offset_fsb, end_fsb);
+               error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true);
+               /*
+                * This is a COW extent, so we must probe the page cache
+                * because there could be dirty page cache being backed
+                * by this extent.
+                */
+               iomap->type = IOMAP_UNWRITTEN;
+               goto out_unlock;
+       }
+
+       /*
+        * Else report a hole, capped to the next found data or COW extent.
+        */
+       if (cow_fsb != NULLFILEOFF && cow_fsb < data_fsb)
+               imap.br_blockcount = cow_fsb - offset_fsb;
+       else
+               imap.br_blockcount = data_fsb - offset_fsb;
+       imap.br_startoff = offset_fsb;
+       imap.br_startblock = HOLESTARTBLOCK;
+       imap.br_state = XFS_EXT_NORM;
+done:
+       xfs_trim_extent(&imap, offset_fsb, end_fsb);
+       error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
+out_unlock:
+       xfs_iunlock(ip, lockmode);
+       return error;
+}
+
+const struct iomap_ops xfs_seek_iomap_ops = {
+       .iomap_begin            = xfs_seek_iomap_begin,
+};
+
 static int
 xfs_xattr_iomap_begin(
        struct inode            *inode,
@@ -1273,12 +1261,10 @@ xfs_xattr_iomap_begin(
 out_unlock:
        xfs_iunlock(ip, lockmode);
 
-       if (!error) {
-               ASSERT(nimaps);
-               xfs_bmbt_to_iomap(ip, iomap, &imap);
-       }
-
-       return error;
+       if (error)
+               return error;
+       ASSERT(nimaps);
+       return xfs_bmbt_to_iomap(ip, iomap, &imap, false);
 }
 
 const struct iomap_ops xfs_xattr_iomap_ops = {
index c6170548831bec4da8b7cea7a1847e60dcdc7e82..5c2f6aa6d78ffa810bdaeae1ed06cb85f465d1b6 100644 (file)
@@ -13,12 +13,10 @@ struct xfs_bmbt_irec;
 
 int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
                        struct xfs_bmbt_irec *, int);
-int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t,
-                       struct xfs_bmbt_irec *, unsigned int *);
 int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
 
-void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
-               struct xfs_bmbt_irec *);
+int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
+               struct xfs_bmbt_irec *, bool shared);
 xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize);
 
 static inline xfs_filblks_t
@@ -42,6 +40,7 @@ xfs_aligned_fsb_count(
 }
 
 extern const struct iomap_ops xfs_iomap_ops;
+extern const struct iomap_ops xfs_seek_iomap_ops;
 extern const struct iomap_ops xfs_xattr_iomap_ops;
 
 #endif /* __XFS_IOMAP_H__*/
index f48ffd7a8d3e491d76defe66961194a635276115..74047bd0c1aeb44709ceae3ef779921778c4be0e 100644 (file)
@@ -191,9 +191,18 @@ xfs_generic_create(
 
        xfs_setup_iops(ip);
 
-       if (tmpfile)
+       if (tmpfile) {
+               /*
+                * The VFS requires that any inode fed to d_tmpfile must have
+                * nlink == 1 so that it can decrement the nlink in d_tmpfile.
+                * However, we created the temp file with nlink == 0 because
+                * we're not allowed to put an inode with nlink > 0 on the
+                * unlinked list.  Therefore we have to set nlink to 1 so that
+                * d_tmpfile can immediately set it back to zero.
+                */
+               set_nlink(inode, 1);
                d_tmpfile(dentry, inode);
-       else
+       else
                d_instantiate(dentry, inode);
 
        xfs_finish_inode_setup(ip);
@@ -522,6 +531,10 @@ xfs_vn_getattr(
                }
        }
 
+       /*
+        * Note: If you add another clause to set an attribute flag, please
+        * update attributes_mask below.
+        */
        if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
                stat->attributes |= STATX_ATTR_IMMUTABLE;
        if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
@@ -529,6 +542,10 @@ xfs_vn_getattr(
        if (ip->i_d.di_flags & XFS_DIFLAG_NODUMP)
                stat->attributes |= STATX_ATTR_NODUMP;
 
+       stat->attributes_mask |= (STATX_ATTR_IMMUTABLE |
+                                 STATX_ATTR_APPEND |
+                                 STATX_ATTR_NODUMP);
+
        switch (inode->i_mode & S_IFMT) {
        case S_IFBLK:
        case S_IFCHR:
index 9fe88d125f0a2b78b9c433618df44be9fcecfa98..3371d1ff27c444d1a0eecfe26b519d6514682fa2 100644 (file)
@@ -2439,17 +2439,21 @@ xlog_recover_validate_buf_type(
        case XFS_BLFT_BTREE_BUF:
                switch (magic32) {
                case XFS_ABTB_CRC_MAGIC:
-               case XFS_ABTC_CRC_MAGIC:
                case XFS_ABTB_MAGIC:
+                       bp->b_ops = &xfs_bnobt_buf_ops;
+                       break;
+               case XFS_ABTC_CRC_MAGIC:
                case XFS_ABTC_MAGIC:
-                       bp->b_ops = &xfs_allocbt_buf_ops;
+                       bp->b_ops = &xfs_cntbt_buf_ops;
                        break;
                case XFS_IBT_CRC_MAGIC:
-               case XFS_FIBT_CRC_MAGIC:
                case XFS_IBT_MAGIC:
-               case XFS_FIBT_MAGIC:
                        bp->b_ops = &xfs_inobt_buf_ops;
                        break;
+               case XFS_FIBT_CRC_MAGIC:
+               case XFS_FIBT_MAGIC:
+                       bp->b_ops = &xfs_finobt_buf_ops;
+                       break;
                case XFS_BMAP_CRC_MAGIC:
                case XFS_BMAP_MAGIC:
                        bp->b_ops = &xfs_bmbt_buf_ops;
@@ -3045,7 +3049,7 @@ xlog_recover_inode_pass2(
         * Make sure the place we're flushing out to really looks
         * like an inode!
         */
-       if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) {
+       if (unlikely(!xfs_verify_magic16(bp, dip->di_magic))) {
                xfs_alert(mp,
        "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld",
                        __func__, dip, bp, in_f->ilf_ino);
index b4d8c318be3cef2c8c7750942f87883b4d5a00ba..fd63b0b1307c502ab45db45b37dd5da4cfa1d7f7 100644 (file)
@@ -149,6 +149,7 @@ xfs_free_perag(
                spin_unlock(&mp->m_perag_lock);
                ASSERT(pag);
                ASSERT(atomic_read(&pag->pag_ref) == 0);
+               xfs_iunlink_destroy(pag);
                xfs_buf_hash_destroy(pag);
                mutex_destroy(&pag->pag_ici_reclaim_lock);
                call_rcu(&pag->rcu_head, __xfs_free_perag);
@@ -227,6 +228,9 @@ xfs_initialize_perag(
                /* first new pag is fully initialized */
                if (first_initialised == NULLAGNUMBER)
                        first_initialised = index;
+               error = xfs_iunlink_init(pag);
+               if (error)
+                       goto out_hash_destroy;
        }
 
        index = xfs_set_inode_alloc(mp, agcount);
@@ -249,6 +253,7 @@ out_unwind_new_pags:
                if (!pag)
                        break;
                xfs_buf_hash_destroy(pag);
+               xfs_iunlink_destroy(pag);
                mutex_destroy(&pag->pag_ici_reclaim_lock);
                kmem_free(pag);
        }
index 7daafe064af84daeec805b971ad1705cee67653a..110f927cf943dbc9cdb9858c37cb51ad92c7d07d 100644 (file)
@@ -138,7 +138,7 @@ typedef struct xfs_mount {
        struct mutex            m_growlock;     /* growfs mutex */
        int                     m_fixedfsid[2]; /* unchanged for life of FS */
        uint64_t                m_flags;        /* global mount flags */
-       bool                    m_inotbt_nores; /* no per-AG finobt resv. */
+       bool                    m_finobt_nores; /* no per-AG finobt resv. */
        int                     m_ialloc_inos;  /* inodes in inode allocation */
        int                     m_ialloc_blks;  /* blocks in inode allocation */
        int                     m_ialloc_min_blks;/* min blocks in sparse inode
@@ -194,6 +194,7 @@ typedef struct xfs_mount {
         */
        uint32_t                m_generation;
 
+       bool                    m_always_cow;
        bool                    m_fail_unmount;
 #ifdef DEBUG
        /*
@@ -396,6 +397,13 @@ typedef struct xfs_perag {
 
        /* reference count */
        uint8_t                 pagf_refcount_level;
+
+       /*
+        * Unlinked inode information.  This incore information reflects
+        * data stored in the AGI, so callers must hold the AGI buffer lock
+        * or have some other means to control concurrency.
+        */
+       struct rhashtable       pagi_unlinked_hash;
 } xfs_perag_t;
 
 static inline struct xfs_ag_resv *
index d3e04d20d8d45007279ee85ab5b04833d3f9355a..c8ba98fae30aefa7013ebfa168fed652a955f3e7 100644 (file)
@@ -125,6 +125,27 @@ xfs_check_ondisk_structs(void)
        XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format,      56);
        XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat,        20);
        XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header,          16);
+
+       /*
+        * The v5 superblock format extended several v4 header structures with
+        * additional data. While new fields are only accessible on v5
+        * superblocks, it's important that the v5 structures place original v4
+        * fields/headers in the correct location on-disk. For example, we must
+        * be able to find magic values at the same location in certain blocks
+        * regardless of superblock version.
+        *
+        * The following checks ensure that various v5 data structures place the
+        * subset of v4 metadata associated with the same type of block at the
+        * start of the on-disk block. If there is no data structure definition
+        * for certain types of v4 blocks, traverse down to the first field of
+        * common metadata (e.g., magic value) and make sure it is at offset
+        * zero.
+        */
+       XFS_CHECK_OFFSET(struct xfs_dir3_leaf, hdr.info.hdr,    0);
+       XFS_CHECK_OFFSET(struct xfs_da3_intnode, hdr.info.hdr,  0);
+       XFS_CHECK_OFFSET(struct xfs_dir3_data_hdr, hdr.magic,   0);
+       XFS_CHECK_OFFSET(struct xfs_dir3_free, hdr.hdr.magic,   0);
+       XFS_CHECK_OFFSET(struct xfs_attr3_leafblock, hdr.info.hdr, 0);
 }
 
 #endif /* __XFS_ONDISK_H */
index f44c3599527d07441fc6eb689c9d442e29add600..bde2c9f56a46ab883fdfd5cb932d958838c867eb 100644 (file)
@@ -185,7 +185,7 @@ xfs_fs_map_blocks(
        }
        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 
-       xfs_bmbt_to_iomap(ip, iomap, &imap);
+       error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
        *device_generation = mp->m_generation;
        return error;
 out_unlock:
index c5b4fa004ca4fd6ac3d25b7926284b67e7cf9282..680ae7662a78ef260fd4897b244b69898a239c5a 100644 (file)
@@ -192,7 +192,7 @@ xfs_reflink_trim_around_shared(
        int                     error = 0;
 
        /* Holes, unwritten, and delalloc extents cannot be shared */
-       if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
+       if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
                *shared = false;
                return 0;
        }
@@ -234,93 +234,59 @@ xfs_reflink_trim_around_shared(
        }
 }
 
-/*
- * Trim the passed in imap to the next shared/unshared extent boundary, and
- * if imap->br_startoff points to a shared extent reserve space for it in the
- * COW fork.
- *
- * Note that imap will always contain the block numbers for the existing blocks
- * in the data fork, as the upper layers need them for read-modify-write
- * operations.
- */
-int
-xfs_reflink_reserve_cow(
+bool
+xfs_inode_need_cow(
        struct xfs_inode        *ip,
-       struct xfs_bmbt_irec    *imap)
+       struct xfs_bmbt_irec    *imap,
+       bool                    *shared)
 {
-       struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
-       struct xfs_bmbt_irec    got;
-       int                     error = 0;
-       bool                    eof = false;
-       struct xfs_iext_cursor  icur;
-       bool                    shared;
-
-       /*
-        * Search the COW fork extent list first.  This serves two purposes:
-        * first this implement the speculative preallocation using cowextisze,
-        * so that we also unshared block adjacent to shared blocks instead
-        * of just the shared blocks themselves.  Second the lookup in the
-        * extent list is generally faster than going out to the shared extent
-        * tree.
-        */
-
-       if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &icur, &got))
-               eof = true;
-       if (!eof && got.br_startoff <= imap->br_startoff) {
-               trace_xfs_reflink_cow_found(ip, imap);
-               xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
+       /* We can't update any real extents in always COW mode. */
+       if (xfs_is_always_cow_inode(ip) &&
+           !isnullstartblock(imap->br_startblock)) {
+               *shared = true;
                return 0;
        }
 
        /* Trim the mapping to the nearest shared extent boundary. */
-       error = xfs_reflink_trim_around_shared(ip, imap, &shared);
-       if (error)
-               return error;
-
-       /* Not shared?  Just report the (potentially capped) extent. */
-       if (!shared)
-               return 0;
-
-       /*
-        * Fork all the shared blocks from our write offset until the end of
-        * the extent.
-        */
-       error = xfs_qm_dqattach_locked(ip, false);
-       if (error)
-               return error;
-
-       error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
-                       imap->br_blockcount, 0, &got, &icur, eof);
-       if (error == -ENOSPC || error == -EDQUOT)
-               trace_xfs_reflink_cow_enospc(ip, imap);
-       if (error)
-               return error;
-
-       xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
-       trace_xfs_reflink_cow_alloc(ip, &got);
-       return 0;
+       return xfs_reflink_trim_around_shared(ip, imap, shared);
 }
 
-/* Convert part of an unwritten CoW extent to a real one. */
-STATIC int
-xfs_reflink_convert_cow_extent(
-       struct xfs_inode                *ip,
-       struct xfs_bmbt_irec            *imap,
-       xfs_fileoff_t                   offset_fsb,
-       xfs_filblks_t                   count_fsb)
+static int
+xfs_reflink_convert_cow_locked(
+       struct xfs_inode        *ip,
+       xfs_fileoff_t           offset_fsb,
+       xfs_filblks_t           count_fsb)
 {
-       int                             nimaps = 1;
+       struct xfs_iext_cursor  icur;
+       struct xfs_bmbt_irec    got;
+       struct xfs_btree_cur    *dummy_cur = NULL;
+       int                     dummy_logflags;
+       int                     error = 0;
 
-       if (imap->br_state == XFS_EXT_NORM)
+       if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
                return 0;
 
-       xfs_trim_extent(imap, offset_fsb, count_fsb);
-       trace_xfs_reflink_convert_cow(ip, imap);
-       if (imap->br_blockcount == 0)
-               return 0;
-       return xfs_bmapi_write(NULL, ip, imap->br_startoff, imap->br_blockcount,
-                       XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT, 0, imap,
-                       &nimaps);
+       do {
+               if (got.br_startoff >= offset_fsb + count_fsb)
+                       break;
+               if (got.br_state == XFS_EXT_NORM)
+                       continue;
+               if (WARN_ON_ONCE(isnullstartblock(got.br_startblock)))
+                       return -EIO;
+
+               xfs_trim_extent(&got, offset_fsb, count_fsb);
+               if (!got.br_blockcount)
+                       continue;
+
+               got.br_state = XFS_EXT_NORM;
+               error = xfs_bmap_add_extent_unwritten_real(NULL, ip,
+                               XFS_COW_FORK, &icur, &dummy_cur, &got,
+                               &dummy_logflags);
+               if (error)
+                       return error;
+       } while (xfs_iext_next_extent(ip->i_cowfp, &icur, &got));
+
+       return error;
 }
 
 /* Convert all of the unwritten CoW extents in a file's range to real ones. */
@@ -334,15 +300,12 @@ xfs_reflink_convert_cow(
        xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset);
        xfs_fileoff_t           end_fsb = XFS_B_TO_FSB(mp, offset + count);
        xfs_filblks_t           count_fsb = end_fsb - offset_fsb;
-       struct xfs_bmbt_irec    imap;
-       int                     nimaps = 1, error = 0;
+       int                     error;
 
        ASSERT(count != 0);
 
        xfs_ilock(ip, XFS_ILOCK_EXCL);
-       error = xfs_bmapi_write(NULL, ip, offset_fsb, count_fsb,
-                       XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT |
-                       XFS_BMAPI_CONVERT_ONLY, 0, &imap, &nimaps);
+       error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
        return error;
 }
@@ -375,7 +338,7 @@ xfs_find_trim_cow_extent(
        if (got.br_startoff > offset_fsb) {
                xfs_trim_extent(imap, imap->br_startoff,
                                got.br_startoff - imap->br_startoff);
-               return xfs_reflink_trim_around_shared(ip, imap, shared);
+               return xfs_inode_need_cow(ip, imap, shared);
        }
 
        *shared = true;
@@ -397,7 +360,8 @@ xfs_reflink_allocate_cow(
        struct xfs_inode        *ip,
        struct xfs_bmbt_irec    *imap,
        bool                    *shared,
-       uint                    *lockmode)
+       uint                    *lockmode,
+       bool                    convert_now)
 {
        struct xfs_mount        *mp = ip->i_mount;
        xfs_fileoff_t           offset_fsb = imap->br_startoff;
@@ -409,7 +373,10 @@ xfs_reflink_allocate_cow(
        xfs_extlen_t            resblks = 0;
 
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-       ASSERT(xfs_is_reflink_inode(ip));
+       if (!ip->i_cowfp) {
+               ASSERT(!xfs_is_reflink_inode(ip));
+               xfs_ifork_init_cow(ip);
+       }
 
        error = xfs_find_trim_cow_extent(ip, imap, shared, &found);
        if (error || !*shared)
@@ -471,7 +438,16 @@ xfs_reflink_allocate_cow(
        if (nimaps == 0)
                return -ENOSPC;
 convert:
-       return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb);
+       xfs_trim_extent(imap, offset_fsb, count_fsb);
+       /*
+        * COW fork extents are supposed to remain unwritten until we're ready
+        * to initiate a disk write.  For direct I/O we are going to write the
+        * data and need the conversion, but for buffered writes we're done.
+        */
+       if (!convert_now || imap->br_state == XFS_EXT_NORM)
+               return 0;
+       trace_xfs_reflink_convert_cow(ip, imap);
+       return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
 
 out_unreserve:
        xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0,
@@ -586,7 +562,7 @@ xfs_reflink_cancel_cow_range(
        int                     error;
 
        trace_xfs_reflink_cancel_cow_range(ip, offset, count);
-       ASSERT(xfs_is_reflink_inode(ip));
+       ASSERT(ip->i_cowfp);
 
        offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
        if (count == NULLFILEOFF)
@@ -1192,7 +1168,7 @@ xfs_reflink_remap_blocks(
                        break;
                ASSERT(nimaps == 1);
 
-               trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE,
+               trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_DATA_FORK,
                                &imap);
 
                /* Translate imap into the destination file. */
index 6d73daef1f132398d0b2ee02ed319c067bd02b6f..28a43b7f581d00fa5623e2b5f0b0730ddb6b0e1e 100644 (file)
@@ -6,16 +6,28 @@
 #ifndef __XFS_REFLINK_H
 #define __XFS_REFLINK_H 1
 
+static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip)
+{
+       return ip->i_mount->m_always_cow &&
+               xfs_sb_version_hasreflink(&ip->i_mount->m_sb);
+}
+
+static inline bool xfs_is_cow_inode(struct xfs_inode *ip)
+{
+       return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip);
+}
+
 extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp,
                xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen,
                xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal);
 extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
                struct xfs_bmbt_irec *irec, bool *shared);
+bool xfs_inode_need_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
+               bool *shared);
 
-extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
-               struct xfs_bmbt_irec *imap);
 extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
-               struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode);
+               struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode,
+               bool convert_now);
 extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
                xfs_off_t count);
 
index c9097cb0b955ea9219185a97551808befc7da495..f093ea244849eb96d31eeaea0f292796f893c778 100644 (file)
@@ -1594,6 +1594,13 @@ xfs_mount_alloc(
        INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
        INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker);
        mp->m_kobj.kobject.kset = xfs_kset;
+       /*
+        * We don't create the finobt per-ag space reservation until after log
+        * recovery, so we must set this to true so that an ifree transaction
+        * started during log recovery will not depend on space reservations
+        * for finobt expansion.
+        */
+       mp->m_finobt_nores = true;
        return mp;
 }
 
@@ -1729,11 +1736,18 @@ xfs_fs_fill_super(
                }
        }
 
-       if (xfs_sb_version_hasreflink(&mp->m_sb) && mp->m_sb.sb_rblocks) {
-               xfs_alert(mp,
+       if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+               if (mp->m_sb.sb_rblocks) {
+                       xfs_alert(mp,
        "reflink not compatible with realtime device!");
-               error = -EINVAL;
-               goto out_filestream_unmount;
+                       error = -EINVAL;
+                       goto out_filestream_unmount;
+               }
+
+               if (xfs_globals.always_cow) {
+                       xfs_info(mp, "using DEBUG-only always_cow mode.");
+                       mp->m_always_cow = true;
+               }
        }
 
        if (xfs_sb_version_hasrmapbt(&mp->m_sb) && mp->m_sb.sb_rblocks) {
index 168488130a1906e34e1181da624891e20b73436a..ad7f9be130872c9e0664780115ba496bfd49e885 100644 (file)
@@ -85,6 +85,7 @@ struct xfs_globals {
        int     log_recovery_delay;     /* log recovery delay (secs) */
        int     mount_delay;            /* mount setup delay (secs) */
        bool    bug_on_assert;          /* BUG() the kernel on assert failure */
+       bool    always_cow;             /* use COW fork for all overwrites */
 };
 extern struct xfs_globals      xfs_globals;
 
index cd6a994a72500ac48755549db33d5510e3c36fdc..cabda13f3c64168a7a33d01e37bf895f9e4a07a4 100644 (file)
@@ -183,10 +183,34 @@ mount_delay_show(
 }
 XFS_SYSFS_ATTR_RW(mount_delay);
 
+static ssize_t
+always_cow_store(
+       struct kobject  *kobject,
+       const char      *buf,
+       size_t          count)
+{
+       ssize_t         ret;
+
+       ret = kstrtobool(buf, &xfs_globals.always_cow);
+       if (ret < 0)
+               return ret;
+       return count;
+}
+
+static ssize_t
+always_cow_show(
+       struct kobject  *kobject,
+       char            *buf)
+{
+       return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.always_cow);
+}
+XFS_SYSFS_ATTR_RW(always_cow);
+
 static struct attribute *xfs_dbg_attrs[] = {
        ATTR_LIST(bug_on_assert),
        ATTR_LIST(log_recovery_delay),
        ATTR_LIST(mount_delay),
+       ATTR_LIST(always_cow),
        NULL,
 };
 
index 6fcc893dfc91358e7da174e4bf4126112cb2a1e2..47fb07d86efdc05c705ea1379abf1eb259500d99 100644 (file)
@@ -1218,23 +1218,17 @@ DEFINE_EVENT(xfs_readpage_class, name,  \
 DEFINE_READPAGE_EVENT(xfs_vm_readpage);
 DEFINE_READPAGE_EVENT(xfs_vm_readpages);
 
-TRACE_DEFINE_ENUM(XFS_IO_HOLE);
-TRACE_DEFINE_ENUM(XFS_IO_DELALLOC);
-TRACE_DEFINE_ENUM(XFS_IO_UNWRITTEN);
-TRACE_DEFINE_ENUM(XFS_IO_OVERWRITE);
-TRACE_DEFINE_ENUM(XFS_IO_COW);
-
 DECLARE_EVENT_CLASS(xfs_imap_class,
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
-                int type, struct xfs_bmbt_irec *irec),
-       TP_ARGS(ip, offset, count, type, irec),
+                int whichfork, struct xfs_bmbt_irec *irec),
+       TP_ARGS(ip, offset, count, whichfork, irec),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(loff_t, size)
                __field(loff_t, offset)
                __field(size_t, count)
-               __field(int, type)
+               __field(int, whichfork)
                __field(xfs_fileoff_t, startoff)
                __field(xfs_fsblock_t, startblock)
                __field(xfs_filblks_t, blockcount)
@@ -1245,33 +1239,33 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
                __entry->size = ip->i_d.di_size;
                __entry->offset = offset;
                __entry->count = count;
-               __entry->type = type;
+               __entry->whichfork = whichfork;
                __entry->startoff = irec ? irec->br_startoff : 0;
                __entry->startblock = irec ? irec->br_startblock : 0;
                __entry->blockcount = irec ? irec->br_blockcount : 0;
        ),
        TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd "
-                 "type %s startoff 0x%llx startblock %lld blockcount 0x%llx",
+                 "fork %s startoff 0x%llx startblock %lld blockcount 0x%llx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->size,
                  __entry->offset,
                  __entry->count,
-                 __print_symbolic(__entry->type, XFS_IO_TYPES),
+                 __entry->whichfork == XFS_COW_FORK ? "cow" : "data",
                  __entry->startoff,
                  (int64_t)__entry->startblock,
                  __entry->blockcount)
 )
 
-#define DEFINE_IOMAP_EVENT(name)       \
+#define DEFINE_IMAP_EVENT(name)        \
 DEFINE_EVENT(xfs_imap_class, name,     \
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
-                int type, struct xfs_bmbt_irec *irec),         \
-       TP_ARGS(ip, offset, count, type, irec))
-DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
-DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
-DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
-DEFINE_IOMAP_EVENT(xfs_iomap_found);
+                int whichfork, struct xfs_bmbt_irec *irec),            \
+       TP_ARGS(ip, offset, count, whichfork, irec))
+DEFINE_IMAP_EVENT(xfs_map_blocks_found);
+DEFINE_IMAP_EVENT(xfs_map_blocks_alloc);
+DEFINE_IMAP_EVENT(xfs_iomap_alloc);
+DEFINE_IMAP_EVENT(xfs_iomap_found);
 
 DECLARE_EVENT_CLASS(xfs_simple_io_class,
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -3078,7 +3072,7 @@ DEFINE_EVENT(xfs_inode_irec_class, name, \
 DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag);
 DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag);
 DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size);
-DEFINE_IOMAP_EVENT(xfs_reflink_remap_imap);
+DEFINE_IMAP_EVENT(xfs_reflink_remap_imap);
 TRACE_EVENT(xfs_reflink_remap_blocks_loop,
        TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset,
                 xfs_filblks_t len, struct xfs_inode *dest,
@@ -3202,13 +3196,10 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
 
 /* copy on write */
 DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
-DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
 
-DEFINE_RW_EVENT(xfs_reflink_reserve_cow);
-
 DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write);
 
 DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
@@ -3371,6 +3362,84 @@ DEFINE_TRANS_EVENT(xfs_trans_roll);
 DEFINE_TRANS_EVENT(xfs_trans_add_item);
 DEFINE_TRANS_EVENT(xfs_trans_free_items);
 
+TRACE_EVENT(xfs_iunlink_update_bucket,
+       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int bucket,
+                xfs_agino_t old_ptr, xfs_agino_t new_ptr),
+       TP_ARGS(mp, agno, bucket, old_ptr, new_ptr),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_agnumber_t, agno)
+               __field(unsigned int, bucket)
+               __field(xfs_agino_t, old_ptr)
+               __field(xfs_agino_t, new_ptr)
+       ),
+       TP_fast_assign(
+               __entry->dev = mp->m_super->s_dev;
+               __entry->agno = agno;
+               __entry->bucket = bucket;
+               __entry->old_ptr = old_ptr;
+               __entry->new_ptr = new_ptr;
+       ),
+       TP_printk("dev %d:%d agno %u bucket %u old 0x%x new 0x%x",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->agno,
+                 __entry->bucket,
+                 __entry->old_ptr,
+                 __entry->new_ptr)
+);
+
+TRACE_EVENT(xfs_iunlink_update_dinode,
+       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
+                xfs_agino_t old_ptr, xfs_agino_t new_ptr),
+       TP_ARGS(mp, agno, agino, old_ptr, new_ptr),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_agnumber_t, agno)
+               __field(xfs_agino_t, agino)
+               __field(xfs_agino_t, old_ptr)
+               __field(xfs_agino_t, new_ptr)
+       ),
+       TP_fast_assign(
+               __entry->dev = mp->m_super->s_dev;
+               __entry->agno = agno;
+               __entry->agino = agino;
+               __entry->old_ptr = old_ptr;
+               __entry->new_ptr = new_ptr;
+       ),
+       TP_printk("dev %d:%d agno %u agino 0x%x old 0x%x new 0x%x",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->agno,
+                 __entry->agino,
+                 __entry->old_ptr,
+                 __entry->new_ptr)
+);
+
+DECLARE_EVENT_CLASS(xfs_ag_inode_class,
+       TP_PROTO(struct xfs_inode *ip),
+       TP_ARGS(ip),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_agnumber_t, agno)
+               __field(xfs_agino_t, agino)
+       ),
+       TP_fast_assign(
+               __entry->dev = VFS_I(ip)->i_sb->s_dev;
+               __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
+               __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
+       ),
+       TP_printk("dev %d:%d agno %u agino %u",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->agno, __entry->agino)
+)
+
+#define DEFINE_AGINODE_EVENT(name) \
+DEFINE_EVENT(xfs_ag_inode_class, name, \
+       TP_PROTO(struct xfs_inode *ip), \
+       TP_ARGS(ip))
+DEFINE_AGINODE_EVENT(xfs_iunlink);
+DEFINE_AGINODE_EVENT(xfs_iunlink_remove);
+DEFINE_AG_EVENT(xfs_iunlink_map_prev_fallback);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH
index 11cff449d055ce2c943046616bbc0bd96231980f..e1c7d55b32c37b1db0b15c131937eda73c89396d 100644 (file)
@@ -17,7 +17,6 @@
 #include "xfs_alloc.h"
 #include "xfs_bmap.h"
 #include "xfs_inode.h"
-#include "xfs_defer.h"
 
 /*
  * This routine is called to allocate a "bmap update done"
index 629f1479c9d234492d3a7d431dd21bb30f4db393..7d65ebf1e847a9c07c0fbb8178b26892ad3390e2 100644 (file)
@@ -277,7 +277,7 @@ xfs_trans_read_buf_map(
                 * release this buffer when it kills the tranaction.
                 */
                ASSERT(bp->b_ops != NULL);
-               error = xfs_buf_ensure_ops(bp, ops);
+               error = xfs_buf_reverify(bp, ops);
                if (error) {
                        xfs_buf_ioerror_alert(bp, __func__);
 
index 0710434eb24004db47b3b45863096b5ad7d1ed04..8ee7a3f8bb20bca0504adcda1eb0776bf4abce42 100644 (file)
@@ -18,7 +18,6 @@
 #include "xfs_alloc.h"
 #include "xfs_bmap.h"
 #include "xfs_trace.h"
-#include "xfs_defer.h"
 
 /*
  * This routine is called to allocate an "extent free done"
index 6c947ff4faf6ecb022fcb0aad0496226372eb977..8d734728dd1be9f6de8662e79b118692802bb12a 100644 (file)
@@ -16,7 +16,6 @@
 #include "xfs_refcount_item.h"
 #include "xfs_alloc.h"
 #include "xfs_refcount.h"
-#include "xfs_defer.h"
 
 /*
  * This routine is called to allocate a "refcount update done"
index a42890931ecd4a2690d3e52eb52e9843e63717d8..5c7936b1be13d3c3f52663438d522faebdec60ca 100644 (file)
@@ -16,7 +16,6 @@
 #include "xfs_rmap_item.h"
 #include "xfs_alloc.h"
 #include "xfs_rmap.h"
-#include "xfs_defer.h"
 
 /* Set the map extent flags for this reverse mapping. */
 static void
index 63ee1d5bf1d77a33d7f760f0f10d722266488cb1..9a63016009a1394f41beaff8323a5568b6ceab22 100644 (file)
@@ -129,6 +129,9 @@ __xfs_xattr_put_listent(
        char *offset;
        int arraytop;
 
+       if (context->count < 0 || context->seen_enough)
+               return;
+
        if (!context->alist)
                goto compute_size;