Merge tag 'v6.6-rc4.vfs.fixes' of gitolite.kernel.org:pub/scm/linux/kernel/git/vfs/vfs

author Linus Torvalds <torvalds@linux-foundation.org>

Tue, 26 Sep 2023 15:50:30 +0000 (08:50 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 26 Sep 2023 15:50:30 +0000 (08:50 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Tue, 26 Sep 2023 15:50:30 +0000 (08:50 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 26 Sep 2023 15:50:30 +0000 (08:50 -0700)
diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst

index deac4e973ddc12d326ee9e6874ed90456206f5db..4d05b9862451ea8237c583b5f276b1834ee4adb9 100644 (file)
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -949,3 +949,99 @@ mmap_lock held.  All in-tree users have been audited and do not seem to
  depend on the mmap_lock being held, but out of tree users should verify
  for themselves.  If they do need it, they can return VM_FAULT_RETRY to
  be called with the mmap_lock held.
+
+---
+
+**mandatory**
+
+The order of opening block devices and matching or creating superblocks has
+changed.
+
+The old logic opened block devices first and then tried to find a
+suitable superblock to reuse based on the block device pointer.
+
+The new logic tries to find a suitable superblock first based on the device
+number, and opening the block device afterwards.
+
+Since opening block devices cannot happen under s_umount because of lock
+ordering requirements s_umount is now dropped while opening block devices and
+reacquired before calling fill_super().
+
+In the old logic concurrent mounters would find the superblock on the list of
+superblocks for the filesystem type. Since the first opener of the block device
+would hold s_umount they would wait until the superblock became either born or
+was discarded due to initialization failure.
+
+Since the new logic drops s_umount concurrent mounters could grab s_umount and
+would spin. Instead they are now made to wait using an explicit wait-wake
+mechanism without having to hold s_umount.
+
+---
+
+**mandatory**
+
+The holder of a block device is now the superblock.
+
+The holder of a block device used to be the file_system_type which wasn't
+particularly useful. It wasn't possible to go from block device to owning
+superblock without matching on the device pointer stored in the superblock.
+This mechanism would only work for a single device so the block layer couldn't
+find the owning superblock of any additional devices.
+
+In the old mechanism reusing or creating a superblock for a racing mount(2) and
+umount(2) relied on the file_system_type as the holder. This was severly
+underdocumented however:
+
+(1) Any concurrent mounter that managed to grab an active reference on an
+    existing superblock was made to wait until the superblock either became
+    ready or until the superblock was removed from the list of superblocks of
+    the filesystem type. If the superblock is ready the caller would simple
+    reuse it.
+
+(2) If the mounter came after deactivate_locked_super() but before
+    the superblock had been removed from the list of superblocks of the
+    filesystem type the mounter would wait until the superblock was shutdown,
+    reuse the block device and allocate a new superblock.
+
+(3) If the mounter came after deactivate_locked_super() and after
+    the superblock had been removed from the list of superblocks of the
+    filesystem type the mounter would reuse the block device and allocate a new
+    superblock (the bd_holder point may still be set to the filesystem type).
+
+Because the holder of the block device was the file_system_type any concurrent
+mounter could open the block devices of any superblock of the same
+file_system_type without risking seeing EBUSY because the block device was
+still in use by another superblock.
+
+Making the superblock the owner of the block device changes this as the holder
+is now a unique superblock and thus block devices associated with it cannot be
+reused by concurrent mounters. So a concurrent mounter in (2) could suddenly
+see EBUSY when trying to open a block device whose holder was a different
+superblock.
+
+The new logic thus waits until the superblock and the devices are shutdown in
+->kill_sb(). Removal of the superblock from the list of superblocks of the
+filesystem type is now moved to a later point when the devices are closed:
+
+(1) Any concurrent mounter managing to grab an active reference on an existing
+    superblock is made to wait until the superblock is either ready or until
+    the superblock and all devices are shutdown in ->kill_sb(). If the
+    superblock is ready the caller will simply reuse it.
+
+(2) If the mounter comes after deactivate_locked_super() but before
+    the superblock has been removed from the list of superblocks of the
+    filesystem type the mounter is made to wait until the superblock and the
+    devices are shut down in ->kill_sb() and the superblock is removed from the
+    list of superblocks of the filesystem type. The mounter will allocate a new
+    superblock and grab ownership of the block device (the bd_holder pointer of
+    the block device will be set to the newly allocated superblock).
+
+(3) This case is now collapsed into (2) as the superblock is left on the list
+    of superblocks of the filesystem type until all devices are shutdown in
+    ->kill_sb(). In other words, if the superblock isn't on the list of
+    superblock of the filesystem type anymore then it has given up ownership of
+    all associated block devices (the bd_holder pointer is NULL).
+
+As this is a VFS level change it has no practical consequences for filesystems
+other than that all of them must use one of the provided kill_litter_super(),
+kill_anon_super(), or kill_block_super() helpers.
diff --git a/fs/aio.c b/fs/aio.c

index a4c2a6bac72ce9976b6b0ea20b1d9213cc4c2af8..f8589caef9c10ec829bc6470cab5ce159915114c 100644 (file)
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -80,7 +80,7 @@ struct aio_ring {
  struct kioctx_table {
         struct rcu_head         rcu;
         unsigned                nr;
-       struct kioctx __rcu     *table[];
+       struct kioctx __rcu     *table[] __counted_by(nr);
  };
  
  struct kioctx_cpu {
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c

index 969ce991b0b03f6b49f3b92cdcfc4ca515a8f315..c1af01b2c42d708ede9b2842b1cc0bfec5153838 100644 (file)
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1535,10 +1535,15 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
  
         if (wbc->pages_skipped) {
                 /*
-                * writeback is not making progress due to locked
-                * buffers. Skip this inode for now.
+                * Writeback is not making progress due to locked buffers.
+                * Skip this inode for now. Although having skipped pages
+                * is odd for clean inodes, it can happen for some
+                * filesystems so handle that gracefully.
                  */
-               redirty_tail_locked(inode, wb);
+               if (inode->i_state & I_DIRTY_ALL)
+                       redirty_tail_locked(inode, wb);
+               else
+                       inode_cgwb_move_to_attached(inode, wb);
                 return;
         }
  
diff --git a/fs/libfs.c b/fs/libfs.c

index a4eb1275788627161d1a1f0d5f5b6bc3557a5df1..37f2d34ee090bd1230426045d8565bf72a82579b 100644 (file)
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1903,6 +1903,7 @@ ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter,
                  * We don't know how much we wrote, so just return the number of
                  * bytes which were direct-written
                  */
+               iocb->ki_pos -= buffered_written;
                 if (direct_written)
                         return direct_written;
                 return err;
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c

index cfec5e0c7f66ae923ab13dc18e88d4cba86b7478..5661a363005ee09d5cf1f453658bd5c0415dd43f 100644 (file)
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -1562,6 +1562,7 @@ load_root:
  put_inode_out:
         iput(inode);
  out:
+       ntfs3_put_sbi(sbi);
         kfree(boot2);
         return err;
  }
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c

index d1761ec5866aa6342db3d6091bd3cc06998d5ca6..ada3fcc9c6d5015ac65929ecf6738720b51615d6 100644 (file)
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -337,7 +337,7 @@ static int ovl_set_timestamps(struct ovl_fs *ofs, struct dentry *upperdentry,
  {
         struct iattr attr = {
                 .ia_valid =
-                    ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
+                    ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET | ATTR_CTIME,
                 .ia_atime = stat->atime,
                 .ia_mtime = stat->mtime,
         };
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c

index 4193633c4c7a771a880acb08d28b561080d2d8b1..693971d20280e2b8f1fdfb1325d992a86b34ec95 100644 (file)
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -391,6 +391,12 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
         if (!ovl_should_sync(OVL_FS(inode->i_sb)))
                 ifl &= ~(IOCB_DSYNC | IOCB_SYNC);
  
+       /*
+        * Overlayfs doesn't support deferred completions, don't copy
+        * this property in case it is set by the issuer.
+        */
+       ifl &= ~IOCB_DIO_CALLER_COMP;
+
         old_cred = ovl_override_creds(file_inode(file)->i_sb);
         if (is_sync_kiocb(iocb)) {
                 file_start_write(real.file);
diff --git a/fs/pipe.c b/fs/pipe.c

index 6c1a9b1db9076c8c1402b7368a25ca7d9a894c70..139190165a1c2231ebb90fb364ee8bdb299b2b16 100644 (file)
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -537,7 +537,6 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
                                 break;
                         }
                         ret += copied;
-                       buf->offset = 0;
                         buf->len = copied;
  
                         if (!iov_iter_count(from))
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h

index b81749492ef98e61bd22218c585864d9a9986953..7d12b8c5b2fa8c73929f7c9bd79d5085e7b1835f 100644 (file)
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -2699,7 +2699,7 @@ struct reiserfs_iget_args {
  #define get_journal_desc_magic(bh) (bh->b_data + bh->b_size - 12)
  
  #define journal_trans_half(blocksize) \
-       ((blocksize - sizeof (struct reiserfs_journal_desc) + sizeof (__u32) - 12) / sizeof (__u32))
+       ((blocksize - sizeof(struct reiserfs_journal_desc) - 12) / sizeof(__u32))
  
  /* journal.c see journal.c for all the comments here */
  
@@ -2711,7 +2711,7 @@ struct reiserfs_journal_desc {
         __le32 j_len;
  
         __le32 j_mount_id;      /* mount id of this trans */
-       __le32 j_realblock[1];  /* real locations for each block */
+       __le32 j_realblock[];   /* real locations for each block */
  };
  
  #define get_desc_trans_id(d)   le32_to_cpu((d)->j_trans_id)
@@ -2726,7 +2726,7 @@ struct reiserfs_journal_desc {
  struct reiserfs_journal_commit {
         __le32 j_trans_id;      /* must match j_trans_id from the desc block */
         __le32 j_len;           /* ditto */
-       __le32 j_realblock[1];  /* real locations for each block */
+       __le32 j_realblock[];   /* real locations for each block */
  };
  
  #define get_commit_trans_id(c) le32_to_cpu((c)->j_trans_id)
author	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 26 Sep 2023 15:50:30 +0000 (08:50 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 26 Sep 2023 15:50:30 +0000 (08:50 -0700)
Documentation/filesystems/porting.rst		patch \| blob \| history
fs/aio.c		patch \| blob \| history
fs/fs-writeback.c		patch \| blob \| history
fs/libfs.c		patch \| blob \| history
fs/ntfs3/super.c		patch \| blob \| history
fs/overlayfs/copy_up.c		patch \| blob \| history
fs/overlayfs/file.c		patch \| blob \| history
fs/pipe.c		patch \| blob \| history
fs/reiserfs/reiserfs.h		patch \| blob \| history