Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso...

[sfrench/cifs-2.6.git] / fs / ext4 / inode.c
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c

index 79b298d397b43b1fadabe82fdeebe2cf3e0cb735..f7140ca66e3bf2751eb8103a37f249b49521b6ff 100644 (file)
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -684,6 +684,24 @@ out_sem:
                 ret = check_block_validity(inode, map);
                 if (ret != 0)
                         return ret;
+
+               /*
+                * Inodes with freshly allocated blocks where contents will be
+                * visible after transaction commit must be on transaction's
+                * ordered data list.
+                */
+               if (map->m_flags & EXT4_MAP_NEW &&
+                   !(map->m_flags & EXT4_MAP_UNWRITTEN) &&
+                   !(flags & EXT4_GET_BLOCKS_ZERO) &&
+                   !IS_NOQUOTA(inode) &&
+                   ext4_should_order_data(inode)) {
+                       if (flags & EXT4_GET_BLOCKS_IO_SUBMIT)
+                               ret = ext4_jbd2_inode_add_wait(handle, inode);
+                       else
+                               ret = ext4_jbd2_inode_add_write(handle, inode);
+                       if (ret)
+                               return ret;
+               }
         }
         return retval;
  }
@@ -1289,15 +1307,6 @@ static int ext4_write_end(struct file *file,
         int i_size_changed = 0;
  
         trace_ext4_write_end(inode, pos, len, copied);
-       if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) {
-               ret = ext4_jbd2_file_inode(handle, inode);
-               if (ret) {
-                       unlock_page(page);
-                       put_page(page);
-                       goto errout;
-               }
-       }
-
         if (ext4_has_inline_data(inode)) {
                 ret = ext4_write_inline_data_end(inode, pos, len,
                                                  copied, page);
@@ -2313,7 +2322,8 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
          * the data was copied into the page cache.
          */
         get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
-                          EXT4_GET_BLOCKS_METADATA_NOFAIL;
+                          EXT4_GET_BLOCKS_METADATA_NOFAIL |
+                          EXT4_GET_BLOCKS_IO_SUBMIT;
         dioread_nolock = ext4_should_dioread_nolock(inode);
         if (dioread_nolock)
                 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
@@ -2602,11 +2612,14 @@ static int ext4_writepages(struct address_space *mapping,
         struct blk_plug plug;
         bool give_up_on_write = false;
  
+       percpu_down_read(&sbi->s_journal_flag_rwsem);
         trace_ext4_writepages(inode, wbc);
  
-       if (dax_mapping(mapping))
-               return dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
-                                                  wbc);
+       if (dax_mapping(mapping)) {
+               ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
+                                                 wbc);
+               goto out_writepages;
+       }
  
         /*
          * No pages to write? This is mainly a kludge to avoid starting
@@ -2776,6 +2789,7 @@ retry:
  out_writepages:
         trace_ext4_writepages_result(inode, wbc, ret,
                                      nr_to_write - wbc->nr_to_write);
+       percpu_up_read(&sbi->s_journal_flag_rwsem);
         return ret;
  }
  
@@ -3215,75 +3229,52 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
  }
  
  #ifdef CONFIG_FS_DAX
-int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
-                           struct buffer_head *bh_result, int create)
+/*
+ * Get block function for DAX IO and mmap faults. It takes care of converting
+ * unwritten extents to written ones and initializes new / converted blocks
+ * to zeros.
+ */
+int ext4_dax_get_block(struct inode *inode, sector_t iblock,
+                      struct buffer_head *bh_result, int create)
  {
-       int ret, err;
-       int credits;
-       struct ext4_map_blocks map;
-       handle_t *handle = NULL;
-       int flags = 0;
-
-       ext4_debug("ext4_dax_mmap_get_block: inode %lu, create flag %d\n",
-                  inode->i_ino, create);
-       map.m_lblk = iblock;
-       map.m_len = bh_result->b_size >> inode->i_blkbits;
-       credits = ext4_chunk_trans_blocks(inode, map.m_len);
-       if (create) {
-               flags |= EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_CREATE_ZERO;
-               handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
-               if (IS_ERR(handle)) {
-                       ret = PTR_ERR(handle);
-                       return ret;
-               }
-       }
+       int ret;
  
-       ret = ext4_map_blocks(handle, inode, &map, flags);
-       if (create) {
-               err = ext4_journal_stop(handle);
-               if (ret >= 0 && err < 0)
-                       ret = err;
-       }
-       if (ret <= 0)
-               goto out;
-       if (map.m_flags & EXT4_MAP_UNWRITTEN) {
-               int err2;
+       ext4_debug("inode %lu, create flag %d\n", inode->i_ino, create);
+       if (!create)
+               return _ext4_get_block(inode, iblock, bh_result, 0);
  
-               /*
-                * We are protected by i_mmap_sem so we know block cannot go
-                * away from under us even though we dropped i_data_sem.
-                * Convert extent to written and write zeros there.
-                *
-                * Note: We may get here even when create == 0.
-                */
-               handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
-               if (IS_ERR(handle)) {
-                       ret = PTR_ERR(handle);
-                       goto out;
-               }
+       ret = ext4_get_block_trans(inode, iblock, bh_result,
+                                  EXT4_GET_BLOCKS_PRE_IO |
+                                  EXT4_GET_BLOCKS_CREATE_ZERO);
+       if (ret < 0)
+               return ret;
  
-               err = ext4_map_blocks(handle, inode, &map,
-                     EXT4_GET_BLOCKS_CONVERT | EXT4_GET_BLOCKS_CREATE_ZERO);
-               if (err < 0)
-                       ret = err;
-               err2 = ext4_journal_stop(handle);
-               if (err2 < 0 && ret > 0)
-                       ret = err2;
-       }
-out:
-       WARN_ON_ONCE(ret == 0 && create);
-       if (ret > 0) {
-               map_bh(bh_result, inode->i_sb, map.m_pblk);
+       if (buffer_unwritten(bh_result)) {
                 /*
-                * At least for now we have to clear BH_New so that DAX code
-                * doesn't attempt to zero blocks again in a racy way.
+                * We are protected by i_mmap_sem or i_mutex so we know block
+                * cannot go away from under us even though we dropped
+                * i_data_sem. Convert extent to written and write zeros there.
                  */
-               map.m_flags &= ~EXT4_MAP_NEW;
-               ext4_update_bh_state(bh_result, map.m_flags);
-               bh_result->b_size = map.m_len << inode->i_blkbits;
-               ret = 0;
+               ret = ext4_get_block_trans(inode, iblock, bh_result,
+                                          EXT4_GET_BLOCKS_CONVERT |
+                                          EXT4_GET_BLOCKS_CREATE_ZERO);
+               if (ret < 0)
+                       return ret;
         }
-       return ret;
+       /*
+        * At least for now we have to clear BH_New so that DAX code
+        * doesn't attempt to zero blocks again in a racy way.
+        */
+       clear_buffer_new(bh_result);
+       return 0;
+}
+#else
+/* Just define empty function, it will never get called. */
+int ext4_dax_get_block(struct inode *inode, sector_t iblock,
+                      struct buffer_head *bh_result, int create)
+{
+       BUG();
+       return 0;
  }
  #endif
  
@@ -3316,7 +3307,9 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
  }
  
  /*
- * For ext4 extent files, ext4 will do direct-io write to holes,
+ * Handling of direct IO writes.
+ *
+ * For ext4 extent files, ext4 will do direct-io write even to holes,
   * preallocated extents, and those write extend the file, no need to
   * fall back to buffered IO.
   *
@@ -3334,10 +3327,11 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
   * if the machine crashes during the write.
   *
   */
-static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
  {
         struct file *file = iocb->ki_filp;
         struct inode *inode = file->f_mapping->host;
+       struct ext4_inode_info *ei = EXT4_I(inode);
         ssize_t ret;
         loff_t offset = iocb->ki_pos;
         size_t count = iov_iter_count(iter);
@@ -3345,10 +3339,25 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
         get_block_t *get_block_func = NULL;
         int dio_flags = 0;
         loff_t final_size = offset + count;
+       int orphan = 0;
+       handle_t *handle;
  
-       /* Use the old path for reads and writes beyond i_size. */
-       if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size)
-               return ext4_ind_direct_IO(iocb, iter);
+       if (final_size > inode->i_size) {
+               /* Credits for sb + inode write */
+               handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+               if (IS_ERR(handle)) {
+                       ret = PTR_ERR(handle);
+                       goto out;
+               }
+               ret = ext4_orphan_add(handle, inode);
+               if (ret) {
+                       ext4_journal_stop(handle);
+                       goto out;
+               }
+               orphan = 1;
+               ei->i_disksize = inode->i_size;
+               ext4_journal_stop(handle);
+       }
  
         BUG_ON(iocb->private == NULL);
  
@@ -3357,8 +3366,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
          * conversion. This also disallows race between truncate() and
          * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
          */
-       if (iov_iter_rw(iter) == WRITE)
-               inode_dio_begin(inode);
+       inode_dio_begin(inode);
  
         /* If we do a overwrite dio, i_mutex locking can be released */
         overwrite = *((int *)iocb->private);
@@ -3367,7 +3375,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
                 inode_unlock(inode);
  
         /*
-        * We could direct write to holes and fallocate.
+        * For extent mapped files we could direct write to holes and fallocate.
          *
          * Allocated blocks to fill the hole are marked as unwritten to prevent
          * parallel buffered read to expose the stale data before DIO complete
@@ -3389,7 +3397,23 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
         iocb->private = NULL;
         if (overwrite)
                 get_block_func = ext4_dio_get_block_overwrite;
-       else if (is_sync_kiocb(iocb)) {
+       else if (IS_DAX(inode)) {
+               /*
+                * We can avoid zeroing for aligned DAX writes beyond EOF. Other
+                * writes need zeroing either because they can race with page
+                * faults or because they use partial blocks.
+                */
+               if (round_down(offset, 1<<inode->i_blkbits) >= inode->i_size &&
+                   ext4_aligned_io(inode, offset, count))
+                       get_block_func = ext4_dio_get_block;
+               else
+                       get_block_func = ext4_dax_get_block;
+               dio_flags = DIO_LOCKING;
+       } else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
+                  round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) {
+               get_block_func = ext4_dio_get_block;
+               dio_flags = DIO_LOCKING | DIO_SKIP_HOLES;
+       } else if (is_sync_kiocb(iocb)) {
                 get_block_func = ext4_dio_get_block_unwritten_sync;
                 dio_flags = DIO_LOCKING;
         } else {
@@ -3399,10 +3423,10 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
  #ifdef CONFIG_EXT4_FS_ENCRYPTION
         BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode));
  #endif
-       if (IS_DAX(inode))
+       if (IS_DAX(inode)) {
                 ret = dax_do_io(iocb, inode, iter, get_block_func,
                                 ext4_end_io_dio, dio_flags);
-       else
+       } else
                 ret = __blockdev_direct_IO(iocb, inode,
                                            inode->i_sb->s_bdev, iter,
                                            get_block_func,
@@ -3422,12 +3446,86 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
                 ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
         }
  
-       if (iov_iter_rw(iter) == WRITE)
-               inode_dio_end(inode);
+       inode_dio_end(inode);
         /* take i_mutex locking again if we do a ovewrite dio */
         if (overwrite)
                 inode_lock(inode);
  
+       if (ret < 0 && final_size > inode->i_size)
+               ext4_truncate_failed_write(inode);
+
+       /* Handle extending of i_size after direct IO write */
+       if (orphan) {
+               int err;
+
+               /* Credits for sb + inode write */
+               handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+               if (IS_ERR(handle)) {
+                       /* This is really bad luck. We've written the data
+                        * but cannot extend i_size. Bail out and pretend
+                        * the write failed... */
+                       ret = PTR_ERR(handle);
+                       if (inode->i_nlink)
+                               ext4_orphan_del(NULL, inode);
+
+                       goto out;
+               }
+               if (inode->i_nlink)
+                       ext4_orphan_del(handle, inode);
+               if (ret > 0) {
+                       loff_t end = offset + ret;
+                       if (end > inode->i_size) {
+                               ei->i_disksize = end;
+                               i_size_write(inode, end);
+                               /*
+                                * We're going to return a positive `ret'
+                                * here due to non-zero-length I/O, so there's
+                                * no way of reporting error returns from
+                                * ext4_mark_inode_dirty() to userspace.  So
+                                * ignore it.
+                                */
+                               ext4_mark_inode_dirty(handle, inode);
+                       }
+               }
+               err = ext4_journal_stop(handle);
+               if (ret == 0)
+                       ret = err;
+       }
+out:
+       return ret;
+}
+
+static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter)
+{
+       int unlocked = 0;
+       struct inode *inode = iocb->ki_filp->f_mapping->host;
+       ssize_t ret;
+
+       if (ext4_should_dioread_nolock(inode)) {
+               /*
+                * Nolock dioread optimization may be dynamically disabled
+                * via ext4_inode_block_unlocked_dio(). Check inode's state
+                * while holding extra i_dio_count ref.
+                */
+               inode_dio_begin(inode);
+               smp_mb();
+               if (unlikely(ext4_test_inode_state(inode,
+                                                   EXT4_STATE_DIOREAD_LOCK)))
+                       inode_dio_end(inode);
+               else
+                       unlocked = 1;
+       }
+       if (IS_DAX(inode)) {
+               ret = dax_do_io(iocb, inode, iter, ext4_dio_get_block,
+                               NULL, unlocked ? 0 : DIO_LOCKING);
+       } else {
+               ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
+                                          iter, ext4_dio_get_block,
+                                          NULL, NULL,
+                                          unlocked ? 0 : DIO_LOCKING);
+       }
+       if (unlocked)
+               inode_dio_end(inode);
         return ret;
  }
  
@@ -3455,10 +3553,10 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
                 return 0;
  
         trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
-       if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
-               ret = ext4_ext_direct_IO(iocb, iter);
+       if (iov_iter_rw(iter) == READ)
+               ret = ext4_direct_IO_read(iocb, iter);
         else
-               ret = ext4_ind_direct_IO(iocb, iter);
+               ret = ext4_direct_IO_write(iocb, iter);
         trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
         return ret;
  }
@@ -3534,10 +3632,7 @@ void ext4_set_aops(struct inode *inode)
  {
         switch (ext4_inode_journal_mode(inode)) {
         case EXT4_INODE_ORDERED_DATA_MODE:
-               ext4_set_inode_state(inode, EXT4_STATE_ORDERED_MODE);
-               break;
         case EXT4_INODE_WRITEBACK_DATA_MODE:
-               ext4_clear_inode_state(inode, EXT4_STATE_ORDERED_MODE);
                 break;
         case EXT4_INODE_JOURNAL_DATA_MODE:
                 inode->i_mapping->a_ops = &ext4_journalled_aops;
@@ -3630,8 +3725,8 @@ static int __ext4_block_zero_page_range(handle_t *handle,
         } else {
                 err = 0;
                 mark_buffer_dirty(bh);
-               if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
-                       err = ext4_jbd2_file_inode(handle, inode);
+               if (ext4_should_order_data(inode))
+                       err = ext4_jbd2_inode_add_write(handle, inode);
         }
  
  unlock:
@@ -5429,6 +5524,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
         journal_t *journal;
         handle_t *handle;
         int err;
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
  
         /*
          * We have to be very careful here: changing a data block's
@@ -5445,22 +5541,30 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
                 return 0;
         if (is_journal_aborted(journal))
                 return -EROFS;
-       /* We have to allocate physical blocks for delalloc blocks
-        * before flushing journal. otherwise delalloc blocks can not
-        * be allocated any more. even more truncate on delalloc blocks
-        * could trigger BUG by flushing delalloc blocks in journal.
-        * There is no delalloc block in non-journal data mode.
-        */
-       if (val && test_opt(inode->i_sb, DELALLOC)) {
-               err = ext4_alloc_da_blocks(inode);
-               if (err < 0)
-                       return err;
-       }
  
         /* Wait for all existing dio workers */
         ext4_inode_block_unlocked_dio(inode);
         inode_dio_wait(inode);
  
+       /*
+        * Before flushing the journal and switching inode's aops, we have
+        * to flush all dirty data the inode has. There can be outstanding
+        * delayed allocations, there can be unwritten extents created by
+        * fallocate or buffered writes in dioread_nolock mode covered by
+        * dirty data which can be converted only after flushing the dirty
+        * data (and journalled aops don't know how to handle these cases).
+        */
+       if (val) {
+               down_write(&EXT4_I(inode)->i_mmap_sem);
+               err = filemap_write_and_wait(inode->i_mapping);
+               if (err < 0) {
+                       up_write(&EXT4_I(inode)->i_mmap_sem);
+                       ext4_inode_resume_unlocked_dio(inode);
+                       return err;
+               }
+       }
+
+       percpu_down_write(&sbi->s_journal_flag_rwsem);
         jbd2_journal_lock_updates(journal);
  
         /*
@@ -5477,6 +5581,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
                 err = jbd2_journal_flush(journal);
                 if (err < 0) {
                         jbd2_journal_unlock_updates(journal);
+                       percpu_up_write(&sbi->s_journal_flag_rwsem);
                         ext4_inode_resume_unlocked_dio(inode);
                         return err;
                 }
@@ -5485,6 +5590,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
         ext4_set_aops(inode);
  
         jbd2_journal_unlock_updates(journal);
+       percpu_up_write(&sbi->s_journal_flag_rwsem);
+
+       if (val)
+               up_write(&EXT4_I(inode)->i_mmap_sem);
         ext4_inode_resume_unlocked_dio(inode);
  
         /* Finally we can mark the inode as dirty. */