// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2010 Red Hat, Inc.
- * Copyright (C) 2016-2019 Christoph Hellwig.
+ * Copyright (C) 2016-2023 Christoph Hellwig.
*/
#include <linux/module.h>
#include <linux/compiler.h>
return test_bit(block + blks_per_folio, ifs->state);
}
+static unsigned ifs_find_dirty_range(struct folio *folio,
+ struct iomap_folio_state *ifs, u64 *range_start, u64 range_end)
+{
+ struct inode *inode = folio->mapping->host;
+ unsigned start_blk =
+ offset_in_folio(folio, *range_start) >> inode->i_blkbits;
+ unsigned end_blk = min_not_zero(
+ offset_in_folio(folio, range_end) >> inode->i_blkbits,
+ i_blocks_per_folio(inode, folio));
+ unsigned nblks = 1;
+
+ while (!ifs_block_is_dirty(folio, ifs, start_blk))
+ if (++start_blk == end_blk)
+ return 0;
+
+ while (start_blk + nblks < end_blk) {
+ if (!ifs_block_is_dirty(folio, ifs, start_blk + nblks))
+ break;
+ nblks++;
+ }
+
+ *range_start = folio_pos(folio) + (start_blk << inode->i_blkbits);
+ return nblks << inode->i_blkbits;
+}
+
+static unsigned iomap_find_dirty_range(struct folio *folio, u64 *range_start,
+ u64 range_end)
+{
+ struct iomap_folio_state *ifs = folio->private;
+
+ if (*range_start >= range_end)
+ return 0;
+
+ if (ifs)
+ return ifs_find_dirty_range(folio, ifs, range_start, range_end);
+ return range_end - *range_start;
+}
+
static void ifs_clear_range_dirty(struct folio *folio,
struct iomap_folio_state *ifs, size_t off, size_t len)
{
EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
- size_t len, int error)
+ size_t len)
{
struct iomap_folio_state *ifs = folio->private;
- if (error) {
- folio_set_error(folio);
- mapping_set_error(inode->i_mapping, error);
- }
-
WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0);
struct folio_iter fi;
u32 folio_count = 0;
+ if (error) {
+ mapping_set_error(inode->i_mapping, error);
+ if (!bio_flagged(bio, BIO_QUIET)) {
+ pr_err_ratelimited(
+"%s: writeback error on inode %lu, offset %lld, sector %llu",
+ inode->i_sb->s_id, inode->i_ino,
+ ioend->io_offset, ioend->io_sector);
+ }
+ }
+
/* walk all folios in bio, ending page IO on them */
bio_for_each_folio_all(fi, bio) {
- iomap_finish_folio_write(inode, fi.folio, fi.length, error);
+ if (error)
+ folio_set_error(fi.folio);
+ iomap_finish_folio_write(inode, fi.folio, fi.length);
folio_count++;
}
- if (unlikely(error && !bio_flagged(bio, BIO_QUIET))) {
- printk_ratelimited(KERN_ERR
-"%s: writeback error on inode %lu, offset %lld, sector %llu",
- inode->i_sb->s_id, inode->i_ino,
- ioend->io_offset, ioend->io_sector);
- }
bio_put(bio); /* frees the ioend */
return folio_count;
}
* Submit the final bio for an ioend.
*
* If @error is non-zero, it means that we have a situation where some part of
- * the submission process has failed after we've marked pages for writeback
- * and unlocked them. In this situation, we need to fail the bio instead of
- * submitting it. This typically only happens on a filesystem shutdown.
+ * the submission process has failed after we've marked pages for writeback.
+ * We cannot cancel ioend directly in that case, so call the bio end I/O handler
+ * with the error status here to run the normal I/O completion handler to clear
+ * the writeback bit and let the file system proess the errors.
*/
-static int
-iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend,
- int error)
+static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
{
+ if (!wpc->ioend)
+ return error;
+
+ /*
+ * Let the file systems prepare the I/O submission and hook in an I/O
+ * comletion handler. This also needs to happen in case after a
+ * failure happened so that the file system end I/O handler gets called
+ * to clean up.
+ */
if (wpc->ops->prepare_ioend)
- error = wpc->ops->prepare_ioend(ioend, error);
+ error = wpc->ops->prepare_ioend(wpc->ioend, error);
+
if (error) {
- /*
- * If we're failing the IO now, just mark the ioend with an
- * error and finish it. This will run IO completion immediately
- * as there is only one reference to the ioend at this point in
- * time.
- */
- ioend->io_bio.bi_status = errno_to_blk_status(error);
- bio_endio(&ioend->io_bio);
- return error;
+ wpc->ioend->io_bio.bi_status = errno_to_blk_status(error);
+ bio_endio(&wpc->ioend->io_bio);
+ } else {
+ submit_bio(&wpc->ioend->io_bio);
}
- submit_bio(&ioend->io_bio);
- return 0;
+ wpc->ioend = NULL;
+ return error;
}
static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
bio->bi_end_io = iomap_writepage_end_bio;
wbc_init_bio(wbc, bio);
+ bio->bi_write_hint = inode->i_write_hint;
ioend = iomap_ioend_from_bio(bio);
INIT_LIST_HEAD(&ioend->io_list);
/*
* Test to see if we have an existing ioend structure that we could append to
* first; otherwise finish off the current ioend and start another.
+ *
+ * If a new ioend is created and cached, the old ioend is submitted to the block
+ * layer instantly. Batching optimisations are provided by higher level block
+ * plugging.
+ *
+ * At the end of a writeback pass, there will be a cached ioend remaining on the
+ * writepage context that the caller will need to submit.
*/
-static void iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
+static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
struct writeback_control *wbc, struct folio *folio,
- struct inode *inode, loff_t pos, struct list_head *iolist)
+ struct inode *inode, loff_t pos, unsigned len)
{
struct iomap_folio_state *ifs = folio->private;
- unsigned len = i_blocksize(inode);
size_t poff = offset_in_folio(folio, pos);
+ int error;
if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) {
new_ioend:
- if (wpc->ioend)
- list_add(&wpc->ioend->io_list, iolist);
+ error = iomap_submit_ioend(wpc, 0);
+ if (error)
+ return error;
wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos);
}
atomic_add(len, &ifs->write_bytes_pending);
wpc->ioend->io_size += len;
wbc_account_cgroup_owner(wbc, &folio->page, len);
+ return 0;
+}
+
+static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
+ struct writeback_control *wbc, struct folio *folio,
+ struct inode *inode, u64 pos, unsigned dirty_len,
+ unsigned *count)
+{
+ int error;
+
+ do {
+ unsigned map_len;
+
+ error = wpc->ops->map_blocks(wpc, inode, pos, dirty_len);
+ if (error)
+ break;
+ trace_iomap_writepage_map(inode, pos, dirty_len, &wpc->iomap);
+
+ map_len = min_t(u64, dirty_len,
+ wpc->iomap.offset + wpc->iomap.length - pos);
+ WARN_ON_ONCE(!folio->private && map_len < dirty_len);
+
+ switch (wpc->iomap.type) {
+ case IOMAP_INLINE:
+ WARN_ON_ONCE(1);
+ error = -EIO;
+ break;
+ case IOMAP_HOLE:
+ break;
+ default:
+ error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos,
+ map_len);
+ if (!error)
+ (*count)++;
+ break;
+ }
+ dirty_len -= map_len;
+ pos += map_len;
+ } while (dirty_len && !error);
+
+ /*
+ * We cannot cancel the ioend directly here on error. We may have
+ * already set other pages under writeback and hence we have to run I/O
+ * completion to mark the error state of the pages under writeback
+ * appropriately.
+ *
+ * Just let the file system know what portion of the folio failed to
+ * map.
+ */
+ if (error && wpc->ops->discard_folio)
+ wpc->ops->discard_folio(folio, pos);
+ return error;
}
/*
* beyond i_size.
*/
folio_zero_segment(folio, poff, folio_size(folio));
- *end_pos = isize;
+ *end_pos = round_up(isize, i_blocksize(inode));
}
return true;
}
-/*
- * We implement an immediate ioend submission policy here to avoid needing to
- * chain multiple ioends and hence nest mempool allocations which can violate
- * the forward progress guarantees we need to provide. The current ioend we're
- * adding blocks to is cached in the writepage context, and if the new block
- * doesn't append to the cached ioend, it will create a new ioend and cache that
- * instead.
- *
- * If a new ioend is created and cached, the old ioend is returned and queued
- * locally for submission once the entire page is processed or an error has been
- * detected. While ioends are submitted immediately after they are completed,
- * batching optimisations are provided by higher level block plugging.
- *
- * At the end of a writeback pass, there will be a cached ioend remaining on the
- * writepage context that the caller will need to submit.
- */
static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
struct writeback_control *wbc, struct folio *folio)
{
struct iomap_folio_state *ifs = folio->private;
struct inode *inode = folio->mapping->host;
- struct iomap_ioend *ioend, *next;
- unsigned len = i_blocksize(inode);
- unsigned nblocks = i_blocks_per_folio(inode, folio);
u64 pos = folio_pos(folio);
u64 end_pos = pos + folio_size(folio);
- int error = 0, count = 0, i;
- LIST_HEAD(submit_list);
+ unsigned count = 0;
+ int error = 0;
+ u32 rlen;
+
+ WARN_ON_ONCE(!folio_test_locked(folio));
+ WARN_ON_ONCE(folio_test_dirty(folio));
+ WARN_ON_ONCE(folio_test_writeback(folio));
trace_iomap_writepage(inode, pos, folio_size(folio));
}
WARN_ON_ONCE(end_pos <= pos);
- if (!ifs && nblocks > 1) {
- ifs = ifs_alloc(inode, folio, 0);
- iomap_set_range_dirty(folio, 0, end_pos - pos);
- }
+ if (i_blocks_per_folio(inode, folio) > 1) {
+ if (!ifs) {
+ ifs = ifs_alloc(inode, folio, 0);
+ iomap_set_range_dirty(folio, 0, end_pos - pos);
+ }
- WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0);
+ /*
+ * Keep the I/O completion handler from clearing the writeback
+ * bit until we have submitted all blocks by adding a bias to
+ * ifs->write_bytes_pending, which is dropped after submitting
+ * all blocks.
+ */
+ WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0);
+ atomic_inc(&ifs->write_bytes_pending);
+ }
/*
- * Walk through the folio to find areas to write back. If we
- * run off the end of the current map or find the current map
- * invalid, grab a new one.
+ * Set the writeback bit ASAP, as the I/O completion for the single
+ * block per folio case happen hit as soon as we're submitting the bio.
*/
- for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
- if (ifs && !ifs_block_is_dirty(folio, ifs, i))
- continue;
+ folio_start_writeback(folio);
- error = wpc->ops->map_blocks(wpc, inode, pos);
+ /*
+ * Walk through the folio to find dirty areas to write back.
+ */
+ while ((rlen = iomap_find_dirty_range(folio, &pos, end_pos))) {
+ error = iomap_writepage_map_blocks(wpc, wbc, folio, inode,
+ pos, rlen, &count);
if (error)
break;
- trace_iomap_writepage_map(inode, &wpc->iomap);
- if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE)) {
- error = -EIO;
- break;
- }
- if (wpc->iomap.type == IOMAP_HOLE)
- continue;
- iomap_add_to_ioend(wpc, wbc, folio, inode, pos, &submit_list);
- count++;
+ pos += rlen;
}
+
if (count)
wpc->nr_folios++;
- WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list));
- WARN_ON_ONCE(!folio_test_locked(folio));
- WARN_ON_ONCE(folio_test_writeback(folio));
- WARN_ON_ONCE(folio_test_dirty(folio));
-
- /*
- * We cannot cancel the ioend directly here on error. We may have
- * already set other pages under writeback and hence we have to run I/O
- * completion to mark the error state of the pages under writeback
- * appropriately.
- */
- if (unlikely(error)) {
- /*
- * Let the filesystem know what portion of the current page
- * failed to map.
- */
- if (wpc->ops->discard_folio)
- wpc->ops->discard_folio(folio, pos);
- }
-
/*
* We can have dirty bits set past end of file in page_mkwrite path
* while mapping the last partial folio. Hence it's better to clear
iomap_clear_range_dirty(folio, 0, folio_size(folio));
/*
- * If the page hasn't been added to the ioend, it won't be affected by
- * I/O completion and we must unlock it now.
+ * Usually the writeback bit is cleared by the I/O completion handler.
+ * But we may end up either not actually writing any blocks, or (when
+ * there are multiple blocks in a folio) all I/O might have finished
+ * already at this point. In that case we need to clear the writeback
+ * bit ourselves right after unlocking the page.
*/
- if (error && !count) {
- folio_unlock(folio);
- goto done;
- }
-
- folio_start_writeback(folio);
folio_unlock(folio);
-
- /*
- * Preserve the original error if there was one; catch
- * submission errors here and propagate into subsequent ioend
- * submissions.
- */
- list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
- int error2;
-
- list_del_init(&ioend->io_list);
- error2 = iomap_submit_ioend(wpc, ioend, error);
- if (error2 && !error)
- error = error2;
+ if (ifs) {
+ if (atomic_dec_and_test(&ifs->write_bytes_pending))
+ folio_end_writeback(folio);
+ } else {
+ if (!count)
+ folio_end_writeback(folio);
}
-
- /*
- * We can end up here with no error and nothing to write only if we race
- * with a partial page truncate on a sub-page block sized filesystem.
- */
- if (!count)
- folio_end_writeback(folio);
-done:
mapping_set_error(inode->i_mapping, error);
return error;
}
wpc->ops = ops;
ret = write_cache_pages(mapping, wbc, iomap_do_writepage, wpc);
- if (!wpc->ioend)
- return ret;
- return iomap_submit_ioend(wpc, wpc->ioend, ret);
+ return iomap_submit_ioend(wpc, ret);
}
EXPORT_SYMBOL_GPL(iomap_writepages);