if (wait_page->bit_nr != key->bit_nr)
return 0;
- /* Stop walking if it's locked */
+ /*
+ * Stop walking if it's locked.
+ * Is this safe if put_and_wait_on_page_locked() is in use?
+ * Yes: the waker must hold a reference to this page, and if PG_locked
+ * has now already been set by another task, that task must also hold
+ * a reference to the *same usage* of this page; so there is no need
+ * to walk on to wake even the put_and_wait_on_page_locked() callers.
+ */
if (test_bit(key->bit_nr, &key->page->flags))
return -1;
wake_up_page_bit(page, bit);
}
+/*
+ * A choice of three behaviors for wait_on_page_bit_common():
+ */
+enum behavior {
+ EXCLUSIVE, /* Hold ref to page and take the bit when woken, like
+ * __lock_page() waiting on then setting PG_locked.
+ */
+ SHARED, /* Hold ref to page and check the bit when woken, like
+ * wait_on_page_writeback() waiting on PG_writeback.
+ */
+ DROP, /* Drop ref to page before wait, no check when woken,
+ * like put_and_wait_on_page_locked() on PG_locked.
+ */
+};
+
static inline int wait_on_page_bit_common(wait_queue_head_t *q,
- struct page *page, int bit_nr, int state, bool lock)
+ struct page *page, int bit_nr, int state, enum behavior behavior)
{
struct wait_page_queue wait_page;
wait_queue_entry_t *wait = &wait_page.wait;
+ bool bit_is_set;
bool thrashing = false;
+ bool delayacct = false;
unsigned long pflags;
int ret = 0;
if (bit_nr == PG_locked &&
!PageUptodate(page) && PageWorkingset(page)) {
- if (!PageSwapBacked(page))
+ if (!PageSwapBacked(page)) {
delayacct_thrashing_start();
+ delayacct = true;
+ }
psi_memstall_enter(&pflags);
thrashing = true;
}
init_wait(wait);
- wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0;
+ wait->flags = behavior == EXCLUSIVE ? WQ_FLAG_EXCLUSIVE : 0;
wait->func = wake_page_function;
wait_page.page = page;
wait_page.bit_nr = bit_nr;
spin_unlock_irq(&q->lock);
- if (likely(test_bit(bit_nr, &page->flags))) {
+ bit_is_set = test_bit(bit_nr, &page->flags);
+ if (behavior == DROP)
+ put_page(page);
+
+ if (likely(bit_is_set))
io_schedule();
- }
- if (lock) {
+ if (behavior == EXCLUSIVE) {
if (!test_and_set_bit_lock(bit_nr, &page->flags))
break;
- } else {
+ } else if (behavior == SHARED) {
if (!test_bit(bit_nr, &page->flags))
break;
}
ret = -EINTR;
break;
}
+
+ if (behavior == DROP) {
+ /*
+ * We can no longer safely access page->flags:
+ * even if CONFIG_MEMORY_HOTREMOVE is not enabled,
+ * there is a risk of waiting forever on a page reused
+ * for something that keeps it locked indefinitely.
+ * But best check for -EINTR above before breaking.
+ */
+ break;
+ }
}
finish_wait(q, wait);
if (thrashing) {
- if (!PageSwapBacked(page))
+ if (delayacct)
delayacct_thrashing_end();
psi_memstall_leave(&pflags);
}
void wait_on_page_bit(struct page *page, int bit_nr)
{
wait_queue_head_t *q = page_waitqueue(page);
- wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, false);
+ wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
}
EXPORT_SYMBOL(wait_on_page_bit);
int wait_on_page_bit_killable(struct page *page, int bit_nr)
{
wait_queue_head_t *q = page_waitqueue(page);
- return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, false);
+ return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, SHARED);
}
EXPORT_SYMBOL(wait_on_page_bit_killable);
+/**
+ * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked
+ * @page: The page to wait for.
+ *
+ * The caller should hold a reference on @page. They expect the page to
+ * become unlocked relatively soon, but do not wish to hold up migration
+ * (for example) by holding the reference while waiting for the page to
+ * come unlocked. After this function returns, the caller should not
+ * dereference @page.
+ */
+void put_and_wait_on_page_locked(struct page *page)
+{
+ wait_queue_head_t *q;
+
+ page = compound_head(page);
+ q = page_waitqueue(page);
+ wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, DROP);
+}
+
/**
* add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
* @page: Page defining the wait queue of interest
{
struct page *page = compound_head(__page);
wait_queue_head_t *q = page_waitqueue(page);
- wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, true);
+ wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE,
+ EXCLUSIVE);
}
EXPORT_SYMBOL(__lock_page);
{
struct page *page = compound_head(__page);
wait_queue_head_t *q = page_waitqueue(page);
- return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE, true);
+ return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE,
+ EXCLUSIVE);
}
EXPORT_SYMBOL_GPL(__lock_page_killable);
VM_BUG_ON_PAGE(page->index != offset, page);
}
- if (page && (fgp_flags & FGP_ACCESSED))
+ if (fgp_flags & FGP_ACCESSED)
mark_page_accessed(page);
no_page:
!mapping->a_ops->is_partially_uptodate)
goto page_not_up_to_date;
/* pipes can't handle partially uptodate pages */
- if (unlikely(iter->type & ITER_PIPE))
+ if (unlikely(iov_iter_is_pipe(iter)))
goto page_not_up_to_date;
if (!trylock_page(page))
goto page_not_up_to_date;
goto next;
head = compound_head(page);
+
+ /*
+ * Check for a locked page first, as a speculative
+ * reference may adversely influence page migration.
+ */
+ if (PageLocked(head))
+ goto next;
if (!page_cache_get_speculative(head))
goto next;
}
EXPORT_SYMBOL(read_cache_page_gfp);
+/*
+ * Don't operate on ranges the page cache doesn't support, and don't exceed the
+ * LFS limits. If pos is under the limit it becomes a short access. If it
+ * exceeds the limit we return -EFBIG.
+ */
+static int generic_access_check_limits(struct file *file, loff_t pos,
+ loff_t *count)
+{
+ struct inode *inode = file->f_mapping->host;
+ loff_t max_size = inode->i_sb->s_maxbytes;
+
+ if (!(file->f_flags & O_LARGEFILE))
+ max_size = MAX_NON_LFS;
+
+ if (unlikely(pos >= max_size))
+ return -EFBIG;
+ *count = min(*count, max_size - pos);
+ return 0;
+}
+
+static int generic_write_check_limits(struct file *file, loff_t pos,
+ loff_t *count)
+{
+ loff_t limit = rlimit(RLIMIT_FSIZE);
+
+ if (limit != RLIM_INFINITY) {
+ if (pos >= limit) {
+ send_sig(SIGXFSZ, current, 0);
+ return -EFBIG;
+ }
+ *count = min(*count, limit - pos);
+ }
+
+ return generic_access_check_limits(file, pos, count);
+}
+
/*
* Performs necessary checks before doing a write
*
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
- unsigned long limit = rlimit(RLIMIT_FSIZE);
- loff_t pos;
+ loff_t count;
+ int ret;
if (!iov_iter_count(from))
return 0;
if (iocb->ki_flags & IOCB_APPEND)
iocb->ki_pos = i_size_read(inode);
- pos = iocb->ki_pos;
-
if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
return -EINVAL;
- if (limit != RLIM_INFINITY) {
- if (iocb->ki_pos >= limit) {
- send_sig(SIGXFSZ, current, 0);
- return -EFBIG;
- }
- iov_iter_truncate(from, limit - (unsigned long)pos);
- }
+ count = iov_iter_count(from);
+ ret = generic_write_check_limits(file, iocb->ki_pos, &count);
+ if (ret)
+ return ret;
+
+ iov_iter_truncate(from, count);
+ return iov_iter_count(from);
+}
+EXPORT_SYMBOL(generic_write_checks);
+
+/*
+ * Performs necessary checks before doing a clone.
+ *
+ * Can adjust amount of bytes to clone.
+ * Returns appropriate error code that caller should return or
+ * zero in case the clone should be allowed.
+ */
+int generic_remap_checks(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t *req_count, unsigned int remap_flags)
+{
+ struct inode *inode_in = file_in->f_mapping->host;
+ struct inode *inode_out = file_out->f_mapping->host;
+ uint64_t count = *req_count;
+ uint64_t bcount;
+ loff_t size_in, size_out;
+ loff_t bs = inode_out->i_sb->s_blocksize;
+ int ret;
+
+ /* The start of both ranges must be aligned to an fs block. */
+ if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs))
+ return -EINVAL;
+
+ /* Ensure offsets don't wrap. */
+ if (pos_in + count < pos_in || pos_out + count < pos_out)
+ return -EINVAL;
+
+ size_in = i_size_read(inode_in);
+ size_out = i_size_read(inode_out);
+
+ /* Dedupe requires both ranges to be within EOF. */
+ if ((remap_flags & REMAP_FILE_DEDUP) &&
+ (pos_in >= size_in || pos_in + count > size_in ||
+ pos_out >= size_out || pos_out + count > size_out))
+ return -EINVAL;
+
+ /* Ensure the infile range is within the infile. */
+ if (pos_in >= size_in)
+ return -EINVAL;
+ count = min(count, size_in - (uint64_t)pos_in);
+
+ ret = generic_access_check_limits(file_in, pos_in, &count);
+ if (ret)
+ return ret;
+
+ ret = generic_write_check_limits(file_out, pos_out, &count);
+ if (ret)
+ return ret;
/*
- * LFS rule
+ * If the user wanted us to link to the infile's EOF, round up to the
+ * next block boundary for this check.
+ *
+ * Otherwise, make sure the count is also block-aligned, having
+ * already confirmed the starting offsets' block alignment.
*/
- if (unlikely(pos + iov_iter_count(from) > MAX_NON_LFS &&
- !(file->f_flags & O_LARGEFILE))) {
- if (pos >= MAX_NON_LFS)
- return -EFBIG;
- iov_iter_truncate(from, MAX_NON_LFS - (unsigned long)pos);
+ if (pos_in + count == size_in) {
+ bcount = ALIGN(size_in, bs) - pos_in;
+ } else {
+ if (!IS_ALIGNED(count, bs))
+ count = ALIGN_DOWN(count, bs);
+ bcount = count;
}
+ /* Don't allow overlapped cloning within the same file. */
+ if (inode_in == inode_out &&
+ pos_out + bcount > pos_in &&
+ pos_out < pos_in + bcount)
+ return -EINVAL;
+
/*
- * Are we about to exceed the fs block limit ?
- *
- * If we have written data it becomes a short write. If we have
- * exceeded without writing data we send a signal and return EFBIG.
- * Linus frestrict idea will clean these up nicely..
+ * We shortened the request but the caller can't deal with that, so
+ * bounce the request back to userspace.
*/
- if (unlikely(pos >= inode->i_sb->s_maxbytes))
- return -EFBIG;
+ if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
+ return -EINVAL;
- iov_iter_truncate(from, inode->i_sb->s_maxbytes - pos);
- return iov_iter_count(from);
+ *req_count = count;
+ return 0;
}
-EXPORT_SYMBOL(generic_write_checks);
int pagecache_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,