mm, fault_around: do not take a reference to a locked page

[sfrench/cifs-2.6.git] / mm / filemap.c
diff --git a/mm/filemap.c b/mm/filemap.c

index 218d0b2ec82d1534dcb66b4744f886d7d0262d55..29655fb47a2c4b2cf61e1c61e5b06d6f51e537a8 100644 (file)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -981,7 +981,14 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
         if (wait_page->bit_nr != key->bit_nr)
                 return 0;
  
-       /* Stop walking if it's locked */
+       /*
+        * Stop walking if it's locked.
+        * Is this safe if put_and_wait_on_page_locked() is in use?
+        * Yes: the waker must hold a reference to this page, and if PG_locked
+        * has now already been set by another task, that task must also hold
+        * a reference to the *same usage* of this page; so there is no need
+        * to walk on to wake even the put_and_wait_on_page_locked() callers.
+        */
         if (test_bit(key->bit_nr, &key->page->flags))
                 return -1;
  
@@ -1049,25 +1056,44 @@ static void wake_up_page(struct page *page, int bit)
         wake_up_page_bit(page, bit);
  }
  
+/*
+ * A choice of three behaviors for wait_on_page_bit_common():
+ */
+enum behavior {
+       EXCLUSIVE,      /* Hold ref to page and take the bit when woken, like
+                        * __lock_page() waiting on then setting PG_locked.
+                        */
+       SHARED,         /* Hold ref to page and check the bit when woken, like
+                        * wait_on_page_writeback() waiting on PG_writeback.
+                        */
+       DROP,           /* Drop ref to page before wait, no check when woken,
+                        * like put_and_wait_on_page_locked() on PG_locked.
+                        */
+};
+
  static inline int wait_on_page_bit_common(wait_queue_head_t *q,
-               struct page *page, int bit_nr, int state, bool lock)
+       struct page *page, int bit_nr, int state, enum behavior behavior)
  {
         struct wait_page_queue wait_page;
         wait_queue_entry_t *wait = &wait_page.wait;
+       bool bit_is_set;
         bool thrashing = false;
+       bool delayacct = false;
         unsigned long pflags;
         int ret = 0;
  
         if (bit_nr == PG_locked &&
             !PageUptodate(page) && PageWorkingset(page)) {
-               if (!PageSwapBacked(page))
+               if (!PageSwapBacked(page)) {
                         delayacct_thrashing_start();
+                       delayacct = true;
+               }
                 psi_memstall_enter(&pflags);
                 thrashing = true;
         }
  
         init_wait(wait);
-       wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0;
+       wait->flags = behavior == EXCLUSIVE ? WQ_FLAG_EXCLUSIVE : 0;
         wait->func = wake_page_function;
         wait_page.page = page;
         wait_page.bit_nr = bit_nr;
@@ -1084,14 +1110,17 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
  
                 spin_unlock_irq(&q->lock);
  
-               if (likely(test_bit(bit_nr, &page->flags))) {
+               bit_is_set = test_bit(bit_nr, &page->flags);
+               if (behavior == DROP)
+                       put_page(page);
+
+               if (likely(bit_is_set))
                         io_schedule();
-               }
  
-               if (lock) {
+               if (behavior == EXCLUSIVE) {
                         if (!test_and_set_bit_lock(bit_nr, &page->flags))
                                 break;
-               } else {
+               } else if (behavior == SHARED) {
                         if (!test_bit(bit_nr, &page->flags))
                                 break;
                 }
@@ -1100,12 +1129,23 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
                         ret = -EINTR;
                         break;
                 }
+
+               if (behavior == DROP) {
+                       /*
+                        * We can no longer safely access page->flags:
+                        * even if CONFIG_MEMORY_HOTREMOVE is not enabled,
+                        * there is a risk of waiting forever on a page reused
+                        * for something that keeps it locked indefinitely.
+                        * But best check for -EINTR above before breaking.
+                        */
+                       break;
+               }
         }
  
         finish_wait(q, wait);
  
         if (thrashing) {
-               if (!PageSwapBacked(page))
+               if (delayacct)
                         delayacct_thrashing_end();
                 psi_memstall_leave(&pflags);
         }
@@ -1124,17 +1164,36 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
  void wait_on_page_bit(struct page *page, int bit_nr)
  {
         wait_queue_head_t *q = page_waitqueue(page);
-       wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, false);
+       wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
  }
  EXPORT_SYMBOL(wait_on_page_bit);
  
  int wait_on_page_bit_killable(struct page *page, int bit_nr)
  {
         wait_queue_head_t *q = page_waitqueue(page);
-       return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, false);
+       return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, SHARED);
  }
  EXPORT_SYMBOL(wait_on_page_bit_killable);
  
+/**
+ * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked
+ * @page: The page to wait for.
+ *
+ * The caller should hold a reference on @page.  They expect the page to
+ * become unlocked relatively soon, but do not wish to hold up migration
+ * (for example) by holding the reference while waiting for the page to
+ * come unlocked.  After this function returns, the caller should not
+ * dereference @page.
+ */
+void put_and_wait_on_page_locked(struct page *page)
+{
+       wait_queue_head_t *q;
+
+       page = compound_head(page);
+       q = page_waitqueue(page);
+       wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, DROP);
+}
+
  /**
   * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
   * @page: Page defining the wait queue of interest
@@ -1264,7 +1323,8 @@ void __lock_page(struct page *__page)
  {
         struct page *page = compound_head(__page);
         wait_queue_head_t *q = page_waitqueue(page);
-       wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, true);
+       wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE,
+                               EXCLUSIVE);
  }
  EXPORT_SYMBOL(__lock_page);
  
@@ -1272,7 +1332,8 @@ int __lock_page_killable(struct page *__page)
  {
         struct page *page = compound_head(__page);
         wait_queue_head_t *q = page_waitqueue(page);
-       return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE, true);
+       return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE,
+                                       EXCLUSIVE);
  }
  EXPORT_SYMBOL_GPL(__lock_page_killable);
  
@@ -1540,7 +1601,7 @@ repeat:
                 VM_BUG_ON_PAGE(page->index != offset, page);
         }
  
-       if (page && (fgp_flags & FGP_ACCESSED))
+       if (fgp_flags & FGP_ACCESSED)
                 mark_page_accessed(page);
  
  no_page:
@@ -2049,7 +2110,7 @@ find_page:
                                         !mapping->a_ops->is_partially_uptodate)
                                 goto page_not_up_to_date;
                         /* pipes can't handle partially uptodate pages */
-                       if (unlikely(iter->type & ITER_PIPE))
+                       if (unlikely(iov_iter_is_pipe(iter)))
                                 goto page_not_up_to_date;
                         if (!trylock_page(page))
                                 goto page_not_up_to_date;
@@ -2553,6 +2614,13 @@ void filemap_map_pages(struct vm_fault *vmf,
                         goto next;
  
                 head = compound_head(page);
+
+               /*
+                * Check for a locked page first, as a speculative
+                * reference may adversely influence page migration.
+                */
+               if (PageLocked(head))
+                       goto next;
                 if (!page_cache_get_speculative(head))
                         goto next;
  
@@ -2824,6 +2892,42 @@ struct page *read_cache_page_gfp(struct address_space *mapping,
  }
  EXPORT_SYMBOL(read_cache_page_gfp);
  
+/*
+ * Don't operate on ranges the page cache doesn't support, and don't exceed the
+ * LFS limits.  If pos is under the limit it becomes a short access.  If it
+ * exceeds the limit we return -EFBIG.
+ */
+static int generic_access_check_limits(struct file *file, loff_t pos,
+                                      loff_t *count)
+{
+       struct inode *inode = file->f_mapping->host;
+       loff_t max_size = inode->i_sb->s_maxbytes;
+
+       if (!(file->f_flags & O_LARGEFILE))
+               max_size = MAX_NON_LFS;
+
+       if (unlikely(pos >= max_size))
+               return -EFBIG;
+       *count = min(*count, max_size - pos);
+       return 0;
+}
+
+static int generic_write_check_limits(struct file *file, loff_t pos,
+                                     loff_t *count)
+{
+       loff_t limit = rlimit(RLIMIT_FSIZE);
+
+       if (limit != RLIM_INFINITY) {
+               if (pos >= limit) {
+                       send_sig(SIGXFSZ, current, 0);
+                       return -EFBIG;
+               }
+               *count = min(*count, limit - pos);
+       }
+
+       return generic_access_check_limits(file, pos, count);
+}
+
  /*
   * Performs necessary checks before doing a write
   *
@@ -2835,8 +2939,8 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
  {
         struct file *file = iocb->ki_filp;
         struct inode *inode = file->f_mapping->host;
-       unsigned long limit = rlimit(RLIMIT_FSIZE);
-       loff_t pos;
+       loff_t count;
+       int ret;
  
         if (!iov_iter_count(from))
                 return 0;
@@ -2845,43 +2949,99 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
         if (iocb->ki_flags & IOCB_APPEND)
                 iocb->ki_pos = i_size_read(inode);
  
-       pos = iocb->ki_pos;
-
         if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
                 return -EINVAL;
  
-       if (limit != RLIM_INFINITY) {
-               if (iocb->ki_pos >= limit) {
-                       send_sig(SIGXFSZ, current, 0);
-                       return -EFBIG;
-               }
-               iov_iter_truncate(from, limit - (unsigned long)pos);
-       }
+       count = iov_iter_count(from);
+       ret = generic_write_check_limits(file, iocb->ki_pos, &count);
+       if (ret)
+               return ret;
+
+       iov_iter_truncate(from, count);
+       return iov_iter_count(from);
+}
+EXPORT_SYMBOL(generic_write_checks);
+
+/*
+ * Performs necessary checks before doing a clone.
+ *
+ * Can adjust amount of bytes to clone.
+ * Returns appropriate error code that caller should return or
+ * zero in case the clone should be allowed.
+ */
+int generic_remap_checks(struct file *file_in, loff_t pos_in,
+                        struct file *file_out, loff_t pos_out,
+                        loff_t *req_count, unsigned int remap_flags)
+{
+       struct inode *inode_in = file_in->f_mapping->host;
+       struct inode *inode_out = file_out->f_mapping->host;
+       uint64_t count = *req_count;
+       uint64_t bcount;
+       loff_t size_in, size_out;
+       loff_t bs = inode_out->i_sb->s_blocksize;
+       int ret;
+
+       /* The start of both ranges must be aligned to an fs block. */
+       if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs))
+               return -EINVAL;
+
+       /* Ensure offsets don't wrap. */
+       if (pos_in + count < pos_in || pos_out + count < pos_out)
+               return -EINVAL;
+
+       size_in = i_size_read(inode_in);
+       size_out = i_size_read(inode_out);
+
+       /* Dedupe requires both ranges to be within EOF. */
+       if ((remap_flags & REMAP_FILE_DEDUP) &&
+           (pos_in >= size_in || pos_in + count > size_in ||
+            pos_out >= size_out || pos_out + count > size_out))
+               return -EINVAL;
+
+       /* Ensure the infile range is within the infile. */
+       if (pos_in >= size_in)
+               return -EINVAL;
+       count = min(count, size_in - (uint64_t)pos_in);
+
+       ret = generic_access_check_limits(file_in, pos_in, &count);
+       if (ret)
+               return ret;
+
+       ret = generic_write_check_limits(file_out, pos_out, &count);
+       if (ret)
+               return ret;
  
         /*
-        * LFS rule
+        * If the user wanted us to link to the infile's EOF, round up to the
+        * next block boundary for this check.
+        *
+        * Otherwise, make sure the count is also block-aligned, having
+        * already confirmed the starting offsets' block alignment.
          */
-       if (unlikely(pos + iov_iter_count(from) > MAX_NON_LFS &&
-                               !(file->f_flags & O_LARGEFILE))) {
-               if (pos >= MAX_NON_LFS)
-                       return -EFBIG;
-               iov_iter_truncate(from, MAX_NON_LFS - (unsigned long)pos);
+       if (pos_in + count == size_in) {
+               bcount = ALIGN(size_in, bs) - pos_in;
+       } else {
+               if (!IS_ALIGNED(count, bs))
+                       count = ALIGN_DOWN(count, bs);
+               bcount = count;
         }
  
+       /* Don't allow overlapped cloning within the same file. */
+       if (inode_in == inode_out &&
+           pos_out + bcount > pos_in &&
+           pos_out < pos_in + bcount)
+               return -EINVAL;
+
         /*
-        * Are we about to exceed the fs block limit ?
-        *
-        * If we have written data it becomes a short write.  If we have
-        * exceeded without writing data we send a signal and return EFBIG.
-        * Linus frestrict idea will clean these up nicely..
+        * We shortened the request but the caller can't deal with that, so
+        * bounce the request back to userspace.
          */
-       if (unlikely(pos >= inode->i_sb->s_maxbytes))
-               return -EFBIG;
+       if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
+               return -EINVAL;
  
-       iov_iter_truncate(from, inode->i_sb->s_maxbytes - pos);
-       return iov_iter_count(from);
+       *req_count = count;
+       return 0;
  }
-EXPORT_SYMBOL(generic_write_checks);
  
  int pagecache_write_begin(struct file *file, struct address_space *mapping,
                                 loff_t pos, unsigned len, unsigned flags,