userfaultfd: non-cooperative: closing the uffd without triggering SIGBUS
[sfrench/cifs-2.6.git] / fs / userfaultfd.c
index b0d5897bc4e6d0e019c79f65b6d41df1d3b0d050..ef4b48d1ea4270f59f46e29ae898a48cdb0c8aa6 100644 (file)
@@ -109,27 +109,24 @@ static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
                goto out;
        WRITE_ONCE(uwq->waken, true);
        /*
-        * The implicit smp_mb__before_spinlock in try_to_wake_up()
-        * renders uwq->waken visible to other CPUs before the task is
-        * waken.
+        * The Program-Order guarantees provided by the scheduler
+        * ensure uwq->waken is visible before the task is woken.
         */
        ret = wake_up_state(wq->private, mode);
-       if (ret)
+       if (ret) {
                /*
                 * Wake only once, autoremove behavior.
                 *
-                * After the effect of list_del_init is visible to the
-                * other CPUs, the waitqueue may disappear from under
-                * us, see the !list_empty_careful() in
-                * handle_userfault(). try_to_wake_up() has an
-                * implicit smp_mb__before_spinlock, and the
-                * wq->private is read before calling the extern
-                * function "wake_up_state" (which in turns calls
-                * try_to_wake_up). While the spin_lock;spin_unlock;
-                * wouldn't be enough, the smp_mb__before_spinlock is
-                * enough to avoid an explicit smp_mb() here.
+                * After the effect of list_del_init is visible to the other
+                * CPUs, the waitqueue may disappear from under us, see the
+                * !list_empty_careful() in handle_userfault().
+                *
+                * try_to_wake_up() has an implicit smp_mb(), and the
+                * wq->private is read before calling the extern function
+                * "wake_up_state" (which in turns calls try_to_wake_up).
                 */
                list_del_init(&wq->entry);
+       }
 out:
        return ret;
 }
@@ -181,7 +178,8 @@ static inline void msg_init(struct uffd_msg *msg)
 
 static inline struct uffd_msg userfault_msg(unsigned long address,
                                            unsigned int flags,
-                                           unsigned long reason)
+                                           unsigned long reason,
+                                           unsigned int features)
 {
        struct uffd_msg msg;
        msg_init(&msg);
@@ -205,6 +203,8 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
                 * write protect fault.
                 */
                msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
+       if (features & UFFD_FEATURE_THREAD_ID)
+               msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
        return msg;
 }
 
@@ -373,13 +373,34 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
        VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP));
        VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP));
 
+       if (ctx->features & UFFD_FEATURE_SIGBUS)
+               goto out;
+
        /*
         * If it's already released don't get it. This avoids to loop
         * in __get_user_pages if userfaultfd_release waits on the
         * caller of handle_userfault to release the mmap_sem.
         */
-       if (unlikely(ACCESS_ONCE(ctx->released)))
+       if (unlikely(ACCESS_ONCE(ctx->released))) {
+               /*
+                * Don't return VM_FAULT_SIGBUS in this case, so a non
+                * cooperative manager can close the uffd after the
+                * last UFFDIO_COPY, without risking to trigger an
+                * involuntary SIGBUS if the process was starting the
+                * userfaultfd while the userfaultfd was still armed
+                * (but after the last UFFDIO_COPY). If the uffd
+                * wasn't already closed when the userfault reached
+                * this point, that would normally be solved by
+                * userfaultfd_must_wait returning 'false'.
+                *
+                * If we were to return VM_FAULT_SIGBUS here, the non
+                * cooperative manager would be instead forced to
+                * always call UFFDIO_UNREGISTER before it can safely
+                * close the uffd.
+                */
+               ret = VM_FAULT_NOPAGE;
                goto out;
+       }
 
        /*
         * Check that we can return VM_FAULT_RETRY.
@@ -422,7 +443,8 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
 
        init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
        uwq.wq.private = current;
-       uwq.msg = userfault_msg(vmf->address, vmf->flags, reason);
+       uwq.msg = userfault_msg(vmf->address, vmf->flags, reason,
+                       ctx->features);
        uwq.ctx = ctx;
        uwq.waken = false;
 
@@ -1197,7 +1219,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
        struct uffdio_register __user *user_uffdio_register;
        unsigned long vm_flags, new_flags;
        bool found;
-       bool non_anon_pages;
+       bool basic_ioctls;
        unsigned long start, end, vma_end;
 
        user_uffdio_register = (struct uffdio_register __user *) arg;
@@ -1263,7 +1285,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
         * Search for not compatible vmas.
         */
        found = false;
-       non_anon_pages = false;
+       basic_ioctls = false;
        for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
                cond_resched();
 
@@ -1302,8 +1324,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
                /*
                 * Note vmas containing huge pages
                 */
-               if (is_vm_hugetlb_page(cur) || vma_is_shmem(cur))
-                       non_anon_pages = true;
+               if (is_vm_hugetlb_page(cur))
+                       basic_ioctls = true;
 
                found = true;
        }
@@ -1374,7 +1396,7 @@ out_unlock:
                 * userland which ioctls methods are guaranteed to
                 * succeed on this range.
                 */
-               if (put_user(non_anon_pages ? UFFD_API_RANGE_IOCTLS_BASIC :
+               if (put_user(basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
                             UFFD_API_RANGE_IOCTLS,
                             &user_uffdio_register->ioctls))
                        ret = -EFAULT;