fs/userfaultfd.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  *  fs/userfaultfd.c
   4  *
   5  *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
   6  *  Copyright (C) 2008-2009 Red Hat, Inc.
   7  *  Copyright (C) 2015  Red Hat, Inc.
   8  *
   9  *  Some part derived from fs/eventfd.c (anon inode setup) and
  10  *  mm/ksm.c (mm hashing).
  11  */
  12
  13 #include <linux/list.h>
  14 #include <linux/hashtable.h>
  15 #include <linux/sched/signal.h>
  16 #include <linux/sched/mm.h>
  17 #include <linux/mm.h>
  18 #include <linux/mm_inline.h>
  19 #include <linux/mmu_notifier.h>
  20 #include <linux/poll.h>
  21 #include <linux/slab.h>
  22 #include <linux/seq_file.h>
  23 #include <linux/file.h>
  24 #include <linux/bug.h>
  25 #include <linux/anon_inodes.h>
  26 #include <linux/syscalls.h>
  27 #include <linux/userfaultfd_k.h>
  28 #include <linux/mempolicy.h>
  29 #include <linux/ioctl.h>
  30 #include <linux/security.h>
  31 #include <linux/hugetlb.h>
  32 #include <linux/swapops.h>
  33 #include <linux/miscdevice.h>
  34
  35 static int sysctl_unprivileged_userfaultfd __read_mostly;
  36
  37 #ifdef CONFIG_SYSCTL
  38 static struct ctl_table vm_userfaultfd_table[] = {
  39         {
  40                 .procname       = "unprivileged_userfaultfd",
  41                 .data           = &sysctl_unprivileged_userfaultfd,
  42                 .maxlen         = sizeof(sysctl_unprivileged_userfaultfd),
  43                 .mode           = 0644,
  44                 .proc_handler   = proc_dointvec_minmax,
  45                 .extra1         = SYSCTL_ZERO,
  46                 .extra2         = SYSCTL_ONE,
  47         },
  48 };
  49 #endif
  50
  51 static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init;
  52
  53 /*
  54  * Start with fault_pending_wqh and fault_wqh so they're more likely
  55  * to be in the same cacheline.
  56  *
  57  * Locking order:
  58  *      fd_wqh.lock
  59  *              fault_pending_wqh.lock
  60  *                      fault_wqh.lock
  61  *              event_wqh.lock
  62  *
  63  * To avoid deadlocks, IRQs must be disabled when taking any of the above locks,
  64  * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's
  65  * also taken in IRQ context.
  66  */
  67 struct userfaultfd_ctx {
  68         /* waitqueue head for the pending (i.e. not read) userfaults */
  69         wait_queue_head_t fault_pending_wqh;
  70         /* waitqueue head for the userfaults */
  71         wait_queue_head_t fault_wqh;
  72         /* waitqueue head for the pseudo fd to wakeup poll/read */
  73         wait_queue_head_t fd_wqh;
  74         /* waitqueue head for events */
  75         wait_queue_head_t event_wqh;
  76         /* a refile sequence protected by fault_pending_wqh lock */
  77         seqcount_spinlock_t refile_seq;
  78         /* pseudo fd refcounting */
  79         refcount_t refcount;
  80         /* userfaultfd syscall flags */
  81         unsigned int flags;
  82         /* features requested from the userspace */
  83         unsigned int features;
  84         /* released */
  85         bool released;
  86         /* memory mappings are changing because of non-cooperative event */
  87         atomic_t mmap_changing;
  88         /* mm with one ore more vmas attached to this userfaultfd_ctx */
  89         struct mm_struct *mm;
  90 };
  91
  92 struct userfaultfd_fork_ctx {
  93         struct userfaultfd_ctx *orig;
  94         struct userfaultfd_ctx *new;
  95         struct list_head list;
  96 };
  97
  98 struct userfaultfd_unmap_ctx {
  99         struct userfaultfd_ctx *ctx;
 100         unsigned long start;
 101         unsigned long end;
 102         struct list_head list;
 103 };
 104
 105 struct userfaultfd_wait_queue {
 106         struct uffd_msg msg;
 107         wait_queue_entry_t wq;
 108         struct userfaultfd_ctx *ctx;
 109         bool waken;
 110 };
 111
 112 struct userfaultfd_wake_range {
 113         unsigned long start;
 114         unsigned long len;
 115 };
 116
 117 /* internal indication that UFFD_API ioctl was successfully executed */
 118 #define UFFD_FEATURE_INITIALIZED                (1u << 31)
 119
 120 static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
 121 {
 122         return ctx->features & UFFD_FEATURE_INITIALIZED;
 123 }
 124
 125 static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx)
 126 {
 127         return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC);
 128 }
 129
 130 /*
 131  * Whether WP_UNPOPULATED is enabled on the uffd context.  It is only
 132  * meaningful when userfaultfd_wp()==true on the vma and when it's
 133  * anonymous.
 134  */
 135 bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
 136 {
 137         struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
 138
 139         if (!ctx)
 140                 return false;
 141
 142         return ctx->features & UFFD_FEATURE_WP_UNPOPULATED;
 143 }
 144
 145 static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
 146                                      vm_flags_t flags)
 147 {
 148         const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP;
 149
 150         vm_flags_reset(vma, flags);
 151         /*
 152          * For shared mappings, we want to enable writenotify while
 153          * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply
 154          * recalculate vma->vm_page_prot whenever userfaultfd-wp changes.
 155          */
 156         if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed)
 157                 vma_set_page_prot(vma);
 158 }
 159
 160 static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
 161                                      int wake_flags, void *key)
 162 {
 163         struct userfaultfd_wake_range *range = key;
 164         int ret;
 165         struct userfaultfd_wait_queue *uwq;
 166         unsigned long start, len;
 167
 168         uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
 169         ret = 0;
 170         /* len == 0 means wake all */
 171         start = range->start;
 172         len = range->len;
 173         if (len && (start > uwq->msg.arg.pagefault.address ||
 174                     start + len <= uwq->msg.arg.pagefault.address))
 175                 goto out;
 176         WRITE_ONCE(uwq->waken, true);
 177         /*
 178          * The Program-Order guarantees provided by the scheduler
 179          * ensure uwq->waken is visible before the task is woken.
 180          */
 181         ret = wake_up_state(wq->private, mode);
 182         if (ret) {
 183                 /*
 184                  * Wake only once, autoremove behavior.
 185                  *
 186                  * After the effect of list_del_init is visible to the other
 187                  * CPUs, the waitqueue may disappear from under us, see the
 188                  * !list_empty_careful() in handle_userfault().
 189                  *
 190                  * try_to_wake_up() has an implicit smp_mb(), and the
 191                  * wq->private is read before calling the extern function
 192                  * "wake_up_state" (which in turns calls try_to_wake_up).
 193                  */
 194                 list_del_init(&wq->entry);
 195         }
 196 out:
 197         return ret;
 198 }
 199
 200 /**
 201  * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
 202  * context.
 203  * @ctx: [in] Pointer to the userfaultfd context.
 204  */
 205 static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
 206 {
 207         refcount_inc(&ctx->refcount);
 208 }
 209
 210 /**
 211  * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
 212  * context.
 213  * @ctx: [in] Pointer to userfaultfd context.
 214  *
 215  * The userfaultfd context reference must have been previously acquired either
 216  * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget().
 217  */
 218 static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
 219 {
 220         if (refcount_dec_and_test(&ctx->refcount)) {
 221                 VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
 222                 VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
 223                 VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
 224                 VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
 225                 VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock));
 226                 VM_BUG_ON(waitqueue_active(&ctx->event_wqh));
 227                 VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
 228                 VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
 229                 mmdrop(ctx->mm);
 230                 kmem_cache_free(userfaultfd_ctx_cachep, ctx);
 231         }
 232 }
 233
 234 static inline void msg_init(struct uffd_msg *msg)
 235 {
 236         BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
 237         /*
 238          * Must use memset to zero out the paddings or kernel data is
 239          * leaked to userland.
 240          */
 241         memset(msg, 0, sizeof(struct uffd_msg));
 242 }
 243
 244 static inline struct uffd_msg userfault_msg(unsigned long address,
 245                                             unsigned long real_address,
 246                                             unsigned int flags,
 247                                             unsigned long reason,
 248                                             unsigned int features)
 249 {
 250         struct uffd_msg msg;
 251
 252         msg_init(&msg);
 253         msg.event = UFFD_EVENT_PAGEFAULT;
 254
 255         msg.arg.pagefault.address = (features & UFFD_FEATURE_EXACT_ADDRESS) ?
 256                                     real_address : address;
 257
 258         /*
 259          * These flags indicate why the userfault occurred:
 260          * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault.
 261          * - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault.
 262          * - Neither of these flags being set indicates a MISSING fault.
 263          *
 264          * Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write
 265          * fault. Otherwise, it was a read fault.
 266          */
 267         if (flags & FAULT_FLAG_WRITE)
 268                 msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
 269         if (reason & VM_UFFD_WP)
 270                 msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
 271         if (reason & VM_UFFD_MINOR)
 272                 msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_MINOR;
 273         if (features & UFFD_FEATURE_THREAD_ID)
 274                 msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
 275         return msg;
 276 }
 277
 278 #ifdef CONFIG_HUGETLB_PAGE
 279 /*
 280  * Same functionality as userfaultfd_must_wait below with modifications for
 281  * hugepmd ranges.
 282  */
 283 static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
 284                                               struct vm_fault *vmf,
 285                                               unsigned long reason)
 286 {
 287         struct vm_area_struct *vma = vmf->vma;
 288         pte_t *ptep, pte;
 289         bool ret = true;
 290
 291         assert_fault_locked(vmf);
 292
 293         ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma));
 294         if (!ptep)
 295                 goto out;
 296
 297         ret = false;
 298         pte = huge_ptep_get(ptep);
 299
 300         /*
 301          * Lockless access: we're in a wait_event so it's ok if it
 302          * changes under us.  PTE markers should be handled the same as none
 303          * ptes here.
 304          */
 305         if (huge_pte_none_mostly(pte))
 306                 ret = true;
 307         if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
 308                 ret = true;
 309 out:
 310         return ret;
 311 }
 312 #else
 313 static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
 314                                               struct vm_fault *vmf,
 315                                               unsigned long reason)
 316 {
 317         return false;   /* should never get here */
 318 }
 319 #endif /* CONFIG_HUGETLB_PAGE */
 320
 321 /*
 322  * Verify the pagetables are still not ok after having reigstered into
 323  * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
 324  * userfault that has already been resolved, if userfaultfd_read and
 325  * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different
 326  * threads.
 327  */
 328 static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
 329                                          struct vm_fault *vmf,
 330                                          unsigned long reason)
 331 {
 332         struct mm_struct *mm = ctx->mm;
 333         unsigned long address = vmf->address;
 334         pgd_t *pgd;
 335         p4d_t *p4d;
 336         pud_t *pud;
 337         pmd_t *pmd, _pmd;
 338         pte_t *pte;
 339         pte_t ptent;
 340         bool ret = true;
 341
 342         assert_fault_locked(vmf);
 343
 344         pgd = pgd_offset(mm, address);
 345         if (!pgd_present(*pgd))
 346                 goto out;
 347         p4d = p4d_offset(pgd, address);
 348         if (!p4d_present(*p4d))
 349                 goto out;
 350         pud = pud_offset(p4d, address);
 351         if (!pud_present(*pud))
 352                 goto out;
 353         pmd = pmd_offset(pud, address);
 354 again:
 355         _pmd = pmdp_get_lockless(pmd);
 356         if (pmd_none(_pmd))
 357                 goto out;
 358
 359         ret = false;
 360         if (!pmd_present(_pmd) || pmd_devmap(_pmd))
 361                 goto out;
 362
 363         if (pmd_trans_huge(_pmd)) {
 364                 if (!pmd_write(_pmd) && (reason & VM_UFFD_WP))
 365                         ret = true;
 366                 goto out;
 367         }
 368
 369         pte = pte_offset_map(pmd, address);
 370         if (!pte) {
 371                 ret = true;
 372                 goto again;
 373         }
 374         /*
 375          * Lockless access: we're in a wait_event so it's ok if it
 376          * changes under us.  PTE markers should be handled the same as none
 377          * ptes here.
 378          */
 379         ptent = ptep_get(pte);
 380         if (pte_none_mostly(ptent))
 381                 ret = true;
 382         if (!pte_write(ptent) && (reason & VM_UFFD_WP))
 383                 ret = true;
 384         pte_unmap(pte);
 385
 386 out:
 387         return ret;
 388 }
 389
 390 static inline unsigned int userfaultfd_get_blocking_state(unsigned int flags)
 391 {
 392         if (flags & FAULT_FLAG_INTERRUPTIBLE)
 393                 return TASK_INTERRUPTIBLE;
 394
 395         if (flags & FAULT_FLAG_KILLABLE)
 396                 return TASK_KILLABLE;
 397
 398         return TASK_UNINTERRUPTIBLE;
 399 }
 400
 401 /*
 402  * The locking rules involved in returning VM_FAULT_RETRY depending on
 403  * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
 404  * FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
 405  * recommendation in __lock_page_or_retry is not an understatement.
 406  *
 407  * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_lock must be released
 408  * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
 409  * not set.
 410  *
 411  * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
 412  * set, VM_FAULT_RETRY can still be returned if and only if there are
 413  * fatal_signal_pending()s, and the mmap_lock must be released before
 414  * returning it.
 415  */
 416 vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
 417 {
 418         struct vm_area_struct *vma = vmf->vma;
 419         struct mm_struct *mm = vma->vm_mm;
 420         struct userfaultfd_ctx *ctx;
 421         struct userfaultfd_wait_queue uwq;
 422         vm_fault_t ret = VM_FAULT_SIGBUS;
 423         bool must_wait;
 424         unsigned int blocking_state;
 425
 426         /*
 427          * We don't do userfault handling for the final child pid update.
 428          *
 429          * We also don't do userfault handling during
 430          * coredumping. hugetlbfs has the special
 431          * hugetlb_follow_page_mask() to skip missing pages in the
 432          * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with
 433          * the no_page_table() helper in follow_page_mask(), but the
 434          * shmem_vm_ops->fault method is invoked even during
 435          * coredumping and it ends up here.
 436          */
 437         if (current->flags & (PF_EXITING|PF_DUMPCORE))
 438                 goto out;
 439
 440         assert_fault_locked(vmf);
 441
 442         ctx = vma->vm_userfaultfd_ctx.ctx;
 443         if (!ctx)
 444                 goto out;
 445
 446         BUG_ON(ctx->mm != mm);
 447
 448         /* Any unrecognized flag is a bug. */
 449         VM_BUG_ON(reason & ~__VM_UFFD_FLAGS);
 450         /* 0 or > 1 flags set is a bug; we expect exactly 1. */
 451         VM_BUG_ON(!reason || (reason & (reason - 1)));
 452
 453         if (ctx->features & UFFD_FEATURE_SIGBUS)
 454                 goto out;
 455         if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY))
 456                 goto out;
 457
 458         /*
 459          * If it's already released don't get it. This avoids to loop
 460          * in __get_user_pages if userfaultfd_release waits on the
 461          * caller of handle_userfault to release the mmap_lock.
 462          */
 463         if (unlikely(READ_ONCE(ctx->released))) {
 464                 /*
 465                  * Don't return VM_FAULT_SIGBUS in this case, so a non
 466                  * cooperative manager can close the uffd after the
 467                  * last UFFDIO_COPY, without risking to trigger an
 468                  * involuntary SIGBUS if the process was starting the
 469                  * userfaultfd while the userfaultfd was still armed
 470                  * (but after the last UFFDIO_COPY). If the uffd
 471                  * wasn't already closed when the userfault reached
 472                  * this point, that would normally be solved by
 473                  * userfaultfd_must_wait returning 'false'.
 474                  *
 475                  * If we were to return VM_FAULT_SIGBUS here, the non
 476                  * cooperative manager would be instead forced to
 477                  * always call UFFDIO_UNREGISTER before it can safely
 478                  * close the uffd.
 479                  */
 480                 ret = VM_FAULT_NOPAGE;
 481                 goto out;
 482         }
 483
 484         /*
 485          * Check that we can return VM_FAULT_RETRY.
 486          *
 487          * NOTE: it should become possible to return VM_FAULT_RETRY
 488          * even if FAULT_FLAG_TRIED is set without leading to gup()
 489          * -EBUSY failures, if the userfaultfd is to be extended for
 490          * VM_UFFD_WP tracking and we intend to arm the userfault
 491          * without first stopping userland access to the memory. For
 492          * VM_UFFD_MISSING userfaults this is enough for now.
 493          */
 494         if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) {
 495                 /*
 496                  * Validate the invariant that nowait must allow retry
 497                  * to be sure not to return SIGBUS erroneously on
 498                  * nowait invocations.
 499                  */
 500                 BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
 501 #ifdef CONFIG_DEBUG_VM
 502                 if (printk_ratelimit()) {
 503                         printk(KERN_WARNING
 504                                "FAULT_FLAG_ALLOW_RETRY missing %x\n",
 505                                vmf->flags);
 506                         dump_stack();
 507                 }
 508 #endif
 509                 goto out;
 510         }
 511
 512         /*
 513          * Handle nowait, not much to do other than tell it to retry
 514          * and wait.
 515          */
 516         ret = VM_FAULT_RETRY;
 517         if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
 518                 goto out;
 519
 520         /* take the reference before dropping the mmap_lock */
 521         userfaultfd_ctx_get(ctx);
 522
 523         init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
 524         uwq.wq.private = current;
 525         uwq.msg = userfault_msg(vmf->address, vmf->real_address, vmf->flags,
 526                                 reason, ctx->features);
 527         uwq.ctx = ctx;
 528         uwq.waken = false;
 529
 530         blocking_state = userfaultfd_get_blocking_state(vmf->flags);
 531
 532         /*
 533          * Take the vma lock now, in order to safely call
 534          * userfaultfd_huge_must_wait() later. Since acquiring the
 535          * (sleepable) vma lock can modify the current task state, that
 536          * must be before explicitly calling set_current_state().
 537          */
 538         if (is_vm_hugetlb_page(vma))
 539                 hugetlb_vma_lock_read(vma);
 540
 541         spin_lock_irq(&ctx->fault_pending_wqh.lock);
 542         /*
 543          * After the __add_wait_queue the uwq is visible to userland
 544          * through poll/read().
 545          */
 546         __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq);
 547         /*
 548          * The smp_mb() after __set_current_state prevents the reads
 549          * following the spin_unlock to happen before the list_add in
 550          * __add_wait_queue.
 551          */
 552         set_current_state(blocking_state);
 553         spin_unlock_irq(&ctx->fault_pending_wqh.lock);
 554
 555         if (!is_vm_hugetlb_page(vma))
 556                 must_wait = userfaultfd_must_wait(ctx, vmf, reason);
 557         else
 558                 must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason);
 559         if (is_vm_hugetlb_page(vma))
 560                 hugetlb_vma_unlock_read(vma);
 561         release_fault_lock(vmf);
 562
 563         if (likely(must_wait && !READ_ONCE(ctx->released))) {
 564                 wake_up_poll(&ctx->fd_wqh, EPOLLIN);
 565                 schedule();
 566         }
 567
 568         __set_current_state(TASK_RUNNING);
 569
 570         /*
 571          * Here we race with the list_del; list_add in
 572          * userfaultfd_ctx_read(), however because we don't ever run
 573          * list_del_init() to refile across the two lists, the prev
 574          * and next pointers will never point to self. list_add also
 575          * would never let any of the two pointers to point to
 576          * self. So list_empty_careful won't risk to see both pointers
 577          * pointing to self at any time during the list refile. The
 578          * only case where list_del_init() is called is the full
 579          * removal in the wake function and there we don't re-list_add
 580          * and it's fine not to block on the spinlock. The uwq on this
 581          * kernel stack can be released after the list_del_init.
 582          */
 583         if (!list_empty_careful(&uwq.wq.entry)) {
 584                 spin_lock_irq(&ctx->fault_pending_wqh.lock);
 585                 /*
 586                  * No need of list_del_init(), the uwq on the stack
 587                  * will be freed shortly anyway.
 588                  */
 589                 list_del(&uwq.wq.entry);
 590                 spin_unlock_irq(&ctx->fault_pending_wqh.lock);
 591         }
 592
 593         /*
 594          * ctx may go away after this if the userfault pseudo fd is
 595          * already released.
 596          */
 597         userfaultfd_ctx_put(ctx);
 598
 599 out:
 600         return ret;
 601 }
 602
 603 static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
 604                                               struct userfaultfd_wait_queue *ewq)
 605 {
 606         struct userfaultfd_ctx *release_new_ctx;
 607
 608         if (WARN_ON_ONCE(current->flags & PF_EXITING))
 609                 goto out;
 610
 611         ewq->ctx = ctx;
 612         init_waitqueue_entry(&ewq->wq, current);
 613         release_new_ctx = NULL;
 614
 615         spin_lock_irq(&ctx->event_wqh.lock);
 616         /*
 617          * After the __add_wait_queue the uwq is visible to userland
 618          * through poll/read().
 619          */
 620         __add_wait_queue(&ctx->event_wqh, &ewq->wq);
 621         for (;;) {
 622                 set_current_state(TASK_KILLABLE);
 623                 if (ewq->msg.event == 0)
 624                         break;
 625                 if (READ_ONCE(ctx->released) ||
 626                     fatal_signal_pending(current)) {
 627                         /*
 628                          * &ewq->wq may be queued in fork_event, but
 629                          * __remove_wait_queue ignores the head
 630                          * parameter. It would be a problem if it
 631                          * didn't.
 632                          */
 633                         __remove_wait_queue(&ctx->event_wqh, &ewq->wq);
 634                         if (ewq->msg.event == UFFD_EVENT_FORK) {
 635                                 struct userfaultfd_ctx *new;
 636
 637                                 new = (struct userfaultfd_ctx *)
 638                                         (unsigned long)
 639                                         ewq->msg.arg.reserved.reserved1;
 640                                 release_new_ctx = new;
 641                         }
 642                         break;
 643                 }
 644
 645                 spin_unlock_irq(&ctx->event_wqh.lock);
 646
 647                 wake_up_poll(&ctx->fd_wqh, EPOLLIN);
 648                 schedule();
 649
 650                 spin_lock_irq(&ctx->event_wqh.lock);
 651         }
 652         __set_current_state(TASK_RUNNING);
 653         spin_unlock_irq(&ctx->event_wqh.lock);
 654
 655         if (release_new_ctx) {
 656                 struct vm_area_struct *vma;
 657                 struct mm_struct *mm = release_new_ctx->mm;
 658                 VMA_ITERATOR(vmi, mm, 0);
 659
 660                 /* the various vma->vm_userfaultfd_ctx still points to it */
 661                 mmap_write_lock(mm);
 662                 for_each_vma(vmi, vma) {
 663                         if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
 664                                 vma_start_write(vma);
 665                                 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
 666                                 userfaultfd_set_vm_flags(vma,
 667                                                          vma->vm_flags & ~__VM_UFFD_FLAGS);
 668                         }
 669                 }
 670                 mmap_write_unlock(mm);
 671
 672                 userfaultfd_ctx_put(release_new_ctx);
 673         }
 674
 675         /*
 676          * ctx may go away after this if the userfault pseudo fd is
 677          * already released.
 678          */
 679 out:
 680         atomic_dec(&ctx->mmap_changing);
 681         VM_BUG_ON(atomic_read(&ctx->mmap_changing) < 0);
 682         userfaultfd_ctx_put(ctx);
 683 }
 684
 685 static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
 686                                        struct userfaultfd_wait_queue *ewq)
 687 {
 688         ewq->msg.event = 0;
 689         wake_up_locked(&ctx->event_wqh);
 690         __remove_wait_queue(&ctx->event_wqh, &ewq->wq);
 691 }
 692
 693 int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
 694 {
 695         struct userfaultfd_ctx *ctx = NULL, *octx;
 696         struct userfaultfd_fork_ctx *fctx;
 697
 698         octx = vma->vm_userfaultfd_ctx.ctx;
 699         if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
 700                 vma_start_write(vma);
 701                 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
 702                 userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
 703                 return 0;
 704         }
 705
 706         list_for_each_entry(fctx, fcs, list)
 707                 if (fctx->orig == octx) {
 708                         ctx = fctx->new;
 709                         break;
 710                 }
 711
 712         if (!ctx) {
 713                 fctx = kmalloc(sizeof(*fctx), GFP_KERNEL);
 714                 if (!fctx)
 715                         return -ENOMEM;
 716
 717                 ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
 718                 if (!ctx) {
 719                         kfree(fctx);
 720                         return -ENOMEM;
 721                 }
 722
 723                 refcount_set(&ctx->refcount, 1);
 724                 ctx->flags = octx->flags;
 725                 ctx->features = octx->features;
 726                 ctx->released = false;
 727                 atomic_set(&ctx->mmap_changing, 0);
 728                 ctx->mm = vma->vm_mm;
 729                 mmgrab(ctx->mm);
 730
 731                 userfaultfd_ctx_get(octx);
 732                 atomic_inc(&octx->mmap_changing);
 733                 fctx->orig = octx;
 734                 fctx->new = ctx;
 735                 list_add_tail(&fctx->list, fcs);
 736         }
 737
 738         vma->vm_userfaultfd_ctx.ctx = ctx;
 739         return 0;
 740 }
 741
 742 static void dup_fctx(struct userfaultfd_fork_ctx *fctx)
 743 {
 744         struct userfaultfd_ctx *ctx = fctx->orig;
 745         struct userfaultfd_wait_queue ewq;
 746
 747         msg_init(&ewq.msg);
 748
 749         ewq.msg.event = UFFD_EVENT_FORK;
 750         ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;
 751
 752         userfaultfd_event_wait_completion(ctx, &ewq);
 753 }
 754
 755 void dup_userfaultfd_complete(struct list_head *fcs)
 756 {
 757         struct userfaultfd_fork_ctx *fctx, *n;
 758
 759         list_for_each_entry_safe(fctx, n, fcs, list) {
 760                 dup_fctx(fctx);
 761                 list_del(&fctx->list);
 762                 kfree(fctx);
 763         }
 764 }
 765
 766 void mremap_userfaultfd_prep(struct vm_area_struct *vma,
 767                              struct vm_userfaultfd_ctx *vm_ctx)
 768 {
 769         struct userfaultfd_ctx *ctx;
 770
 771         ctx = vma->vm_userfaultfd_ctx.ctx;
 772
 773         if (!ctx)
 774                 return;
 775
 776         if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
 777                 vm_ctx->ctx = ctx;
 778                 userfaultfd_ctx_get(ctx);
 779                 atomic_inc(&ctx->mmap_changing);
 780         } else {
 781                 /* Drop uffd context if remap feature not enabled */
 782                 vma_start_write(vma);
 783                 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
 784                 userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
 785         }
 786 }
 787
 788 void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
 789                                  unsigned long from, unsigned long to,
 790                                  unsigned long len)
 791 {
 792         struct userfaultfd_ctx *ctx = vm_ctx->ctx;
 793         struct userfaultfd_wait_queue ewq;
 794
 795         if (!ctx)
 796                 return;
 797
 798         if (to & ~PAGE_MASK) {
 799                 userfaultfd_ctx_put(ctx);
 800                 return;
 801         }
 802
 803         msg_init(&ewq.msg);
 804
 805         ewq.msg.event = UFFD_EVENT_REMAP;
 806         ewq.msg.arg.remap.from = from;
 807         ewq.msg.arg.remap.to = to;
 808         ewq.msg.arg.remap.len = len;
 809
 810         userfaultfd_event_wait_completion(ctx, &ewq);
 811 }
 812
 813 bool userfaultfd_remove(struct vm_area_struct *vma,
 814                         unsigned long start, unsigned long end)
 815 {
 816         struct mm_struct *mm = vma->vm_mm;
 817         struct userfaultfd_ctx *ctx;
 818         struct userfaultfd_wait_queue ewq;
 819
 820         ctx = vma->vm_userfaultfd_ctx.ctx;
 821         if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
 822                 return true;
 823
 824         userfaultfd_ctx_get(ctx);
 825         atomic_inc(&ctx->mmap_changing);
 826         mmap_read_unlock(mm);
 827
 828         msg_init(&ewq.msg);
 829
 830         ewq.msg.event = UFFD_EVENT_REMOVE;
 831         ewq.msg.arg.remove.start = start;
 832         ewq.msg.arg.remove.end = end;
 833
 834         userfaultfd_event_wait_completion(ctx, &ewq);
 835
 836         return false;
 837 }
 838
 839 static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps,
 840                           unsigned long start, unsigned long end)
 841 {
 842         struct userfaultfd_unmap_ctx *unmap_ctx;
 843
 844         list_for_each_entry(unmap_ctx, unmaps, list)
 845                 if (unmap_ctx->ctx == ctx && unmap_ctx->start == start &&
 846                     unmap_ctx->end == end)
 847                         return true;
 848
 849         return false;
 850 }
 851
 852 int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start,
 853                            unsigned long end, struct list_head *unmaps)
 854 {
 855         struct userfaultfd_unmap_ctx *unmap_ctx;
 856         struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
 857
 858         if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) ||
 859             has_unmap_ctx(ctx, unmaps, start, end))
 860                 return 0;
 861
 862         unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL);
 863         if (!unmap_ctx)
 864                 return -ENOMEM;
 865
 866         userfaultfd_ctx_get(ctx);
 867         atomic_inc(&ctx->mmap_changing);
 868         unmap_ctx->ctx = ctx;
 869         unmap_ctx->start = start;
 870         unmap_ctx->end = end;
 871         list_add_tail(&unmap_ctx->list, unmaps);
 872
 873         return 0;
 874 }
 875
 876 void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf)
 877 {
 878         struct userfaultfd_unmap_ctx *ctx, *n;
 879         struct userfaultfd_wait_queue ewq;
 880
 881         list_for_each_entry_safe(ctx, n, uf, list) {
 882                 msg_init(&ewq.msg);
 883
 884                 ewq.msg.event = UFFD_EVENT_UNMAP;
 885                 ewq.msg.arg.remove.start = ctx->start;
 886                 ewq.msg.arg.remove.end = ctx->end;
 887
 888                 userfaultfd_event_wait_completion(ctx->ctx, &ewq);
 889
 890                 list_del(&ctx->list);
 891                 kfree(ctx);
 892         }
 893 }
 894
 895 static int userfaultfd_release(struct inode *inode, struct file *file)
 896 {
 897         struct userfaultfd_ctx *ctx = file->private_data;
 898         struct mm_struct *mm = ctx->mm;
 899         struct vm_area_struct *vma, *prev;
 900         /* len == 0 means wake all */
 901         struct userfaultfd_wake_range range = { .len = 0, };
 902         unsigned long new_flags;
 903         VMA_ITERATOR(vmi, mm, 0);
 904
 905         WRITE_ONCE(ctx->released, true);
 906
 907         if (!mmget_not_zero(mm))
 908                 goto wakeup;
 909
 910         /*
 911          * Flush page faults out of all CPUs. NOTE: all page faults
 912          * must be retried without returning VM_FAULT_SIGBUS if
 913          * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
 914          * changes while handle_userfault released the mmap_lock. So
 915          * it's critical that released is set to true (above), before
 916          * taking the mmap_lock for writing.
 917          */
 918         mmap_write_lock(mm);
 919         prev = NULL;
 920         for_each_vma(vmi, vma) {
 921                 cond_resched();
 922                 BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
 923                        !!(vma->vm_flags & __VM_UFFD_FLAGS));
 924                 if (vma->vm_userfaultfd_ctx.ctx != ctx) {
 925                         prev = vma;
 926                         continue;
 927                 }
 928                 new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
 929                 vma = vma_modify_flags_uffd(&vmi, prev, vma, vma->vm_start,
 930                                             vma->vm_end, new_flags,
 931                                             NULL_VM_UFFD_CTX);
 932
 933                 vma_start_write(vma);
 934                 userfaultfd_set_vm_flags(vma, new_flags);
 935                 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
 936
 937                 prev = vma;
 938         }
 939         mmap_write_unlock(mm);
 940         mmput(mm);
 941 wakeup:
 942         /*
 943          * After no new page faults can wait on this fault_*wqh, flush
 944          * the last page faults that may have been already waiting on
 945          * the fault_*wqh.
 946          */
 947         spin_lock_irq(&ctx->fault_pending_wqh.lock);
 948         __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range);
 949         __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range);
 950         spin_unlock_irq(&ctx->fault_pending_wqh.lock);
 951
 952         /* Flush pending events that may still wait on event_wqh */
 953         wake_up_all(&ctx->event_wqh);
 954
 955         wake_up_poll(&ctx->fd_wqh, EPOLLHUP);
 956         userfaultfd_ctx_put(ctx);
 957         return 0;
 958 }
 959
 960 /* fault_pending_wqh.lock must be hold by the caller */
 961 static inline struct userfaultfd_wait_queue *find_userfault_in(
 962                 wait_queue_head_t *wqh)
 963 {
 964         wait_queue_entry_t *wq;
 965         struct userfaultfd_wait_queue *uwq;
 966
 967         lockdep_assert_held(&wqh->lock);
 968
 969         uwq = NULL;
 970         if (!waitqueue_active(wqh))
 971                 goto out;
 972         /* walk in reverse to provide FIFO behavior to read userfaults */
 973         wq = list_last_entry(&wqh->head, typeof(*wq), entry);
 974         uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
 975 out:
 976         return uwq;
 977 }
 978
 979 static inline struct userfaultfd_wait_queue *find_userfault(
 980                 struct userfaultfd_ctx *ctx)
 981 {
 982         return find_userfault_in(&ctx->fault_pending_wqh);
 983 }
 984
 985 static inline struct userfaultfd_wait_queue *find_userfault_evt(
 986                 struct userfaultfd_ctx *ctx)
 987 {
 988         return find_userfault_in(&ctx->event_wqh);
 989 }
 990
 991 static __poll_t userfaultfd_poll(struct file *file, poll_table *wait)
 992 {
 993         struct userfaultfd_ctx *ctx = file->private_data;
 994         __poll_t ret;
 995
 996         poll_wait(file, &ctx->fd_wqh, wait);
 997
 998         if (!userfaultfd_is_initialized(ctx))
 999                 return EPOLLERR;
1000
1001         /*
1002          * poll() never guarantees that read won't block.
1003          * userfaults can be waken before they're read().
1004          */
1005         if (unlikely(!(file->f_flags & O_NONBLOCK)))
1006                 return EPOLLERR;
1007         /*
1008          * lockless access to see if there are pending faults
1009          * __pollwait last action is the add_wait_queue but
1010          * the spin_unlock would allow the waitqueue_active to
1011          * pass above the actual list_add inside
1012          * add_wait_queue critical section. So use a full
1013          * memory barrier to serialize the list_add write of
1014          * add_wait_queue() with the waitqueue_active read
1015          * below.
1016          */
1017         ret = 0;
1018         smp_mb();
1019         if (waitqueue_active(&ctx->fault_pending_wqh))
1020                 ret = EPOLLIN;
1021         else if (waitqueue_active(&ctx->event_wqh))
1022                 ret = EPOLLIN;
1023
1024         return ret;
1025 }
1026
1027 static const struct file_operations userfaultfd_fops;
1028
1029 static int resolve_userfault_fork(struct userfaultfd_ctx *new,
1030                                   struct inode *inode,
1031                                   struct uffd_msg *msg)
1032 {
1033         int fd;
1034
1035         fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, new,
1036                         O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode);
1037         if (fd < 0)
1038                 return fd;
1039
1040         msg->arg.reserved.reserved1 = 0;
1041         msg->arg.fork.ufd = fd;
1042         return 0;
1043 }
1044
1045 static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
1046                                     struct uffd_msg *msg, struct inode *inode)
1047 {
1048         ssize_t ret;
1049         DECLARE_WAITQUEUE(wait, current);
1050         struct userfaultfd_wait_queue *uwq;
1051         /*
1052          * Handling fork event requires sleeping operations, so
1053          * we drop the event_wqh lock, then do these ops, then
1054          * lock it back and wake up the waiter. While the lock is
1055          * dropped the ewq may go away so we keep track of it
1056          * carefully.
1057          */
1058         LIST_HEAD(fork_event);
1059         struct userfaultfd_ctx *fork_nctx = NULL;
1060
1061         /* always take the fd_wqh lock before the fault_pending_wqh lock */
1062         spin_lock_irq(&ctx->fd_wqh.lock);
1063         __add_wait_queue(&ctx->fd_wqh, &wait);
1064         for (;;) {
1065                 set_current_state(TASK_INTERRUPTIBLE);
1066                 spin_lock(&ctx->fault_pending_wqh.lock);
1067                 uwq = find_userfault(ctx);
1068                 if (uwq) {
1069                         /*
1070                          * Use a seqcount to repeat the lockless check
1071                          * in wake_userfault() to avoid missing
1072                          * wakeups because during the refile both
1073                          * waitqueue could become empty if this is the
1074                          * only userfault.
1075                          */
1076                         write_seqcount_begin(&ctx->refile_seq);
1077
1078                         /*
1079                          * The fault_pending_wqh.lock prevents the uwq
1080                          * to disappear from under us.
1081                          *
1082                          * Refile this userfault from
1083                          * fault_pending_wqh to fault_wqh, it's not
1084                          * pending anymore after we read it.
1085                          *
1086                          * Use list_del() by hand (as
1087                          * userfaultfd_wake_function also uses
1088                          * list_del_init() by hand) to be sure nobody
1089                          * changes __remove_wait_queue() to use
1090                          * list_del_init() in turn breaking the
1091                          * !list_empty_careful() check in
1092                          * handle_userfault(). The uwq->wq.head list
1093                          * must never be empty at any time during the
1094                          * refile, or the waitqueue could disappear
1095                          * from under us. The "wait_queue_head_t"
1096                          * parameter of __remove_wait_queue() is unused
1097                          * anyway.
1098                          */
1099                         list_del(&uwq->wq.entry);
1100                         add_wait_queue(&ctx->fault_wqh, &uwq->wq);
1101
1102                         write_seqcount_end(&ctx->refile_seq);
1103
1104                         /* careful to always initialize msg if ret == 0 */
1105                         *msg = uwq->msg;
1106                         spin_unlock(&ctx->fault_pending_wqh.lock);
1107                         ret = 0;
1108                         break;
1109                 }
1110                 spin_unlock(&ctx->fault_pending_wqh.lock);
1111
1112                 spin_lock(&ctx->event_wqh.lock);
1113                 uwq = find_userfault_evt(ctx);
1114                 if (uwq) {
1115                         *msg = uwq->msg;
1116
1117                         if (uwq->msg.event == UFFD_EVENT_FORK) {
1118                                 fork_nctx = (struct userfaultfd_ctx *)
1119                                         (unsigned long)
1120                                         uwq->msg.arg.reserved.reserved1;
1121                                 list_move(&uwq->wq.entry, &fork_event);
1122                                 /*
1123                                  * fork_nctx can be freed as soon as
1124                                  * we drop the lock, unless we take a
1125                                  * reference on it.
1126                                  */
1127                                 userfaultfd_ctx_get(fork_nctx);
1128                                 spin_unlock(&ctx->event_wqh.lock);
1129                                 ret = 0;
1130                                 break;
1131                         }
1132
1133                         userfaultfd_event_complete(ctx, uwq);
1134                         spin_unlock(&ctx->event_wqh.lock);
1135                         ret = 0;
1136                         break;
1137                 }
1138                 spin_unlock(&ctx->event_wqh.lock);
1139
1140                 if (signal_pending(current)) {
1141                         ret = -ERESTARTSYS;
1142                         break;
1143                 }
1144                 if (no_wait) {
1145                         ret = -EAGAIN;
1146                         break;
1147                 }
1148                 spin_unlock_irq(&ctx->fd_wqh.lock);
1149                 schedule();
1150                 spin_lock_irq(&ctx->fd_wqh.lock);
1151         }
1152         __remove_wait_queue(&ctx->fd_wqh, &wait);
1153         __set_current_state(TASK_RUNNING);
1154         spin_unlock_irq(&ctx->fd_wqh.lock);
1155
1156         if (!ret && msg->event == UFFD_EVENT_FORK) {
1157                 ret = resolve_userfault_fork(fork_nctx, inode, msg);
1158                 spin_lock_irq(&ctx->event_wqh.lock);
1159                 if (!list_empty(&fork_event)) {
1160                         /*
1161                          * The fork thread didn't abort, so we can
1162                          * drop the temporary refcount.
1163                          */
1164                         userfaultfd_ctx_put(fork_nctx);
1165
1166                         uwq = list_first_entry(&fork_event,
1167                                                typeof(*uwq),
1168                                                wq.entry);
1169                         /*
1170                          * If fork_event list wasn't empty and in turn
1171                          * the event wasn't already released by fork
1172                          * (the event is allocated on fork kernel
1173                          * stack), put the event back to its place in
1174                          * the event_wq. fork_event head will be freed
1175                          * as soon as we return so the event cannot
1176                          * stay queued there no matter the current
1177                          * "ret" value.
1178                          */
1179                         list_del(&uwq->wq.entry);
1180                         __add_wait_queue(&ctx->event_wqh, &uwq->wq);
1181
1182                         /*
1183                          * Leave the event in the waitqueue and report
1184                          * error to userland if we failed to resolve
1185                          * the userfault fork.
1186                          */
1187                         if (likely(!ret))
1188                                 userfaultfd_event_complete(ctx, uwq);
1189                 } else {
1190                         /*
1191                          * Here the fork thread aborted and the
1192                          * refcount from the fork thread on fork_nctx
1193                          * has already been released. We still hold
1194                          * the reference we took before releasing the
1195                          * lock above. If resolve_userfault_fork
1196                          * failed we've to drop it because the
1197                          * fork_nctx has to be freed in such case. If
1198                          * it succeeded we'll hold it because the new
1199                          * uffd references it.
1200                          */
1201                         if (ret)
1202                                 userfaultfd_ctx_put(fork_nctx);
1203                 }
1204                 spin_unlock_irq(&ctx->event_wqh.lock);
1205         }
1206
1207         return ret;
1208 }
1209
1210 static ssize_t userfaultfd_read(struct file *file, char __user *buf,
1211                                 size_t count, loff_t *ppos)
1212 {
1213         struct userfaultfd_ctx *ctx = file->private_data;
1214         ssize_t _ret, ret = 0;
1215         struct uffd_msg msg;
1216         int no_wait = file->f_flags & O_NONBLOCK;
1217         struct inode *inode = file_inode(file);
1218
1219         if (!userfaultfd_is_initialized(ctx))
1220                 return -EINVAL;
1221
1222         for (;;) {
1223                 if (count < sizeof(msg))
1224                         return ret ? ret : -EINVAL;
1225                 _ret = userfaultfd_ctx_read(ctx, no_wait, &msg, inode);
1226                 if (_ret < 0)
1227                         return ret ? ret : _ret;
1228                 if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg)))
1229                         return ret ? ret : -EFAULT;
1230                 ret += sizeof(msg);
1231                 buf += sizeof(msg);
1232                 count -= sizeof(msg);
1233                 /*
1234                  * Allow to read more than one fault at time but only
1235                  * block if waiting for the very first one.
1236                  */
1237                 no_wait = O_NONBLOCK;
1238         }
1239 }
1240
1241 static void __wake_userfault(struct userfaultfd_ctx *ctx,
1242                              struct userfaultfd_wake_range *range)
1243 {
1244         spin_lock_irq(&ctx->fault_pending_wqh.lock);
1245         /* wake all in the range and autoremove */
1246         if (waitqueue_active(&ctx->fault_pending_wqh))
1247                 __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
1248                                      range);
1249         if (waitqueue_active(&ctx->fault_wqh))
1250                 __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range);
1251         spin_unlock_irq(&ctx->fault_pending_wqh.lock);
1252 }
1253
1254 static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
1255                                            struct userfaultfd_wake_range *range)
1256 {
1257         unsigned seq;
1258         bool need_wakeup;
1259
1260         /*
1261          * To be sure waitqueue_active() is not reordered by the CPU
1262          * before the pagetable update, use an explicit SMP memory
1263          * barrier here. PT lock release or mmap_read_unlock(mm) still
1264          * have release semantics that can allow the
1265          * waitqueue_active() to be reordered before the pte update.
1266          */
1267         smp_mb();
1268
1269         /*
1270          * Use waitqueue_active because it's very frequent to
1271          * change the address space atomically even if there are no
1272          * userfaults yet. So we take the spinlock only when we're
1273          * sure we've userfaults to wake.
1274          */
1275         do {
1276                 seq = read_seqcount_begin(&ctx->refile_seq);
1277                 need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) ||
1278                         waitqueue_active(&ctx->fault_wqh);
1279                 cond_resched();
1280         } while (read_seqcount_retry(&ctx->refile_seq, seq));
1281         if (need_wakeup)
1282                 __wake_userfault(ctx, range);
1283 }
1284
1285 static __always_inline int validate_unaligned_range(
1286         struct mm_struct *mm, __u64 start, __u64 len)
1287 {
1288         __u64 task_size = mm->task_size;
1289
1290         if (len & ~PAGE_MASK)
1291                 return -EINVAL;
1292         if (!len)
1293                 return -EINVAL;
1294         if (start < mmap_min_addr)
1295                 return -EINVAL;
1296         if (start >= task_size)
1297                 return -EINVAL;
1298         if (len > task_size - start)
1299                 return -EINVAL;
1300         if (start + len <= start)
1301                 return -EINVAL;
1302         return 0;
1303 }
1304
1305 static __always_inline int validate_range(struct mm_struct *mm,
1306                                           __u64 start, __u64 len)
1307 {
1308         if (start & ~PAGE_MASK)
1309                 return -EINVAL;
1310
1311         return validate_unaligned_range(mm, start, len);
1312 }
1313
1314 static int userfaultfd_register(struct userfaultfd_ctx *ctx,
1315                                 unsigned long arg)
1316 {
1317         struct mm_struct *mm = ctx->mm;
1318         struct vm_area_struct *vma, *prev, *cur;
1319         int ret;
1320         struct uffdio_register uffdio_register;
1321         struct uffdio_register __user *user_uffdio_register;
1322         unsigned long vm_flags, new_flags;
1323         bool found;
1324         bool basic_ioctls;
1325         unsigned long start, end, vma_end;
1326         struct vma_iterator vmi;
1327         bool wp_async = userfaultfd_wp_async_ctx(ctx);
1328
1329         user_uffdio_register = (struct uffdio_register __user *) arg;
1330
1331         ret = -EFAULT;
1332         if (copy_from_user(&uffdio_register, user_uffdio_register,
1333                            sizeof(uffdio_register)-sizeof(__u64)))
1334                 goto out;
1335
1336         ret = -EINVAL;
1337         if (!uffdio_register.mode)
1338                 goto out;
1339         if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES)
1340                 goto out;
1341         vm_flags = 0;
1342         if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
1343                 vm_flags |= VM_UFFD_MISSING;
1344         if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
1345 #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
1346                 goto out;
1347 #endif
1348                 vm_flags |= VM_UFFD_WP;
1349         }
1350         if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) {
1351 #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
1352                 goto out;
1353 #endif
1354                 vm_flags |= VM_UFFD_MINOR;
1355         }
1356
1357         ret = validate_range(mm, uffdio_register.range.start,
1358                              uffdio_register.range.len);
1359         if (ret)
1360                 goto out;
1361
1362         start = uffdio_register.range.start;
1363         end = start + uffdio_register.range.len;
1364
1365         ret = -ENOMEM;
1366         if (!mmget_not_zero(mm))
1367                 goto out;
1368
1369         ret = -EINVAL;
1370         mmap_write_lock(mm);
1371         vma_iter_init(&vmi, mm, start);
1372         vma = vma_find(&vmi, end);
1373         if (!vma)
1374                 goto out_unlock;
1375
1376         /*
1377          * If the first vma contains huge pages, make sure start address
1378          * is aligned to huge page size.
1379          */
1380         if (is_vm_hugetlb_page(vma)) {
1381                 unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
1382
1383                 if (start & (vma_hpagesize - 1))
1384                         goto out_unlock;
1385         }
1386
1387         /*
1388          * Search for not compatible vmas.
1389          */
1390         found = false;
1391         basic_ioctls = false;
1392         cur = vma;
1393         do {
1394                 cond_resched();
1395
1396                 BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
1397                        !!(cur->vm_flags & __VM_UFFD_FLAGS));
1398
1399                 /* check not compatible vmas */
1400                 ret = -EINVAL;
1401                 if (!vma_can_userfault(cur, vm_flags, wp_async))
1402                         goto out_unlock;
1403
1404                 /*
1405                  * UFFDIO_COPY will fill file holes even without
1406                  * PROT_WRITE. This check enforces that if this is a
1407                  * MAP_SHARED, the process has write permission to the backing
1408                  * file. If VM_MAYWRITE is set it also enforces that on a
1409                  * MAP_SHARED vma: there is no F_WRITE_SEAL and no further
1410                  * F_WRITE_SEAL can be taken until the vma is destroyed.
1411                  */
1412                 ret = -EPERM;
1413                 if (unlikely(!(cur->vm_flags & VM_MAYWRITE)))
1414                         goto out_unlock;
1415
1416                 /*
1417                  * If this vma contains ending address, and huge pages
1418                  * check alignment.
1419                  */
1420                 if (is_vm_hugetlb_page(cur) && end <= cur->vm_end &&
1421                     end > cur->vm_start) {
1422                         unsigned long vma_hpagesize = vma_kernel_pagesize(cur);
1423
1424                         ret = -EINVAL;
1425
1426                         if (end & (vma_hpagesize - 1))
1427                                 goto out_unlock;
1428                 }
1429                 if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE))
1430                         goto out_unlock;
1431
1432                 /*
1433                  * Check that this vma isn't already owned by a
1434                  * different userfaultfd. We can't allow more than one
1435                  * userfaultfd to own a single vma simultaneously or we
1436                  * wouldn't know which one to deliver the userfaults to.
1437                  */
1438                 ret = -EBUSY;
1439                 if (cur->vm_userfaultfd_ctx.ctx &&
1440                     cur->vm_userfaultfd_ctx.ctx != ctx)
1441                         goto out_unlock;
1442
1443                 /*
1444                  * Note vmas containing huge pages
1445                  */
1446                 if (is_vm_hugetlb_page(cur))
1447                         basic_ioctls = true;
1448
1449                 found = true;
1450         } for_each_vma_range(vmi, cur, end);
1451         BUG_ON(!found);
1452
1453         vma_iter_set(&vmi, start);
1454         prev = vma_prev(&vmi);
1455         if (vma->vm_start < start)
1456                 prev = vma;
1457
1458         ret = 0;
1459         for_each_vma_range(vmi, vma, end) {
1460                 cond_resched();
1461
1462                 BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async));
1463                 BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
1464                        vma->vm_userfaultfd_ctx.ctx != ctx);
1465                 WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
1466
1467                 /*
1468                  * Nothing to do: this vma is already registered into this
1469                  * userfaultfd and with the right tracking mode too.
1470                  */
1471                 if (vma->vm_userfaultfd_ctx.ctx == ctx &&
1472                     (vma->vm_flags & vm_flags) == vm_flags)
1473                         goto skip;
1474
1475                 if (vma->vm_start > start)
1476                         start = vma->vm_start;
1477                 vma_end = min(end, vma->vm_end);
1478
1479                 new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
1480                 vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
1481                                             new_flags,
1482                                             (struct vm_userfaultfd_ctx){ctx});
1483                 if (IS_ERR(vma)) {
1484                         ret = PTR_ERR(vma);
1485                         break;
1486                 }
1487
1488                 /*
1489                  * In the vma_merge() successful mprotect-like case 8:
1490                  * the next vma was merged into the current one and
1491                  * the current one has not been updated yet.
1492                  */
1493                 vma_start_write(vma);
1494                 userfaultfd_set_vm_flags(vma, new_flags);
1495                 vma->vm_userfaultfd_ctx.ctx = ctx;
1496
1497                 if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
1498                         hugetlb_unshare_all_pmds(vma);
1499
1500         skip:
1501                 prev = vma;
1502                 start = vma->vm_end;
1503         }
1504
1505 out_unlock:
1506         mmap_write_unlock(mm);
1507         mmput(mm);
1508         if (!ret) {
1509                 __u64 ioctls_out;
1510
1511                 ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
1512                     UFFD_API_RANGE_IOCTLS;
1513
1514                 /*
1515                  * Declare the WP ioctl only if the WP mode is
1516                  * specified and all checks passed with the range
1517                  */
1518                 if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP))
1519                         ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT);
1520
1521                 /* CONTINUE ioctl is only supported for MINOR ranges. */
1522                 if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
1523                         ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE);
1524
1525                 /*
1526                  * Now that we scanned all vmas we can already tell
1527                  * userland which ioctls methods are guaranteed to
1528                  * succeed on this range.
1529                  */
1530                 if (put_user(ioctls_out, &user_uffdio_register->ioctls))
1531                         ret = -EFAULT;
1532         }
1533 out:
1534         return ret;
1535 }
1536
1537 static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
1538                                   unsigned long arg)
1539 {
1540         struct mm_struct *mm = ctx->mm;
1541         struct vm_area_struct *vma, *prev, *cur;
1542         int ret;
1543         struct uffdio_range uffdio_unregister;
1544         unsigned long new_flags;
1545         bool found;
1546         unsigned long start, end, vma_end;
1547         const void __user *buf = (void __user *)arg;
1548         struct vma_iterator vmi;
1549         bool wp_async = userfaultfd_wp_async_ctx(ctx);
1550
1551         ret = -EFAULT;
1552         if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
1553                 goto out;
1554
1555         ret = validate_range(mm, uffdio_unregister.start,
1556                              uffdio_unregister.len);
1557         if (ret)
1558                 goto out;
1559
1560         start = uffdio_unregister.start;
1561         end = start + uffdio_unregister.len;
1562
1563         ret = -ENOMEM;
1564         if (!mmget_not_zero(mm))
1565                 goto out;
1566
1567         mmap_write_lock(mm);
1568         ret = -EINVAL;
1569         vma_iter_init(&vmi, mm, start);
1570         vma = vma_find(&vmi, end);
1571         if (!vma)
1572                 goto out_unlock;
1573
1574         /*
1575          * If the first vma contains huge pages, make sure start address
1576          * is aligned to huge page size.
1577          */
1578         if (is_vm_hugetlb_page(vma)) {
1579                 unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
1580
1581                 if (start & (vma_hpagesize - 1))
1582                         goto out_unlock;
1583         }
1584
1585         /*
1586          * Search for not compatible vmas.
1587          */
1588         found = false;
1589         cur = vma;
1590         do {
1591                 cond_resched();
1592
1593                 BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
1594                        !!(cur->vm_flags & __VM_UFFD_FLAGS));
1595
1596                 /*
1597                  * Check not compatible vmas, not strictly required
1598                  * here as not compatible vmas cannot have an
1599                  * userfaultfd_ctx registered on them, but this
1600                  * provides for more strict behavior to notice
1601                  * unregistration errors.
1602                  */
1603                 if (!vma_can_userfault(cur, cur->vm_flags, wp_async))
1604                         goto out_unlock;
1605
1606                 found = true;
1607         } for_each_vma_range(vmi, cur, end);
1608         BUG_ON(!found);
1609
1610         vma_iter_set(&vmi, start);
1611         prev = vma_prev(&vmi);
1612         if (vma->vm_start < start)
1613                 prev = vma;
1614
1615         ret = 0;
1616         for_each_vma_range(vmi, vma, end) {
1617                 cond_resched();
1618
1619                 BUG_ON(!vma_can_userfault(vma, vma->vm_flags, wp_async));
1620
1621                 /*
1622                  * Nothing to do: this vma is already registered into this
1623                  * userfaultfd and with the right tracking mode too.
1624                  */
1625                 if (!vma->vm_userfaultfd_ctx.ctx)
1626                         goto skip;
1627
1628                 WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
1629
1630                 if (vma->vm_start > start)
1631                         start = vma->vm_start;
1632                 vma_end = min(end, vma->vm_end);
1633
1634                 if (userfaultfd_missing(vma)) {
1635                         /*
1636                          * Wake any concurrent pending userfault while
1637                          * we unregister, so they will not hang
1638                          * permanently and it avoids userland to call
1639                          * UFFDIO_WAKE explicitly.
1640                          */
1641                         struct userfaultfd_wake_range range;
1642                         range.start = start;
1643                         range.len = vma_end - start;
1644                         wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
1645                 }
1646
1647                 /* Reset ptes for the whole vma range if wr-protected */
1648                 if (userfaultfd_wp(vma))
1649                         uffd_wp_range(vma, start, vma_end - start, false);
1650
1651                 new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
1652                 vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
1653                                             new_flags, NULL_VM_UFFD_CTX);
1654                 if (IS_ERR(vma)) {
1655                         ret = PTR_ERR(vma);
1656                         break;
1657                 }
1658
1659                 /*
1660                  * In the vma_merge() successful mprotect-like case 8:
1661                  * the next vma was merged into the current one and
1662                  * the current one has not been updated yet.
1663                  */
1664                 vma_start_write(vma);
1665                 userfaultfd_set_vm_flags(vma, new_flags);
1666                 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
1667
1668         skip:
1669                 prev = vma;
1670                 start = vma->vm_end;
1671         }
1672
1673 out_unlock:
1674         mmap_write_unlock(mm);
1675         mmput(mm);
1676 out:
1677         return ret;
1678 }
1679
1680 /*
1681  * userfaultfd_wake may be used in combination with the
1682  * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches.
1683  */
1684 static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
1685                             unsigned long arg)
1686 {
1687         int ret;
1688         struct uffdio_range uffdio_wake;
1689         struct userfaultfd_wake_range range;
1690         const void __user *buf = (void __user *)arg;
1691
1692         ret = -EFAULT;
1693         if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
1694                 goto out;
1695
1696         ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
1697         if (ret)
1698                 goto out;
1699
1700         range.start = uffdio_wake.start;
1701         range.len = uffdio_wake.len;
1702
1703         /*
1704          * len == 0 means wake all and we don't want to wake all here,
1705          * so check it again to be sure.
1706          */
1707         VM_BUG_ON(!range.len);
1708
1709         wake_userfault(ctx, &range);
1710         ret = 0;
1711
1712 out:
1713         return ret;
1714 }
1715
1716 static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
1717                             unsigned long arg)
1718 {
1719         __s64 ret;
1720         struct uffdio_copy uffdio_copy;
1721         struct uffdio_copy __user *user_uffdio_copy;
1722         struct userfaultfd_wake_range range;
1723         uffd_flags_t flags = 0;
1724
1725         user_uffdio_copy = (struct uffdio_copy __user *) arg;
1726
1727         ret = -EAGAIN;
1728         if (atomic_read(&ctx->mmap_changing))
1729                 goto out;
1730
1731         ret = -EFAULT;
1732         if (copy_from_user(&uffdio_copy, user_uffdio_copy,
1733                            /* don't copy "copy" last field */
1734                            sizeof(uffdio_copy)-sizeof(__s64)))
1735                 goto out;
1736
1737         ret = validate_unaligned_range(ctx->mm, uffdio_copy.src,
1738                                        uffdio_copy.len);
1739         if (ret)
1740                 goto out;
1741         ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
1742         if (ret)
1743                 goto out;
1744
1745         ret = -EINVAL;
1746         if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
1747                 goto out;
1748         if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
1749                 flags |= MFILL_ATOMIC_WP;
1750         if (mmget_not_zero(ctx->mm)) {
1751                 ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
1752                                         uffdio_copy.len, &ctx->mmap_changing,
1753                                         flags);
1754                 mmput(ctx->mm);
1755         } else {
1756                 return -ESRCH;
1757         }
1758         if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
1759                 return -EFAULT;
1760         if (ret < 0)
1761                 goto out;
1762         BUG_ON(!ret);
1763         /* len == 0 would wake all */
1764         range.len = ret;
1765         if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
1766                 range.start = uffdio_copy.dst;
1767                 wake_userfault(ctx, &range);
1768         }
1769         ret = range.len == uffdio_copy.len ? 0 : -EAGAIN;
1770 out:
1771         return ret;
1772 }
1773
1774 static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
1775                                 unsigned long arg)
1776 {
1777         __s64 ret;
1778         struct uffdio_zeropage uffdio_zeropage;
1779         struct uffdio_zeropage __user *user_uffdio_zeropage;
1780         struct userfaultfd_wake_range range;
1781
1782         user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
1783
1784         ret = -EAGAIN;
1785         if (atomic_read(&ctx->mmap_changing))
1786                 goto out;
1787
1788         ret = -EFAULT;
1789         if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
1790                            /* don't copy "zeropage" last field */
1791                            sizeof(uffdio_zeropage)-sizeof(__s64)))
1792                 goto out;
1793
1794         ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
1795                              uffdio_zeropage.range.len);
1796         if (ret)
1797                 goto out;
1798         ret = -EINVAL;
1799         if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
1800                 goto out;
1801
1802         if (mmget_not_zero(ctx->mm)) {
1803                 ret = mfill_atomic_zeropage(ctx->mm, uffdio_zeropage.range.start,
1804                                            uffdio_zeropage.range.len,
1805                                            &ctx->mmap_changing);
1806                 mmput(ctx->mm);
1807         } else {
1808                 return -ESRCH;
1809         }
1810         if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
1811                 return -EFAULT;
1812         if (ret < 0)
1813                 goto out;
1814         /* len == 0 would wake all */
1815         BUG_ON(!ret);
1816         range.len = ret;
1817         if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
1818                 range.start = uffdio_zeropage.range.start;
1819                 wake_userfault(ctx, &range);
1820         }
1821         ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
1822 out:
1823         return ret;
1824 }
1825
1826 static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
1827                                     unsigned long arg)
1828 {
1829         int ret;
1830         struct uffdio_writeprotect uffdio_wp;
1831         struct uffdio_writeprotect __user *user_uffdio_wp;
1832         struct userfaultfd_wake_range range;
1833         bool mode_wp, mode_dontwake;
1834
1835         if (atomic_read(&ctx->mmap_changing))
1836                 return -EAGAIN;
1837
1838         user_uffdio_wp = (struct uffdio_writeprotect __user *) arg;
1839
1840         if (copy_from_user(&uffdio_wp, user_uffdio_wp,
1841                            sizeof(struct uffdio_writeprotect)))
1842                 return -EFAULT;
1843
1844         ret = validate_range(ctx->mm, uffdio_wp.range.start,
1845                              uffdio_wp.range.len);
1846         if (ret)
1847                 return ret;
1848
1849         if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
1850                                UFFDIO_WRITEPROTECT_MODE_WP))
1851                 return -EINVAL;
1852
1853         mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
1854         mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
1855
1856         if (mode_wp && mode_dontwake)
1857                 return -EINVAL;
1858
1859         if (mmget_not_zero(ctx->mm)) {
1860                 ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
1861                                           uffdio_wp.range.len, mode_wp,
1862                                           &ctx->mmap_changing);
1863                 mmput(ctx->mm);
1864         } else {
1865                 return -ESRCH;
1866         }
1867
1868         if (ret)
1869                 return ret;
1870
1871         if (!mode_wp && !mode_dontwake) {
1872                 range.start = uffdio_wp.range.start;
1873                 range.len = uffdio_wp.range.len;
1874                 wake_userfault(ctx, &range);
1875         }
1876         return ret;
1877 }
1878
1879 static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
1880 {
1881         __s64 ret;
1882         struct uffdio_continue uffdio_continue;
1883         struct uffdio_continue __user *user_uffdio_continue;
1884         struct userfaultfd_wake_range range;
1885         uffd_flags_t flags = 0;
1886
1887         user_uffdio_continue = (struct uffdio_continue __user *)arg;
1888
1889         ret = -EAGAIN;
1890         if (atomic_read(&ctx->mmap_changing))
1891                 goto out;
1892
1893         ret = -EFAULT;
1894         if (copy_from_user(&uffdio_continue, user_uffdio_continue,
1895                            /* don't copy the output fields */
1896                            sizeof(uffdio_continue) - (sizeof(__s64))))
1897                 goto out;
1898
1899         ret = validate_range(ctx->mm, uffdio_continue.range.start,
1900                              uffdio_continue.range.len);
1901         if (ret)
1902                 goto out;
1903
1904         ret = -EINVAL;
1905         if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE |
1906                                      UFFDIO_CONTINUE_MODE_WP))
1907                 goto out;
1908         if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP)
1909                 flags |= MFILL_ATOMIC_WP;
1910
1911         if (mmget_not_zero(ctx->mm)) {
1912                 ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start,
1913                                             uffdio_continue.range.len,
1914                                             &ctx->mmap_changing, flags);
1915                 mmput(ctx->mm);
1916         } else {
1917                 return -ESRCH;
1918         }
1919
1920         if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
1921                 return -EFAULT;
1922         if (ret < 0)
1923                 goto out;
1924
1925         /* len == 0 would wake all */
1926         BUG_ON(!ret);
1927         range.len = ret;
1928         if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
1929                 range.start = uffdio_continue.range.start;
1930                 wake_userfault(ctx, &range);
1931         }
1932         ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN;
1933
1934 out:
1935         return ret;
1936 }
1937
1938 static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long arg)
1939 {
1940         __s64 ret;
1941         struct uffdio_poison uffdio_poison;
1942         struct uffdio_poison __user *user_uffdio_poison;
1943         struct userfaultfd_wake_range range;
1944
1945         user_uffdio_poison = (struct uffdio_poison __user *)arg;
1946
1947         ret = -EAGAIN;
1948         if (atomic_read(&ctx->mmap_changing))
1949                 goto out;
1950
1951         ret = -EFAULT;
1952         if (copy_from_user(&uffdio_poison, user_uffdio_poison,
1953                            /* don't copy the output fields */
1954                            sizeof(uffdio_poison) - (sizeof(__s64))))
1955                 goto out;
1956
1957         ret = validate_range(ctx->mm, uffdio_poison.range.start,
1958                              uffdio_poison.range.len);
1959         if (ret)
1960                 goto out;
1961
1962         ret = -EINVAL;
1963         if (uffdio_poison.mode & ~UFFDIO_POISON_MODE_DONTWAKE)
1964                 goto out;
1965
1966         if (mmget_not_zero(ctx->mm)) {
1967                 ret = mfill_atomic_poison(ctx->mm, uffdio_poison.range.start,
1968                                           uffdio_poison.range.len,
1969                                           &ctx->mmap_changing, 0);
1970                 mmput(ctx->mm);
1971         } else {
1972                 return -ESRCH;
1973         }
1974
1975         if (unlikely(put_user(ret, &user_uffdio_poison->updated)))
1976                 return -EFAULT;
1977         if (ret < 0)
1978                 goto out;
1979
1980         /* len == 0 would wake all */
1981         BUG_ON(!ret);
1982         range.len = ret;
1983         if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) {
1984                 range.start = uffdio_poison.range.start;
1985                 wake_userfault(ctx, &range);
1986         }
1987         ret = range.len == uffdio_poison.range.len ? 0 : -EAGAIN;
1988
1989 out:
1990         return ret;
1991 }
1992
1993 bool userfaultfd_wp_async(struct vm_area_struct *vma)
1994 {
1995         return userfaultfd_wp_async_ctx(vma->vm_userfaultfd_ctx.ctx);
1996 }
1997
1998 static inline unsigned int uffd_ctx_features(__u64 user_features)
1999 {
2000         /*
2001          * For the current set of features the bits just coincide. Set
2002          * UFFD_FEATURE_INITIALIZED to mark the features as enabled.
2003          */
2004         return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED;
2005 }
2006
2007 static int userfaultfd_move(struct userfaultfd_ctx *ctx,
2008                             unsigned long arg)
2009 {
2010         __s64 ret;
2011         struct uffdio_move uffdio_move;
2012         struct uffdio_move __user *user_uffdio_move;
2013         struct userfaultfd_wake_range range;
2014         struct mm_struct *mm = ctx->mm;
2015
2016         user_uffdio_move = (struct uffdio_move __user *) arg;
2017
2018         if (atomic_read(&ctx->mmap_changing))
2019                 return -EAGAIN;
2020
2021         if (copy_from_user(&uffdio_move, user_uffdio_move,
2022                            /* don't copy "move" last field */
2023                            sizeof(uffdio_move)-sizeof(__s64)))
2024                 return -EFAULT;
2025
2026         /* Do not allow cross-mm moves. */
2027         if (mm != current->mm)
2028                 return -EINVAL;
2029
2030         ret = validate_range(mm, uffdio_move.dst, uffdio_move.len);
2031         if (ret)
2032                 return ret;
2033
2034         ret = validate_range(mm, uffdio_move.src, uffdio_move.len);
2035         if (ret)
2036                 return ret;
2037
2038         if (uffdio_move.mode & ~(UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES|
2039                                   UFFDIO_MOVE_MODE_DONTWAKE))
2040                 return -EINVAL;
2041
2042         if (mmget_not_zero(mm)) {
2043                 mmap_read_lock(mm);
2044
2045                 /* Re-check after taking mmap_lock */
2046                 if (likely(!atomic_read(&ctx->mmap_changing)))
2047                         ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
2048                                          uffdio_move.len, uffdio_move.mode);
2049                 else
2050                         ret = -EINVAL;
2051
2052                 mmap_read_unlock(mm);
2053                 mmput(mm);
2054         } else {
2055                 return -ESRCH;
2056         }
2057
2058         if (unlikely(put_user(ret, &user_uffdio_move->move)))
2059                 return -EFAULT;
2060         if (ret < 0)
2061                 goto out;
2062
2063         /* len == 0 would wake all */
2064         VM_WARN_ON(!ret);
2065         range.len = ret;
2066         if (!(uffdio_move.mode & UFFDIO_MOVE_MODE_DONTWAKE)) {
2067                 range.start = uffdio_move.dst;
2068                 wake_userfault(ctx, &range);
2069         }
2070         ret = range.len == uffdio_move.len ? 0 : -EAGAIN;
2071
2072 out:
2073         return ret;
2074 }
2075
2076 /*
2077  * userland asks for a certain API version and we return which bits
2078  * and ioctl commands are implemented in this kernel for such API
2079  * version or -EINVAL if unknown.
2080  */
2081 static int userfaultfd_api(struct userfaultfd_ctx *ctx,
2082                            unsigned long arg)
2083 {
2084         struct uffdio_api uffdio_api;
2085         void __user *buf = (void __user *)arg;
2086         unsigned int ctx_features;
2087         int ret;
2088         __u64 features;
2089
2090         ret = -EFAULT;
2091         if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
2092                 goto out;
2093         features = uffdio_api.features;
2094         ret = -EINVAL;
2095         if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES))
2096                 goto err_out;
2097         ret = -EPERM;
2098         if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
2099                 goto err_out;
2100
2101         /* WP_ASYNC relies on WP_UNPOPULATED, choose it unconditionally */
2102         if (features & UFFD_FEATURE_WP_ASYNC)
2103                 features |= UFFD_FEATURE_WP_UNPOPULATED;
2104
2105         /* report all available features and ioctls to userland */
2106         uffdio_api.features = UFFD_API_FEATURES;
2107 #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
2108         uffdio_api.features &=
2109                 ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
2110 #endif
2111 #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
2112         uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
2113 #endif
2114 #ifndef CONFIG_PTE_MARKER_UFFD_WP
2115         uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
2116         uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
2117         uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
2118 #endif
2119         uffdio_api.ioctls = UFFD_API_IOCTLS;
2120         ret = -EFAULT;
2121         if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
2122                 goto out;
2123
2124         /* only enable the requested features for this uffd context */
2125         ctx_features = uffd_ctx_features(features);
2126         ret = -EINVAL;
2127         if (cmpxchg(&ctx->features, 0, ctx_features) != 0)
2128                 goto err_out;
2129
2130         ret = 0;
2131 out:
2132         return ret;
2133 err_out:
2134         memset(&uffdio_api, 0, sizeof(uffdio_api));
2135         if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
2136                 ret = -EFAULT;
2137         goto out;
2138 }
2139
2140 static long userfaultfd_ioctl(struct file *file, unsigned cmd,
2141                               unsigned long arg)
2142 {
2143         int ret = -EINVAL;
2144         struct userfaultfd_ctx *ctx = file->private_data;
2145
2146         if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx))
2147                 return -EINVAL;
2148
2149         switch(cmd) {
2150         case UFFDIO_API:
2151                 ret = userfaultfd_api(ctx, arg);
2152                 break;
2153         case UFFDIO_REGISTER:
2154                 ret = userfaultfd_register(ctx, arg);
2155                 break;
2156         case UFFDIO_UNREGISTER:
2157                 ret = userfaultfd_unregister(ctx, arg);
2158                 break;
2159         case UFFDIO_WAKE:
2160                 ret = userfaultfd_wake(ctx, arg);
2161                 break;
2162         case UFFDIO_COPY:
2163                 ret = userfaultfd_copy(ctx, arg);
2164                 break;
2165         case UFFDIO_ZEROPAGE:
2166                 ret = userfaultfd_zeropage(ctx, arg);
2167                 break;
2168         case UFFDIO_MOVE:
2169                 ret = userfaultfd_move(ctx, arg);
2170                 break;
2171         case UFFDIO_WRITEPROTECT:
2172                 ret = userfaultfd_writeprotect(ctx, arg);
2173                 break;
2174         case UFFDIO_CONTINUE:
2175                 ret = userfaultfd_continue(ctx, arg);
2176                 break;
2177         case UFFDIO_POISON:
2178                 ret = userfaultfd_poison(ctx, arg);
2179                 break;
2180         }
2181         return ret;
2182 }
2183
2184 #ifdef CONFIG_PROC_FS
2185 static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
2186 {
2187         struct userfaultfd_ctx *ctx = f->private_data;
2188         wait_queue_entry_t *wq;
2189         unsigned long pending = 0, total = 0;
2190
2191         spin_lock_irq(&ctx->fault_pending_wqh.lock);
2192         list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
2193                 pending++;
2194                 total++;
2195         }
2196         list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
2197                 total++;
2198         }
2199         spin_unlock_irq(&ctx->fault_pending_wqh.lock);
2200
2201         /*
2202          * If more protocols will be added, there will be all shown
2203          * separated by a space. Like this:
2204          *      protocols: aa:... bb:...
2205          */
2206         seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
2207                    pending, total, UFFD_API, ctx->features,
2208                    UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
2209 }
2210 #endif
2211
2212 static const struct file_operations userfaultfd_fops = {
2213 #ifdef CONFIG_PROC_FS
2214         .show_fdinfo    = userfaultfd_show_fdinfo,
2215 #endif
2216         .release        = userfaultfd_release,
2217         .poll           = userfaultfd_poll,
2218         .read           = userfaultfd_read,
2219         .unlocked_ioctl = userfaultfd_ioctl,
2220         .compat_ioctl   = compat_ptr_ioctl,
2221         .llseek         = noop_llseek,
2222 };
2223
2224 static void init_once_userfaultfd_ctx(void *mem)
2225 {
2226         struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem;
2227
2228         init_waitqueue_head(&ctx->fault_pending_wqh);
2229         init_waitqueue_head(&ctx->fault_wqh);
2230         init_waitqueue_head(&ctx->event_wqh);
2231         init_waitqueue_head(&ctx->fd_wqh);
2232         seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock);
2233 }
2234
2235 static int new_userfaultfd(int flags)
2236 {
2237         struct userfaultfd_ctx *ctx;
2238         int fd;
2239
2240         BUG_ON(!current->mm);
2241
2242         /* Check the UFFD_* constants for consistency.  */
2243         BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS);
2244         BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
2245         BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
2246
2247         if (flags & ~(UFFD_SHARED_FCNTL_FLAGS | UFFD_USER_MODE_ONLY))
2248                 return -EINVAL;
2249
2250         ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
2251         if (!ctx)
2252                 return -ENOMEM;
2253
2254         refcount_set(&ctx->refcount, 1);
2255         ctx->flags = flags;
2256         ctx->features = 0;
2257         ctx->released = false;
2258         atomic_set(&ctx->mmap_changing, 0);
2259         ctx->mm = current->mm;
2260         /* prevent the mm struct to be freed */
2261         mmgrab(ctx->mm);
2262
2263         fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, ctx,
2264                         O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL);
2265         if (fd < 0) {
2266                 mmdrop(ctx->mm);
2267                 kmem_cache_free(userfaultfd_ctx_cachep, ctx);
2268         }
2269         return fd;
2270 }
2271
2272 static inline bool userfaultfd_syscall_allowed(int flags)
2273 {
2274         /* Userspace-only page faults are always allowed */
2275         if (flags & UFFD_USER_MODE_ONLY)
2276                 return true;
2277
2278         /*
2279          * The user is requesting a userfaultfd which can handle kernel faults.
2280          * Privileged users are always allowed to do this.
2281          */
2282         if (capable(CAP_SYS_PTRACE))
2283                 return true;
2284
2285         /* Otherwise, access to kernel fault handling is sysctl controlled. */
2286         return sysctl_unprivileged_userfaultfd;
2287 }
2288
2289 SYSCALL_DEFINE1(userfaultfd, int, flags)
2290 {
2291         if (!userfaultfd_syscall_allowed(flags))
2292                 return -EPERM;
2293
2294         return new_userfaultfd(flags);
2295 }
2296
2297 static long userfaultfd_dev_ioctl(struct file *file, unsigned int cmd, unsigned long flags)
2298 {
2299         if (cmd != USERFAULTFD_IOC_NEW)
2300                 return -EINVAL;
2301
2302         return new_userfaultfd(flags);
2303 }
2304
2305 static const struct file_operations userfaultfd_dev_fops = {
2306         .unlocked_ioctl = userfaultfd_dev_ioctl,
2307         .compat_ioctl = userfaultfd_dev_ioctl,
2308         .owner = THIS_MODULE,
2309         .llseek = noop_llseek,
2310 };
2311
2312 static struct miscdevice userfaultfd_misc = {
2313         .minor = MISC_DYNAMIC_MINOR,
2314         .name = "userfaultfd",
2315         .fops = &userfaultfd_dev_fops
2316 };
2317
2318 static int __init userfaultfd_init(void)
2319 {
2320         int ret;
2321
2322         ret = misc_register(&userfaultfd_misc);
2323         if (ret)
2324                 return ret;
2325
2326         userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
2327                                                 sizeof(struct userfaultfd_ctx),
2328                                                 0,
2329                                                 SLAB_HWCACHE_ALIGN|SLAB_PANIC,
2330                                                 init_once_userfaultfd_ctx);
2331 #ifdef CONFIG_SYSCTL
2332         register_sysctl_init("vm", vm_userfaultfd_table);
2333 #endif
2334         return 0;
2335 }
2336 __initcall(userfaultfd_init);