HID: input: avoid polling stylus battery on Chromebook Pompom
[sfrench/cifs-2.6.git] / fs / userfaultfd.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *  fs/userfaultfd.c
4  *
5  *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
6  *  Copyright (C) 2008-2009 Red Hat, Inc.
7  *  Copyright (C) 2015  Red Hat, Inc.
8  *
9  *  Some part derived from fs/eventfd.c (anon inode setup) and
10  *  mm/ksm.c (mm hashing).
11  */
12
13 #include <linux/list.h>
14 #include <linux/hashtable.h>
15 #include <linux/sched/signal.h>
16 #include <linux/sched/mm.h>
17 #include <linux/mm.h>
18 #include <linux/mm_inline.h>
19 #include <linux/mmu_notifier.h>
20 #include <linux/poll.h>
21 #include <linux/slab.h>
22 #include <linux/seq_file.h>
23 #include <linux/file.h>
24 #include <linux/bug.h>
25 #include <linux/anon_inodes.h>
26 #include <linux/syscalls.h>
27 #include <linux/userfaultfd_k.h>
28 #include <linux/mempolicy.h>
29 #include <linux/ioctl.h>
30 #include <linux/security.h>
31 #include <linux/hugetlb.h>
32 #include <linux/swapops.h>
33 #include <linux/miscdevice.h>
34
35 static int sysctl_unprivileged_userfaultfd __read_mostly;
36
37 #ifdef CONFIG_SYSCTL
38 static struct ctl_table vm_userfaultfd_table[] = {
39         {
40                 .procname       = "unprivileged_userfaultfd",
41                 .data           = &sysctl_unprivileged_userfaultfd,
42                 .maxlen         = sizeof(sysctl_unprivileged_userfaultfd),
43                 .mode           = 0644,
44                 .proc_handler   = proc_dointvec_minmax,
45                 .extra1         = SYSCTL_ZERO,
46                 .extra2         = SYSCTL_ONE,
47         },
48 };
49 #endif
50
51 static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init;
52
53 /*
54  * Start with fault_pending_wqh and fault_wqh so they're more likely
55  * to be in the same cacheline.
56  *
57  * Locking order:
58  *      fd_wqh.lock
59  *              fault_pending_wqh.lock
60  *                      fault_wqh.lock
61  *              event_wqh.lock
62  *
63  * To avoid deadlocks, IRQs must be disabled when taking any of the above locks,
64  * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's
65  * also taken in IRQ context.
66  */
67 struct userfaultfd_ctx {
68         /* waitqueue head for the pending (i.e. not read) userfaults */
69         wait_queue_head_t fault_pending_wqh;
70         /* waitqueue head for the userfaults */
71         wait_queue_head_t fault_wqh;
72         /* waitqueue head for the pseudo fd to wakeup poll/read */
73         wait_queue_head_t fd_wqh;
74         /* waitqueue head for events */
75         wait_queue_head_t event_wqh;
76         /* a refile sequence protected by fault_pending_wqh lock */
77         seqcount_spinlock_t refile_seq;
78         /* pseudo fd refcounting */
79         refcount_t refcount;
80         /* userfaultfd syscall flags */
81         unsigned int flags;
82         /* features requested from the userspace */
83         unsigned int features;
84         /* released */
85         bool released;
86         /* memory mappings are changing because of non-cooperative event */
87         atomic_t mmap_changing;
88         /* mm with one ore more vmas attached to this userfaultfd_ctx */
89         struct mm_struct *mm;
90 };
91
92 struct userfaultfd_fork_ctx {
93         struct userfaultfd_ctx *orig;
94         struct userfaultfd_ctx *new;
95         struct list_head list;
96 };
97
98 struct userfaultfd_unmap_ctx {
99         struct userfaultfd_ctx *ctx;
100         unsigned long start;
101         unsigned long end;
102         struct list_head list;
103 };
104
105 struct userfaultfd_wait_queue {
106         struct uffd_msg msg;
107         wait_queue_entry_t wq;
108         struct userfaultfd_ctx *ctx;
109         bool waken;
110 };
111
112 struct userfaultfd_wake_range {
113         unsigned long start;
114         unsigned long len;
115 };
116
117 /* internal indication that UFFD_API ioctl was successfully executed */
118 #define UFFD_FEATURE_INITIALIZED                (1u << 31)
119
120 static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
121 {
122         return ctx->features & UFFD_FEATURE_INITIALIZED;
123 }
124
125 static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx)
126 {
127         return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC);
128 }
129
130 /*
131  * Whether WP_UNPOPULATED is enabled on the uffd context.  It is only
132  * meaningful when userfaultfd_wp()==true on the vma and when it's
133  * anonymous.
134  */
135 bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
136 {
137         struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
138
139         if (!ctx)
140                 return false;
141
142         return ctx->features & UFFD_FEATURE_WP_UNPOPULATED;
143 }
144
145 static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
146                                      vm_flags_t flags)
147 {
148         const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP;
149
150         vm_flags_reset(vma, flags);
151         /*
152          * For shared mappings, we want to enable writenotify while
153          * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply
154          * recalculate vma->vm_page_prot whenever userfaultfd-wp changes.
155          */
156         if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed)
157                 vma_set_page_prot(vma);
158 }
159
160 static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
161                                      int wake_flags, void *key)
162 {
163         struct userfaultfd_wake_range *range = key;
164         int ret;
165         struct userfaultfd_wait_queue *uwq;
166         unsigned long start, len;
167
168         uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
169         ret = 0;
170         /* len == 0 means wake all */
171         start = range->start;
172         len = range->len;
173         if (len && (start > uwq->msg.arg.pagefault.address ||
174                     start + len <= uwq->msg.arg.pagefault.address))
175                 goto out;
176         WRITE_ONCE(uwq->waken, true);
177         /*
178          * The Program-Order guarantees provided by the scheduler
179          * ensure uwq->waken is visible before the task is woken.
180          */
181         ret = wake_up_state(wq->private, mode);
182         if (ret) {
183                 /*
184                  * Wake only once, autoremove behavior.
185                  *
186                  * After the effect of list_del_init is visible to the other
187                  * CPUs, the waitqueue may disappear from under us, see the
188                  * !list_empty_careful() in handle_userfault().
189                  *
190                  * try_to_wake_up() has an implicit smp_mb(), and the
191                  * wq->private is read before calling the extern function
192                  * "wake_up_state" (which in turns calls try_to_wake_up).
193                  */
194                 list_del_init(&wq->entry);
195         }
196 out:
197         return ret;
198 }
199
200 /**
201  * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
202  * context.
203  * @ctx: [in] Pointer to the userfaultfd context.
204  */
205 static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
206 {
207         refcount_inc(&ctx->refcount);
208 }
209
210 /**
211  * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
212  * context.
213  * @ctx: [in] Pointer to userfaultfd context.
214  *
215  * The userfaultfd context reference must have been previously acquired either
216  * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget().
217  */
218 static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
219 {
220         if (refcount_dec_and_test(&ctx->refcount)) {
221                 VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
222                 VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
223                 VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
224                 VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
225                 VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock));
226                 VM_BUG_ON(waitqueue_active(&ctx->event_wqh));
227                 VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
228                 VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
229                 mmdrop(ctx->mm);
230                 kmem_cache_free(userfaultfd_ctx_cachep, ctx);
231         }
232 }
233
234 static inline void msg_init(struct uffd_msg *msg)
235 {
236         BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
237         /*
238          * Must use memset to zero out the paddings or kernel data is
239          * leaked to userland.
240          */
241         memset(msg, 0, sizeof(struct uffd_msg));
242 }
243
244 static inline struct uffd_msg userfault_msg(unsigned long address,
245                                             unsigned long real_address,
246                                             unsigned int flags,
247                                             unsigned long reason,
248                                             unsigned int features)
249 {
250         struct uffd_msg msg;
251
252         msg_init(&msg);
253         msg.event = UFFD_EVENT_PAGEFAULT;
254
255         msg.arg.pagefault.address = (features & UFFD_FEATURE_EXACT_ADDRESS) ?
256                                     real_address : address;
257
258         /*
259          * These flags indicate why the userfault occurred:
260          * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault.
261          * - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault.
262          * - Neither of these flags being set indicates a MISSING fault.
263          *
264          * Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write
265          * fault. Otherwise, it was a read fault.
266          */
267         if (flags & FAULT_FLAG_WRITE)
268                 msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
269         if (reason & VM_UFFD_WP)
270                 msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
271         if (reason & VM_UFFD_MINOR)
272                 msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_MINOR;
273         if (features & UFFD_FEATURE_THREAD_ID)
274                 msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
275         return msg;
276 }
277
278 #ifdef CONFIG_HUGETLB_PAGE
279 /*
280  * Same functionality as userfaultfd_must_wait below with modifications for
281  * hugepmd ranges.
282  */
283 static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
284                                               struct vm_fault *vmf,
285                                               unsigned long reason)
286 {
287         struct vm_area_struct *vma = vmf->vma;
288         pte_t *ptep, pte;
289         bool ret = true;
290
291         assert_fault_locked(vmf);
292
293         ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma));
294         if (!ptep)
295                 goto out;
296
297         ret = false;
298         pte = huge_ptep_get(ptep);
299
300         /*
301          * Lockless access: we're in a wait_event so it's ok if it
302          * changes under us.  PTE markers should be handled the same as none
303          * ptes here.
304          */
305         if (huge_pte_none_mostly(pte))
306                 ret = true;
307         if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
308                 ret = true;
309 out:
310         return ret;
311 }
312 #else
313 static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
314                                               struct vm_fault *vmf,
315                                               unsigned long reason)
316 {
317         return false;   /* should never get here */
318 }
319 #endif /* CONFIG_HUGETLB_PAGE */
320
321 /*
322  * Verify the pagetables are still not ok after having reigstered into
323  * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
324  * userfault that has already been resolved, if userfaultfd_read and
325  * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different
326  * threads.
327  */
328 static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
329                                          struct vm_fault *vmf,
330                                          unsigned long reason)
331 {
332         struct mm_struct *mm = ctx->mm;
333         unsigned long address = vmf->address;
334         pgd_t *pgd;
335         p4d_t *p4d;
336         pud_t *pud;
337         pmd_t *pmd, _pmd;
338         pte_t *pte;
339         pte_t ptent;
340         bool ret = true;
341
342         assert_fault_locked(vmf);
343
344         pgd = pgd_offset(mm, address);
345         if (!pgd_present(*pgd))
346                 goto out;
347         p4d = p4d_offset(pgd, address);
348         if (!p4d_present(*p4d))
349                 goto out;
350         pud = pud_offset(p4d, address);
351         if (!pud_present(*pud))
352                 goto out;
353         pmd = pmd_offset(pud, address);
354 again:
355         _pmd = pmdp_get_lockless(pmd);
356         if (pmd_none(_pmd))
357                 goto out;
358
359         ret = false;
360         if (!pmd_present(_pmd) || pmd_devmap(_pmd))
361                 goto out;
362
363         if (pmd_trans_huge(_pmd)) {
364                 if (!pmd_write(_pmd) && (reason & VM_UFFD_WP))
365                         ret = true;
366                 goto out;
367         }
368
369         pte = pte_offset_map(pmd, address);
370         if (!pte) {
371                 ret = true;
372                 goto again;
373         }
374         /*
375          * Lockless access: we're in a wait_event so it's ok if it
376          * changes under us.  PTE markers should be handled the same as none
377          * ptes here.
378          */
379         ptent = ptep_get(pte);
380         if (pte_none_mostly(ptent))
381                 ret = true;
382         if (!pte_write(ptent) && (reason & VM_UFFD_WP))
383                 ret = true;
384         pte_unmap(pte);
385
386 out:
387         return ret;
388 }
389
390 static inline unsigned int userfaultfd_get_blocking_state(unsigned int flags)
391 {
392         if (flags & FAULT_FLAG_INTERRUPTIBLE)
393                 return TASK_INTERRUPTIBLE;
394
395         if (flags & FAULT_FLAG_KILLABLE)
396                 return TASK_KILLABLE;
397
398         return TASK_UNINTERRUPTIBLE;
399 }
400
401 /*
402  * The locking rules involved in returning VM_FAULT_RETRY depending on
403  * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
404  * FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
405  * recommendation in __lock_page_or_retry is not an understatement.
406  *
407  * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_lock must be released
408  * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
409  * not set.
410  *
411  * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
412  * set, VM_FAULT_RETRY can still be returned if and only if there are
413  * fatal_signal_pending()s, and the mmap_lock must be released before
414  * returning it.
415  */
416 vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
417 {
418         struct vm_area_struct *vma = vmf->vma;
419         struct mm_struct *mm = vma->vm_mm;
420         struct userfaultfd_ctx *ctx;
421         struct userfaultfd_wait_queue uwq;
422         vm_fault_t ret = VM_FAULT_SIGBUS;
423         bool must_wait;
424         unsigned int blocking_state;
425
426         /*
427          * We don't do userfault handling for the final child pid update.
428          *
429          * We also don't do userfault handling during
430          * coredumping. hugetlbfs has the special
431          * hugetlb_follow_page_mask() to skip missing pages in the
432          * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with
433          * the no_page_table() helper in follow_page_mask(), but the
434          * shmem_vm_ops->fault method is invoked even during
435          * coredumping and it ends up here.
436          */
437         if (current->flags & (PF_EXITING|PF_DUMPCORE))
438                 goto out;
439
440         assert_fault_locked(vmf);
441
442         ctx = vma->vm_userfaultfd_ctx.ctx;
443         if (!ctx)
444                 goto out;
445
446         BUG_ON(ctx->mm != mm);
447
448         /* Any unrecognized flag is a bug. */
449         VM_BUG_ON(reason & ~__VM_UFFD_FLAGS);
450         /* 0 or > 1 flags set is a bug; we expect exactly 1. */
451         VM_BUG_ON(!reason || (reason & (reason - 1)));
452
453         if (ctx->features & UFFD_FEATURE_SIGBUS)
454                 goto out;
455         if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY))
456                 goto out;
457
458         /*
459          * If it's already released don't get it. This avoids to loop
460          * in __get_user_pages if userfaultfd_release waits on the
461          * caller of handle_userfault to release the mmap_lock.
462          */
463         if (unlikely(READ_ONCE(ctx->released))) {
464                 /*
465                  * Don't return VM_FAULT_SIGBUS in this case, so a non
466                  * cooperative manager can close the uffd after the
467                  * last UFFDIO_COPY, without risking to trigger an
468                  * involuntary SIGBUS if the process was starting the
469                  * userfaultfd while the userfaultfd was still armed
470                  * (but after the last UFFDIO_COPY). If the uffd
471                  * wasn't already closed when the userfault reached
472                  * this point, that would normally be solved by
473                  * userfaultfd_must_wait returning 'false'.
474                  *
475                  * If we were to return VM_FAULT_SIGBUS here, the non
476                  * cooperative manager would be instead forced to
477                  * always call UFFDIO_UNREGISTER before it can safely
478                  * close the uffd.
479                  */
480                 ret = VM_FAULT_NOPAGE;
481                 goto out;
482         }
483
484         /*
485          * Check that we can return VM_FAULT_RETRY.
486          *
487          * NOTE: it should become possible to return VM_FAULT_RETRY
488          * even if FAULT_FLAG_TRIED is set without leading to gup()
489          * -EBUSY failures, if the userfaultfd is to be extended for
490          * VM_UFFD_WP tracking and we intend to arm the userfault
491          * without first stopping userland access to the memory. For
492          * VM_UFFD_MISSING userfaults this is enough for now.
493          */
494         if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) {
495                 /*
496                  * Validate the invariant that nowait must allow retry
497                  * to be sure not to return SIGBUS erroneously on
498                  * nowait invocations.
499                  */
500                 BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
501 #ifdef CONFIG_DEBUG_VM
502                 if (printk_ratelimit()) {
503                         printk(KERN_WARNING
504                                "FAULT_FLAG_ALLOW_RETRY missing %x\n",
505                                vmf->flags);
506                         dump_stack();
507                 }
508 #endif
509                 goto out;
510         }
511
512         /*
513          * Handle nowait, not much to do other than tell it to retry
514          * and wait.
515          */
516         ret = VM_FAULT_RETRY;
517         if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
518                 goto out;
519
520         /* take the reference before dropping the mmap_lock */
521         userfaultfd_ctx_get(ctx);
522
523         init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
524         uwq.wq.private = current;
525         uwq.msg = userfault_msg(vmf->address, vmf->real_address, vmf->flags,
526                                 reason, ctx->features);
527         uwq.ctx = ctx;
528         uwq.waken = false;
529
530         blocking_state = userfaultfd_get_blocking_state(vmf->flags);
531
532         /*
533          * Take the vma lock now, in order to safely call
534          * userfaultfd_huge_must_wait() later. Since acquiring the
535          * (sleepable) vma lock can modify the current task state, that
536          * must be before explicitly calling set_current_state().
537          */
538         if (is_vm_hugetlb_page(vma))
539                 hugetlb_vma_lock_read(vma);
540
541         spin_lock_irq(&ctx->fault_pending_wqh.lock);
542         /*
543          * After the __add_wait_queue the uwq is visible to userland
544          * through poll/read().
545          */
546         __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq);
547         /*
548          * The smp_mb() after __set_current_state prevents the reads
549          * following the spin_unlock to happen before the list_add in
550          * __add_wait_queue.
551          */
552         set_current_state(blocking_state);
553         spin_unlock_irq(&ctx->fault_pending_wqh.lock);
554
555         if (!is_vm_hugetlb_page(vma))
556                 must_wait = userfaultfd_must_wait(ctx, vmf, reason);
557         else
558                 must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason);
559         if (is_vm_hugetlb_page(vma))
560                 hugetlb_vma_unlock_read(vma);
561         release_fault_lock(vmf);
562
563         if (likely(must_wait && !READ_ONCE(ctx->released))) {
564                 wake_up_poll(&ctx->fd_wqh, EPOLLIN);
565                 schedule();
566         }
567
568         __set_current_state(TASK_RUNNING);
569
570         /*
571          * Here we race with the list_del; list_add in
572          * userfaultfd_ctx_read(), however because we don't ever run
573          * list_del_init() to refile across the two lists, the prev
574          * and next pointers will never point to self. list_add also
575          * would never let any of the two pointers to point to
576          * self. So list_empty_careful won't risk to see both pointers
577          * pointing to self at any time during the list refile. The
578          * only case where list_del_init() is called is the full
579          * removal in the wake function and there we don't re-list_add
580          * and it's fine not to block on the spinlock. The uwq on this
581          * kernel stack can be released after the list_del_init.
582          */
583         if (!list_empty_careful(&uwq.wq.entry)) {
584                 spin_lock_irq(&ctx->fault_pending_wqh.lock);
585                 /*
586                  * No need of list_del_init(), the uwq on the stack
587                  * will be freed shortly anyway.
588                  */
589                 list_del(&uwq.wq.entry);
590                 spin_unlock_irq(&ctx->fault_pending_wqh.lock);
591         }
592
593         /*
594          * ctx may go away after this if the userfault pseudo fd is
595          * already released.
596          */
597         userfaultfd_ctx_put(ctx);
598
599 out:
600         return ret;
601 }
602
603 static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
604                                               struct userfaultfd_wait_queue *ewq)
605 {
606         struct userfaultfd_ctx *release_new_ctx;
607
608         if (WARN_ON_ONCE(current->flags & PF_EXITING))
609                 goto out;
610
611         ewq->ctx = ctx;
612         init_waitqueue_entry(&ewq->wq, current);
613         release_new_ctx = NULL;
614
615         spin_lock_irq(&ctx->event_wqh.lock);
616         /*
617          * After the __add_wait_queue the uwq is visible to userland
618          * through poll/read().
619          */
620         __add_wait_queue(&ctx->event_wqh, &ewq->wq);
621         for (;;) {
622                 set_current_state(TASK_KILLABLE);
623                 if (ewq->msg.event == 0)
624                         break;
625                 if (READ_ONCE(ctx->released) ||
626                     fatal_signal_pending(current)) {
627                         /*
628                          * &ewq->wq may be queued in fork_event, but
629                          * __remove_wait_queue ignores the head
630                          * parameter. It would be a problem if it
631                          * didn't.
632                          */
633                         __remove_wait_queue(&ctx->event_wqh, &ewq->wq);
634                         if (ewq->msg.event == UFFD_EVENT_FORK) {
635                                 struct userfaultfd_ctx *new;
636
637                                 new = (struct userfaultfd_ctx *)
638                                         (unsigned long)
639                                         ewq->msg.arg.reserved.reserved1;
640                                 release_new_ctx = new;
641                         }
642                         break;
643                 }
644
645                 spin_unlock_irq(&ctx->event_wqh.lock);
646
647                 wake_up_poll(&ctx->fd_wqh, EPOLLIN);
648                 schedule();
649
650                 spin_lock_irq(&ctx->event_wqh.lock);
651         }
652         __set_current_state(TASK_RUNNING);
653         spin_unlock_irq(&ctx->event_wqh.lock);
654
655         if (release_new_ctx) {
656                 struct vm_area_struct *vma;
657                 struct mm_struct *mm = release_new_ctx->mm;
658                 VMA_ITERATOR(vmi, mm, 0);
659
660                 /* the various vma->vm_userfaultfd_ctx still points to it */
661                 mmap_write_lock(mm);
662                 for_each_vma(vmi, vma) {
663                         if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
664                                 vma_start_write(vma);
665                                 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
666                                 userfaultfd_set_vm_flags(vma,
667                                                          vma->vm_flags & ~__VM_UFFD_FLAGS);
668                         }
669                 }
670                 mmap_write_unlock(mm);
671
672                 userfaultfd_ctx_put(release_new_ctx);
673         }
674
675         /*
676          * ctx may go away after this if the userfault pseudo fd is
677          * already released.
678          */
679 out:
680         atomic_dec(&ctx->mmap_changing);
681         VM_BUG_ON(atomic_read(&ctx->mmap_changing) < 0);
682         userfaultfd_ctx_put(ctx);
683 }
684
685 static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
686                                        struct userfaultfd_wait_queue *ewq)
687 {
688         ewq->msg.event = 0;
689         wake_up_locked(&ctx->event_wqh);
690         __remove_wait_queue(&ctx->event_wqh, &ewq->wq);
691 }
692
693 int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
694 {
695         struct userfaultfd_ctx *ctx = NULL, *octx;
696         struct userfaultfd_fork_ctx *fctx;
697
698         octx = vma->vm_userfaultfd_ctx.ctx;
699         if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
700                 vma_start_write(vma);
701                 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
702                 userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
703                 return 0;
704         }
705
706         list_for_each_entry(fctx, fcs, list)
707                 if (fctx->orig == octx) {
708                         ctx = fctx->new;
709                         break;
710                 }
711
712         if (!ctx) {
713                 fctx = kmalloc(sizeof(*fctx), GFP_KERNEL);
714                 if (!fctx)
715                         return -ENOMEM;
716
717                 ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
718                 if (!ctx) {
719                         kfree(fctx);
720                         return -ENOMEM;
721                 }
722
723                 refcount_set(&ctx->refcount, 1);
724                 ctx->flags = octx->flags;
725                 ctx->features = octx->features;
726                 ctx->released = false;
727                 atomic_set(&ctx->mmap_changing, 0);
728                 ctx->mm = vma->vm_mm;
729                 mmgrab(ctx->mm);
730
731                 userfaultfd_ctx_get(octx);
732                 atomic_inc(&octx->mmap_changing);
733                 fctx->orig = octx;
734                 fctx->new = ctx;
735                 list_add_tail(&fctx->list, fcs);
736         }
737
738         vma->vm_userfaultfd_ctx.ctx = ctx;
739         return 0;
740 }
741
742 static void dup_fctx(struct userfaultfd_fork_ctx *fctx)
743 {
744         struct userfaultfd_ctx *ctx = fctx->orig;
745         struct userfaultfd_wait_queue ewq;
746
747         msg_init(&ewq.msg);
748
749         ewq.msg.event = UFFD_EVENT_FORK;
750         ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;
751
752         userfaultfd_event_wait_completion(ctx, &ewq);
753 }
754
755 void dup_userfaultfd_complete(struct list_head *fcs)
756 {
757         struct userfaultfd_fork_ctx *fctx, *n;
758
759         list_for_each_entry_safe(fctx, n, fcs, list) {
760                 dup_fctx(fctx);
761                 list_del(&fctx->list);
762                 kfree(fctx);
763         }
764 }
765
766 void mremap_userfaultfd_prep(struct vm_area_struct *vma,
767                              struct vm_userfaultfd_ctx *vm_ctx)
768 {
769         struct userfaultfd_ctx *ctx;
770
771         ctx = vma->vm_userfaultfd_ctx.ctx;
772
773         if (!ctx)
774                 return;
775
776         if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
777                 vm_ctx->ctx = ctx;
778                 userfaultfd_ctx_get(ctx);
779                 atomic_inc(&ctx->mmap_changing);
780         } else {
781                 /* Drop uffd context if remap feature not enabled */
782                 vma_start_write(vma);
783                 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
784                 userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
785         }
786 }
787
788 void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
789                                  unsigned long from, unsigned long to,
790                                  unsigned long len)
791 {
792         struct userfaultfd_ctx *ctx = vm_ctx->ctx;
793         struct userfaultfd_wait_queue ewq;
794
795         if (!ctx)
796                 return;
797
798         if (to & ~PAGE_MASK) {
799                 userfaultfd_ctx_put(ctx);
800                 return;
801         }
802
803         msg_init(&ewq.msg);
804
805         ewq.msg.event = UFFD_EVENT_REMAP;
806         ewq.msg.arg.remap.from = from;
807         ewq.msg.arg.remap.to = to;
808         ewq.msg.arg.remap.len = len;
809
810         userfaultfd_event_wait_completion(ctx, &ewq);
811 }
812
813 bool userfaultfd_remove(struct vm_area_struct *vma,
814                         unsigned long start, unsigned long end)
815 {
816         struct mm_struct *mm = vma->vm_mm;
817         struct userfaultfd_ctx *ctx;
818         struct userfaultfd_wait_queue ewq;
819
820         ctx = vma->vm_userfaultfd_ctx.ctx;
821         if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
822                 return true;
823
824         userfaultfd_ctx_get(ctx);
825         atomic_inc(&ctx->mmap_changing);
826         mmap_read_unlock(mm);
827
828         msg_init(&ewq.msg);
829
830         ewq.msg.event = UFFD_EVENT_REMOVE;
831         ewq.msg.arg.remove.start = start;
832         ewq.msg.arg.remove.end = end;
833
834         userfaultfd_event_wait_completion(ctx, &ewq);
835
836         return false;
837 }
838
839 static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps,
840                           unsigned long start, unsigned long end)
841 {
842         struct userfaultfd_unmap_ctx *unmap_ctx;
843
844         list_for_each_entry(unmap_ctx, unmaps, list)
845                 if (unmap_ctx->ctx == ctx && unmap_ctx->start == start &&
846                     unmap_ctx->end == end)
847                         return true;
848
849         return false;
850 }
851
852 int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start,
853                            unsigned long end, struct list_head *unmaps)
854 {
855         struct userfaultfd_unmap_ctx *unmap_ctx;
856         struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
857
858         if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) ||
859             has_unmap_ctx(ctx, unmaps, start, end))
860                 return 0;
861
862         unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL);
863         if (!unmap_ctx)
864                 return -ENOMEM;
865
866         userfaultfd_ctx_get(ctx);
867         atomic_inc(&ctx->mmap_changing);
868         unmap_ctx->ctx = ctx;
869         unmap_ctx->start = start;
870         unmap_ctx->end = end;
871         list_add_tail(&unmap_ctx->list, unmaps);
872
873         return 0;
874 }
875
876 void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf)
877 {
878         struct userfaultfd_unmap_ctx *ctx, *n;
879         struct userfaultfd_wait_queue ewq;
880
881         list_for_each_entry_safe(ctx, n, uf, list) {
882                 msg_init(&ewq.msg);
883
884                 ewq.msg.event = UFFD_EVENT_UNMAP;
885                 ewq.msg.arg.remove.start = ctx->start;
886                 ewq.msg.arg.remove.end = ctx->end;
887
888                 userfaultfd_event_wait_completion(ctx->ctx, &ewq);
889
890                 list_del(&ctx->list);
891                 kfree(ctx);
892         }
893 }
894
895 static int userfaultfd_release(struct inode *inode, struct file *file)
896 {
897         struct userfaultfd_ctx *ctx = file->private_data;
898         struct mm_struct *mm = ctx->mm;
899         struct vm_area_struct *vma, *prev;
900         /* len == 0 means wake all */
901         struct userfaultfd_wake_range range = { .len = 0, };
902         unsigned long new_flags;
903         VMA_ITERATOR(vmi, mm, 0);
904
905         WRITE_ONCE(ctx->released, true);
906
907         if (!mmget_not_zero(mm))
908                 goto wakeup;
909
910         /*
911          * Flush page faults out of all CPUs. NOTE: all page faults
912          * must be retried without returning VM_FAULT_SIGBUS if
913          * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
914          * changes while handle_userfault released the mmap_lock. So
915          * it's critical that released is set to true (above), before
916          * taking the mmap_lock for writing.
917          */
918         mmap_write_lock(mm);
919         prev = NULL;
920         for_each_vma(vmi, vma) {
921                 cond_resched();
922                 BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
923                        !!(vma->vm_flags & __VM_UFFD_FLAGS));
924                 if (vma->vm_userfaultfd_ctx.ctx != ctx) {
925                         prev = vma;
926                         continue;
927                 }
928                 new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
929                 vma = vma_modify_flags_uffd(&vmi, prev, vma, vma->vm_start,
930                                             vma->vm_end, new_flags,
931                                             NULL_VM_UFFD_CTX);
932
933                 vma_start_write(vma);
934                 userfaultfd_set_vm_flags(vma, new_flags);
935                 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
936
937                 prev = vma;
938         }
939         mmap_write_unlock(mm);
940         mmput(mm);
941 wakeup:
942         /*
943          * After no new page faults can wait on this fault_*wqh, flush
944          * the last page faults that may have been already waiting on
945          * the fault_*wqh.
946          */
947         spin_lock_irq(&ctx->fault_pending_wqh.lock);
948         __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range);
949         __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range);
950         spin_unlock_irq(&ctx->fault_pending_wqh.lock);
951
952         /* Flush pending events that may still wait on event_wqh */
953         wake_up_all(&ctx->event_wqh);
954
955         wake_up_poll(&ctx->fd_wqh, EPOLLHUP);
956         userfaultfd_ctx_put(ctx);
957         return 0;
958 }
959
960 /* fault_pending_wqh.lock must be hold by the caller */
961 static inline struct userfaultfd_wait_queue *find_userfault_in(
962                 wait_queue_head_t *wqh)
963 {
964         wait_queue_entry_t *wq;
965         struct userfaultfd_wait_queue *uwq;
966
967         lockdep_assert_held(&wqh->lock);
968
969         uwq = NULL;
970         if (!waitqueue_active(wqh))
971                 goto out;
972         /* walk in reverse to provide FIFO behavior to read userfaults */
973         wq = list_last_entry(&wqh->head, typeof(*wq), entry);
974         uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
975 out:
976         return uwq;
977 }
978
979 static inline struct userfaultfd_wait_queue *find_userfault(
980                 struct userfaultfd_ctx *ctx)
981 {
982         return find_userfault_in(&ctx->fault_pending_wqh);
983 }
984
985 static inline struct userfaultfd_wait_queue *find_userfault_evt(
986                 struct userfaultfd_ctx *ctx)
987 {
988         return find_userfault_in(&ctx->event_wqh);
989 }
990
991 static __poll_t userfaultfd_poll(struct file *file, poll_table *wait)
992 {
993         struct userfaultfd_ctx *ctx = file->private_data;
994         __poll_t ret;
995
996         poll_wait(file, &ctx->fd_wqh, wait);
997
998         if (!userfaultfd_is_initialized(ctx))
999                 return EPOLLERR;
1000
1001         /*
1002          * poll() never guarantees that read won't block.
1003          * userfaults can be waken before they're read().
1004          */
1005         if (unlikely(!(file->f_flags & O_NONBLOCK)))
1006                 return EPOLLERR;
1007         /*
1008          * lockless access to see if there are pending faults
1009          * __pollwait last action is the add_wait_queue but
1010          * the spin_unlock would allow the waitqueue_active to
1011          * pass above the actual list_add inside
1012          * add_wait_queue critical section. So use a full
1013          * memory barrier to serialize the list_add write of
1014          * add_wait_queue() with the waitqueue_active read
1015          * below.
1016          */
1017         ret = 0;
1018         smp_mb();
1019         if (waitqueue_active(&ctx->fault_pending_wqh))
1020                 ret = EPOLLIN;
1021         else if (waitqueue_active(&ctx->event_wqh))
1022                 ret = EPOLLIN;
1023
1024         return ret;
1025 }
1026
1027 static const struct file_operations userfaultfd_fops;
1028
1029 static int resolve_userfault_fork(struct userfaultfd_ctx *new,
1030                                   struct inode *inode,
1031                                   struct uffd_msg *msg)
1032 {
1033         int fd;
1034
1035         fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, new,
1036                         O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode);
1037         if (fd < 0)
1038                 return fd;
1039
1040         msg->arg.reserved.reserved1 = 0;
1041         msg->arg.fork.ufd = fd;
1042         return 0;
1043 }
1044
1045 static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
1046                                     struct uffd_msg *msg, struct inode *inode)
1047 {
1048         ssize_t ret;
1049         DECLARE_WAITQUEUE(wait, current);
1050         struct userfaultfd_wait_queue *uwq;
1051         /*
1052          * Handling fork event requires sleeping operations, so
1053          * we drop the event_wqh lock, then do these ops, then
1054          * lock it back and wake up the waiter. While the lock is
1055          * dropped the ewq may go away so we keep track of it
1056          * carefully.
1057          */
1058         LIST_HEAD(fork_event);
1059         struct userfaultfd_ctx *fork_nctx = NULL;
1060
1061         /* always take the fd_wqh lock before the fault_pending_wqh lock */
1062         spin_lock_irq(&ctx->fd_wqh.lock);
1063         __add_wait_queue(&ctx->fd_wqh, &wait);
1064         for (;;) {
1065                 set_current_state(TASK_INTERRUPTIBLE);
1066                 spin_lock(&ctx->fault_pending_wqh.lock);
1067                 uwq = find_userfault(ctx);
1068                 if (uwq) {
1069                         /*
1070                          * Use a seqcount to repeat the lockless check
1071                          * in wake_userfault() to avoid missing
1072                          * wakeups because during the refile both
1073                          * waitqueue could become empty if this is the
1074                          * only userfault.
1075                          */
1076                         write_seqcount_begin(&ctx->refile_seq);
1077
1078                         /*
1079                          * The fault_pending_wqh.lock prevents the uwq
1080                          * to disappear from under us.
1081                          *
1082                          * Refile this userfault from
1083                          * fault_pending_wqh to fault_wqh, it's not
1084                          * pending anymore after we read it.
1085                          *
1086                          * Use list_del() by hand (as
1087                          * userfaultfd_wake_function also uses
1088                          * list_del_init() by hand) to be sure nobody
1089                          * changes __remove_wait_queue() to use
1090                          * list_del_init() in turn breaking the
1091                          * !list_empty_careful() check in
1092                          * handle_userfault(). The uwq->wq.head list
1093                          * must never be empty at any time during the
1094                          * refile, or the waitqueue could disappear
1095                          * from under us. The "wait_queue_head_t"
1096                          * parameter of __remove_wait_queue() is unused
1097                          * anyway.
1098                          */
1099                         list_del(&uwq->wq.entry);
1100                         add_wait_queue(&ctx->fault_wqh, &uwq->wq);
1101
1102                         write_seqcount_end(&ctx->refile_seq);
1103
1104                         /* careful to always initialize msg if ret == 0 */
1105                         *msg = uwq->msg;
1106                         spin_unlock(&ctx->fault_pending_wqh.lock);
1107                         ret = 0;
1108                         break;
1109                 }
1110                 spin_unlock(&ctx->fault_pending_wqh.lock);
1111
1112                 spin_lock(&ctx->event_wqh.lock);
1113                 uwq = find_userfault_evt(ctx);
1114                 if (uwq) {
1115                         *msg = uwq->msg;
1116
1117                         if (uwq->msg.event == UFFD_EVENT_FORK) {
1118                                 fork_nctx = (struct userfaultfd_ctx *)
1119                                         (unsigned long)
1120                                         uwq->msg.arg.reserved.reserved1;
1121                                 list_move(&uwq->wq.entry, &fork_event);
1122                                 /*
1123                                  * fork_nctx can be freed as soon as
1124                                  * we drop the lock, unless we take a
1125                                  * reference on it.
1126                                  */
1127                                 userfaultfd_ctx_get(fork_nctx);
1128                                 spin_unlock(&ctx->event_wqh.lock);
1129                                 ret = 0;
1130                                 break;
1131                         }
1132
1133                         userfaultfd_event_complete(ctx, uwq);
1134                         spin_unlock(&ctx->event_wqh.lock);
1135                         ret = 0;
1136                         break;
1137                 }
1138                 spin_unlock(&ctx->event_wqh.lock);
1139
1140                 if (signal_pending(current)) {
1141                         ret = -ERESTARTSYS;
1142                         break;
1143                 }
1144                 if (no_wait) {
1145                         ret = -EAGAIN;
1146                         break;
1147                 }
1148                 spin_unlock_irq(&ctx->fd_wqh.lock);
1149                 schedule();
1150                 spin_lock_irq(&ctx->fd_wqh.lock);
1151         }
1152         __remove_wait_queue(&ctx->fd_wqh, &wait);
1153         __set_current_state(TASK_RUNNING);
1154         spin_unlock_irq(&ctx->fd_wqh.lock);
1155
1156         if (!ret && msg->event == UFFD_EVENT_FORK) {
1157                 ret = resolve_userfault_fork(fork_nctx, inode, msg);
1158                 spin_lock_irq(&ctx->event_wqh.lock);
1159                 if (!list_empty(&fork_event)) {
1160                         /*
1161                          * The fork thread didn't abort, so we can
1162                          * drop the temporary refcount.
1163                          */
1164                         userfaultfd_ctx_put(fork_nctx);
1165
1166                         uwq = list_first_entry(&fork_event,
1167                                                typeof(*uwq),
1168                                                wq.entry);
1169                         /*
1170                          * If fork_event list wasn't empty and in turn
1171                          * the event wasn't already released by fork
1172                          * (the event is allocated on fork kernel
1173                          * stack), put the event back to its place in
1174                          * the event_wq. fork_event head will be freed
1175                          * as soon as we return so the event cannot
1176                          * stay queued there no matter the current
1177                          * "ret" value.
1178                          */
1179                         list_del(&uwq->wq.entry);
1180                         __add_wait_queue(&ctx->event_wqh, &uwq->wq);
1181
1182                         /*
1183                          * Leave the event in the waitqueue and report
1184                          * error to userland if we failed to resolve
1185                          * the userfault fork.
1186                          */
1187                         if (likely(!ret))
1188                                 userfaultfd_event_complete(ctx, uwq);
1189                 } else {
1190                         /*
1191                          * Here the fork thread aborted and the
1192                          * refcount from the fork thread on fork_nctx
1193                          * has already been released. We still hold
1194                          * the reference we took before releasing the
1195                          * lock above. If resolve_userfault_fork
1196                          * failed we've to drop it because the
1197                          * fork_nctx has to be freed in such case. If
1198                          * it succeeded we'll hold it because the new
1199                          * uffd references it.
1200                          */
1201                         if (ret)
1202                                 userfaultfd_ctx_put(fork_nctx);
1203                 }
1204                 spin_unlock_irq(&ctx->event_wqh.lock);
1205         }
1206
1207         return ret;
1208 }
1209
1210 static ssize_t userfaultfd_read(struct file *file, char __user *buf,
1211                                 size_t count, loff_t *ppos)
1212 {
1213         struct userfaultfd_ctx *ctx = file->private_data;
1214         ssize_t _ret, ret = 0;
1215         struct uffd_msg msg;
1216         int no_wait = file->f_flags & O_NONBLOCK;
1217         struct inode *inode = file_inode(file);
1218
1219         if (!userfaultfd_is_initialized(ctx))
1220                 return -EINVAL;
1221
1222         for (;;) {
1223                 if (count < sizeof(msg))
1224                         return ret ? ret : -EINVAL;
1225                 _ret = userfaultfd_ctx_read(ctx, no_wait, &msg, inode);
1226                 if (_ret < 0)
1227                         return ret ? ret : _ret;
1228                 if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg)))
1229                         return ret ? ret : -EFAULT;
1230                 ret += sizeof(msg);
1231                 buf += sizeof(msg);
1232                 count -= sizeof(msg);
1233                 /*
1234                  * Allow to read more than one fault at time but only
1235                  * block if waiting for the very first one.
1236                  */
1237                 no_wait = O_NONBLOCK;
1238         }
1239 }
1240
1241 static void __wake_userfault(struct userfaultfd_ctx *ctx,
1242                              struct userfaultfd_wake_range *range)
1243 {
1244         spin_lock_irq(&ctx->fault_pending_wqh.lock);
1245         /* wake all in the range and autoremove */
1246         if (waitqueue_active(&ctx->fault_pending_wqh))
1247                 __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
1248                                      range);
1249         if (waitqueue_active(&ctx->fault_wqh))
1250                 __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range);
1251         spin_unlock_irq(&ctx->fault_pending_wqh.lock);
1252 }
1253
1254 static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
1255                                            struct userfaultfd_wake_range *range)
1256 {
1257         unsigned seq;
1258         bool need_wakeup;
1259
1260         /*
1261          * To be sure waitqueue_active() is not reordered by the CPU
1262          * before the pagetable update, use an explicit SMP memory
1263          * barrier here. PT lock release or mmap_read_unlock(mm) still
1264          * have release semantics that can allow the
1265          * waitqueue_active() to be reordered before the pte update.
1266          */
1267         smp_mb();
1268
1269         /*
1270          * Use waitqueue_active because it's very frequent to
1271          * change the address space atomically even if there are no
1272          * userfaults yet. So we take the spinlock only when we're
1273          * sure we've userfaults to wake.
1274          */
1275         do {
1276                 seq = read_seqcount_begin(&ctx->refile_seq);
1277                 need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) ||
1278                         waitqueue_active(&ctx->fault_wqh);
1279                 cond_resched();
1280         } while (read_seqcount_retry(&ctx->refile_seq, seq));
1281         if (need_wakeup)
1282                 __wake_userfault(ctx, range);
1283 }
1284
1285 static __always_inline int validate_unaligned_range(
1286         struct mm_struct *mm, __u64 start, __u64 len)
1287 {
1288         __u64 task_size = mm->task_size;
1289
1290         if (len & ~PAGE_MASK)
1291                 return -EINVAL;
1292         if (!len)
1293                 return -EINVAL;
1294         if (start < mmap_min_addr)
1295                 return -EINVAL;
1296         if (start >= task_size)
1297                 return -EINVAL;
1298         if (len > task_size - start)
1299                 return -EINVAL;
1300         if (start + len <= start)
1301                 return -EINVAL;
1302         return 0;
1303 }
1304
1305 static __always_inline int validate_range(struct mm_struct *mm,
1306                                           __u64 start, __u64 len)
1307 {
1308         if (start & ~PAGE_MASK)
1309                 return -EINVAL;
1310
1311         return validate_unaligned_range(mm, start, len);
1312 }
1313
1314 static int userfaultfd_register(struct userfaultfd_ctx *ctx,
1315                                 unsigned long arg)
1316 {
1317         struct mm_struct *mm = ctx->mm;
1318         struct vm_area_struct *vma, *prev, *cur;
1319         int ret;
1320         struct uffdio_register uffdio_register;
1321         struct uffdio_register __user *user_uffdio_register;
1322         unsigned long vm_flags, new_flags;
1323         bool found;
1324         bool basic_ioctls;
1325         unsigned long start, end, vma_end;
1326         struct vma_iterator vmi;
1327         bool wp_async = userfaultfd_wp_async_ctx(ctx);
1328
1329         user_uffdio_register = (struct uffdio_register __user *) arg;
1330
1331         ret = -EFAULT;
1332         if (copy_from_user(&uffdio_register, user_uffdio_register,
1333                            sizeof(uffdio_register)-sizeof(__u64)))
1334                 goto out;
1335
1336         ret = -EINVAL;
1337         if (!uffdio_register.mode)
1338                 goto out;
1339         if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES)
1340                 goto out;
1341         vm_flags = 0;
1342         if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
1343                 vm_flags |= VM_UFFD_MISSING;
1344         if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
1345 #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
1346                 goto out;
1347 #endif
1348                 vm_flags |= VM_UFFD_WP;
1349         }
1350         if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) {
1351 #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
1352                 goto out;
1353 #endif
1354                 vm_flags |= VM_UFFD_MINOR;
1355         }
1356
1357         ret = validate_range(mm, uffdio_register.range.start,
1358                              uffdio_register.range.len);
1359         if (ret)
1360                 goto out;
1361
1362         start = uffdio_register.range.start;
1363         end = start + uffdio_register.range.len;
1364
1365         ret = -ENOMEM;
1366         if (!mmget_not_zero(mm))
1367                 goto out;
1368
1369         ret = -EINVAL;
1370         mmap_write_lock(mm);
1371         vma_iter_init(&vmi, mm, start);
1372         vma = vma_find(&vmi, end);
1373         if (!vma)
1374                 goto out_unlock;
1375
1376         /*
1377          * If the first vma contains huge pages, make sure start address
1378          * is aligned to huge page size.
1379          */
1380         if (is_vm_hugetlb_page(vma)) {
1381                 unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
1382
1383                 if (start & (vma_hpagesize - 1))
1384                         goto out_unlock;
1385         }
1386
1387         /*
1388          * Search for not compatible vmas.
1389          */
1390         found = false;
1391         basic_ioctls = false;
1392         cur = vma;
1393         do {
1394                 cond_resched();
1395
1396                 BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
1397                        !!(cur->vm_flags & __VM_UFFD_FLAGS));
1398
1399                 /* check not compatible vmas */
1400                 ret = -EINVAL;
1401                 if (!vma_can_userfault(cur, vm_flags, wp_async))
1402                         goto out_unlock;
1403
1404                 /*
1405                  * UFFDIO_COPY will fill file holes even without
1406                  * PROT_WRITE. This check enforces that if this is a
1407                  * MAP_SHARED, the process has write permission to the backing
1408                  * file. If VM_MAYWRITE is set it also enforces that on a
1409                  * MAP_SHARED vma: there is no F_WRITE_SEAL and no further
1410                  * F_WRITE_SEAL can be taken until the vma is destroyed.
1411                  */
1412                 ret = -EPERM;
1413                 if (unlikely(!(cur->vm_flags & VM_MAYWRITE)))
1414                         goto out_unlock;
1415
1416                 /*
1417                  * If this vma contains ending address, and huge pages
1418                  * check alignment.
1419                  */
1420                 if (is_vm_hugetlb_page(cur) && end <= cur->vm_end &&
1421                     end > cur->vm_start) {
1422                         unsigned long vma_hpagesize = vma_kernel_pagesize(cur);
1423
1424                         ret = -EINVAL;
1425
1426                         if (end & (vma_hpagesize - 1))
1427                                 goto out_unlock;
1428                 }
1429                 if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE))
1430                         goto out_unlock;
1431
1432                 /*
1433                  * Check that this vma isn't already owned by a
1434                  * different userfaultfd. We can't allow more than one
1435                  * userfaultfd to own a single vma simultaneously or we
1436                  * wouldn't know which one to deliver the userfaults to.
1437                  */
1438                 ret = -EBUSY;
1439                 if (cur->vm_userfaultfd_ctx.ctx &&
1440                     cur->vm_userfaultfd_ctx.ctx != ctx)
1441                         goto out_unlock;
1442
1443                 /*
1444                  * Note vmas containing huge pages
1445                  */
1446                 if (is_vm_hugetlb_page(cur))
1447                         basic_ioctls = true;
1448
1449                 found = true;
1450         } for_each_vma_range(vmi, cur, end);
1451         BUG_ON(!found);
1452
1453         vma_iter_set(&vmi, start);
1454         prev = vma_prev(&vmi);
1455         if (vma->vm_start < start)
1456                 prev = vma;
1457
1458         ret = 0;
1459         for_each_vma_range(vmi, vma, end) {
1460                 cond_resched();
1461
1462                 BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async));
1463                 BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
1464                        vma->vm_userfaultfd_ctx.ctx != ctx);
1465                 WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
1466
1467                 /*
1468                  * Nothing to do: this vma is already registered into this
1469                  * userfaultfd and with the right tracking mode too.
1470                  */
1471                 if (vma->vm_userfaultfd_ctx.ctx == ctx &&
1472                     (vma->vm_flags & vm_flags) == vm_flags)
1473                         goto skip;
1474
1475                 if (vma->vm_start > start)
1476                         start = vma->vm_start;
1477                 vma_end = min(end, vma->vm_end);
1478
1479                 new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
1480                 vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
1481                                             new_flags,
1482                                             (struct vm_userfaultfd_ctx){ctx});
1483                 if (IS_ERR(vma)) {
1484                         ret = PTR_ERR(vma);
1485                         break;
1486                 }
1487
1488                 /*
1489                  * In the vma_merge() successful mprotect-like case 8:
1490                  * the next vma was merged into the current one and
1491                  * the current one has not been updated yet.
1492                  */
1493                 vma_start_write(vma);
1494                 userfaultfd_set_vm_flags(vma, new_flags);
1495                 vma->vm_userfaultfd_ctx.ctx = ctx;
1496
1497                 if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
1498                         hugetlb_unshare_all_pmds(vma);
1499
1500         skip:
1501                 prev = vma;
1502                 start = vma->vm_end;
1503         }
1504
1505 out_unlock:
1506         mmap_write_unlock(mm);
1507         mmput(mm);
1508         if (!ret) {
1509                 __u64 ioctls_out;
1510
1511                 ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
1512                     UFFD_API_RANGE_IOCTLS;
1513
1514                 /*
1515                  * Declare the WP ioctl only if the WP mode is
1516                  * specified and all checks passed with the range
1517                  */
1518                 if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP))
1519                         ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT);
1520
1521                 /* CONTINUE ioctl is only supported for MINOR ranges. */
1522                 if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
1523                         ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE);
1524
1525                 /*
1526                  * Now that we scanned all vmas we can already tell
1527                  * userland which ioctls methods are guaranteed to
1528                  * succeed on this range.
1529                  */
1530                 if (put_user(ioctls_out, &user_uffdio_register->ioctls))
1531                         ret = -EFAULT;
1532         }
1533 out:
1534         return ret;
1535 }
1536
1537 static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
1538                                   unsigned long arg)
1539 {
1540         struct mm_struct *mm = ctx->mm;
1541         struct vm_area_struct *vma, *prev, *cur;
1542         int ret;
1543         struct uffdio_range uffdio_unregister;
1544         unsigned long new_flags;
1545         bool found;
1546         unsigned long start, end, vma_end;
1547         const void __user *buf = (void __user *)arg;
1548         struct vma_iterator vmi;
1549         bool wp_async = userfaultfd_wp_async_ctx(ctx);
1550
1551         ret = -EFAULT;
1552         if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
1553                 goto out;
1554
1555         ret = validate_range(mm, uffdio_unregister.start,
1556                              uffdio_unregister.len);
1557         if (ret)
1558                 goto out;
1559
1560         start = uffdio_unregister.start;
1561         end = start + uffdio_unregister.len;
1562
1563         ret = -ENOMEM;
1564         if (!mmget_not_zero(mm))
1565                 goto out;
1566
1567         mmap_write_lock(mm);
1568         ret = -EINVAL;
1569         vma_iter_init(&vmi, mm, start);
1570         vma = vma_find(&vmi, end);
1571         if (!vma)
1572                 goto out_unlock;
1573
1574         /*
1575          * If the first vma contains huge pages, make sure start address
1576          * is aligned to huge page size.
1577          */
1578         if (is_vm_hugetlb_page(vma)) {
1579                 unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
1580
1581                 if (start & (vma_hpagesize - 1))
1582                         goto out_unlock;
1583         }
1584
1585         /*
1586          * Search for not compatible vmas.
1587          */
1588         found = false;
1589         cur = vma;
1590         do {
1591                 cond_resched();
1592
1593                 BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
1594                        !!(cur->vm_flags & __VM_UFFD_FLAGS));
1595
1596                 /*
1597                  * Check not compatible vmas, not strictly required
1598                  * here as not compatible vmas cannot have an
1599                  * userfaultfd_ctx registered on them, but this
1600                  * provides for more strict behavior to notice
1601                  * unregistration errors.
1602                  */
1603                 if (!vma_can_userfault(cur, cur->vm_flags, wp_async))
1604                         goto out_unlock;
1605
1606                 found = true;
1607         } for_each_vma_range(vmi, cur, end);
1608         BUG_ON(!found);
1609
1610         vma_iter_set(&vmi, start);
1611         prev = vma_prev(&vmi);
1612         if (vma->vm_start < start)
1613                 prev = vma;
1614
1615         ret = 0;
1616         for_each_vma_range(vmi, vma, end) {
1617                 cond_resched();
1618
1619                 BUG_ON(!vma_can_userfault(vma, vma->vm_flags, wp_async));
1620
1621                 /*
1622                  * Nothing to do: this vma is already registered into this
1623                  * userfaultfd and with the right tracking mode too.
1624                  */
1625                 if (!vma->vm_userfaultfd_ctx.ctx)
1626                         goto skip;
1627
1628                 WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
1629
1630                 if (vma->vm_start > start)
1631                         start = vma->vm_start;
1632                 vma_end = min(end, vma->vm_end);
1633
1634                 if (userfaultfd_missing(vma)) {
1635                         /*
1636                          * Wake any concurrent pending userfault while
1637                          * we unregister, so they will not hang
1638                          * permanently and it avoids userland to call
1639                          * UFFDIO_WAKE explicitly.
1640                          */
1641                         struct userfaultfd_wake_range range;
1642                         range.start = start;
1643                         range.len = vma_end - start;
1644                         wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
1645                 }
1646
1647                 /* Reset ptes for the whole vma range if wr-protected */
1648                 if (userfaultfd_wp(vma))
1649                         uffd_wp_range(vma, start, vma_end - start, false);
1650
1651                 new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
1652                 vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
1653                                             new_flags, NULL_VM_UFFD_CTX);
1654                 if (IS_ERR(vma)) {
1655                         ret = PTR_ERR(vma);
1656                         break;
1657                 }
1658
1659                 /*
1660                  * In the vma_merge() successful mprotect-like case 8:
1661                  * the next vma was merged into the current one and
1662                  * the current one has not been updated yet.
1663                  */
1664                 vma_start_write(vma);
1665                 userfaultfd_set_vm_flags(vma, new_flags);
1666                 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
1667
1668         skip:
1669                 prev = vma;
1670                 start = vma->vm_end;
1671         }
1672
1673 out_unlock:
1674         mmap_write_unlock(mm);
1675         mmput(mm);
1676 out:
1677         return ret;
1678 }
1679
1680 /*
1681  * userfaultfd_wake may be used in combination with the
1682  * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches.
1683  */
1684 static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
1685                             unsigned long arg)
1686 {
1687         int ret;
1688         struct uffdio_range uffdio_wake;
1689         struct userfaultfd_wake_range range;
1690         const void __user *buf = (void __user *)arg;
1691
1692         ret = -EFAULT;
1693         if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
1694                 goto out;
1695
1696         ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
1697         if (ret)
1698                 goto out;
1699
1700         range.start = uffdio_wake.start;
1701         range.len = uffdio_wake.len;
1702
1703         /*
1704          * len == 0 means wake all and we don't want to wake all here,
1705          * so check it again to be sure.
1706          */
1707         VM_BUG_ON(!range.len);
1708
1709         wake_userfault(ctx, &range);
1710         ret = 0;
1711
1712 out:
1713         return ret;
1714 }
1715
1716 static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
1717                             unsigned long arg)
1718 {
1719         __s64 ret;
1720         struct uffdio_copy uffdio_copy;
1721         struct uffdio_copy __user *user_uffdio_copy;
1722         struct userfaultfd_wake_range range;
1723         uffd_flags_t flags = 0;
1724
1725         user_uffdio_copy = (struct uffdio_copy __user *) arg;
1726
1727         ret = -EAGAIN;
1728         if (atomic_read(&ctx->mmap_changing))
1729                 goto out;
1730
1731         ret = -EFAULT;
1732         if (copy_from_user(&uffdio_copy, user_uffdio_copy,
1733                            /* don't copy "copy" last field */
1734                            sizeof(uffdio_copy)-sizeof(__s64)))
1735                 goto out;
1736
1737         ret = validate_unaligned_range(ctx->mm, uffdio_copy.src,
1738                                        uffdio_copy.len);
1739         if (ret)
1740                 goto out;
1741         ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
1742         if (ret)
1743                 goto out;
1744
1745         ret = -EINVAL;
1746         if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
1747                 goto out;
1748         if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
1749                 flags |= MFILL_ATOMIC_WP;
1750         if (mmget_not_zero(ctx->mm)) {
1751                 ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
1752                                         uffdio_copy.len, &ctx->mmap_changing,
1753                                         flags);
1754                 mmput(ctx->mm);
1755         } else {
1756                 return -ESRCH;
1757         }
1758         if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
1759                 return -EFAULT;
1760         if (ret < 0)
1761                 goto out;
1762         BUG_ON(!ret);
1763         /* len == 0 would wake all */
1764         range.len = ret;
1765         if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
1766                 range.start = uffdio_copy.dst;
1767                 wake_userfault(ctx, &range);
1768         }
1769         ret = range.len == uffdio_copy.len ? 0 : -EAGAIN;
1770 out:
1771         return ret;
1772 }
1773
1774 static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
1775                                 unsigned long arg)
1776 {
1777         __s64 ret;
1778         struct uffdio_zeropage uffdio_zeropage;
1779         struct uffdio_zeropage __user *user_uffdio_zeropage;
1780         struct userfaultfd_wake_range range;
1781
1782         user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
1783
1784         ret = -EAGAIN;
1785         if (atomic_read(&ctx->mmap_changing))
1786                 goto out;
1787
1788         ret = -EFAULT;
1789         if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
1790                            /* don't copy "zeropage" last field */
1791                            sizeof(uffdio_zeropage)-sizeof(__s64)))
1792                 goto out;
1793
1794         ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
1795                              uffdio_zeropage.range.len);
1796         if (ret)
1797                 goto out;
1798         ret = -EINVAL;
1799         if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
1800                 goto out;
1801
1802         if (mmget_not_zero(ctx->mm)) {
1803                 ret = mfill_atomic_zeropage(ctx->mm, uffdio_zeropage.range.start,
1804                                            uffdio_zeropage.range.len,
1805                                            &ctx->mmap_changing);
1806                 mmput(ctx->mm);
1807         } else {
1808                 return -ESRCH;
1809         }
1810         if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
1811                 return -EFAULT;
1812         if (ret < 0)
1813                 goto out;
1814         /* len == 0 would wake all */
1815         BUG_ON(!ret);
1816         range.len = ret;
1817         if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
1818                 range.start = uffdio_zeropage.range.start;
1819                 wake_userfault(ctx, &range);
1820         }
1821         ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
1822 out:
1823         return ret;
1824 }
1825
1826 static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
1827                                     unsigned long arg)
1828 {
1829         int ret;
1830         struct uffdio_writeprotect uffdio_wp;
1831         struct uffdio_writeprotect __user *user_uffdio_wp;
1832         struct userfaultfd_wake_range range;
1833         bool mode_wp, mode_dontwake;
1834
1835         if (atomic_read(&ctx->mmap_changing))
1836                 return -EAGAIN;
1837
1838         user_uffdio_wp = (struct uffdio_writeprotect __user *) arg;
1839
1840         if (copy_from_user(&uffdio_wp, user_uffdio_wp,
1841                            sizeof(struct uffdio_writeprotect)))
1842                 return -EFAULT;
1843
1844         ret = validate_range(ctx->mm, uffdio_wp.range.start,
1845                              uffdio_wp.range.len);
1846         if (ret)
1847                 return ret;
1848
1849         if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
1850                                UFFDIO_WRITEPROTECT_MODE_WP))
1851                 return -EINVAL;
1852
1853         mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
1854         mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
1855
1856         if (mode_wp && mode_dontwake)
1857                 return -EINVAL;
1858
1859         if (mmget_not_zero(ctx->mm)) {
1860                 ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
1861                                           uffdio_wp.range.len, mode_wp,
1862                                           &ctx->mmap_changing);
1863                 mmput(ctx->mm);
1864         } else {
1865                 return -ESRCH;
1866         }
1867
1868         if (ret)
1869                 return ret;
1870
1871         if (!mode_wp && !mode_dontwake) {
1872                 range.start = uffdio_wp.range.start;
1873                 range.len = uffdio_wp.range.len;
1874                 wake_userfault(ctx, &range);
1875         }
1876         return ret;
1877 }
1878
1879 static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
1880 {
1881         __s64 ret;
1882         struct uffdio_continue uffdio_continue;
1883         struct uffdio_continue __user *user_uffdio_continue;
1884         struct userfaultfd_wake_range range;
1885         uffd_flags_t flags = 0;
1886
1887         user_uffdio_continue = (struct uffdio_continue __user *)arg;
1888
1889         ret = -EAGAIN;
1890         if (atomic_read(&ctx->mmap_changing))
1891                 goto out;
1892
1893         ret = -EFAULT;
1894         if (copy_from_user(&uffdio_continue, user_uffdio_continue,
1895                            /* don't copy the output fields */
1896                            sizeof(uffdio_continue) - (sizeof(__s64))))
1897                 goto out;
1898
1899         ret = validate_range(ctx->mm, uffdio_continue.range.start,
1900                              uffdio_continue.range.len);
1901         if (ret)
1902                 goto out;
1903
1904         ret = -EINVAL;
1905         if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE |
1906                                      UFFDIO_CONTINUE_MODE_WP))
1907                 goto out;
1908         if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP)
1909                 flags |= MFILL_ATOMIC_WP;
1910
1911         if (mmget_not_zero(ctx->mm)) {
1912                 ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start,
1913                                             uffdio_continue.range.len,
1914                                             &ctx->mmap_changing, flags);
1915                 mmput(ctx->mm);
1916         } else {
1917                 return -ESRCH;
1918         }
1919
1920         if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
1921                 return -EFAULT;
1922         if (ret < 0)
1923                 goto out;
1924
1925         /* len == 0 would wake all */
1926         BUG_ON(!ret);
1927         range.len = ret;
1928         if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
1929                 range.start = uffdio_continue.range.start;
1930                 wake_userfault(ctx, &range);
1931         }
1932         ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN;
1933
1934 out:
1935         return ret;
1936 }
1937
1938 static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long arg)
1939 {
1940         __s64 ret;
1941         struct uffdio_poison uffdio_poison;
1942         struct uffdio_poison __user *user_uffdio_poison;
1943         struct userfaultfd_wake_range range;
1944
1945         user_uffdio_poison = (struct uffdio_poison __user *)arg;
1946
1947         ret = -EAGAIN;
1948         if (atomic_read(&ctx->mmap_changing))
1949                 goto out;
1950
1951         ret = -EFAULT;
1952         if (copy_from_user(&uffdio_poison, user_uffdio_poison,
1953                            /* don't copy the output fields */
1954                            sizeof(uffdio_poison) - (sizeof(__s64))))
1955                 goto out;
1956
1957         ret = validate_range(ctx->mm, uffdio_poison.range.start,
1958                              uffdio_poison.range.len);
1959         if (ret)
1960                 goto out;
1961
1962         ret = -EINVAL;
1963         if (uffdio_poison.mode & ~UFFDIO_POISON_MODE_DONTWAKE)
1964                 goto out;
1965
1966         if (mmget_not_zero(ctx->mm)) {
1967                 ret = mfill_atomic_poison(ctx->mm, uffdio_poison.range.start,
1968                                           uffdio_poison.range.len,
1969                                           &ctx->mmap_changing, 0);
1970                 mmput(ctx->mm);
1971         } else {
1972                 return -ESRCH;
1973         }
1974
1975         if (unlikely(put_user(ret, &user_uffdio_poison->updated)))
1976                 return -EFAULT;
1977         if (ret < 0)
1978                 goto out;
1979
1980         /* len == 0 would wake all */
1981         BUG_ON(!ret);
1982         range.len = ret;
1983         if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) {
1984                 range.start = uffdio_poison.range.start;
1985                 wake_userfault(ctx, &range);
1986         }
1987         ret = range.len == uffdio_poison.range.len ? 0 : -EAGAIN;
1988
1989 out:
1990         return ret;
1991 }
1992
1993 bool userfaultfd_wp_async(struct vm_area_struct *vma)
1994 {
1995         return userfaultfd_wp_async_ctx(vma->vm_userfaultfd_ctx.ctx);
1996 }
1997
1998 static inline unsigned int uffd_ctx_features(__u64 user_features)
1999 {
2000         /*
2001          * For the current set of features the bits just coincide. Set
2002          * UFFD_FEATURE_INITIALIZED to mark the features as enabled.
2003          */
2004         return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED;
2005 }
2006
2007 static int userfaultfd_move(struct userfaultfd_ctx *ctx,
2008                             unsigned long arg)
2009 {
2010         __s64 ret;
2011         struct uffdio_move uffdio_move;
2012         struct uffdio_move __user *user_uffdio_move;
2013         struct userfaultfd_wake_range range;
2014         struct mm_struct *mm = ctx->mm;
2015
2016         user_uffdio_move = (struct uffdio_move __user *) arg;
2017
2018         if (atomic_read(&ctx->mmap_changing))
2019                 return -EAGAIN;
2020
2021         if (copy_from_user(&uffdio_move, user_uffdio_move,
2022                            /* don't copy "move" last field */
2023                            sizeof(uffdio_move)-sizeof(__s64)))
2024                 return -EFAULT;
2025
2026         /* Do not allow cross-mm moves. */
2027         if (mm != current->mm)
2028                 return -EINVAL;
2029
2030         ret = validate_range(mm, uffdio_move.dst, uffdio_move.len);
2031         if (ret)
2032                 return ret;
2033
2034         ret = validate_range(mm, uffdio_move.src, uffdio_move.len);
2035         if (ret)
2036                 return ret;
2037
2038         if (uffdio_move.mode & ~(UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES|
2039                                   UFFDIO_MOVE_MODE_DONTWAKE))
2040                 return -EINVAL;
2041
2042         if (mmget_not_zero(mm)) {
2043                 mmap_read_lock(mm);
2044
2045                 /* Re-check after taking mmap_lock */
2046                 if (likely(!atomic_read(&ctx->mmap_changing)))
2047                         ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
2048                                          uffdio_move.len, uffdio_move.mode);
2049                 else
2050                         ret = -EINVAL;
2051
2052                 mmap_read_unlock(mm);
2053                 mmput(mm);
2054         } else {
2055                 return -ESRCH;
2056         }
2057
2058         if (unlikely(put_user(ret, &user_uffdio_move->move)))
2059                 return -EFAULT;
2060         if (ret < 0)
2061                 goto out;
2062
2063         /* len == 0 would wake all */
2064         VM_WARN_ON(!ret);
2065         range.len = ret;
2066         if (!(uffdio_move.mode & UFFDIO_MOVE_MODE_DONTWAKE)) {
2067                 range.start = uffdio_move.dst;
2068                 wake_userfault(ctx, &range);
2069         }
2070         ret = range.len == uffdio_move.len ? 0 : -EAGAIN;
2071
2072 out:
2073         return ret;
2074 }
2075
2076 /*
2077  * userland asks for a certain API version and we return which bits
2078  * and ioctl commands are implemented in this kernel for such API
2079  * version or -EINVAL if unknown.
2080  */
2081 static int userfaultfd_api(struct userfaultfd_ctx *ctx,
2082                            unsigned long arg)
2083 {
2084         struct uffdio_api uffdio_api;
2085         void __user *buf = (void __user *)arg;
2086         unsigned int ctx_features;
2087         int ret;
2088         __u64 features;
2089
2090         ret = -EFAULT;
2091         if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
2092                 goto out;
2093         features = uffdio_api.features;
2094         ret = -EINVAL;
2095         if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES))
2096                 goto err_out;
2097         ret = -EPERM;
2098         if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
2099                 goto err_out;
2100
2101         /* WP_ASYNC relies on WP_UNPOPULATED, choose it unconditionally */
2102         if (features & UFFD_FEATURE_WP_ASYNC)
2103                 features |= UFFD_FEATURE_WP_UNPOPULATED;
2104
2105         /* report all available features and ioctls to userland */
2106         uffdio_api.features = UFFD_API_FEATURES;
2107 #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
2108         uffdio_api.features &=
2109                 ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
2110 #endif
2111 #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
2112         uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
2113 #endif
2114 #ifndef CONFIG_PTE_MARKER_UFFD_WP
2115         uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
2116         uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
2117         uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
2118 #endif
2119         uffdio_api.ioctls = UFFD_API_IOCTLS;
2120         ret = -EFAULT;
2121         if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
2122                 goto out;
2123
2124         /* only enable the requested features for this uffd context */
2125         ctx_features = uffd_ctx_features(features);
2126         ret = -EINVAL;
2127         if (cmpxchg(&ctx->features, 0, ctx_features) != 0)
2128                 goto err_out;
2129
2130         ret = 0;
2131 out:
2132         return ret;
2133 err_out:
2134         memset(&uffdio_api, 0, sizeof(uffdio_api));
2135         if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
2136                 ret = -EFAULT;
2137         goto out;
2138 }
2139
2140 static long userfaultfd_ioctl(struct file *file, unsigned cmd,
2141                               unsigned long arg)
2142 {
2143         int ret = -EINVAL;
2144         struct userfaultfd_ctx *ctx = file->private_data;
2145
2146         if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx))
2147                 return -EINVAL;
2148
2149         switch(cmd) {
2150         case UFFDIO_API:
2151                 ret = userfaultfd_api(ctx, arg);
2152                 break;
2153         case UFFDIO_REGISTER:
2154                 ret = userfaultfd_register(ctx, arg);
2155                 break;
2156         case UFFDIO_UNREGISTER:
2157                 ret = userfaultfd_unregister(ctx, arg);
2158                 break;
2159         case UFFDIO_WAKE:
2160                 ret = userfaultfd_wake(ctx, arg);
2161                 break;
2162         case UFFDIO_COPY:
2163                 ret = userfaultfd_copy(ctx, arg);
2164                 break;
2165         case UFFDIO_ZEROPAGE:
2166                 ret = userfaultfd_zeropage(ctx, arg);
2167                 break;
2168         case UFFDIO_MOVE:
2169                 ret = userfaultfd_move(ctx, arg);
2170                 break;
2171         case UFFDIO_WRITEPROTECT:
2172                 ret = userfaultfd_writeprotect(ctx, arg);
2173                 break;
2174         case UFFDIO_CONTINUE:
2175                 ret = userfaultfd_continue(ctx, arg);
2176                 break;
2177         case UFFDIO_POISON:
2178                 ret = userfaultfd_poison(ctx, arg);
2179                 break;
2180         }
2181         return ret;
2182 }
2183
2184 #ifdef CONFIG_PROC_FS
2185 static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
2186 {
2187         struct userfaultfd_ctx *ctx = f->private_data;
2188         wait_queue_entry_t *wq;
2189         unsigned long pending = 0, total = 0;
2190
2191         spin_lock_irq(&ctx->fault_pending_wqh.lock);
2192         list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
2193                 pending++;
2194                 total++;
2195         }
2196         list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
2197                 total++;
2198         }
2199         spin_unlock_irq(&ctx->fault_pending_wqh.lock);
2200
2201         /*
2202          * If more protocols will be added, there will be all shown
2203          * separated by a space. Like this:
2204          *      protocols: aa:... bb:...
2205          */
2206         seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
2207                    pending, total, UFFD_API, ctx->features,
2208                    UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
2209 }
2210 #endif
2211
2212 static const struct file_operations userfaultfd_fops = {
2213 #ifdef CONFIG_PROC_FS
2214         .show_fdinfo    = userfaultfd_show_fdinfo,
2215 #endif
2216         .release        = userfaultfd_release,
2217         .poll           = userfaultfd_poll,
2218         .read           = userfaultfd_read,
2219         .unlocked_ioctl = userfaultfd_ioctl,
2220         .compat_ioctl   = compat_ptr_ioctl,
2221         .llseek         = noop_llseek,
2222 };
2223
2224 static void init_once_userfaultfd_ctx(void *mem)
2225 {
2226         struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem;
2227
2228         init_waitqueue_head(&ctx->fault_pending_wqh);
2229         init_waitqueue_head(&ctx->fault_wqh);
2230         init_waitqueue_head(&ctx->event_wqh);
2231         init_waitqueue_head(&ctx->fd_wqh);
2232         seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock);
2233 }
2234
2235 static int new_userfaultfd(int flags)
2236 {
2237         struct userfaultfd_ctx *ctx;
2238         int fd;
2239
2240         BUG_ON(!current->mm);
2241
2242         /* Check the UFFD_* constants for consistency.  */
2243         BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS);
2244         BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
2245         BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
2246
2247         if (flags & ~(UFFD_SHARED_FCNTL_FLAGS | UFFD_USER_MODE_ONLY))
2248                 return -EINVAL;
2249
2250         ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
2251         if (!ctx)
2252                 return -ENOMEM;
2253
2254         refcount_set(&ctx->refcount, 1);
2255         ctx->flags = flags;
2256         ctx->features = 0;
2257         ctx->released = false;
2258         atomic_set(&ctx->mmap_changing, 0);
2259         ctx->mm = current->mm;
2260         /* prevent the mm struct to be freed */
2261         mmgrab(ctx->mm);
2262
2263         fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, ctx,
2264                         O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL);
2265         if (fd < 0) {
2266                 mmdrop(ctx->mm);
2267                 kmem_cache_free(userfaultfd_ctx_cachep, ctx);
2268         }
2269         return fd;
2270 }
2271
2272 static inline bool userfaultfd_syscall_allowed(int flags)
2273 {
2274         /* Userspace-only page faults are always allowed */
2275         if (flags & UFFD_USER_MODE_ONLY)
2276                 return true;
2277
2278         /*
2279          * The user is requesting a userfaultfd which can handle kernel faults.
2280          * Privileged users are always allowed to do this.
2281          */
2282         if (capable(CAP_SYS_PTRACE))
2283                 return true;
2284
2285         /* Otherwise, access to kernel fault handling is sysctl controlled. */
2286         return sysctl_unprivileged_userfaultfd;
2287 }
2288
2289 SYSCALL_DEFINE1(userfaultfd, int, flags)
2290 {
2291         if (!userfaultfd_syscall_allowed(flags))
2292                 return -EPERM;
2293
2294         return new_userfaultfd(flags);
2295 }
2296
2297 static long userfaultfd_dev_ioctl(struct file *file, unsigned int cmd, unsigned long flags)
2298 {
2299         if (cmd != USERFAULTFD_IOC_NEW)
2300                 return -EINVAL;
2301
2302         return new_userfaultfd(flags);
2303 }
2304
2305 static const struct file_operations userfaultfd_dev_fops = {
2306         .unlocked_ioctl = userfaultfd_dev_ioctl,
2307         .compat_ioctl = userfaultfd_dev_ioctl,
2308         .owner = THIS_MODULE,
2309         .llseek = noop_llseek,
2310 };
2311
2312 static struct miscdevice userfaultfd_misc = {
2313         .minor = MISC_DYNAMIC_MINOR,
2314         .name = "userfaultfd",
2315         .fops = &userfaultfd_dev_fops
2316 };
2317
2318 static int __init userfaultfd_init(void)
2319 {
2320         int ret;
2321
2322         ret = misc_register(&userfaultfd_misc);
2323         if (ret)
2324                 return ret;
2325
2326         userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
2327                                                 sizeof(struct userfaultfd_ctx),
2328                                                 0,
2329                                                 SLAB_HWCACHE_ALIGN|SLAB_PANIC,
2330                                                 init_once_userfaultfd_ctx);
2331 #ifdef CONFIG_SYSCTL
2332         register_sysctl_init("vm", vm_userfaultfd_table);
2333 #endif
2334         return 0;
2335 }
2336 __initcall(userfaultfd_init);