3c6265d4254b194791020b0a08efda588b7ffb4e
[sfrench/cifs-2.6.git] / mm / hmm.c
1 /*
2  * Copyright 2013 Red Hat Inc.
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation; either version 2 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * Authors: Jérôme Glisse <jglisse@redhat.com>
15  */
16 /*
17  * Refer to include/linux/hmm.h for information about heterogeneous memory
18  * management or HMM for short.
19  */
20 #include <linux/mm.h>
21 #include <linux/hmm.h>
22 #include <linux/rmap.h>
23 #include <linux/swap.h>
24 #include <linux/slab.h>
25 #include <linux/sched.h>
26 #include <linux/swapops.h>
27 #include <linux/hugetlb.h>
28 #include <linux/jump_label.h>
29 #include <linux/mmu_notifier.h>
30
31
32 /*
33  * Device private memory see HMM (Documentation/vm/hmm.txt) or hmm.h
34  */
35 DEFINE_STATIC_KEY_FALSE(device_private_key);
36 EXPORT_SYMBOL(device_private_key);
37
38
39 #ifdef CONFIG_HMM
40 static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
41
42 /*
43  * struct hmm - HMM per mm struct
44  *
45  * @mm: mm struct this HMM struct is bound to
46  * @lock: lock protecting ranges list
47  * @sequence: we track updates to the CPU page table with a sequence number
48  * @ranges: list of range being snapshotted
49  * @mirrors: list of mirrors for this mm
50  * @mmu_notifier: mmu notifier to track updates to CPU page table
51  * @mirrors_sem: read/write semaphore protecting the mirrors list
52  */
53 struct hmm {
54         struct mm_struct        *mm;
55         spinlock_t              lock;
56         atomic_t                sequence;
57         struct list_head        ranges;
58         struct list_head        mirrors;
59         struct mmu_notifier     mmu_notifier;
60         struct rw_semaphore     mirrors_sem;
61 };
62
63 /*
64  * hmm_register - register HMM against an mm (HMM internal)
65  *
66  * @mm: mm struct to attach to
67  *
68  * This is not intended to be used directly by device drivers. It allocates an
69  * HMM struct if mm does not have one, and initializes it.
70  */
71 static struct hmm *hmm_register(struct mm_struct *mm)
72 {
73         struct hmm *hmm = READ_ONCE(mm->hmm);
74         bool cleanup = false;
75
76         /*
77          * The hmm struct can only be freed once the mm_struct goes away,
78          * hence we should always have pre-allocated an new hmm struct
79          * above.
80          */
81         if (hmm)
82                 return hmm;
83
84         hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
85         if (!hmm)
86                 return NULL;
87         INIT_LIST_HEAD(&hmm->mirrors);
88         init_rwsem(&hmm->mirrors_sem);
89         atomic_set(&hmm->sequence, 0);
90         hmm->mmu_notifier.ops = NULL;
91         INIT_LIST_HEAD(&hmm->ranges);
92         spin_lock_init(&hmm->lock);
93         hmm->mm = mm;
94
95         /*
96          * We should only get here if hold the mmap_sem in write mode ie on
97          * registration of first mirror through hmm_mirror_register()
98          */
99         hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
100         if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) {
101                 kfree(hmm);
102                 return NULL;
103         }
104
105         spin_lock(&mm->page_table_lock);
106         if (!mm->hmm)
107                 mm->hmm = hmm;
108         else
109                 cleanup = true;
110         spin_unlock(&mm->page_table_lock);
111
112         if (cleanup) {
113                 mmu_notifier_unregister(&hmm->mmu_notifier, mm);
114                 kfree(hmm);
115         }
116
117         return mm->hmm;
118 }
119
120 void hmm_mm_destroy(struct mm_struct *mm)
121 {
122         kfree(mm->hmm);
123 }
124 #endif /* CONFIG_HMM */
125
126 #if IS_ENABLED(CONFIG_HMM_MIRROR)
127 static void hmm_invalidate_range(struct hmm *hmm,
128                                  enum hmm_update_type action,
129                                  unsigned long start,
130                                  unsigned long end)
131 {
132         struct hmm_mirror *mirror;
133         struct hmm_range *range;
134
135         spin_lock(&hmm->lock);
136         list_for_each_entry(range, &hmm->ranges, list) {
137                 unsigned long addr, idx, npages;
138
139                 if (end < range->start || start >= range->end)
140                         continue;
141
142                 range->valid = false;
143                 addr = max(start, range->start);
144                 idx = (addr - range->start) >> PAGE_SHIFT;
145                 npages = (min(range->end, end) - addr) >> PAGE_SHIFT;
146                 memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages);
147         }
148         spin_unlock(&hmm->lock);
149
150         down_read(&hmm->mirrors_sem);
151         list_for_each_entry(mirror, &hmm->mirrors, list)
152                 mirror->ops->sync_cpu_device_pagetables(mirror, action,
153                                                         start, end);
154         up_read(&hmm->mirrors_sem);
155 }
156
157 static void hmm_invalidate_range_start(struct mmu_notifier *mn,
158                                        struct mm_struct *mm,
159                                        unsigned long start,
160                                        unsigned long end)
161 {
162         struct hmm *hmm = mm->hmm;
163
164         VM_BUG_ON(!hmm);
165
166         atomic_inc(&hmm->sequence);
167 }
168
169 static void hmm_invalidate_range_end(struct mmu_notifier *mn,
170                                      struct mm_struct *mm,
171                                      unsigned long start,
172                                      unsigned long end)
173 {
174         struct hmm *hmm = mm->hmm;
175
176         VM_BUG_ON(!hmm);
177
178         hmm_invalidate_range(mm->hmm, HMM_UPDATE_INVALIDATE, start, end);
179 }
180
181 static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
182         .invalidate_range_start = hmm_invalidate_range_start,
183         .invalidate_range_end   = hmm_invalidate_range_end,
184 };
185
186 /*
187  * hmm_mirror_register() - register a mirror against an mm
188  *
189  * @mirror: new mirror struct to register
190  * @mm: mm to register against
191  *
192  * To start mirroring a process address space, the device driver must register
193  * an HMM mirror struct.
194  *
195  * THE mm->mmap_sem MUST BE HELD IN WRITE MODE !
196  */
197 int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm)
198 {
199         /* Sanity check */
200         if (!mm || !mirror || !mirror->ops)
201                 return -EINVAL;
202
203         mirror->hmm = hmm_register(mm);
204         if (!mirror->hmm)
205                 return -ENOMEM;
206
207         down_write(&mirror->hmm->mirrors_sem);
208         list_add(&mirror->list, &mirror->hmm->mirrors);
209         up_write(&mirror->hmm->mirrors_sem);
210
211         return 0;
212 }
213 EXPORT_SYMBOL(hmm_mirror_register);
214
215 /*
216  * hmm_mirror_unregister() - unregister a mirror
217  *
218  * @mirror: new mirror struct to register
219  *
220  * Stop mirroring a process address space, and cleanup.
221  */
222 void hmm_mirror_unregister(struct hmm_mirror *mirror)
223 {
224         struct hmm *hmm = mirror->hmm;
225
226         down_write(&hmm->mirrors_sem);
227         list_del(&mirror->list);
228         up_write(&hmm->mirrors_sem);
229 }
230 EXPORT_SYMBOL(hmm_mirror_unregister);
231
232 struct hmm_vma_walk {
233         struct hmm_range        *range;
234         unsigned long           last;
235         bool                    fault;
236         bool                    block;
237         bool                    write;
238 };
239
240 static int hmm_vma_do_fault(struct mm_walk *walk,
241                             unsigned long addr,
242                             hmm_pfn_t *pfn)
243 {
244         unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE;
245         struct hmm_vma_walk *hmm_vma_walk = walk->private;
246         struct vm_area_struct *vma = walk->vma;
247         int r;
248
249         flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY;
250         flags |= hmm_vma_walk->write ? FAULT_FLAG_WRITE : 0;
251         r = handle_mm_fault(vma, addr, flags);
252         if (r & VM_FAULT_RETRY)
253                 return -EBUSY;
254         if (r & VM_FAULT_ERROR) {
255                 *pfn = HMM_PFN_ERROR;
256                 return -EFAULT;
257         }
258
259         return -EAGAIN;
260 }
261
262 static void hmm_pfns_special(hmm_pfn_t *pfns,
263                              unsigned long addr,
264                              unsigned long end)
265 {
266         for (; addr < end; addr += PAGE_SIZE, pfns++)
267                 *pfns = HMM_PFN_SPECIAL;
268 }
269
270 static int hmm_pfns_bad(unsigned long addr,
271                         unsigned long end,
272                         struct mm_walk *walk)
273 {
274         struct hmm_range *range = walk->private;
275         hmm_pfn_t *pfns = range->pfns;
276         unsigned long i;
277
278         i = (addr - range->start) >> PAGE_SHIFT;
279         for (; addr < end; addr += PAGE_SIZE, i++)
280                 pfns[i] = HMM_PFN_ERROR;
281
282         return 0;
283 }
284
285 static void hmm_pfns_clear(hmm_pfn_t *pfns,
286                            unsigned long addr,
287                            unsigned long end)
288 {
289         for (; addr < end; addr += PAGE_SIZE, pfns++)
290                 *pfns = 0;
291 }
292
293 static int hmm_vma_walk_hole(unsigned long addr,
294                              unsigned long end,
295                              struct mm_walk *walk)
296 {
297         struct hmm_vma_walk *hmm_vma_walk = walk->private;
298         struct hmm_range *range = hmm_vma_walk->range;
299         hmm_pfn_t *pfns = range->pfns;
300         unsigned long i;
301
302         hmm_vma_walk->last = addr;
303         i = (addr - range->start) >> PAGE_SHIFT;
304         for (; addr < end; addr += PAGE_SIZE, i++) {
305                 pfns[i] = HMM_PFN_EMPTY;
306                 if (hmm_vma_walk->fault) {
307                         int ret;
308
309                         ret = hmm_vma_do_fault(walk, addr, &pfns[i]);
310                         if (ret != -EAGAIN)
311                                 return ret;
312                 }
313         }
314
315         return hmm_vma_walk->fault ? -EAGAIN : 0;
316 }
317
318 static int hmm_vma_walk_clear(unsigned long addr,
319                               unsigned long end,
320                               struct mm_walk *walk)
321 {
322         struct hmm_vma_walk *hmm_vma_walk = walk->private;
323         struct hmm_range *range = hmm_vma_walk->range;
324         hmm_pfn_t *pfns = range->pfns;
325         unsigned long i;
326
327         hmm_vma_walk->last = addr;
328         i = (addr - range->start) >> PAGE_SHIFT;
329         for (; addr < end; addr += PAGE_SIZE, i++) {
330                 pfns[i] = 0;
331                 if (hmm_vma_walk->fault) {
332                         int ret;
333
334                         ret = hmm_vma_do_fault(walk, addr, &pfns[i]);
335                         if (ret != -EAGAIN)
336                                 return ret;
337                 }
338         }
339
340         return hmm_vma_walk->fault ? -EAGAIN : 0;
341 }
342
343 static int hmm_vma_walk_pmd(pmd_t *pmdp,
344                             unsigned long start,
345                             unsigned long end,
346                             struct mm_walk *walk)
347 {
348         struct hmm_vma_walk *hmm_vma_walk = walk->private;
349         struct hmm_range *range = hmm_vma_walk->range;
350         struct vm_area_struct *vma = walk->vma;
351         hmm_pfn_t *pfns = range->pfns;
352         unsigned long addr = start, i;
353         bool write_fault;
354         hmm_pfn_t flag;
355         pte_t *ptep;
356
357         i = (addr - range->start) >> PAGE_SHIFT;
358         flag = vma->vm_flags & VM_READ ? HMM_PFN_READ : 0;
359         write_fault = hmm_vma_walk->fault & hmm_vma_walk->write;
360
361 again:
362         if (pmd_none(*pmdp))
363                 return hmm_vma_walk_hole(start, end, walk);
364
365         if (pmd_huge(*pmdp) && vma->vm_flags & VM_HUGETLB)
366                 return hmm_pfns_bad(start, end, walk);
367
368         if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) {
369                 unsigned long pfn;
370                 pmd_t pmd;
371
372                 /*
373                  * No need to take pmd_lock here, even if some other threads
374                  * is splitting the huge pmd we will get that event through
375                  * mmu_notifier callback.
376                  *
377                  * So just read pmd value and check again its a transparent
378                  * huge or device mapping one and compute corresponding pfn
379                  * values.
380                  */
381                 pmd = pmd_read_atomic(pmdp);
382                 barrier();
383                 if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
384                         goto again;
385                 if (pmd_protnone(pmd))
386                         return hmm_vma_walk_clear(start, end, walk);
387
388                 if (write_fault && !pmd_write(pmd))
389                         return hmm_vma_walk_clear(start, end, walk);
390
391                 pfn = pmd_pfn(pmd) + pte_index(addr);
392                 flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0;
393                 for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
394                         pfns[i] = hmm_pfn_t_from_pfn(pfn) | flag;
395                 return 0;
396         }
397
398         if (pmd_bad(*pmdp))
399                 return hmm_pfns_bad(start, end, walk);
400
401         ptep = pte_offset_map(pmdp, addr);
402         for (; addr < end; addr += PAGE_SIZE, ptep++, i++) {
403                 pte_t pte = *ptep;
404
405                 pfns[i] = 0;
406
407                 if (pte_none(pte)) {
408                         pfns[i] = HMM_PFN_EMPTY;
409                         if (hmm_vma_walk->fault)
410                                 goto fault;
411                         continue;
412                 }
413
414                 if (!pte_present(pte)) {
415                         swp_entry_t entry;
416
417                         if (!non_swap_entry(entry)) {
418                                 if (hmm_vma_walk->fault)
419                                         goto fault;
420                                 continue;
421                         }
422
423                         entry = pte_to_swp_entry(pte);
424
425                         /*
426                          * This is a special swap entry, ignore migration, use
427                          * device and report anything else as error.
428                          */
429                         if (is_migration_entry(entry)) {
430                                 if (hmm_vma_walk->fault) {
431                                         pte_unmap(ptep);
432                                         hmm_vma_walk->last = addr;
433                                         migration_entry_wait(vma->vm_mm,
434                                                              pmdp, addr);
435                                         return -EAGAIN;
436                                 }
437                                 continue;
438                         } else {
439                                 /* Report error for everything else */
440                                 pfns[i] = HMM_PFN_ERROR;
441                         }
442                         continue;
443                 }
444
445                 if (write_fault && !pte_write(pte))
446                         goto fault;
447
448                 pfns[i] = hmm_pfn_t_from_pfn(pte_pfn(pte)) | flag;
449                 pfns[i] |= pte_write(pte) ? HMM_PFN_WRITE : 0;
450                 continue;
451
452 fault:
453                 pte_unmap(ptep);
454                 /* Fault all pages in range */
455                 return hmm_vma_walk_clear(start, end, walk);
456         }
457         pte_unmap(ptep - 1);
458
459         return 0;
460 }
461
462 /*
463  * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses
464  * @vma: virtual memory area containing the virtual address range
465  * @range: used to track snapshot validity
466  * @start: range virtual start address (inclusive)
467  * @end: range virtual end address (exclusive)
468  * @entries: array of hmm_pfn_t: provided by the caller, filled in by function
469  * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, 0 success
470  *
471  * This snapshots the CPU page table for a range of virtual addresses. Snapshot
472  * validity is tracked by range struct. See hmm_vma_range_done() for further
473  * information.
474  *
475  * The range struct is initialized here. It tracks the CPU page table, but only
476  * if the function returns success (0), in which case the caller must then call
477  * hmm_vma_range_done() to stop CPU page table update tracking on this range.
478  *
479  * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS
480  * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED !
481  */
482 int hmm_vma_get_pfns(struct vm_area_struct *vma,
483                      struct hmm_range *range,
484                      unsigned long start,
485                      unsigned long end,
486                      hmm_pfn_t *pfns)
487 {
488         struct hmm_vma_walk hmm_vma_walk;
489         struct mm_walk mm_walk;
490         struct hmm *hmm;
491
492         /* FIXME support hugetlb fs */
493         if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
494                 hmm_pfns_special(pfns, start, end);
495                 return -EINVAL;
496         }
497
498         /* Sanity check, this really should not happen ! */
499         if (start < vma->vm_start || start >= vma->vm_end)
500                 return -EINVAL;
501         if (end < vma->vm_start || end > vma->vm_end)
502                 return -EINVAL;
503
504         hmm = hmm_register(vma->vm_mm);
505         if (!hmm)
506                 return -ENOMEM;
507         /* Caller must have registered a mirror, via hmm_mirror_register() ! */
508         if (!hmm->mmu_notifier.ops)
509                 return -EINVAL;
510
511         /* Initialize range to track CPU page table update */
512         range->start = start;
513         range->pfns = pfns;
514         range->end = end;
515         spin_lock(&hmm->lock);
516         range->valid = true;
517         list_add_rcu(&range->list, &hmm->ranges);
518         spin_unlock(&hmm->lock);
519
520         hmm_vma_walk.fault = false;
521         hmm_vma_walk.range = range;
522         mm_walk.private = &hmm_vma_walk;
523
524         mm_walk.vma = vma;
525         mm_walk.mm = vma->vm_mm;
526         mm_walk.pte_entry = NULL;
527         mm_walk.test_walk = NULL;
528         mm_walk.hugetlb_entry = NULL;
529         mm_walk.pmd_entry = hmm_vma_walk_pmd;
530         mm_walk.pte_hole = hmm_vma_walk_hole;
531
532         walk_page_range(start, end, &mm_walk);
533         return 0;
534 }
535 EXPORT_SYMBOL(hmm_vma_get_pfns);
536
537 /*
538  * hmm_vma_range_done() - stop tracking change to CPU page table over a range
539  * @vma: virtual memory area containing the virtual address range
540  * @range: range being tracked
541  * Returns: false if range data has been invalidated, true otherwise
542  *
543  * Range struct is used to track updates to the CPU page table after a call to
544  * either hmm_vma_get_pfns() or hmm_vma_fault(). Once the device driver is done
545  * using the data,  or wants to lock updates to the data it got from those
546  * functions, it must call the hmm_vma_range_done() function, which will then
547  * stop tracking CPU page table updates.
548  *
549  * Note that device driver must still implement general CPU page table update
550  * tracking either by using hmm_mirror (see hmm_mirror_register()) or by using
551  * the mmu_notifier API directly.
552  *
553  * CPU page table update tracking done through hmm_range is only temporary and
554  * to be used while trying to duplicate CPU page table contents for a range of
555  * virtual addresses.
556  *
557  * There are two ways to use this :
558  * again:
559  *   hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...);
560  *   trans = device_build_page_table_update_transaction(pfns);
561  *   device_page_table_lock();
562  *   if (!hmm_vma_range_done(vma, range)) {
563  *     device_page_table_unlock();
564  *     goto again;
565  *   }
566  *   device_commit_transaction(trans);
567  *   device_page_table_unlock();
568  *
569  * Or:
570  *   hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...);
571  *   device_page_table_lock();
572  *   hmm_vma_range_done(vma, range);
573  *   device_update_page_table(pfns);
574  *   device_page_table_unlock();
575  */
576 bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range)
577 {
578         unsigned long npages = (range->end - range->start) >> PAGE_SHIFT;
579         struct hmm *hmm;
580
581         if (range->end <= range->start) {
582                 BUG();
583                 return false;
584         }
585
586         hmm = hmm_register(vma->vm_mm);
587         if (!hmm) {
588                 memset(range->pfns, 0, sizeof(*range->pfns) * npages);
589                 return false;
590         }
591
592         spin_lock(&hmm->lock);
593         list_del_rcu(&range->list);
594         spin_unlock(&hmm->lock);
595
596         return range->valid;
597 }
598 EXPORT_SYMBOL(hmm_vma_range_done);
599
600 /*
601  * hmm_vma_fault() - try to fault some address in a virtual address range
602  * @vma: virtual memory area containing the virtual address range
603  * @range: use to track pfns array content validity
604  * @start: fault range virtual start address (inclusive)
605  * @end: fault range virtual end address (exclusive)
606  * @pfns: array of hmm_pfn_t, only entry with fault flag set will be faulted
607  * @write: is it a write fault
608  * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
609  * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop)
610  *
611  * This is similar to a regular CPU page fault except that it will not trigger
612  * any memory migration if the memory being faulted is not accessible by CPUs.
613  *
614  * On error, for one virtual address in the range, the function will set the
615  * hmm_pfn_t error flag for the corresponding pfn entry.
616  *
617  * Expected use pattern:
618  * retry:
619  *   down_read(&mm->mmap_sem);
620  *   // Find vma and address device wants to fault, initialize hmm_pfn_t
621  *   // array accordingly
622  *   ret = hmm_vma_fault(vma, start, end, pfns, allow_retry);
623  *   switch (ret) {
624  *   case -EAGAIN:
625  *     hmm_vma_range_done(vma, range);
626  *     // You might want to rate limit or yield to play nicely, you may
627  *     // also commit any valid pfn in the array assuming that you are
628  *     // getting true from hmm_vma_range_monitor_end()
629  *     goto retry;
630  *   case 0:
631  *     break;
632  *   default:
633  *     // Handle error !
634  *     up_read(&mm->mmap_sem)
635  *     return;
636  *   }
637  *   // Take device driver lock that serialize device page table update
638  *   driver_lock_device_page_table_update();
639  *   hmm_vma_range_done(vma, range);
640  *   // Commit pfns we got from hmm_vma_fault()
641  *   driver_unlock_device_page_table_update();
642  *   up_read(&mm->mmap_sem)
643  *
644  * YOU MUST CALL hmm_vma_range_done() AFTER THIS FUNCTION RETURN SUCCESS (0)
645  * BEFORE FREEING THE range struct OR YOU WILL HAVE SERIOUS MEMORY CORRUPTION !
646  *
647  * YOU HAVE BEEN WARNED !
648  */
649 int hmm_vma_fault(struct vm_area_struct *vma,
650                   struct hmm_range *range,
651                   unsigned long start,
652                   unsigned long end,
653                   hmm_pfn_t *pfns,
654                   bool write,
655                   bool block)
656 {
657         struct hmm_vma_walk hmm_vma_walk;
658         struct mm_walk mm_walk;
659         struct hmm *hmm;
660         int ret;
661
662         /* Sanity check, this really should not happen ! */
663         if (start < vma->vm_start || start >= vma->vm_end)
664                 return -EINVAL;
665         if (end < vma->vm_start || end > vma->vm_end)
666                 return -EINVAL;
667
668         hmm = hmm_register(vma->vm_mm);
669         if (!hmm) {
670                 hmm_pfns_clear(pfns, start, end);
671                 return -ENOMEM;
672         }
673         /* Caller must have registered a mirror using hmm_mirror_register() */
674         if (!hmm->mmu_notifier.ops)
675                 return -EINVAL;
676
677         /* Initialize range to track CPU page table update */
678         range->start = start;
679         range->pfns = pfns;
680         range->end = end;
681         spin_lock(&hmm->lock);
682         range->valid = true;
683         list_add_rcu(&range->list, &hmm->ranges);
684         spin_unlock(&hmm->lock);
685
686         /* FIXME support hugetlb fs */
687         if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
688                 hmm_pfns_special(pfns, start, end);
689                 return 0;
690         }
691
692         hmm_vma_walk.fault = true;
693         hmm_vma_walk.write = write;
694         hmm_vma_walk.block = block;
695         hmm_vma_walk.range = range;
696         mm_walk.private = &hmm_vma_walk;
697         hmm_vma_walk.last = range->start;
698
699         mm_walk.vma = vma;
700         mm_walk.mm = vma->vm_mm;
701         mm_walk.pte_entry = NULL;
702         mm_walk.test_walk = NULL;
703         mm_walk.hugetlb_entry = NULL;
704         mm_walk.pmd_entry = hmm_vma_walk_pmd;
705         mm_walk.pte_hole = hmm_vma_walk_hole;
706
707         do {
708                 ret = walk_page_range(start, end, &mm_walk);
709                 start = hmm_vma_walk.last;
710         } while (ret == -EAGAIN);
711
712         if (ret) {
713                 unsigned long i;
714
715                 i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
716                 hmm_pfns_clear(&pfns[i], hmm_vma_walk.last, end);
717                 hmm_vma_range_done(vma, range);
718         }
719         return ret;
720 }
721 EXPORT_SYMBOL(hmm_vma_fault);
722 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */