Merge tag 'mm-nonmm-stable-2024-05-19-11-56' of git://git.kernel.org/pub/scm/linux...
[sfrench/cifs-2.6.git] / mm / huge_memory.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *  Copyright (C) 2009  Red Hat, Inc.
4  */
5
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7
8 #include <linux/mm.h>
9 #include <linux/sched.h>
10 #include <linux/sched/mm.h>
11 #include <linux/sched/coredump.h>
12 #include <linux/sched/numa_balancing.h>
13 #include <linux/highmem.h>
14 #include <linux/hugetlb.h>
15 #include <linux/mmu_notifier.h>
16 #include <linux/rmap.h>
17 #include <linux/swap.h>
18 #include <linux/shrinker.h>
19 #include <linux/mm_inline.h>
20 #include <linux/swapops.h>
21 #include <linux/backing-dev.h>
22 #include <linux/dax.h>
23 #include <linux/khugepaged.h>
24 #include <linux/freezer.h>
25 #include <linux/pfn_t.h>
26 #include <linux/mman.h>
27 #include <linux/memremap.h>
28 #include <linux/pagemap.h>
29 #include <linux/debugfs.h>
30 #include <linux/migrate.h>
31 #include <linux/hashtable.h>
32 #include <linux/userfaultfd_k.h>
33 #include <linux/page_idle.h>
34 #include <linux/shmem_fs.h>
35 #include <linux/oom.h>
36 #include <linux/numa.h>
37 #include <linux/page_owner.h>
38 #include <linux/sched/sysctl.h>
39 #include <linux/memory-tiers.h>
40 #include <linux/compat.h>
41 #include <linux/pgalloc_tag.h>
42
43 #include <asm/tlb.h>
44 #include <asm/pgalloc.h>
45 #include "internal.h"
46 #include "swap.h"
47
48 #define CREATE_TRACE_POINTS
49 #include <trace/events/thp.h>
50
51 /*
52  * By default, transparent hugepage support is disabled in order to avoid
53  * risking an increased memory footprint for applications that are not
54  * guaranteed to benefit from it. When transparent hugepage support is
55  * enabled, it is for all mappings, and khugepaged scans all mappings.
56  * Defrag is invoked by khugepaged hugepage allocations and by page faults
57  * for all hugepage allocations.
58  */
59 unsigned long transparent_hugepage_flags __read_mostly =
60 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
61         (1<<TRANSPARENT_HUGEPAGE_FLAG)|
62 #endif
63 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
64         (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
65 #endif
66         (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
67         (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
68         (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
69
70 static struct shrinker *deferred_split_shrinker;
71 static unsigned long deferred_split_count(struct shrinker *shrink,
72                                           struct shrink_control *sc);
73 static unsigned long deferred_split_scan(struct shrinker *shrink,
74                                          struct shrink_control *sc);
75
76 static atomic_t huge_zero_refcount;
77 struct folio *huge_zero_folio __read_mostly;
78 unsigned long huge_zero_pfn __read_mostly = ~0UL;
79 unsigned long huge_anon_orders_always __read_mostly;
80 unsigned long huge_anon_orders_madvise __read_mostly;
81 unsigned long huge_anon_orders_inherit __read_mostly;
82
83 unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
84                                          unsigned long vm_flags,
85                                          unsigned long tva_flags,
86                                          unsigned long orders)
87 {
88         bool smaps = tva_flags & TVA_SMAPS;
89         bool in_pf = tva_flags & TVA_IN_PF;
90         bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS;
91         /* Check the intersection of requested and supported orders. */
92         orders &= vma_is_anonymous(vma) ?
93                         THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE;
94         if (!orders)
95                 return 0;
96
97         if (!vma->vm_mm)                /* vdso */
98                 return 0;
99
100         /*
101          * Explicitly disabled through madvise or prctl, or some
102          * architectures may disable THP for some mappings, for
103          * example, s390 kvm.
104          * */
105         if ((vm_flags & VM_NOHUGEPAGE) ||
106             test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
107                 return 0;
108         /*
109          * If the hardware/firmware marked hugepage support disabled.
110          */
111         if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED))
112                 return 0;
113
114         /* khugepaged doesn't collapse DAX vma, but page fault is fine. */
115         if (vma_is_dax(vma))
116                 return in_pf ? orders : 0;
117
118         /*
119          * khugepaged special VMA and hugetlb VMA.
120          * Must be checked after dax since some dax mappings may have
121          * VM_MIXEDMAP set.
122          */
123         if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED))
124                 return 0;
125
126         /*
127          * Check alignment for file vma and size for both file and anon vma by
128          * filtering out the unsuitable orders.
129          *
130          * Skip the check for page fault. Huge fault does the check in fault
131          * handlers.
132          */
133         if (!in_pf) {
134                 int order = highest_order(orders);
135                 unsigned long addr;
136
137                 while (orders) {
138                         addr = vma->vm_end - (PAGE_SIZE << order);
139                         if (thp_vma_suitable_order(vma, addr, order))
140                                 break;
141                         order = next_order(&orders, order);
142                 }
143
144                 if (!orders)
145                         return 0;
146         }
147
148         /*
149          * Enabled via shmem mount options or sysfs settings.
150          * Must be done before hugepage flags check since shmem has its
151          * own flags.
152          */
153         if (!in_pf && shmem_file(vma->vm_file))
154                 return shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff,
155                                      !enforce_sysfs, vma->vm_mm, vm_flags)
156                         ? orders : 0;
157
158         if (!vma_is_anonymous(vma)) {
159                 /*
160                  * Enforce sysfs THP requirements as necessary. Anonymous vmas
161                  * were already handled in thp_vma_allowable_orders().
162                  */
163                 if (enforce_sysfs &&
164                     (!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) &&
165                                                     !hugepage_global_always())))
166                         return 0;
167
168                 /*
169                  * Trust that ->huge_fault() handlers know what they are doing
170                  * in fault path.
171                  */
172                 if (((in_pf || smaps)) && vma->vm_ops->huge_fault)
173                         return orders;
174                 /* Only regular file is valid in collapse path */
175                 if (((!in_pf || smaps)) && file_thp_enabled(vma))
176                         return orders;
177                 return 0;
178         }
179
180         if (vma_is_temporary_stack(vma))
181                 return 0;
182
183         /*
184          * THPeligible bit of smaps should show 1 for proper VMAs even
185          * though anon_vma is not initialized yet.
186          *
187          * Allow page fault since anon_vma may be not initialized until
188          * the first page fault.
189          */
190         if (!vma->anon_vma)
191                 return (smaps || in_pf) ? orders : 0;
192
193         return orders;
194 }
195
196 static bool get_huge_zero_page(void)
197 {
198         struct folio *zero_folio;
199 retry:
200         if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
201                 return true;
202
203         zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
204                         HPAGE_PMD_ORDER);
205         if (!zero_folio) {
206                 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
207                 return false;
208         }
209         preempt_disable();
210         if (cmpxchg(&huge_zero_folio, NULL, zero_folio)) {
211                 preempt_enable();
212                 folio_put(zero_folio);
213                 goto retry;
214         }
215         WRITE_ONCE(huge_zero_pfn, folio_pfn(zero_folio));
216
217         /* We take additional reference here. It will be put back by shrinker */
218         atomic_set(&huge_zero_refcount, 2);
219         preempt_enable();
220         count_vm_event(THP_ZERO_PAGE_ALLOC);
221         return true;
222 }
223
224 static void put_huge_zero_page(void)
225 {
226         /*
227          * Counter should never go to zero here. Only shrinker can put
228          * last reference.
229          */
230         BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
231 }
232
233 struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
234 {
235         if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
236                 return READ_ONCE(huge_zero_folio);
237
238         if (!get_huge_zero_page())
239                 return NULL;
240
241         if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
242                 put_huge_zero_page();
243
244         return READ_ONCE(huge_zero_folio);
245 }
246
247 void mm_put_huge_zero_folio(struct mm_struct *mm)
248 {
249         if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
250                 put_huge_zero_page();
251 }
252
253 static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
254                                         struct shrink_control *sc)
255 {
256         /* we can free zero page only if last reference remains */
257         return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
258 }
259
260 static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
261                                        struct shrink_control *sc)
262 {
263         if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
264                 struct folio *zero_folio = xchg(&huge_zero_folio, NULL);
265                 BUG_ON(zero_folio == NULL);
266                 WRITE_ONCE(huge_zero_pfn, ~0UL);
267                 folio_put(zero_folio);
268                 return HPAGE_PMD_NR;
269         }
270
271         return 0;
272 }
273
274 static struct shrinker *huge_zero_page_shrinker;
275
276 #ifdef CONFIG_SYSFS
277 static ssize_t enabled_show(struct kobject *kobj,
278                             struct kobj_attribute *attr, char *buf)
279 {
280         const char *output;
281
282         if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
283                 output = "[always] madvise never";
284         else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
285                           &transparent_hugepage_flags))
286                 output = "always [madvise] never";
287         else
288                 output = "always madvise [never]";
289
290         return sysfs_emit(buf, "%s\n", output);
291 }
292
293 static ssize_t enabled_store(struct kobject *kobj,
294                              struct kobj_attribute *attr,
295                              const char *buf, size_t count)
296 {
297         ssize_t ret = count;
298
299         if (sysfs_streq(buf, "always")) {
300                 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
301                 set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
302         } else if (sysfs_streq(buf, "madvise")) {
303                 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
304                 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
305         } else if (sysfs_streq(buf, "never")) {
306                 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
307                 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
308         } else
309                 ret = -EINVAL;
310
311         if (ret > 0) {
312                 int err = start_stop_khugepaged();
313                 if (err)
314                         ret = err;
315         }
316         return ret;
317 }
318
319 static struct kobj_attribute enabled_attr = __ATTR_RW(enabled);
320
321 ssize_t single_hugepage_flag_show(struct kobject *kobj,
322                                   struct kobj_attribute *attr, char *buf,
323                                   enum transparent_hugepage_flag flag)
324 {
325         return sysfs_emit(buf, "%d\n",
326                           !!test_bit(flag, &transparent_hugepage_flags));
327 }
328
329 ssize_t single_hugepage_flag_store(struct kobject *kobj,
330                                  struct kobj_attribute *attr,
331                                  const char *buf, size_t count,
332                                  enum transparent_hugepage_flag flag)
333 {
334         unsigned long value;
335         int ret;
336
337         ret = kstrtoul(buf, 10, &value);
338         if (ret < 0)
339                 return ret;
340         if (value > 1)
341                 return -EINVAL;
342
343         if (value)
344                 set_bit(flag, &transparent_hugepage_flags);
345         else
346                 clear_bit(flag, &transparent_hugepage_flags);
347
348         return count;
349 }
350
351 static ssize_t defrag_show(struct kobject *kobj,
352                            struct kobj_attribute *attr, char *buf)
353 {
354         const char *output;
355
356         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
357                      &transparent_hugepage_flags))
358                 output = "[always] defer defer+madvise madvise never";
359         else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
360                           &transparent_hugepage_flags))
361                 output = "always [defer] defer+madvise madvise never";
362         else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
363                           &transparent_hugepage_flags))
364                 output = "always defer [defer+madvise] madvise never";
365         else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
366                           &transparent_hugepage_flags))
367                 output = "always defer defer+madvise [madvise] never";
368         else
369                 output = "always defer defer+madvise madvise [never]";
370
371         return sysfs_emit(buf, "%s\n", output);
372 }
373
374 static ssize_t defrag_store(struct kobject *kobj,
375                             struct kobj_attribute *attr,
376                             const char *buf, size_t count)
377 {
378         if (sysfs_streq(buf, "always")) {
379                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
380                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
381                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
382                 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
383         } else if (sysfs_streq(buf, "defer+madvise")) {
384                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
385                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
386                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
387                 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
388         } else if (sysfs_streq(buf, "defer")) {
389                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
390                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
391                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
392                 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
393         } else if (sysfs_streq(buf, "madvise")) {
394                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
395                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
396                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
397                 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
398         } else if (sysfs_streq(buf, "never")) {
399                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
400                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
401                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
402                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
403         } else
404                 return -EINVAL;
405
406         return count;
407 }
408 static struct kobj_attribute defrag_attr = __ATTR_RW(defrag);
409
410 static ssize_t use_zero_page_show(struct kobject *kobj,
411                                   struct kobj_attribute *attr, char *buf)
412 {
413         return single_hugepage_flag_show(kobj, attr, buf,
414                                          TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
415 }
416 static ssize_t use_zero_page_store(struct kobject *kobj,
417                 struct kobj_attribute *attr, const char *buf, size_t count)
418 {
419         return single_hugepage_flag_store(kobj, attr, buf, count,
420                                  TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
421 }
422 static struct kobj_attribute use_zero_page_attr = __ATTR_RW(use_zero_page);
423
424 static ssize_t hpage_pmd_size_show(struct kobject *kobj,
425                                    struct kobj_attribute *attr, char *buf)
426 {
427         return sysfs_emit(buf, "%lu\n", HPAGE_PMD_SIZE);
428 }
429 static struct kobj_attribute hpage_pmd_size_attr =
430         __ATTR_RO(hpage_pmd_size);
431
432 static struct attribute *hugepage_attr[] = {
433         &enabled_attr.attr,
434         &defrag_attr.attr,
435         &use_zero_page_attr.attr,
436         &hpage_pmd_size_attr.attr,
437 #ifdef CONFIG_SHMEM
438         &shmem_enabled_attr.attr,
439 #endif
440         NULL,
441 };
442
443 static const struct attribute_group hugepage_attr_group = {
444         .attrs = hugepage_attr,
445 };
446
447 static void hugepage_exit_sysfs(struct kobject *hugepage_kobj);
448 static void thpsize_release(struct kobject *kobj);
449 static DEFINE_SPINLOCK(huge_anon_orders_lock);
450 static LIST_HEAD(thpsize_list);
451
452 struct thpsize {
453         struct kobject kobj;
454         struct list_head node;
455         int order;
456 };
457
458 #define to_thpsize(kobj) container_of(kobj, struct thpsize, kobj)
459
460 static ssize_t thpsize_enabled_show(struct kobject *kobj,
461                                     struct kobj_attribute *attr, char *buf)
462 {
463         int order = to_thpsize(kobj)->order;
464         const char *output;
465
466         if (test_bit(order, &huge_anon_orders_always))
467                 output = "[always] inherit madvise never";
468         else if (test_bit(order, &huge_anon_orders_inherit))
469                 output = "always [inherit] madvise never";
470         else if (test_bit(order, &huge_anon_orders_madvise))
471                 output = "always inherit [madvise] never";
472         else
473                 output = "always inherit madvise [never]";
474
475         return sysfs_emit(buf, "%s\n", output);
476 }
477
478 static ssize_t thpsize_enabled_store(struct kobject *kobj,
479                                      struct kobj_attribute *attr,
480                                      const char *buf, size_t count)
481 {
482         int order = to_thpsize(kobj)->order;
483         ssize_t ret = count;
484
485         if (sysfs_streq(buf, "always")) {
486                 spin_lock(&huge_anon_orders_lock);
487                 clear_bit(order, &huge_anon_orders_inherit);
488                 clear_bit(order, &huge_anon_orders_madvise);
489                 set_bit(order, &huge_anon_orders_always);
490                 spin_unlock(&huge_anon_orders_lock);
491         } else if (sysfs_streq(buf, "inherit")) {
492                 spin_lock(&huge_anon_orders_lock);
493                 clear_bit(order, &huge_anon_orders_always);
494                 clear_bit(order, &huge_anon_orders_madvise);
495                 set_bit(order, &huge_anon_orders_inherit);
496                 spin_unlock(&huge_anon_orders_lock);
497         } else if (sysfs_streq(buf, "madvise")) {
498                 spin_lock(&huge_anon_orders_lock);
499                 clear_bit(order, &huge_anon_orders_always);
500                 clear_bit(order, &huge_anon_orders_inherit);
501                 set_bit(order, &huge_anon_orders_madvise);
502                 spin_unlock(&huge_anon_orders_lock);
503         } else if (sysfs_streq(buf, "never")) {
504                 spin_lock(&huge_anon_orders_lock);
505                 clear_bit(order, &huge_anon_orders_always);
506                 clear_bit(order, &huge_anon_orders_inherit);
507                 clear_bit(order, &huge_anon_orders_madvise);
508                 spin_unlock(&huge_anon_orders_lock);
509         } else
510                 ret = -EINVAL;
511
512         return ret;
513 }
514
515 static struct kobj_attribute thpsize_enabled_attr =
516         __ATTR(enabled, 0644, thpsize_enabled_show, thpsize_enabled_store);
517
518 static struct attribute *thpsize_attrs[] = {
519         &thpsize_enabled_attr.attr,
520         NULL,
521 };
522
523 static const struct attribute_group thpsize_attr_group = {
524         .attrs = thpsize_attrs,
525 };
526
527 static const struct kobj_type thpsize_ktype = {
528         .release = &thpsize_release,
529         .sysfs_ops = &kobj_sysfs_ops,
530 };
531
532 DEFINE_PER_CPU(struct mthp_stat, mthp_stats) = {{{0}}};
533
534 static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
535 {
536         unsigned long sum = 0;
537         int cpu;
538
539         for_each_possible_cpu(cpu) {
540                 struct mthp_stat *this = &per_cpu(mthp_stats, cpu);
541
542                 sum += this->stats[order][item];
543         }
544
545         return sum;
546 }
547
548 #define DEFINE_MTHP_STAT_ATTR(_name, _index)                            \
549 static ssize_t _name##_show(struct kobject *kobj,                       \
550                         struct kobj_attribute *attr, char *buf)         \
551 {                                                                       \
552         int order = to_thpsize(kobj)->order;                            \
553                                                                         \
554         return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index));  \
555 }                                                                       \
556 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
557
558 DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
559 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
560 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
561 DEFINE_MTHP_STAT_ATTR(anon_swpout, MTHP_STAT_ANON_SWPOUT);
562 DEFINE_MTHP_STAT_ATTR(anon_swpout_fallback, MTHP_STAT_ANON_SWPOUT_FALLBACK);
563
564 static struct attribute *stats_attrs[] = {
565         &anon_fault_alloc_attr.attr,
566         &anon_fault_fallback_attr.attr,
567         &anon_fault_fallback_charge_attr.attr,
568         &anon_swpout_attr.attr,
569         &anon_swpout_fallback_attr.attr,
570         NULL,
571 };
572
573 static struct attribute_group stats_attr_group = {
574         .name = "stats",
575         .attrs = stats_attrs,
576 };
577
578 static struct thpsize *thpsize_create(int order, struct kobject *parent)
579 {
580         unsigned long size = (PAGE_SIZE << order) / SZ_1K;
581         struct thpsize *thpsize;
582         int ret;
583
584         thpsize = kzalloc(sizeof(*thpsize), GFP_KERNEL);
585         if (!thpsize)
586                 return ERR_PTR(-ENOMEM);
587
588         ret = kobject_init_and_add(&thpsize->kobj, &thpsize_ktype, parent,
589                                    "hugepages-%lukB", size);
590         if (ret) {
591                 kfree(thpsize);
592                 return ERR_PTR(ret);
593         }
594
595         ret = sysfs_create_group(&thpsize->kobj, &thpsize_attr_group);
596         if (ret) {
597                 kobject_put(&thpsize->kobj);
598                 return ERR_PTR(ret);
599         }
600
601         ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group);
602         if (ret) {
603                 kobject_put(&thpsize->kobj);
604                 return ERR_PTR(ret);
605         }
606
607         thpsize->order = order;
608         return thpsize;
609 }
610
611 static void thpsize_release(struct kobject *kobj)
612 {
613         kfree(to_thpsize(kobj));
614 }
615
616 static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
617 {
618         int err;
619         struct thpsize *thpsize;
620         unsigned long orders;
621         int order;
622
623         /*
624          * Default to setting PMD-sized THP to inherit the global setting and
625          * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time
626          * constant so we have to do this here.
627          */
628         huge_anon_orders_inherit = BIT(PMD_ORDER);
629
630         *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
631         if (unlikely(!*hugepage_kobj)) {
632                 pr_err("failed to create transparent hugepage kobject\n");
633                 return -ENOMEM;
634         }
635
636         err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
637         if (err) {
638                 pr_err("failed to register transparent hugepage group\n");
639                 goto delete_obj;
640         }
641
642         err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
643         if (err) {
644                 pr_err("failed to register transparent hugepage group\n");
645                 goto remove_hp_group;
646         }
647
648         orders = THP_ORDERS_ALL_ANON;
649         order = highest_order(orders);
650         while (orders) {
651                 thpsize = thpsize_create(order, *hugepage_kobj);
652                 if (IS_ERR(thpsize)) {
653                         pr_err("failed to create thpsize for order %d\n", order);
654                         err = PTR_ERR(thpsize);
655                         goto remove_all;
656                 }
657                 list_add(&thpsize->node, &thpsize_list);
658                 order = next_order(&orders, order);
659         }
660
661         return 0;
662
663 remove_all:
664         hugepage_exit_sysfs(*hugepage_kobj);
665         return err;
666 remove_hp_group:
667         sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
668 delete_obj:
669         kobject_put(*hugepage_kobj);
670         return err;
671 }
672
673 static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
674 {
675         struct thpsize *thpsize, *tmp;
676
677         list_for_each_entry_safe(thpsize, tmp, &thpsize_list, node) {
678                 list_del(&thpsize->node);
679                 kobject_put(&thpsize->kobj);
680         }
681
682         sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
683         sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
684         kobject_put(hugepage_kobj);
685 }
686 #else
687 static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
688 {
689         return 0;
690 }
691
692 static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
693 {
694 }
695 #endif /* CONFIG_SYSFS */
696
697 static int __init thp_shrinker_init(void)
698 {
699         huge_zero_page_shrinker = shrinker_alloc(0, "thp-zero");
700         if (!huge_zero_page_shrinker)
701                 return -ENOMEM;
702
703         deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
704                                                  SHRINKER_MEMCG_AWARE |
705                                                  SHRINKER_NONSLAB,
706                                                  "thp-deferred_split");
707         if (!deferred_split_shrinker) {
708                 shrinker_free(huge_zero_page_shrinker);
709                 return -ENOMEM;
710         }
711
712         huge_zero_page_shrinker->count_objects = shrink_huge_zero_page_count;
713         huge_zero_page_shrinker->scan_objects = shrink_huge_zero_page_scan;
714         shrinker_register(huge_zero_page_shrinker);
715
716         deferred_split_shrinker->count_objects = deferred_split_count;
717         deferred_split_shrinker->scan_objects = deferred_split_scan;
718         shrinker_register(deferred_split_shrinker);
719
720         return 0;
721 }
722
723 static void __init thp_shrinker_exit(void)
724 {
725         shrinker_free(huge_zero_page_shrinker);
726         shrinker_free(deferred_split_shrinker);
727 }
728
729 static int __init hugepage_init(void)
730 {
731         int err;
732         struct kobject *hugepage_kobj;
733
734         if (!has_transparent_hugepage()) {
735                 transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED;
736                 return -EINVAL;
737         }
738
739         /*
740          * hugepages can't be allocated by the buddy allocator
741          */
742         MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_PAGE_ORDER);
743
744         err = hugepage_init_sysfs(&hugepage_kobj);
745         if (err)
746                 goto err_sysfs;
747
748         err = khugepaged_init();
749         if (err)
750                 goto err_slab;
751
752         err = thp_shrinker_init();
753         if (err)
754                 goto err_shrinker;
755
756         /*
757          * By default disable transparent hugepages on smaller systems,
758          * where the extra memory used could hurt more than TLB overhead
759          * is likely to save.  The admin can still enable it through /sys.
760          */
761         if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
762                 transparent_hugepage_flags = 0;
763                 return 0;
764         }
765
766         err = start_stop_khugepaged();
767         if (err)
768                 goto err_khugepaged;
769
770         return 0;
771 err_khugepaged:
772         thp_shrinker_exit();
773 err_shrinker:
774         khugepaged_destroy();
775 err_slab:
776         hugepage_exit_sysfs(hugepage_kobj);
777 err_sysfs:
778         return err;
779 }
780 subsys_initcall(hugepage_init);
781
782 static int __init setup_transparent_hugepage(char *str)
783 {
784         int ret = 0;
785         if (!str)
786                 goto out;
787         if (!strcmp(str, "always")) {
788                 set_bit(TRANSPARENT_HUGEPAGE_FLAG,
789                         &transparent_hugepage_flags);
790                 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
791                           &transparent_hugepage_flags);
792                 ret = 1;
793         } else if (!strcmp(str, "madvise")) {
794                 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
795                           &transparent_hugepage_flags);
796                 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
797                         &transparent_hugepage_flags);
798                 ret = 1;
799         } else if (!strcmp(str, "never")) {
800                 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
801                           &transparent_hugepage_flags);
802                 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
803                           &transparent_hugepage_flags);
804                 ret = 1;
805         }
806 out:
807         if (!ret)
808                 pr_warn("transparent_hugepage= cannot parse, ignored\n");
809         return ret;
810 }
811 __setup("transparent_hugepage=", setup_transparent_hugepage);
812
813 pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
814 {
815         if (likely(vma->vm_flags & VM_WRITE))
816                 pmd = pmd_mkwrite(pmd, vma);
817         return pmd;
818 }
819
820 #ifdef CONFIG_MEMCG
821 static inline
822 struct deferred_split *get_deferred_split_queue(struct folio *folio)
823 {
824         struct mem_cgroup *memcg = folio_memcg(folio);
825         struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
826
827         if (memcg)
828                 return &memcg->deferred_split_queue;
829         else
830                 return &pgdat->deferred_split_queue;
831 }
832 #else
833 static inline
834 struct deferred_split *get_deferred_split_queue(struct folio *folio)
835 {
836         struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
837
838         return &pgdat->deferred_split_queue;
839 }
840 #endif
841
842 static inline bool is_transparent_hugepage(const struct folio *folio)
843 {
844         if (!folio_test_large(folio))
845                 return false;
846
847         return is_huge_zero_folio(folio) ||
848                 folio_test_large_rmappable(folio);
849 }
850
851 static unsigned long __thp_get_unmapped_area(struct file *filp,
852                 unsigned long addr, unsigned long len,
853                 loff_t off, unsigned long flags, unsigned long size,
854                 vm_flags_t vm_flags)
855 {
856         loff_t off_end = off + len;
857         loff_t off_align = round_up(off, size);
858         unsigned long len_pad, ret, off_sub;
859
860         if (IS_ENABLED(CONFIG_32BIT) || in_compat_syscall())
861                 return 0;
862
863         if (off_end <= off_align || (off_end - off_align) < size)
864                 return 0;
865
866         len_pad = len + size;
867         if (len_pad < len || (off + len_pad) < off)
868                 return 0;
869
870         ret = mm_get_unmapped_area_vmflags(current->mm, filp, addr, len_pad,
871                                            off >> PAGE_SHIFT, flags, vm_flags);
872
873         /*
874          * The failure might be due to length padding. The caller will retry
875          * without the padding.
876          */
877         if (IS_ERR_VALUE(ret))
878                 return 0;
879
880         /*
881          * Do not try to align to THP boundary if allocation at the address
882          * hint succeeds.
883          */
884         if (ret == addr)
885                 return addr;
886
887         off_sub = (off - ret) & (size - 1);
888
889         if (test_bit(MMF_TOPDOWN, &current->mm->flags) && !off_sub)
890                 return ret + size;
891
892         ret += off_sub;
893         return ret;
894 }
895
896 unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
897                 unsigned long len, unsigned long pgoff, unsigned long flags,
898                 vm_flags_t vm_flags)
899 {
900         unsigned long ret;
901         loff_t off = (loff_t)pgoff << PAGE_SHIFT;
902
903         ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE, vm_flags);
904         if (ret)
905                 return ret;
906
907         return mm_get_unmapped_area_vmflags(current->mm, filp, addr, len, pgoff, flags,
908                                             vm_flags);
909 }
910
911 unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
912                 unsigned long len, unsigned long pgoff, unsigned long flags)
913 {
914         return thp_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, 0);
915 }
916 EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
917
918 static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
919                         struct page *page, gfp_t gfp)
920 {
921         struct vm_area_struct *vma = vmf->vma;
922         struct folio *folio = page_folio(page);
923         pgtable_t pgtable;
924         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
925         vm_fault_t ret = 0;
926
927         VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
928
929         if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
930                 folio_put(folio);
931                 count_vm_event(THP_FAULT_FALLBACK);
932                 count_vm_event(THP_FAULT_FALLBACK_CHARGE);
933                 count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
934                 count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
935                 return VM_FAULT_FALLBACK;
936         }
937         folio_throttle_swaprate(folio, gfp);
938
939         pgtable = pte_alloc_one(vma->vm_mm);
940         if (unlikely(!pgtable)) {
941                 ret = VM_FAULT_OOM;
942                 goto release;
943         }
944
945         clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
946         /*
947          * The memory barrier inside __folio_mark_uptodate makes sure that
948          * clear_huge_page writes become visible before the set_pmd_at()
949          * write.
950          */
951         __folio_mark_uptodate(folio);
952
953         vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
954         if (unlikely(!pmd_none(*vmf->pmd))) {
955                 goto unlock_release;
956         } else {
957                 pmd_t entry;
958
959                 ret = check_stable_address_space(vma->vm_mm);
960                 if (ret)
961                         goto unlock_release;
962
963                 /* Deliver the page fault to userland */
964                 if (userfaultfd_missing(vma)) {
965                         spin_unlock(vmf->ptl);
966                         folio_put(folio);
967                         pte_free(vma->vm_mm, pgtable);
968                         ret = handle_userfault(vmf, VM_UFFD_MISSING);
969                         VM_BUG_ON(ret & VM_FAULT_FALLBACK);
970                         return ret;
971                 }
972
973                 entry = mk_huge_pmd(page, vma->vm_page_prot);
974                 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
975                 folio_add_new_anon_rmap(folio, vma, haddr);
976                 folio_add_lru_vma(folio, vma);
977                 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
978                 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
979                 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
980                 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
981                 mm_inc_nr_ptes(vma->vm_mm);
982                 spin_unlock(vmf->ptl);
983                 count_vm_event(THP_FAULT_ALLOC);
984                 count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
985                 count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
986         }
987
988         return 0;
989 unlock_release:
990         spin_unlock(vmf->ptl);
991 release:
992         if (pgtable)
993                 pte_free(vma->vm_mm, pgtable);
994         folio_put(folio);
995         return ret;
996
997 }
998
999 /*
1000  * always: directly stall for all thp allocations
1001  * defer: wake kswapd and fail if not immediately available
1002  * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
1003  *                fail if not immediately available
1004  * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
1005  *          available
1006  * never: never stall for any thp allocation
1007  */
1008 gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma)
1009 {
1010         const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE);
1011
1012         /* Always do synchronous compaction */
1013         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
1014                 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
1015
1016         /* Kick kcompactd and fail quickly */
1017         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
1018                 return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
1019
1020         /* Synchronous compaction if madvised, otherwise kick kcompactd */
1021         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
1022                 return GFP_TRANSHUGE_LIGHT |
1023                         (vma_madvised ? __GFP_DIRECT_RECLAIM :
1024                                         __GFP_KSWAPD_RECLAIM);
1025
1026         /* Only do synchronous compaction if madvised */
1027         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
1028                 return GFP_TRANSHUGE_LIGHT |
1029                        (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
1030
1031         return GFP_TRANSHUGE_LIGHT;
1032 }
1033
1034 /* Caller must hold page table lock. */
1035 static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
1036                 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
1037                 struct folio *zero_folio)
1038 {
1039         pmd_t entry;
1040         if (!pmd_none(*pmd))
1041                 return;
1042         entry = mk_pmd(&zero_folio->page, vma->vm_page_prot);
1043         entry = pmd_mkhuge(entry);
1044         pgtable_trans_huge_deposit(mm, pmd, pgtable);
1045         set_pmd_at(mm, haddr, pmd, entry);
1046         mm_inc_nr_ptes(mm);
1047 }
1048
1049 vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
1050 {
1051         struct vm_area_struct *vma = vmf->vma;
1052         gfp_t gfp;
1053         struct folio *folio;
1054         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1055         vm_fault_t ret;
1056
1057         if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
1058                 return VM_FAULT_FALLBACK;
1059         ret = vmf_anon_prepare(vmf);
1060         if (ret)
1061                 return ret;
1062         khugepaged_enter_vma(vma, vma->vm_flags);
1063
1064         if (!(vmf->flags & FAULT_FLAG_WRITE) &&
1065                         !mm_forbids_zeropage(vma->vm_mm) &&
1066                         transparent_hugepage_use_zero_page()) {
1067                 pgtable_t pgtable;
1068                 struct folio *zero_folio;
1069                 vm_fault_t ret;
1070
1071                 pgtable = pte_alloc_one(vma->vm_mm);
1072                 if (unlikely(!pgtable))
1073                         return VM_FAULT_OOM;
1074                 zero_folio = mm_get_huge_zero_folio(vma->vm_mm);
1075                 if (unlikely(!zero_folio)) {
1076                         pte_free(vma->vm_mm, pgtable);
1077                         count_vm_event(THP_FAULT_FALLBACK);
1078                         return VM_FAULT_FALLBACK;
1079                 }
1080                 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1081                 ret = 0;
1082                 if (pmd_none(*vmf->pmd)) {
1083                         ret = check_stable_address_space(vma->vm_mm);
1084                         if (ret) {
1085                                 spin_unlock(vmf->ptl);
1086                                 pte_free(vma->vm_mm, pgtable);
1087                         } else if (userfaultfd_missing(vma)) {
1088                                 spin_unlock(vmf->ptl);
1089                                 pte_free(vma->vm_mm, pgtable);
1090                                 ret = handle_userfault(vmf, VM_UFFD_MISSING);
1091                                 VM_BUG_ON(ret & VM_FAULT_FALLBACK);
1092                         } else {
1093                                 set_huge_zero_folio(pgtable, vma->vm_mm, vma,
1094                                                    haddr, vmf->pmd, zero_folio);
1095                                 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
1096                                 spin_unlock(vmf->ptl);
1097                         }
1098                 } else {
1099                         spin_unlock(vmf->ptl);
1100                         pte_free(vma->vm_mm, pgtable);
1101                 }
1102                 return ret;
1103         }
1104         gfp = vma_thp_gfp_mask(vma);
1105         folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
1106         if (unlikely(!folio)) {
1107                 count_vm_event(THP_FAULT_FALLBACK);
1108                 count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
1109                 return VM_FAULT_FALLBACK;
1110         }
1111         return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
1112 }
1113
1114 static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
1115                 pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
1116                 pgtable_t pgtable)
1117 {
1118         struct mm_struct *mm = vma->vm_mm;
1119         pmd_t entry;
1120         spinlock_t *ptl;
1121
1122         ptl = pmd_lock(mm, pmd);
1123         if (!pmd_none(*pmd)) {
1124                 if (write) {
1125                         if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) {
1126                                 WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
1127                                 goto out_unlock;
1128                         }
1129                         entry = pmd_mkyoung(*pmd);
1130                         entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1131                         if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
1132                                 update_mmu_cache_pmd(vma, addr, pmd);
1133                 }
1134
1135                 goto out_unlock;
1136         }
1137
1138         entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
1139         if (pfn_t_devmap(pfn))
1140                 entry = pmd_mkdevmap(entry);
1141         if (write) {
1142                 entry = pmd_mkyoung(pmd_mkdirty(entry));
1143                 entry = maybe_pmd_mkwrite(entry, vma);
1144         }
1145
1146         if (pgtable) {
1147                 pgtable_trans_huge_deposit(mm, pmd, pgtable);
1148                 mm_inc_nr_ptes(mm);
1149                 pgtable = NULL;
1150         }
1151
1152         set_pmd_at(mm, addr, pmd, entry);
1153         update_mmu_cache_pmd(vma, addr, pmd);
1154
1155 out_unlock:
1156         spin_unlock(ptl);
1157         if (pgtable)
1158                 pte_free(mm, pgtable);
1159 }
1160
1161 /**
1162  * vmf_insert_pfn_pmd - insert a pmd size pfn
1163  * @vmf: Structure describing the fault
1164  * @pfn: pfn to insert
1165  * @write: whether it's a write fault
1166  *
1167  * Insert a pmd size pfn. See vmf_insert_pfn() for additional info.
1168  *
1169  * Return: vm_fault_t value.
1170  */
1171 vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
1172 {
1173         unsigned long addr = vmf->address & PMD_MASK;
1174         struct vm_area_struct *vma = vmf->vma;
1175         pgprot_t pgprot = vma->vm_page_prot;
1176         pgtable_t pgtable = NULL;
1177
1178         /*
1179          * If we had pmd_special, we could avoid all these restrictions,
1180          * but we need to be consistent with PTEs and architectures that
1181          * can't support a 'special' bit.
1182          */
1183         BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
1184                         !pfn_t_devmap(pfn));
1185         BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1186                                                 (VM_PFNMAP|VM_MIXEDMAP));
1187         BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1188
1189         if (addr < vma->vm_start || addr >= vma->vm_end)
1190                 return VM_FAULT_SIGBUS;
1191
1192         if (arch_needs_pgtable_deposit()) {
1193                 pgtable = pte_alloc_one(vma->vm_mm);
1194                 if (!pgtable)
1195                         return VM_FAULT_OOM;
1196         }
1197
1198         track_pfn_insert(vma, &pgprot, pfn);
1199
1200         insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
1201         return VM_FAULT_NOPAGE;
1202 }
1203 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
1204
1205 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1206 static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
1207 {
1208         if (likely(vma->vm_flags & VM_WRITE))
1209                 pud = pud_mkwrite(pud);
1210         return pud;
1211 }
1212
1213 static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
1214                 pud_t *pud, pfn_t pfn, bool write)
1215 {
1216         struct mm_struct *mm = vma->vm_mm;
1217         pgprot_t prot = vma->vm_page_prot;
1218         pud_t entry;
1219         spinlock_t *ptl;
1220
1221         ptl = pud_lock(mm, pud);
1222         if (!pud_none(*pud)) {
1223                 if (write) {
1224                         if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) {
1225                                 WARN_ON_ONCE(!is_huge_zero_pud(*pud));
1226                                 goto out_unlock;
1227                         }
1228                         entry = pud_mkyoung(*pud);
1229                         entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
1230                         if (pudp_set_access_flags(vma, addr, pud, entry, 1))
1231                                 update_mmu_cache_pud(vma, addr, pud);
1232                 }
1233                 goto out_unlock;
1234         }
1235
1236         entry = pud_mkhuge(pfn_t_pud(pfn, prot));
1237         if (pfn_t_devmap(pfn))
1238                 entry = pud_mkdevmap(entry);
1239         if (write) {
1240                 entry = pud_mkyoung(pud_mkdirty(entry));
1241                 entry = maybe_pud_mkwrite(entry, vma);
1242         }
1243         set_pud_at(mm, addr, pud, entry);
1244         update_mmu_cache_pud(vma, addr, pud);
1245
1246 out_unlock:
1247         spin_unlock(ptl);
1248 }
1249
1250 /**
1251  * vmf_insert_pfn_pud - insert a pud size pfn
1252  * @vmf: Structure describing the fault
1253  * @pfn: pfn to insert
1254  * @write: whether it's a write fault
1255  *
1256  * Insert a pud size pfn. See vmf_insert_pfn() for additional info.
1257  *
1258  * Return: vm_fault_t value.
1259  */
1260 vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
1261 {
1262         unsigned long addr = vmf->address & PUD_MASK;
1263         struct vm_area_struct *vma = vmf->vma;
1264         pgprot_t pgprot = vma->vm_page_prot;
1265
1266         /*
1267          * If we had pud_special, we could avoid all these restrictions,
1268          * but we need to be consistent with PTEs and architectures that
1269          * can't support a 'special' bit.
1270          */
1271         BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
1272                         !pfn_t_devmap(pfn));
1273         BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1274                                                 (VM_PFNMAP|VM_MIXEDMAP));
1275         BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1276
1277         if (addr < vma->vm_start || addr >= vma->vm_end)
1278                 return VM_FAULT_SIGBUS;
1279
1280         track_pfn_insert(vma, &pgprot, pfn);
1281
1282         insert_pfn_pud(vma, addr, vmf->pud, pfn, write);
1283         return VM_FAULT_NOPAGE;
1284 }
1285 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
1286 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
1287
1288 void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
1289                pmd_t *pmd, bool write)
1290 {
1291         pmd_t _pmd;
1292
1293         _pmd = pmd_mkyoung(*pmd);
1294         if (write)
1295                 _pmd = pmd_mkdirty(_pmd);
1296         if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
1297                                   pmd, _pmd, write))
1298                 update_mmu_cache_pmd(vma, addr, pmd);
1299 }
1300
1301 struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
1302                 pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
1303 {
1304         unsigned long pfn = pmd_pfn(*pmd);
1305         struct mm_struct *mm = vma->vm_mm;
1306         struct page *page;
1307         int ret;
1308
1309         assert_spin_locked(pmd_lockptr(mm, pmd));
1310
1311         if (flags & FOLL_WRITE && !pmd_write(*pmd))
1312                 return NULL;
1313
1314         if (pmd_present(*pmd) && pmd_devmap(*pmd))
1315                 /* pass */;
1316         else
1317                 return NULL;
1318
1319         if (flags & FOLL_TOUCH)
1320                 touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
1321
1322         /*
1323          * device mapped pages can only be returned if the
1324          * caller will manage the page reference count.
1325          */
1326         if (!(flags & (FOLL_GET | FOLL_PIN)))
1327                 return ERR_PTR(-EEXIST);
1328
1329         pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
1330         *pgmap = get_dev_pagemap(pfn, *pgmap);
1331         if (!*pgmap)
1332                 return ERR_PTR(-EFAULT);
1333         page = pfn_to_page(pfn);
1334         ret = try_grab_page(page, flags);
1335         if (ret)
1336                 page = ERR_PTR(ret);
1337
1338         return page;
1339 }
1340
1341 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1342                   pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
1343                   struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
1344 {
1345         spinlock_t *dst_ptl, *src_ptl;
1346         struct page *src_page;
1347         struct folio *src_folio;
1348         pmd_t pmd;
1349         pgtable_t pgtable = NULL;
1350         int ret = -ENOMEM;
1351
1352         /* Skip if can be re-fill on fault */
1353         if (!vma_is_anonymous(dst_vma))
1354                 return 0;
1355
1356         pgtable = pte_alloc_one(dst_mm);
1357         if (unlikely(!pgtable))
1358                 goto out;
1359
1360         dst_ptl = pmd_lock(dst_mm, dst_pmd);
1361         src_ptl = pmd_lockptr(src_mm, src_pmd);
1362         spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1363
1364         ret = -EAGAIN;
1365         pmd = *src_pmd;
1366
1367 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
1368         if (unlikely(is_swap_pmd(pmd))) {
1369                 swp_entry_t entry = pmd_to_swp_entry(pmd);
1370
1371                 VM_BUG_ON(!is_pmd_migration_entry(pmd));
1372                 if (!is_readable_migration_entry(entry)) {
1373                         entry = make_readable_migration_entry(
1374                                                         swp_offset(entry));
1375                         pmd = swp_entry_to_pmd(entry);
1376                         if (pmd_swp_soft_dirty(*src_pmd))
1377                                 pmd = pmd_swp_mksoft_dirty(pmd);
1378                         if (pmd_swp_uffd_wp(*src_pmd))
1379                                 pmd = pmd_swp_mkuffd_wp(pmd);
1380                         set_pmd_at(src_mm, addr, src_pmd, pmd);
1381                 }
1382                 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1383                 mm_inc_nr_ptes(dst_mm);
1384                 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
1385                 if (!userfaultfd_wp(dst_vma))
1386                         pmd = pmd_swp_clear_uffd_wp(pmd);
1387                 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
1388                 ret = 0;
1389                 goto out_unlock;
1390         }
1391 #endif
1392
1393         if (unlikely(!pmd_trans_huge(pmd))) {
1394                 pte_free(dst_mm, pgtable);
1395                 goto out_unlock;
1396         }
1397         /*
1398          * When page table lock is held, the huge zero pmd should not be
1399          * under splitting since we don't split the page itself, only pmd to
1400          * a page table.
1401          */
1402         if (is_huge_zero_pmd(pmd)) {
1403                 /*
1404                  * mm_get_huge_zero_folio() will never allocate a new
1405                  * folio here, since we already have a zero page to
1406                  * copy. It just takes a reference.
1407                  */
1408                 mm_get_huge_zero_folio(dst_mm);
1409                 goto out_zero_page;
1410         }
1411
1412         src_page = pmd_page(pmd);
1413         VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
1414         src_folio = page_folio(src_page);
1415
1416         folio_get(src_folio);
1417         if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, src_vma))) {
1418                 /* Page maybe pinned: split and retry the fault on PTEs. */
1419                 folio_put(src_folio);
1420                 pte_free(dst_mm, pgtable);
1421                 spin_unlock(src_ptl);
1422                 spin_unlock(dst_ptl);
1423                 __split_huge_pmd(src_vma, src_pmd, addr, false, NULL);
1424                 return -EAGAIN;
1425         }
1426         add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1427 out_zero_page:
1428         mm_inc_nr_ptes(dst_mm);
1429         pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
1430         pmdp_set_wrprotect(src_mm, addr, src_pmd);
1431         if (!userfaultfd_wp(dst_vma))
1432                 pmd = pmd_clear_uffd_wp(pmd);
1433         pmd = pmd_mkold(pmd_wrprotect(pmd));
1434         set_pmd_at(dst_mm, addr, dst_pmd, pmd);
1435
1436         ret = 0;
1437 out_unlock:
1438         spin_unlock(src_ptl);
1439         spin_unlock(dst_ptl);
1440 out:
1441         return ret;
1442 }
1443
1444 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1445 void touch_pud(struct vm_area_struct *vma, unsigned long addr,
1446                pud_t *pud, bool write)
1447 {
1448         pud_t _pud;
1449
1450         _pud = pud_mkyoung(*pud);
1451         if (write)
1452                 _pud = pud_mkdirty(_pud);
1453         if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
1454                                   pud, _pud, write))
1455                 update_mmu_cache_pud(vma, addr, pud);
1456 }
1457
1458 int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1459                   pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
1460                   struct vm_area_struct *vma)
1461 {
1462         spinlock_t *dst_ptl, *src_ptl;
1463         pud_t pud;
1464         int ret;
1465
1466         dst_ptl = pud_lock(dst_mm, dst_pud);
1467         src_ptl = pud_lockptr(src_mm, src_pud);
1468         spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1469
1470         ret = -EAGAIN;
1471         pud = *src_pud;
1472         if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
1473                 goto out_unlock;
1474
1475         /*
1476          * When page table lock is held, the huge zero pud should not be
1477          * under splitting since we don't split the page itself, only pud to
1478          * a page table.
1479          */
1480         if (is_huge_zero_pud(pud)) {
1481                 /* No huge zero pud yet */
1482         }
1483
1484         /*
1485          * TODO: once we support anonymous pages, use
1486          * folio_try_dup_anon_rmap_*() and split if duplicating fails.
1487          */
1488         pudp_set_wrprotect(src_mm, addr, src_pud);
1489         pud = pud_mkold(pud_wrprotect(pud));
1490         set_pud_at(dst_mm, addr, dst_pud, pud);
1491
1492         ret = 0;
1493 out_unlock:
1494         spin_unlock(src_ptl);
1495         spin_unlock(dst_ptl);
1496         return ret;
1497 }
1498
1499 void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
1500 {
1501         bool write = vmf->flags & FAULT_FLAG_WRITE;
1502
1503         vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
1504         if (unlikely(!pud_same(*vmf->pud, orig_pud)))
1505                 goto unlock;
1506
1507         touch_pud(vmf->vma, vmf->address, vmf->pud, write);
1508 unlock:
1509         spin_unlock(vmf->ptl);
1510 }
1511 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
1512
1513 void huge_pmd_set_accessed(struct vm_fault *vmf)
1514 {
1515         bool write = vmf->flags & FAULT_FLAG_WRITE;
1516
1517         vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
1518         if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd)))
1519                 goto unlock;
1520
1521         touch_pmd(vmf->vma, vmf->address, vmf->pmd, write);
1522
1523 unlock:
1524         spin_unlock(vmf->ptl);
1525 }
1526
1527 vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
1528 {
1529         const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
1530         struct vm_area_struct *vma = vmf->vma;
1531         struct folio *folio;
1532         struct page *page;
1533         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1534         pmd_t orig_pmd = vmf->orig_pmd;
1535
1536         vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
1537         VM_BUG_ON_VMA(!vma->anon_vma, vma);
1538
1539         if (is_huge_zero_pmd(orig_pmd))
1540                 goto fallback;
1541
1542         spin_lock(vmf->ptl);
1543
1544         if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
1545                 spin_unlock(vmf->ptl);
1546                 return 0;
1547         }
1548
1549         page = pmd_page(orig_pmd);
1550         folio = page_folio(page);
1551         VM_BUG_ON_PAGE(!PageHead(page), page);
1552
1553         /* Early check when only holding the PT lock. */
1554         if (PageAnonExclusive(page))
1555                 goto reuse;
1556
1557         if (!folio_trylock(folio)) {
1558                 folio_get(folio);
1559                 spin_unlock(vmf->ptl);
1560                 folio_lock(folio);
1561                 spin_lock(vmf->ptl);
1562                 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
1563                         spin_unlock(vmf->ptl);
1564                         folio_unlock(folio);
1565                         folio_put(folio);
1566                         return 0;
1567                 }
1568                 folio_put(folio);
1569         }
1570
1571         /* Recheck after temporarily dropping the PT lock. */
1572         if (PageAnonExclusive(page)) {
1573                 folio_unlock(folio);
1574                 goto reuse;
1575         }
1576
1577         /*
1578          * See do_wp_page(): we can only reuse the folio exclusively if
1579          * there are no additional references. Note that we always drain
1580          * the LRU cache immediately after adding a THP.
1581          */
1582         if (folio_ref_count(folio) >
1583                         1 + folio_test_swapcache(folio) * folio_nr_pages(folio))
1584                 goto unlock_fallback;
1585         if (folio_test_swapcache(folio))
1586                 folio_free_swap(folio);
1587         if (folio_ref_count(folio) == 1) {
1588                 pmd_t entry;
1589
1590                 folio_move_anon_rmap(folio, vma);
1591                 SetPageAnonExclusive(page);
1592                 folio_unlock(folio);
1593 reuse:
1594                 if (unlikely(unshare)) {
1595                         spin_unlock(vmf->ptl);
1596                         return 0;
1597                 }
1598                 entry = pmd_mkyoung(orig_pmd);
1599                 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1600                 if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
1601                         update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
1602                 spin_unlock(vmf->ptl);
1603                 return 0;
1604         }
1605
1606 unlock_fallback:
1607         folio_unlock(folio);
1608         spin_unlock(vmf->ptl);
1609 fallback:
1610         __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
1611         return VM_FAULT_FALLBACK;
1612 }
1613
1614 static inline bool can_change_pmd_writable(struct vm_area_struct *vma,
1615                                            unsigned long addr, pmd_t pmd)
1616 {
1617         struct page *page;
1618
1619         if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE)))
1620                 return false;
1621
1622         /* Don't touch entries that are not even readable (NUMA hinting). */
1623         if (pmd_protnone(pmd))
1624                 return false;
1625
1626         /* Do we need write faults for softdirty tracking? */
1627         if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd))
1628                 return false;
1629
1630         /* Do we need write faults for uffd-wp tracking? */
1631         if (userfaultfd_huge_pmd_wp(vma, pmd))
1632                 return false;
1633
1634         if (!(vma->vm_flags & VM_SHARED)) {
1635                 /* See can_change_pte_writable(). */
1636                 page = vm_normal_page_pmd(vma, addr, pmd);
1637                 return page && PageAnon(page) && PageAnonExclusive(page);
1638         }
1639
1640         /* See can_change_pte_writable(). */
1641         return pmd_dirty(pmd);
1642 }
1643
1644 /* NUMA hinting page fault entry point for trans huge pmds */
1645 vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
1646 {
1647         struct vm_area_struct *vma = vmf->vma;
1648         pmd_t oldpmd = vmf->orig_pmd;
1649         pmd_t pmd;
1650         struct folio *folio;
1651         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1652         int nid = NUMA_NO_NODE;
1653         int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK);
1654         bool migrated = false, writable = false;
1655         int flags = 0;
1656
1657         vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1658         if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
1659                 spin_unlock(vmf->ptl);
1660                 goto out;
1661         }
1662
1663         pmd = pmd_modify(oldpmd, vma->vm_page_prot);
1664
1665         /*
1666          * Detect now whether the PMD could be writable; this information
1667          * is only valid while holding the PT lock.
1668          */
1669         writable = pmd_write(pmd);
1670         if (!writable && vma_wants_manual_pte_write_upgrade(vma) &&
1671             can_change_pmd_writable(vma, vmf->address, pmd))
1672                 writable = true;
1673
1674         folio = vm_normal_folio_pmd(vma, haddr, pmd);
1675         if (!folio)
1676                 goto out_map;
1677
1678         /* See similar comment in do_numa_page for explanation */
1679         if (!writable)
1680                 flags |= TNF_NO_GROUP;
1681
1682         nid = folio_nid(folio);
1683         /*
1684          * For memory tiering mode, cpupid of slow memory page is used
1685          * to record page access time.  So use default value.
1686          */
1687         if (node_is_toptier(nid))
1688                 last_cpupid = folio_last_cpupid(folio);
1689         target_nid = numa_migrate_prep(folio, vmf, haddr, nid, &flags);
1690         if (target_nid == NUMA_NO_NODE) {
1691                 folio_put(folio);
1692                 goto out_map;
1693         }
1694
1695         spin_unlock(vmf->ptl);
1696         writable = false;
1697
1698         migrated = migrate_misplaced_folio(folio, vma, target_nid);
1699         if (migrated) {
1700                 flags |= TNF_MIGRATED;
1701                 nid = target_nid;
1702         } else {
1703                 flags |= TNF_MIGRATE_FAIL;
1704                 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1705                 if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
1706                         spin_unlock(vmf->ptl);
1707                         goto out;
1708                 }
1709                 goto out_map;
1710         }
1711
1712 out:
1713         if (nid != NUMA_NO_NODE)
1714                 task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
1715
1716         return 0;
1717
1718 out_map:
1719         /* Restore the PMD */
1720         pmd = pmd_modify(oldpmd, vma->vm_page_prot);
1721         pmd = pmd_mkyoung(pmd);
1722         if (writable)
1723                 pmd = pmd_mkwrite(pmd, vma);
1724         set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
1725         update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
1726         spin_unlock(vmf->ptl);
1727         goto out;
1728 }
1729
1730 /*
1731  * Return true if we do MADV_FREE successfully on entire pmd page.
1732  * Otherwise, return false.
1733  */
1734 bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1735                 pmd_t *pmd, unsigned long addr, unsigned long next)
1736 {
1737         spinlock_t *ptl;
1738         pmd_t orig_pmd;
1739         struct folio *folio;
1740         struct mm_struct *mm = tlb->mm;
1741         bool ret = false;
1742
1743         tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
1744
1745         ptl = pmd_trans_huge_lock(pmd, vma);
1746         if (!ptl)
1747                 goto out_unlocked;
1748
1749         orig_pmd = *pmd;
1750         if (is_huge_zero_pmd(orig_pmd))
1751                 goto out;
1752
1753         if (unlikely(!pmd_present(orig_pmd))) {
1754                 VM_BUG_ON(thp_migration_supported() &&
1755                                   !is_pmd_migration_entry(orig_pmd));
1756                 goto out;
1757         }
1758
1759         folio = pmd_folio(orig_pmd);
1760         /*
1761          * If other processes are mapping this folio, we couldn't discard
1762          * the folio unless they all do MADV_FREE so let's skip the folio.
1763          */
1764         if (folio_likely_mapped_shared(folio))
1765                 goto out;
1766
1767         if (!folio_trylock(folio))
1768                 goto out;
1769
1770         /*
1771          * If user want to discard part-pages of THP, split it so MADV_FREE
1772          * will deactivate only them.
1773          */
1774         if (next - addr != HPAGE_PMD_SIZE) {
1775                 folio_get(folio);
1776                 spin_unlock(ptl);
1777                 split_folio(folio);
1778                 folio_unlock(folio);
1779                 folio_put(folio);
1780                 goto out_unlocked;
1781         }
1782
1783         if (folio_test_dirty(folio))
1784                 folio_clear_dirty(folio);
1785         folio_unlock(folio);
1786
1787         if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
1788                 pmdp_invalidate(vma, addr, pmd);
1789                 orig_pmd = pmd_mkold(orig_pmd);
1790                 orig_pmd = pmd_mkclean(orig_pmd);
1791
1792                 set_pmd_at(mm, addr, pmd, orig_pmd);
1793                 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1794         }
1795
1796         folio_mark_lazyfree(folio);
1797         ret = true;
1798 out:
1799         spin_unlock(ptl);
1800 out_unlocked:
1801         return ret;
1802 }
1803
1804 static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
1805 {
1806         pgtable_t pgtable;
1807
1808         pgtable = pgtable_trans_huge_withdraw(mm, pmd);
1809         pte_free(mm, pgtable);
1810         mm_dec_nr_ptes(mm);
1811 }
1812
1813 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1814                  pmd_t *pmd, unsigned long addr)
1815 {
1816         pmd_t orig_pmd;
1817         spinlock_t *ptl;
1818
1819         tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
1820
1821         ptl = __pmd_trans_huge_lock(pmd, vma);
1822         if (!ptl)
1823                 return 0;
1824         /*
1825          * For architectures like ppc64 we look at deposited pgtable
1826          * when calling pmdp_huge_get_and_clear. So do the
1827          * pgtable_trans_huge_withdraw after finishing pmdp related
1828          * operations.
1829          */
1830         orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd,
1831                                                 tlb->fullmm);
1832         arch_check_zapped_pmd(vma, orig_pmd);
1833         tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1834         if (vma_is_special_huge(vma)) {
1835                 if (arch_needs_pgtable_deposit())
1836                         zap_deposited_table(tlb->mm, pmd);
1837                 spin_unlock(ptl);
1838         } else if (is_huge_zero_pmd(orig_pmd)) {
1839                 zap_deposited_table(tlb->mm, pmd);
1840                 spin_unlock(ptl);
1841         } else {
1842                 struct folio *folio = NULL;
1843                 int flush_needed = 1;
1844
1845                 if (pmd_present(orig_pmd)) {
1846                         struct page *page = pmd_page(orig_pmd);
1847
1848                         folio = page_folio(page);
1849                         folio_remove_rmap_pmd(folio, page, vma);
1850                         WARN_ON_ONCE(folio_mapcount(folio) < 0);
1851                         VM_BUG_ON_PAGE(!PageHead(page), page);
1852                 } else if (thp_migration_supported()) {
1853                         swp_entry_t entry;
1854
1855                         VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
1856                         entry = pmd_to_swp_entry(orig_pmd);
1857                         folio = pfn_swap_entry_folio(entry);
1858                         flush_needed = 0;
1859                 } else
1860                         WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
1861
1862                 if (folio_test_anon(folio)) {
1863                         zap_deposited_table(tlb->mm, pmd);
1864                         add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1865                 } else {
1866                         if (arch_needs_pgtable_deposit())
1867                                 zap_deposited_table(tlb->mm, pmd);
1868                         add_mm_counter(tlb->mm, mm_counter_file(folio),
1869                                        -HPAGE_PMD_NR);
1870                 }
1871
1872                 spin_unlock(ptl);
1873                 if (flush_needed)
1874                         tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE);
1875         }
1876         return 1;
1877 }
1878
1879 #ifndef pmd_move_must_withdraw
1880 static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
1881                                          spinlock_t *old_pmd_ptl,
1882                                          struct vm_area_struct *vma)
1883 {
1884         /*
1885          * With split pmd lock we also need to move preallocated
1886          * PTE page table if new_pmd is on different PMD page table.
1887          *
1888          * We also don't deposit and withdraw tables for file pages.
1889          */
1890         return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
1891 }
1892 #endif
1893
1894 static pmd_t move_soft_dirty_pmd(pmd_t pmd)
1895 {
1896 #ifdef CONFIG_MEM_SOFT_DIRTY
1897         if (unlikely(is_pmd_migration_entry(pmd)))
1898                 pmd = pmd_swp_mksoft_dirty(pmd);
1899         else if (pmd_present(pmd))
1900                 pmd = pmd_mksoft_dirty(pmd);
1901 #endif
1902         return pmd;
1903 }
1904
1905 bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
1906                   unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
1907 {
1908         spinlock_t *old_ptl, *new_ptl;
1909         pmd_t pmd;
1910         struct mm_struct *mm = vma->vm_mm;
1911         bool force_flush = false;
1912
1913         /*
1914          * The destination pmd shouldn't be established, free_pgtables()
1915          * should have released it; but move_page_tables() might have already
1916          * inserted a page table, if racing against shmem/file collapse.
1917          */
1918         if (!pmd_none(*new_pmd)) {
1919                 VM_BUG_ON(pmd_trans_huge(*new_pmd));
1920                 return false;
1921         }
1922
1923         /*
1924          * We don't have to worry about the ordering of src and dst
1925          * ptlocks because exclusive mmap_lock prevents deadlock.
1926          */
1927         old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
1928         if (old_ptl) {
1929                 new_ptl = pmd_lockptr(mm, new_pmd);
1930                 if (new_ptl != old_ptl)
1931                         spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
1932                 pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
1933                 if (pmd_present(pmd))
1934                         force_flush = true;
1935                 VM_BUG_ON(!pmd_none(*new_pmd));
1936
1937                 if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
1938                         pgtable_t pgtable;
1939                         pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
1940                         pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
1941                 }
1942                 pmd = move_soft_dirty_pmd(pmd);
1943                 set_pmd_at(mm, new_addr, new_pmd, pmd);
1944                 if (force_flush)
1945                         flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
1946                 if (new_ptl != old_ptl)
1947                         spin_unlock(new_ptl);
1948                 spin_unlock(old_ptl);
1949                 return true;
1950         }
1951         return false;
1952 }
1953
1954 /*
1955  * Returns
1956  *  - 0 if PMD could not be locked
1957  *  - 1 if PMD was locked but protections unchanged and TLB flush unnecessary
1958  *      or if prot_numa but THP migration is not supported
1959  *  - HPAGE_PMD_NR if protections changed and TLB flush necessary
1960  */
1961 int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1962                     pmd_t *pmd, unsigned long addr, pgprot_t newprot,
1963                     unsigned long cp_flags)
1964 {
1965         struct mm_struct *mm = vma->vm_mm;
1966         spinlock_t *ptl;
1967         pmd_t oldpmd, entry;
1968         bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
1969         bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
1970         bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
1971         int ret = 1;
1972
1973         tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
1974
1975         if (prot_numa && !thp_migration_supported())
1976                 return 1;
1977
1978         ptl = __pmd_trans_huge_lock(pmd, vma);
1979         if (!ptl)
1980                 return 0;
1981
1982 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
1983         if (is_swap_pmd(*pmd)) {
1984                 swp_entry_t entry = pmd_to_swp_entry(*pmd);
1985                 struct folio *folio = pfn_swap_entry_folio(entry);
1986                 pmd_t newpmd;
1987
1988                 VM_BUG_ON(!is_pmd_migration_entry(*pmd));
1989                 if (is_writable_migration_entry(entry)) {
1990                         /*
1991                          * A protection check is difficult so
1992                          * just be safe and disable write
1993                          */
1994                         if (folio_test_anon(folio))
1995                                 entry = make_readable_exclusive_migration_entry(swp_offset(entry));
1996                         else
1997                                 entry = make_readable_migration_entry(swp_offset(entry));
1998                         newpmd = swp_entry_to_pmd(entry);
1999                         if (pmd_swp_soft_dirty(*pmd))
2000                                 newpmd = pmd_swp_mksoft_dirty(newpmd);
2001                 } else {
2002                         newpmd = *pmd;
2003                 }
2004
2005                 if (uffd_wp)
2006                         newpmd = pmd_swp_mkuffd_wp(newpmd);
2007                 else if (uffd_wp_resolve)
2008                         newpmd = pmd_swp_clear_uffd_wp(newpmd);
2009                 if (!pmd_same(*pmd, newpmd))
2010                         set_pmd_at(mm, addr, pmd, newpmd);
2011                 goto unlock;
2012         }
2013 #endif
2014
2015         if (prot_numa) {
2016                 struct folio *folio;
2017                 bool toptier;
2018                 /*
2019                  * Avoid trapping faults against the zero page. The read-only
2020                  * data is likely to be read-cached on the local CPU and
2021                  * local/remote hits to the zero page are not interesting.
2022                  */
2023                 if (is_huge_zero_pmd(*pmd))
2024                         goto unlock;
2025
2026                 if (pmd_protnone(*pmd))
2027                         goto unlock;
2028
2029                 folio = pmd_folio(*pmd);
2030                 toptier = node_is_toptier(folio_nid(folio));
2031                 /*
2032                  * Skip scanning top tier node if normal numa
2033                  * balancing is disabled
2034                  */
2035                 if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
2036                     toptier)
2037                         goto unlock;
2038
2039                 if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
2040                     !toptier)
2041                         folio_xchg_access_time(folio,
2042                                                jiffies_to_msecs(jiffies));
2043         }
2044         /*
2045          * In case prot_numa, we are under mmap_read_lock(mm). It's critical
2046          * to not clear pmd intermittently to avoid race with MADV_DONTNEED
2047          * which is also under mmap_read_lock(mm):
2048          *
2049          *      CPU0:                           CPU1:
2050          *                              change_huge_pmd(prot_numa=1)
2051          *                               pmdp_huge_get_and_clear_notify()
2052          * madvise_dontneed()
2053          *  zap_pmd_range()
2054          *   pmd_trans_huge(*pmd) == 0 (without ptl)
2055          *   // skip the pmd
2056          *                               set_pmd_at();
2057          *                               // pmd is re-established
2058          *
2059          * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
2060          * which may break userspace.
2061          *
2062          * pmdp_invalidate_ad() is required to make sure we don't miss
2063          * dirty/young flags set by hardware.
2064          */
2065         oldpmd = pmdp_invalidate_ad(vma, addr, pmd);
2066
2067         entry = pmd_modify(oldpmd, newprot);
2068         if (uffd_wp)
2069                 entry = pmd_mkuffd_wp(entry);
2070         else if (uffd_wp_resolve)
2071                 /*
2072                  * Leave the write bit to be handled by PF interrupt
2073                  * handler, then things like COW could be properly
2074                  * handled.
2075                  */
2076                 entry = pmd_clear_uffd_wp(entry);
2077
2078         /* See change_pte_range(). */
2079         if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) &&
2080             can_change_pmd_writable(vma, addr, entry))
2081                 entry = pmd_mkwrite(entry, vma);
2082
2083         ret = HPAGE_PMD_NR;
2084         set_pmd_at(mm, addr, pmd, entry);
2085
2086         if (huge_pmd_needs_flush(oldpmd, entry))
2087                 tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE);
2088 unlock:
2089         spin_unlock(ptl);
2090         return ret;
2091 }
2092
2093 #ifdef CONFIG_USERFAULTFD
2094 /*
2095  * The PT lock for src_pmd and dst_vma/src_vma (for reading) are locked by
2096  * the caller, but it must return after releasing the page_table_lock.
2097  * Just move the page from src_pmd to dst_pmd if possible.
2098  * Return zero if succeeded in moving the page, -EAGAIN if it needs to be
2099  * repeated by the caller, or other errors in case of failure.
2100  */
2101 int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
2102                         struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
2103                         unsigned long dst_addr, unsigned long src_addr)
2104 {
2105         pmd_t _dst_pmd, src_pmdval;
2106         struct page *src_page;
2107         struct folio *src_folio;
2108         struct anon_vma *src_anon_vma;
2109         spinlock_t *src_ptl, *dst_ptl;
2110         pgtable_t src_pgtable;
2111         struct mmu_notifier_range range;
2112         int err = 0;
2113
2114         src_pmdval = *src_pmd;
2115         src_ptl = pmd_lockptr(mm, src_pmd);
2116
2117         lockdep_assert_held(src_ptl);
2118         vma_assert_locked(src_vma);
2119         vma_assert_locked(dst_vma);
2120
2121         /* Sanity checks before the operation */
2122         if (WARN_ON_ONCE(!pmd_none(dst_pmdval)) || WARN_ON_ONCE(src_addr & ~HPAGE_PMD_MASK) ||
2123             WARN_ON_ONCE(dst_addr & ~HPAGE_PMD_MASK)) {
2124                 spin_unlock(src_ptl);
2125                 return -EINVAL;
2126         }
2127
2128         if (!pmd_trans_huge(src_pmdval)) {
2129                 spin_unlock(src_ptl);
2130                 if (is_pmd_migration_entry(src_pmdval)) {
2131                         pmd_migration_entry_wait(mm, &src_pmdval);
2132                         return -EAGAIN;
2133                 }
2134                 return -ENOENT;
2135         }
2136
2137         src_page = pmd_page(src_pmdval);
2138
2139         if (!is_huge_zero_pmd(src_pmdval)) {
2140                 if (unlikely(!PageAnonExclusive(src_page))) {
2141                         spin_unlock(src_ptl);
2142                         return -EBUSY;
2143                 }
2144
2145                 src_folio = page_folio(src_page);
2146                 folio_get(src_folio);
2147         } else
2148                 src_folio = NULL;
2149
2150         spin_unlock(src_ptl);
2151
2152         flush_cache_range(src_vma, src_addr, src_addr + HPAGE_PMD_SIZE);
2153         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, src_addr,
2154                                 src_addr + HPAGE_PMD_SIZE);
2155         mmu_notifier_invalidate_range_start(&range);
2156
2157         if (src_folio) {
2158                 folio_lock(src_folio);
2159
2160                 /*
2161                  * split_huge_page walks the anon_vma chain without the page
2162                  * lock. Serialize against it with the anon_vma lock, the page
2163                  * lock is not enough.
2164                  */
2165                 src_anon_vma = folio_get_anon_vma(src_folio);
2166                 if (!src_anon_vma) {
2167                         err = -EAGAIN;
2168                         goto unlock_folio;
2169                 }
2170                 anon_vma_lock_write(src_anon_vma);
2171         } else
2172                 src_anon_vma = NULL;
2173
2174         dst_ptl = pmd_lockptr(mm, dst_pmd);
2175         double_pt_lock(src_ptl, dst_ptl);
2176         if (unlikely(!pmd_same(*src_pmd, src_pmdval) ||
2177                      !pmd_same(*dst_pmd, dst_pmdval))) {
2178                 err = -EAGAIN;
2179                 goto unlock_ptls;
2180         }
2181         if (src_folio) {
2182                 if (folio_maybe_dma_pinned(src_folio) ||
2183                     !PageAnonExclusive(&src_folio->page)) {
2184                         err = -EBUSY;
2185                         goto unlock_ptls;
2186                 }
2187
2188                 if (WARN_ON_ONCE(!folio_test_head(src_folio)) ||
2189                     WARN_ON_ONCE(!folio_test_anon(src_folio))) {
2190                         err = -EBUSY;
2191                         goto unlock_ptls;
2192                 }
2193
2194                 src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
2195                 /* Folio got pinned from under us. Put it back and fail the move. */
2196                 if (folio_maybe_dma_pinned(src_folio)) {
2197                         set_pmd_at(mm, src_addr, src_pmd, src_pmdval);
2198                         err = -EBUSY;
2199                         goto unlock_ptls;
2200                 }
2201
2202                 folio_move_anon_rmap(src_folio, dst_vma);
2203                 src_folio->index = linear_page_index(dst_vma, dst_addr);
2204
2205                 _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
2206                 /* Follow mremap() behavior and treat the entry dirty after the move */
2207                 _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
2208         } else {
2209                 src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
2210                 _dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot);
2211         }
2212         set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd);
2213
2214         src_pgtable = pgtable_trans_huge_withdraw(mm, src_pmd);
2215         pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable);
2216 unlock_ptls:
2217         double_pt_unlock(src_ptl, dst_ptl);
2218         if (src_anon_vma) {
2219                 anon_vma_unlock_write(src_anon_vma);
2220                 put_anon_vma(src_anon_vma);
2221         }
2222 unlock_folio:
2223         /* unblock rmap walks */
2224         if (src_folio)
2225                 folio_unlock(src_folio);
2226         mmu_notifier_invalidate_range_end(&range);
2227         if (src_folio)
2228                 folio_put(src_folio);
2229         return err;
2230 }
2231 #endif /* CONFIG_USERFAULTFD */
2232
2233 /*
2234  * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
2235  *
2236  * Note that if it returns page table lock pointer, this routine returns without
2237  * unlocking page table lock. So callers must unlock it.
2238  */
2239 spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
2240 {
2241         spinlock_t *ptl;
2242         ptl = pmd_lock(vma->vm_mm, pmd);
2243         if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) ||
2244                         pmd_devmap(*pmd)))
2245                 return ptl;
2246         spin_unlock(ptl);
2247         return NULL;
2248 }
2249
2250 /*
2251  * Returns page table lock pointer if a given pud maps a thp, NULL otherwise.
2252  *
2253  * Note that if it returns page table lock pointer, this routine returns without
2254  * unlocking page table lock. So callers must unlock it.
2255  */
2256 spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
2257 {
2258         spinlock_t *ptl;
2259
2260         ptl = pud_lock(vma->vm_mm, pud);
2261         if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
2262                 return ptl;
2263         spin_unlock(ptl);
2264         return NULL;
2265 }
2266
2267 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
2268 int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
2269                  pud_t *pud, unsigned long addr)
2270 {
2271         spinlock_t *ptl;
2272
2273         ptl = __pud_trans_huge_lock(pud, vma);
2274         if (!ptl)
2275                 return 0;
2276
2277         pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
2278         tlb_remove_pud_tlb_entry(tlb, pud, addr);
2279         if (vma_is_special_huge(vma)) {
2280                 spin_unlock(ptl);
2281                 /* No zero page support yet */
2282         } else {
2283                 /* No support for anonymous PUD pages yet */
2284                 BUG();
2285         }
2286         return 1;
2287 }
2288
2289 static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
2290                 unsigned long haddr)
2291 {
2292         VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
2293         VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
2294         VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
2295         VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));
2296
2297         count_vm_event(THP_SPLIT_PUD);
2298
2299         pudp_huge_clear_flush(vma, haddr, pud);
2300 }
2301
2302 void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
2303                 unsigned long address)
2304 {
2305         spinlock_t *ptl;
2306         struct mmu_notifier_range range;
2307
2308         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
2309                                 address & HPAGE_PUD_MASK,
2310                                 (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
2311         mmu_notifier_invalidate_range_start(&range);
2312         ptl = pud_lock(vma->vm_mm, pud);
2313         if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
2314                 goto out;
2315         __split_huge_pud_locked(vma, pud, range.start);
2316
2317 out:
2318         spin_unlock(ptl);
2319         mmu_notifier_invalidate_range_end(&range);
2320 }
2321 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
2322
2323 static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
2324                 unsigned long haddr, pmd_t *pmd)
2325 {
2326         struct mm_struct *mm = vma->vm_mm;
2327         pgtable_t pgtable;
2328         pmd_t _pmd, old_pmd;
2329         unsigned long addr;
2330         pte_t *pte;
2331         int i;
2332
2333         /*
2334          * Leave pmd empty until pte is filled note that it is fine to delay
2335          * notification until mmu_notifier_invalidate_range_end() as we are
2336          * replacing a zero pmd write protected page with a zero pte write
2337          * protected page.
2338          *
2339          * See Documentation/mm/mmu_notifier.rst
2340          */
2341         old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
2342
2343         pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2344         pmd_populate(mm, &_pmd, pgtable);
2345
2346         pte = pte_offset_map(&_pmd, haddr);
2347         VM_BUG_ON(!pte);
2348         for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
2349                 pte_t entry;
2350
2351                 entry = pfn_pte(my_zero_pfn(addr), vma->vm_page_prot);
2352                 entry = pte_mkspecial(entry);
2353                 if (pmd_uffd_wp(old_pmd))
2354                         entry = pte_mkuffd_wp(entry);
2355                 VM_BUG_ON(!pte_none(ptep_get(pte)));
2356                 set_pte_at(mm, addr, pte, entry);
2357                 pte++;
2358         }
2359         pte_unmap(pte - 1);
2360         smp_wmb(); /* make pte visible before pmd */
2361         pmd_populate(mm, pmd, pgtable);
2362 }
2363
2364 static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
2365                 unsigned long haddr, bool freeze)
2366 {
2367         struct mm_struct *mm = vma->vm_mm;
2368         struct folio *folio;
2369         struct page *page;
2370         pgtable_t pgtable;
2371         pmd_t old_pmd, _pmd;
2372         bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
2373         bool anon_exclusive = false, dirty = false;
2374         unsigned long addr;
2375         pte_t *pte;
2376         int i;
2377
2378         VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
2379         VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
2380         VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
2381         VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
2382                                 && !pmd_devmap(*pmd));
2383
2384         count_vm_event(THP_SPLIT_PMD);
2385
2386         if (!vma_is_anonymous(vma)) {
2387                 old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
2388                 /*
2389                  * We are going to unmap this huge page. So
2390                  * just go ahead and zap it
2391                  */
2392                 if (arch_needs_pgtable_deposit())
2393                         zap_deposited_table(mm, pmd);
2394                 if (vma_is_special_huge(vma))
2395                         return;
2396                 if (unlikely(is_pmd_migration_entry(old_pmd))) {
2397                         swp_entry_t entry;
2398
2399                         entry = pmd_to_swp_entry(old_pmd);
2400                         folio = pfn_swap_entry_folio(entry);
2401                 } else {
2402                         page = pmd_page(old_pmd);
2403                         folio = page_folio(page);
2404                         if (!folio_test_dirty(folio) && pmd_dirty(old_pmd))
2405                                 folio_mark_dirty(folio);
2406                         if (!folio_test_referenced(folio) && pmd_young(old_pmd))
2407                                 folio_set_referenced(folio);
2408                         folio_remove_rmap_pmd(folio, page, vma);
2409                         folio_put(folio);
2410                 }
2411                 add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR);
2412                 return;
2413         }
2414
2415         if (is_huge_zero_pmd(*pmd)) {
2416                 /*
2417                  * FIXME: Do we want to invalidate secondary mmu by calling
2418                  * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below
2419                  * inside __split_huge_pmd() ?
2420                  *
2421                  * We are going from a zero huge page write protected to zero
2422                  * small page also write protected so it does not seems useful
2423                  * to invalidate secondary mmu at this time.
2424                  */
2425                 return __split_huge_zero_page_pmd(vma, haddr, pmd);
2426         }
2427
2428         pmd_migration = is_pmd_migration_entry(*pmd);
2429         if (unlikely(pmd_migration)) {
2430                 swp_entry_t entry;
2431
2432                 old_pmd = *pmd;
2433                 entry = pmd_to_swp_entry(old_pmd);
2434                 page = pfn_swap_entry_to_page(entry);
2435                 write = is_writable_migration_entry(entry);
2436                 if (PageAnon(page))
2437                         anon_exclusive = is_readable_exclusive_migration_entry(entry);
2438                 young = is_migration_entry_young(entry);
2439                 dirty = is_migration_entry_dirty(entry);
2440                 soft_dirty = pmd_swp_soft_dirty(old_pmd);
2441                 uffd_wp = pmd_swp_uffd_wp(old_pmd);
2442         } else {
2443                 /*
2444                  * Up to this point the pmd is present and huge and userland has
2445                  * the whole access to the hugepage during the split (which
2446                  * happens in place). If we overwrite the pmd with the not-huge
2447                  * version pointing to the pte here (which of course we could if
2448                  * all CPUs were bug free), userland could trigger a small page
2449                  * size TLB miss on the small sized TLB while the hugepage TLB
2450                  * entry is still established in the huge TLB. Some CPU doesn't
2451                  * like that. See
2452                  * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
2453                  * 383 on page 105. Intel should be safe but is also warns that
2454                  * it's only safe if the permission and cache attributes of the
2455                  * two entries loaded in the two TLB is identical (which should
2456                  * be the case here). But it is generally safer to never allow
2457                  * small and huge TLB entries for the same virtual address to be
2458                  * loaded simultaneously. So instead of doing "pmd_populate();
2459                  * flush_pmd_tlb_range();" we first mark the current pmd
2460                  * notpresent (atomically because here the pmd_trans_huge must
2461                  * remain set at all times on the pmd until the split is
2462                  * complete for this pmd), then we flush the SMP TLB and finally
2463                  * we write the non-huge version of the pmd entry with
2464                  * pmd_populate.
2465                  */
2466                 old_pmd = pmdp_invalidate(vma, haddr, pmd);
2467                 page = pmd_page(old_pmd);
2468                 folio = page_folio(page);
2469                 if (pmd_dirty(old_pmd)) {
2470                         dirty = true;
2471                         folio_set_dirty(folio);
2472                 }
2473                 write = pmd_write(old_pmd);
2474                 young = pmd_young(old_pmd);
2475                 soft_dirty = pmd_soft_dirty(old_pmd);
2476                 uffd_wp = pmd_uffd_wp(old_pmd);
2477
2478                 VM_WARN_ON_FOLIO(!folio_ref_count(folio), folio);
2479                 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
2480
2481                 /*
2482                  * Without "freeze", we'll simply split the PMD, propagating the
2483                  * PageAnonExclusive() flag for each PTE by setting it for
2484                  * each subpage -- no need to (temporarily) clear.
2485                  *
2486                  * With "freeze" we want to replace mapped pages by
2487                  * migration entries right away. This is only possible if we
2488                  * managed to clear PageAnonExclusive() -- see
2489                  * set_pmd_migration_entry().
2490                  *
2491                  * In case we cannot clear PageAnonExclusive(), split the PMD
2492                  * only and let try_to_migrate_one() fail later.
2493                  *
2494                  * See folio_try_share_anon_rmap_pmd(): invalidate PMD first.
2495                  */
2496                 anon_exclusive = PageAnonExclusive(page);
2497                 if (freeze && anon_exclusive &&
2498                     folio_try_share_anon_rmap_pmd(folio, page))
2499                         freeze = false;
2500                 if (!freeze) {
2501                         rmap_t rmap_flags = RMAP_NONE;
2502
2503                         folio_ref_add(folio, HPAGE_PMD_NR - 1);
2504                         if (anon_exclusive)
2505                                 rmap_flags |= RMAP_EXCLUSIVE;
2506                         folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
2507                                                  vma, haddr, rmap_flags);
2508                 }
2509         }
2510
2511         /*
2512          * Withdraw the table only after we mark the pmd entry invalid.
2513          * This's critical for some architectures (Power).
2514          */
2515         pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2516         pmd_populate(mm, &_pmd, pgtable);
2517
2518         pte = pte_offset_map(&_pmd, haddr);
2519         VM_BUG_ON(!pte);
2520
2521         /*
2522          * Note that NUMA hinting access restrictions are not transferred to
2523          * avoid any possibility of altering permissions across VMAs.
2524          */
2525         if (freeze || pmd_migration) {
2526                 for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
2527                         pte_t entry;
2528                         swp_entry_t swp_entry;
2529
2530                         if (write)
2531                                 swp_entry = make_writable_migration_entry(
2532                                                         page_to_pfn(page + i));
2533                         else if (anon_exclusive)
2534                                 swp_entry = make_readable_exclusive_migration_entry(
2535                                                         page_to_pfn(page + i));
2536                         else
2537                                 swp_entry = make_readable_migration_entry(
2538                                                         page_to_pfn(page + i));
2539                         if (young)
2540                                 swp_entry = make_migration_entry_young(swp_entry);
2541                         if (dirty)
2542                                 swp_entry = make_migration_entry_dirty(swp_entry);
2543                         entry = swp_entry_to_pte(swp_entry);
2544                         if (soft_dirty)
2545                                 entry = pte_swp_mksoft_dirty(entry);
2546                         if (uffd_wp)
2547                                 entry = pte_swp_mkuffd_wp(entry);
2548
2549                         VM_WARN_ON(!pte_none(ptep_get(pte + i)));
2550                         set_pte_at(mm, addr, pte + i, entry);
2551                 }
2552         } else {
2553                 pte_t entry;
2554
2555                 entry = mk_pte(page, READ_ONCE(vma->vm_page_prot));
2556                 if (write)
2557                         entry = pte_mkwrite(entry, vma);
2558                 if (!young)
2559                         entry = pte_mkold(entry);
2560                 /* NOTE: this may set soft-dirty too on some archs */
2561                 if (dirty)
2562                         entry = pte_mkdirty(entry);
2563                 if (soft_dirty)
2564                         entry = pte_mksoft_dirty(entry);
2565                 if (uffd_wp)
2566                         entry = pte_mkuffd_wp(entry);
2567
2568                 for (i = 0; i < HPAGE_PMD_NR; i++)
2569                         VM_WARN_ON(!pte_none(ptep_get(pte + i)));
2570
2571                 set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR);
2572         }
2573         pte_unmap(pte);
2574
2575         if (!pmd_migration)
2576                 folio_remove_rmap_pmd(folio, page, vma);
2577         if (freeze)
2578                 put_page(page);
2579
2580         smp_wmb(); /* make pte visible before pmd */
2581         pmd_populate(mm, pmd, pgtable);
2582 }
2583
2584 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
2585                 unsigned long address, bool freeze, struct folio *folio)
2586 {
2587         spinlock_t *ptl;
2588         struct mmu_notifier_range range;
2589
2590         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
2591                                 address & HPAGE_PMD_MASK,
2592                                 (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
2593         mmu_notifier_invalidate_range_start(&range);
2594         ptl = pmd_lock(vma->vm_mm, pmd);
2595
2596         /*
2597          * If caller asks to setup a migration entry, we need a folio to check
2598          * pmd against. Otherwise we can end up replacing wrong folio.
2599          */
2600         VM_BUG_ON(freeze && !folio);
2601         VM_WARN_ON_ONCE(folio && !folio_test_locked(folio));
2602
2603         if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
2604             is_pmd_migration_entry(*pmd)) {
2605                 /*
2606                  * It's safe to call pmd_page when folio is set because it's
2607                  * guaranteed that pmd is present.
2608                  */
2609                 if (folio && folio != pmd_folio(*pmd))
2610                         goto out;
2611                 __split_huge_pmd_locked(vma, pmd, range.start, freeze);
2612         }
2613
2614 out:
2615         spin_unlock(ptl);
2616         mmu_notifier_invalidate_range_end(&range);
2617 }
2618
2619 void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
2620                 bool freeze, struct folio *folio)
2621 {
2622         pmd_t *pmd = mm_find_pmd(vma->vm_mm, address);
2623
2624         if (!pmd)
2625                 return;
2626
2627         __split_huge_pmd(vma, pmd, address, freeze, folio);
2628 }
2629
2630 static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address)
2631 {
2632         /*
2633          * If the new address isn't hpage aligned and it could previously
2634          * contain an hugepage: check if we need to split an huge pmd.
2635          */
2636         if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) &&
2637             range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE),
2638                          ALIGN(address, HPAGE_PMD_SIZE)))
2639                 split_huge_pmd_address(vma, address, false, NULL);
2640 }
2641
2642 void vma_adjust_trans_huge(struct vm_area_struct *vma,
2643                              unsigned long start,
2644                              unsigned long end,
2645                              long adjust_next)
2646 {
2647         /* Check if we need to split start first. */
2648         split_huge_pmd_if_needed(vma, start);
2649
2650         /* Check if we need to split end next. */
2651         split_huge_pmd_if_needed(vma, end);
2652
2653         /*
2654          * If we're also updating the next vma vm_start,
2655          * check if we need to split it.
2656          */
2657         if (adjust_next > 0) {
2658                 struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end);
2659                 unsigned long nstart = next->vm_start;
2660                 nstart += adjust_next;
2661                 split_huge_pmd_if_needed(next, nstart);
2662         }
2663 }
2664
2665 static void unmap_folio(struct folio *folio)
2666 {
2667         enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SYNC |
2668                 TTU_BATCH_FLUSH;
2669
2670         VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
2671
2672         if (folio_test_pmd_mappable(folio))
2673                 ttu_flags |= TTU_SPLIT_HUGE_PMD;
2674
2675         /*
2676          * Anon pages need migration entries to preserve them, but file
2677          * pages can simply be left unmapped, then faulted back on demand.
2678          * If that is ever changed (perhaps for mlock), update remap_page().
2679          */
2680         if (folio_test_anon(folio))
2681                 try_to_migrate(folio, ttu_flags);
2682         else
2683                 try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK);
2684
2685         try_to_unmap_flush();
2686 }
2687
2688 static void remap_page(struct folio *folio, unsigned long nr)
2689 {
2690         int i = 0;
2691
2692         /* If unmap_folio() uses try_to_migrate() on file, remove this check */
2693         if (!folio_test_anon(folio))
2694                 return;
2695         for (;;) {
2696                 remove_migration_ptes(folio, folio, true);
2697                 i += folio_nr_pages(folio);
2698                 if (i >= nr)
2699                         break;
2700                 folio = folio_next(folio);
2701         }
2702 }
2703
2704 static void lru_add_page_tail(struct page *head, struct page *tail,
2705                 struct lruvec *lruvec, struct list_head *list)
2706 {
2707         VM_BUG_ON_PAGE(!PageHead(head), head);
2708         VM_BUG_ON_PAGE(PageLRU(tail), head);
2709         lockdep_assert_held(&lruvec->lru_lock);
2710
2711         if (list) {
2712                 /* page reclaim is reclaiming a huge page */
2713                 VM_WARN_ON(PageLRU(head));
2714                 get_page(tail);
2715                 list_add_tail(&tail->lru, list);
2716         } else {
2717                 /* head is still on lru (and we have it frozen) */
2718                 VM_WARN_ON(!PageLRU(head));
2719                 if (PageUnevictable(tail))
2720                         tail->mlock_count = 0;
2721                 else
2722                         list_add_tail(&tail->lru, &head->lru);
2723                 SetPageLRU(tail);
2724         }
2725 }
2726
2727 static void __split_huge_page_tail(struct folio *folio, int tail,
2728                 struct lruvec *lruvec, struct list_head *list,
2729                 unsigned int new_order)
2730 {
2731         struct page *head = &folio->page;
2732         struct page *page_tail = head + tail;
2733         /*
2734          * Careful: new_folio is not a "real" folio before we cleared PageTail.
2735          * Don't pass it around before clear_compound_head().
2736          */
2737         struct folio *new_folio = (struct folio *)page_tail;
2738
2739         VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
2740
2741         /*
2742          * Clone page flags before unfreezing refcount.
2743          *
2744          * After successful get_page_unless_zero() might follow flags change,
2745          * for example lock_page() which set PG_waiters.
2746          *
2747          * Note that for mapped sub-pages of an anonymous THP,
2748          * PG_anon_exclusive has been cleared in unmap_folio() and is stored in
2749          * the migration entry instead from where remap_page() will restore it.
2750          * We can still have PG_anon_exclusive set on effectively unmapped and
2751          * unreferenced sub-pages of an anonymous THP: we can simply drop
2752          * PG_anon_exclusive (-> PG_mappedtodisk) for these here.
2753          */
2754         page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
2755         page_tail->flags |= (head->flags &
2756                         ((1L << PG_referenced) |
2757                          (1L << PG_swapbacked) |
2758                          (1L << PG_swapcache) |
2759                          (1L << PG_mlocked) |
2760                          (1L << PG_uptodate) |
2761                          (1L << PG_active) |
2762                          (1L << PG_workingset) |
2763                          (1L << PG_locked) |
2764                          (1L << PG_unevictable) |
2765 #ifdef CONFIG_ARCH_USES_PG_ARCH_X
2766                          (1L << PG_arch_2) |
2767                          (1L << PG_arch_3) |
2768 #endif
2769                          (1L << PG_dirty) |
2770                          LRU_GEN_MASK | LRU_REFS_MASK));
2771
2772         /* ->mapping in first and second tail page is replaced by other uses */
2773         VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
2774                         page_tail);
2775         page_tail->mapping = head->mapping;
2776         page_tail->index = head->index + tail;
2777
2778         /*
2779          * page->private should not be set in tail pages. Fix up and warn once
2780          * if private is unexpectedly set.
2781          */
2782         if (unlikely(page_tail->private)) {
2783                 VM_WARN_ON_ONCE_PAGE(true, page_tail);
2784                 page_tail->private = 0;
2785         }
2786         if (folio_test_swapcache(folio))
2787                 new_folio->swap.val = folio->swap.val + tail;
2788
2789         /* Page flags must be visible before we make the page non-compound. */
2790         smp_wmb();
2791
2792         /*
2793          * Clear PageTail before unfreezing page refcount.
2794          *
2795          * After successful get_page_unless_zero() might follow put_page()
2796          * which needs correct compound_head().
2797          */
2798         clear_compound_head(page_tail);
2799         if (new_order) {
2800                 prep_compound_page(page_tail, new_order);
2801                 folio_set_large_rmappable(new_folio);
2802         }
2803
2804         /* Finally unfreeze refcount. Additional reference from page cache. */
2805         page_ref_unfreeze(page_tail,
2806                 1 + ((!folio_test_anon(folio) || folio_test_swapcache(folio)) ?
2807                              folio_nr_pages(new_folio) : 0));
2808
2809         if (folio_test_young(folio))
2810                 folio_set_young(new_folio);
2811         if (folio_test_idle(folio))
2812                 folio_set_idle(new_folio);
2813
2814         folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio));
2815
2816         /*
2817          * always add to the tail because some iterators expect new
2818          * pages to show after the currently processed elements - e.g.
2819          * migrate_pages
2820          */
2821         lru_add_page_tail(head, page_tail, lruvec, list);
2822 }
2823
2824 static void __split_huge_page(struct page *page, struct list_head *list,
2825                 pgoff_t end, unsigned int new_order)
2826 {
2827         struct folio *folio = page_folio(page);
2828         struct page *head = &folio->page;
2829         struct lruvec *lruvec;
2830         struct address_space *swap_cache = NULL;
2831         unsigned long offset = 0;
2832         int i, nr_dropped = 0;
2833         unsigned int new_nr = 1 << new_order;
2834         int order = folio_order(folio);
2835         unsigned int nr = 1 << order;
2836
2837         /* complete memcg works before add pages to LRU */
2838         split_page_memcg(head, order, new_order);
2839
2840         if (folio_test_anon(folio) && folio_test_swapcache(folio)) {
2841                 offset = swp_offset(folio->swap);
2842                 swap_cache = swap_address_space(folio->swap);
2843                 xa_lock(&swap_cache->i_pages);
2844         }
2845
2846         /* lock lru list/PageCompound, ref frozen by page_ref_freeze */
2847         lruvec = folio_lruvec_lock(folio);
2848
2849         ClearPageHasHWPoisoned(head);
2850
2851         for (i = nr - new_nr; i >= new_nr; i -= new_nr) {
2852                 __split_huge_page_tail(folio, i, lruvec, list, new_order);
2853                 /* Some pages can be beyond EOF: drop them from page cache */
2854                 if (head[i].index >= end) {
2855                         struct folio *tail = page_folio(head + i);
2856
2857                         if (shmem_mapping(folio->mapping))
2858                                 nr_dropped++;
2859                         else if (folio_test_clear_dirty(tail))
2860                                 folio_account_cleaned(tail,
2861                                         inode_to_wb(folio->mapping->host));
2862                         __filemap_remove_folio(tail, NULL);
2863                         folio_put(tail);
2864                 } else if (!PageAnon(page)) {
2865                         __xa_store(&folio->mapping->i_pages, head[i].index,
2866                                         head + i, 0);
2867                 } else if (swap_cache) {
2868                         __xa_store(&swap_cache->i_pages, offset + i,
2869                                         head + i, 0);
2870                 }
2871         }
2872
2873         if (!new_order)
2874                 ClearPageCompound(head);
2875         else {
2876                 struct folio *new_folio = (struct folio *)head;
2877
2878                 folio_set_order(new_folio, new_order);
2879         }
2880         unlock_page_lruvec(lruvec);
2881         /* Caller disabled irqs, so they are still disabled here */
2882
2883         split_page_owner(head, order, new_order);
2884         pgalloc_tag_split(head, 1 << order);
2885
2886         /* See comment in __split_huge_page_tail() */
2887         if (folio_test_anon(folio)) {
2888                 /* Additional pin to swap cache */
2889                 if (folio_test_swapcache(folio)) {
2890                         folio_ref_add(folio, 1 + new_nr);
2891                         xa_unlock(&swap_cache->i_pages);
2892                 } else {
2893                         folio_ref_inc(folio);
2894                 }
2895         } else {
2896                 /* Additional pin to page cache */
2897                 folio_ref_add(folio, 1 + new_nr);
2898                 xa_unlock(&folio->mapping->i_pages);
2899         }
2900         local_irq_enable();
2901
2902         if (nr_dropped)
2903                 shmem_uncharge(folio->mapping->host, nr_dropped);
2904         remap_page(folio, nr);
2905
2906         /*
2907          * set page to its compound_head when split to non order-0 pages, so
2908          * we can skip unlocking it below, since PG_locked is transferred to
2909          * the compound_head of the page and the caller will unlock it.
2910          */
2911         if (new_order)
2912                 page = compound_head(page);
2913
2914         for (i = 0; i < nr; i += new_nr) {
2915                 struct page *subpage = head + i;
2916                 struct folio *new_folio = page_folio(subpage);
2917                 if (subpage == page)
2918                         continue;
2919                 folio_unlock(new_folio);
2920
2921                 /*
2922                  * Subpages may be freed if there wasn't any mapping
2923                  * like if add_to_swap() is running on a lru page that
2924                  * had its mapping zapped. And freeing these pages
2925                  * requires taking the lru_lock so we do the put_page
2926                  * of the tail pages after the split is complete.
2927                  */
2928                 free_page_and_swap_cache(subpage);
2929         }
2930 }
2931
2932 /* Racy check whether the huge page can be split */
2933 bool can_split_folio(struct folio *folio, int *pextra_pins)
2934 {
2935         int extra_pins;
2936
2937         /* Additional pins from page cache */
2938         if (folio_test_anon(folio))
2939                 extra_pins = folio_test_swapcache(folio) ?
2940                                 folio_nr_pages(folio) : 0;
2941         else
2942                 extra_pins = folio_nr_pages(folio);
2943         if (pextra_pins)
2944                 *pextra_pins = extra_pins;
2945         return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 1;
2946 }
2947
2948 /*
2949  * This function splits a large folio into smaller folios of order @new_order.
2950  * @page can point to any page of the large folio to split. The split operation
2951  * does not change the position of @page.
2952  *
2953  * Prerequisites:
2954  *
2955  * 1) The caller must hold a reference on the @page's owning folio, also known
2956  *    as the large folio.
2957  *
2958  * 2) The large folio must be locked.
2959  *
2960  * 3) The folio must not be pinned. Any unexpected folio references, including
2961  *    GUP pins, will result in the folio not getting split; instead, the caller
2962  *    will receive an -EAGAIN.
2963  *
2964  * 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not
2965  *    supported for non-file-backed folios, because folio->_deferred_list, which
2966  *    is used by partially mapped folios, is stored in subpage 2, but an order-1
2967  *    folio only has subpages 0 and 1. File-backed order-1 folios are supported,
2968  *    since they do not use _deferred_list.
2969  *
2970  * After splitting, the caller's folio reference will be transferred to @page,
2971  * resulting in a raised refcount of @page after this call. The other pages may
2972  * be freed if they are not mapped.
2973  *
2974  * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
2975  *
2976  * Pages in @new_order will inherit the mapping, flags, and so on from the
2977  * huge page.
2978  *
2979  * Returns 0 if the huge page was split successfully.
2980  *
2981  * Returns -EAGAIN if the folio has unexpected reference (e.g., GUP) or if
2982  * the folio was concurrently removed from the page cache.
2983  *
2984  * Returns -EBUSY when trying to split the huge zeropage, if the folio is
2985  * under writeback, if fs-specific folio metadata cannot currently be
2986  * released, or if some unexpected race happened (e.g., anon VMA disappeared,
2987  * truncation).
2988  *
2989  * Returns -EINVAL when trying to split to an order that is incompatible
2990  * with the folio. Splitting to order 0 is compatible with all folios.
2991  */
2992 int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
2993                                      unsigned int new_order)
2994 {
2995         struct folio *folio = page_folio(page);
2996         struct deferred_split *ds_queue = get_deferred_split_queue(folio);
2997         /* reset xarray order to new order after split */
2998         XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
2999         struct anon_vma *anon_vma = NULL;
3000         struct address_space *mapping = NULL;
3001         bool is_thp = folio_test_pmd_mappable(folio);
3002         int extra_pins, ret;
3003         pgoff_t end;
3004         bool is_hzp;
3005
3006         VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
3007         VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
3008
3009         if (new_order >= folio_order(folio))
3010                 return -EINVAL;
3011
3012         /* Cannot split anonymous THP to order-1 */
3013         if (new_order == 1 && folio_test_anon(folio)) {
3014                 VM_WARN_ONCE(1, "Cannot split to order-1 folio");
3015                 return -EINVAL;
3016         }
3017
3018         if (new_order) {
3019                 /* Only swapping a whole PMD-mapped folio is supported */
3020                 if (folio_test_swapcache(folio))
3021                         return -EINVAL;
3022                 /* Split shmem folio to non-zero order not supported */
3023                 if (shmem_mapping(folio->mapping)) {
3024                         VM_WARN_ONCE(1,
3025                                 "Cannot split shmem folio to non-0 order");
3026                         return -EINVAL;
3027                 }
3028                 /* No split if the file system does not support large folio */
3029                 if (!mapping_large_folio_support(folio->mapping)) {
3030                         VM_WARN_ONCE(1,
3031                                 "Cannot split file folio to non-0 order");
3032                         return -EINVAL;
3033                 }
3034         }
3035
3036
3037         is_hzp = is_huge_zero_folio(folio);
3038         if (is_hzp) {
3039                 pr_warn_ratelimited("Called split_huge_page for huge zero page\n");
3040                 return -EBUSY;
3041         }
3042
3043         if (folio_test_writeback(folio))
3044                 return -EBUSY;
3045
3046         if (folio_test_anon(folio)) {
3047                 /*
3048                  * The caller does not necessarily hold an mmap_lock that would
3049                  * prevent the anon_vma disappearing so we first we take a
3050                  * reference to it and then lock the anon_vma for write. This
3051                  * is similar to folio_lock_anon_vma_read except the write lock
3052                  * is taken to serialise against parallel split or collapse
3053                  * operations.
3054                  */
3055                 anon_vma = folio_get_anon_vma(folio);
3056                 if (!anon_vma) {
3057                         ret = -EBUSY;
3058                         goto out;
3059                 }
3060                 end = -1;
3061                 mapping = NULL;
3062                 anon_vma_lock_write(anon_vma);
3063         } else {
3064                 gfp_t gfp;
3065
3066                 mapping = folio->mapping;
3067
3068                 /* Truncated ? */
3069                 if (!mapping) {
3070                         ret = -EBUSY;
3071                         goto out;
3072                 }
3073
3074                 gfp = current_gfp_context(mapping_gfp_mask(mapping) &
3075                                                         GFP_RECLAIM_MASK);
3076
3077                 if (!filemap_release_folio(folio, gfp)) {
3078                         ret = -EBUSY;
3079                         goto out;
3080                 }
3081
3082                 xas_split_alloc(&xas, folio, folio_order(folio), gfp);
3083                 if (xas_error(&xas)) {
3084                         ret = xas_error(&xas);
3085                         goto out;
3086                 }
3087
3088                 anon_vma = NULL;
3089                 i_mmap_lock_read(mapping);
3090
3091                 /*
3092                  *__split_huge_page() may need to trim off pages beyond EOF:
3093                  * but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
3094                  * which cannot be nested inside the page tree lock. So note
3095                  * end now: i_size itself may be changed at any moment, but
3096                  * folio lock is good enough to serialize the trimming.
3097                  */
3098                 end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
3099                 if (shmem_mapping(mapping))
3100                         end = shmem_fallocend(mapping->host, end);
3101         }
3102
3103         /*
3104          * Racy check if we can split the page, before unmap_folio() will
3105          * split PMDs
3106          */
3107         if (!can_split_folio(folio, &extra_pins)) {
3108                 ret = -EAGAIN;
3109                 goto out_unlock;
3110         }
3111
3112         unmap_folio(folio);
3113
3114         /* block interrupt reentry in xa_lock and spinlock */
3115         local_irq_disable();
3116         if (mapping) {
3117                 /*
3118                  * Check if the folio is present in page cache.
3119                  * We assume all tail are present too, if folio is there.
3120                  */
3121                 xas_lock(&xas);
3122                 xas_reset(&xas);
3123                 if (xas_load(&xas) != folio)
3124                         goto fail;
3125         }
3126
3127         /* Prevent deferred_split_scan() touching ->_refcount */
3128         spin_lock(&ds_queue->split_queue_lock);
3129         if (folio_ref_freeze(folio, 1 + extra_pins)) {
3130                 if (folio_order(folio) > 1 &&
3131                     !list_empty(&folio->_deferred_list)) {
3132                         ds_queue->split_queue_len--;
3133                         /*
3134                          * Reinitialize page_deferred_list after removing the
3135                          * page from the split_queue, otherwise a subsequent
3136                          * split will see list corruption when checking the
3137                          * page_deferred_list.
3138                          */
3139                         list_del_init(&folio->_deferred_list);
3140                 }
3141                 spin_unlock(&ds_queue->split_queue_lock);
3142                 if (mapping) {
3143                         int nr = folio_nr_pages(folio);
3144
3145                         xas_split(&xas, folio, folio_order(folio));
3146                         if (folio_test_pmd_mappable(folio) &&
3147                             new_order < HPAGE_PMD_ORDER) {
3148                                 if (folio_test_swapbacked(folio)) {
3149                                         __lruvec_stat_mod_folio(folio,
3150                                                         NR_SHMEM_THPS, -nr);
3151                                 } else {
3152                                         __lruvec_stat_mod_folio(folio,
3153                                                         NR_FILE_THPS, -nr);
3154                                         filemap_nr_thps_dec(mapping);
3155                                 }
3156                         }
3157                 }
3158
3159                 __split_huge_page(page, list, end, new_order);
3160                 ret = 0;
3161         } else {
3162                 spin_unlock(&ds_queue->split_queue_lock);
3163 fail:
3164                 if (mapping)
3165                         xas_unlock(&xas);
3166                 local_irq_enable();
3167                 remap_page(folio, folio_nr_pages(folio));
3168                 ret = -EAGAIN;
3169         }
3170
3171 out_unlock:
3172         if (anon_vma) {
3173                 anon_vma_unlock_write(anon_vma);
3174                 put_anon_vma(anon_vma);
3175         }
3176         if (mapping)
3177                 i_mmap_unlock_read(mapping);
3178 out:
3179         xas_destroy(&xas);
3180         if (is_thp)
3181                 count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
3182         return ret;
3183 }
3184
3185 void folio_undo_large_rmappable(struct folio *folio)
3186 {
3187         struct deferred_split *ds_queue;
3188         unsigned long flags;
3189
3190         if (folio_order(folio) <= 1)
3191                 return;
3192
3193         /*
3194          * At this point, there is no one trying to add the folio to
3195          * deferred_list. If folio is not in deferred_list, it's safe
3196          * to check without acquiring the split_queue_lock.
3197          */
3198         if (data_race(list_empty(&folio->_deferred_list)))
3199                 return;
3200
3201         ds_queue = get_deferred_split_queue(folio);
3202         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
3203         if (!list_empty(&folio->_deferred_list)) {
3204                 ds_queue->split_queue_len--;
3205                 list_del_init(&folio->_deferred_list);
3206         }
3207         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
3208 }
3209
3210 void deferred_split_folio(struct folio *folio)
3211 {
3212         struct deferred_split *ds_queue = get_deferred_split_queue(folio);
3213 #ifdef CONFIG_MEMCG
3214         struct mem_cgroup *memcg = folio_memcg(folio);
3215 #endif
3216         unsigned long flags;
3217
3218         /*
3219          * Order 1 folios have no space for a deferred list, but we also
3220          * won't waste much memory by not adding them to the deferred list.
3221          */
3222         if (folio_order(folio) <= 1)
3223                 return;
3224
3225         /*
3226          * The try_to_unmap() in page reclaim path might reach here too,
3227          * this may cause a race condition to corrupt deferred split queue.
3228          * And, if page reclaim is already handling the same folio, it is
3229          * unnecessary to handle it again in shrinker.
3230          *
3231          * Check the swapcache flag to determine if the folio is being
3232          * handled by page reclaim since THP swap would add the folio into
3233          * swap cache before calling try_to_unmap().
3234          */
3235         if (folio_test_swapcache(folio))
3236                 return;
3237
3238         if (!list_empty(&folio->_deferred_list))
3239                 return;
3240
3241         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
3242         if (list_empty(&folio->_deferred_list)) {
3243                 if (folio_test_pmd_mappable(folio))
3244                         count_vm_event(THP_DEFERRED_SPLIT_PAGE);
3245                 list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
3246                 ds_queue->split_queue_len++;
3247 #ifdef CONFIG_MEMCG
3248                 if (memcg)
3249                         set_shrinker_bit(memcg, folio_nid(folio),
3250                                          deferred_split_shrinker->id);
3251 #endif
3252         }
3253         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
3254 }
3255
3256 static unsigned long deferred_split_count(struct shrinker *shrink,
3257                 struct shrink_control *sc)
3258 {
3259         struct pglist_data *pgdata = NODE_DATA(sc->nid);
3260         struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
3261
3262 #ifdef CONFIG_MEMCG
3263         if (sc->memcg)
3264                 ds_queue = &sc->memcg->deferred_split_queue;
3265 #endif
3266         return READ_ONCE(ds_queue->split_queue_len);
3267 }
3268
3269 static unsigned long deferred_split_scan(struct shrinker *shrink,
3270                 struct shrink_control *sc)
3271 {
3272         struct pglist_data *pgdata = NODE_DATA(sc->nid);
3273         struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
3274         unsigned long flags;
3275         LIST_HEAD(list);
3276         struct folio *folio, *next;
3277         int split = 0;
3278
3279 #ifdef CONFIG_MEMCG
3280         if (sc->memcg)
3281                 ds_queue = &sc->memcg->deferred_split_queue;
3282 #endif
3283
3284         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
3285         /* Take pin on all head pages to avoid freeing them under us */
3286         list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
3287                                                         _deferred_list) {
3288                 if (folio_try_get(folio)) {
3289                         list_move(&folio->_deferred_list, &list);
3290                 } else {
3291                         /* We lost race with folio_put() */
3292                         list_del_init(&folio->_deferred_list);
3293                         ds_queue->split_queue_len--;
3294                 }
3295                 if (!--sc->nr_to_scan)
3296                         break;
3297         }
3298         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
3299
3300         list_for_each_entry_safe(folio, next, &list, _deferred_list) {
3301                 if (!folio_trylock(folio))
3302                         goto next;
3303                 /* split_huge_page() removes page from list on success */
3304                 if (!split_folio(folio))
3305                         split++;
3306                 folio_unlock(folio);
3307 next:
3308                 folio_put(folio);
3309         }
3310
3311         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
3312         list_splice_tail(&list, &ds_queue->split_queue);
3313         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
3314
3315         /*
3316          * Stop shrinker if we didn't split any page, but the queue is empty.
3317          * This can happen if pages were freed under us.
3318          */
3319         if (!split && list_empty(&ds_queue->split_queue))
3320                 return SHRINK_STOP;
3321         return split;
3322 }
3323
3324 #ifdef CONFIG_DEBUG_FS
3325 static void split_huge_pages_all(void)
3326 {
3327         struct zone *zone;
3328         struct page *page;
3329         struct folio *folio;
3330         unsigned long pfn, max_zone_pfn;
3331         unsigned long total = 0, split = 0;
3332
3333         pr_debug("Split all THPs\n");
3334         for_each_zone(zone) {
3335                 if (!managed_zone(zone))
3336                         continue;
3337                 max_zone_pfn = zone_end_pfn(zone);
3338                 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
3339                         int nr_pages;
3340
3341                         page = pfn_to_online_page(pfn);
3342                         if (!page || PageTail(page))
3343                                 continue;
3344                         folio = page_folio(page);
3345                         if (!folio_try_get(folio))
3346                                 continue;
3347
3348                         if (unlikely(page_folio(page) != folio))
3349                                 goto next;
3350
3351                         if (zone != folio_zone(folio))
3352                                 goto next;
3353
3354                         if (!folio_test_large(folio)
3355                                 || folio_test_hugetlb(folio)
3356                                 || !folio_test_lru(folio))
3357                                 goto next;
3358
3359                         total++;
3360                         folio_lock(folio);
3361                         nr_pages = folio_nr_pages(folio);
3362                         if (!split_folio(folio))
3363                                 split++;
3364                         pfn += nr_pages - 1;
3365                         folio_unlock(folio);
3366 next:
3367                         folio_put(folio);
3368                         cond_resched();
3369                 }
3370         }
3371
3372         pr_debug("%lu of %lu THP split\n", split, total);
3373 }
3374
3375 static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma)
3376 {
3377         return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) ||
3378                     is_vm_hugetlb_page(vma);
3379 }
3380
3381 static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
3382                                 unsigned long vaddr_end, unsigned int new_order)
3383 {
3384         int ret = 0;
3385         struct task_struct *task;
3386         struct mm_struct *mm;
3387         unsigned long total = 0, split = 0;
3388         unsigned long addr;
3389
3390         vaddr_start &= PAGE_MASK;
3391         vaddr_end &= PAGE_MASK;
3392
3393         /* Find the task_struct from pid */
3394         rcu_read_lock();
3395         task = find_task_by_vpid(pid);
3396         if (!task) {
3397                 rcu_read_unlock();
3398                 ret = -ESRCH;
3399                 goto out;
3400         }
3401         get_task_struct(task);
3402         rcu_read_unlock();
3403
3404         /* Find the mm_struct */
3405         mm = get_task_mm(task);
3406         put_task_struct(task);
3407
3408         if (!mm) {
3409                 ret = -EINVAL;
3410                 goto out;
3411         }
3412
3413         pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
3414                  pid, vaddr_start, vaddr_end);
3415
3416         mmap_read_lock(mm);
3417         /*
3418          * always increase addr by PAGE_SIZE, since we could have a PTE page
3419          * table filled with PTE-mapped THPs, each of which is distinct.
3420          */
3421         for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
3422                 struct vm_area_struct *vma = vma_lookup(mm, addr);
3423                 struct page *page;
3424                 struct folio *folio;
3425
3426                 if (!vma)
3427                         break;
3428
3429                 /* skip special VMA and hugetlb VMA */
3430                 if (vma_not_suitable_for_thp_split(vma)) {
3431                         addr = vma->vm_end;
3432                         continue;
3433                 }
3434
3435                 /* FOLL_DUMP to ignore special (like zero) pages */
3436                 page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
3437
3438                 if (IS_ERR_OR_NULL(page))
3439                         continue;
3440
3441                 folio = page_folio(page);
3442                 if (!is_transparent_hugepage(folio))
3443                         goto next;
3444
3445                 if (new_order >= folio_order(folio))
3446                         goto next;
3447
3448                 total++;
3449                 /*
3450                  * For folios with private, split_huge_page_to_list_to_order()
3451                  * will try to drop it before split and then check if the folio
3452                  * can be split or not. So skip the check here.
3453                  */
3454                 if (!folio_test_private(folio) &&
3455                     !can_split_folio(folio, NULL))
3456                         goto next;
3457
3458                 if (!folio_trylock(folio))
3459                         goto next;
3460
3461                 if (!split_folio_to_order(folio, new_order))
3462                         split++;
3463
3464                 folio_unlock(folio);
3465 next:
3466                 folio_put(folio);
3467                 cond_resched();
3468         }
3469         mmap_read_unlock(mm);
3470         mmput(mm);
3471
3472         pr_debug("%lu of %lu THP split\n", split, total);
3473
3474 out:
3475         return ret;
3476 }
3477
3478 static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
3479                                 pgoff_t off_end, unsigned int new_order)
3480 {
3481         struct filename *file;
3482         struct file *candidate;
3483         struct address_space *mapping;
3484         int ret = -EINVAL;
3485         pgoff_t index;
3486         int nr_pages = 1;
3487         unsigned long total = 0, split = 0;
3488
3489         file = getname_kernel(file_path);
3490         if (IS_ERR(file))
3491                 return ret;
3492
3493         candidate = file_open_name(file, O_RDONLY, 0);
3494         if (IS_ERR(candidate))
3495                 goto out;
3496
3497         pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n",
3498                  file_path, off_start, off_end);
3499
3500         mapping = candidate->f_mapping;
3501
3502         for (index = off_start; index < off_end; index += nr_pages) {
3503                 struct folio *folio = filemap_get_folio(mapping, index);
3504
3505                 nr_pages = 1;
3506                 if (IS_ERR(folio))
3507                         continue;
3508
3509                 if (!folio_test_large(folio))
3510                         goto next;
3511
3512                 total++;
3513                 nr_pages = folio_nr_pages(folio);
3514
3515                 if (new_order >= folio_order(folio))
3516                         goto next;
3517
3518                 if (!folio_trylock(folio))
3519                         goto next;
3520
3521                 if (!split_folio_to_order(folio, new_order))
3522                         split++;
3523
3524                 folio_unlock(folio);
3525 next:
3526                 folio_put(folio);
3527                 cond_resched();
3528         }
3529
3530         filp_close(candidate, NULL);
3531         ret = 0;
3532
3533         pr_debug("%lu of %lu file-backed THP split\n", split, total);
3534 out:
3535         putname(file);
3536         return ret;
3537 }
3538
3539 #define MAX_INPUT_BUF_SZ 255
3540
3541 static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
3542                                 size_t count, loff_t *ppops)
3543 {
3544         static DEFINE_MUTEX(split_debug_mutex);
3545         ssize_t ret;
3546         /*
3547          * hold pid, start_vaddr, end_vaddr, new_order or
3548          * file_path, off_start, off_end, new_order
3549          */
3550         char input_buf[MAX_INPUT_BUF_SZ];
3551         int pid;
3552         unsigned long vaddr_start, vaddr_end;
3553         unsigned int new_order = 0;
3554
3555         ret = mutex_lock_interruptible(&split_debug_mutex);
3556         if (ret)
3557                 return ret;
3558
3559         ret = -EFAULT;
3560
3561         memset(input_buf, 0, MAX_INPUT_BUF_SZ);
3562         if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ)))
3563                 goto out;
3564
3565         input_buf[MAX_INPUT_BUF_SZ - 1] = '\0';
3566
3567         if (input_buf[0] == '/') {
3568                 char *tok;
3569                 char *buf = input_buf;
3570                 char file_path[MAX_INPUT_BUF_SZ];
3571                 pgoff_t off_start = 0, off_end = 0;
3572                 size_t input_len = strlen(input_buf);
3573
3574                 tok = strsep(&buf, ",");
3575                 if (tok) {
3576                         strcpy(file_path, tok);
3577                 } else {
3578                         ret = -EINVAL;
3579                         goto out;
3580                 }
3581
3582                 ret = sscanf(buf, "0x%lx,0x%lx,%d", &off_start, &off_end, &new_order);
3583                 if (ret != 2 && ret != 3) {
3584                         ret = -EINVAL;
3585                         goto out;
3586                 }
3587                 ret = split_huge_pages_in_file(file_path, off_start, off_end, new_order);
3588                 if (!ret)
3589                         ret = input_len;
3590
3591                 goto out;
3592         }
3593
3594         ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d", &pid, &vaddr_start, &vaddr_end, &new_order);
3595         if (ret == 1 && pid == 1) {
3596                 split_huge_pages_all();
3597                 ret = strlen(input_buf);
3598                 goto out;
3599         } else if (ret != 3 && ret != 4) {
3600                 ret = -EINVAL;
3601                 goto out;
3602         }
3603
3604         ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order);
3605         if (!ret)
3606                 ret = strlen(input_buf);
3607 out:
3608         mutex_unlock(&split_debug_mutex);
3609         return ret;
3610
3611 }
3612
3613 static const struct file_operations split_huge_pages_fops = {
3614         .owner   = THIS_MODULE,
3615         .write   = split_huge_pages_write,
3616         .llseek  = no_llseek,
3617 };
3618
3619 static int __init split_huge_pages_debugfs(void)
3620 {
3621         debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
3622                             &split_huge_pages_fops);
3623         return 0;
3624 }
3625 late_initcall(split_huge_pages_debugfs);
3626 #endif
3627
3628 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
3629 int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
3630                 struct page *page)
3631 {
3632         struct folio *folio = page_folio(page);
3633         struct vm_area_struct *vma = pvmw->vma;
3634         struct mm_struct *mm = vma->vm_mm;
3635         unsigned long address = pvmw->address;
3636         bool anon_exclusive;
3637         pmd_t pmdval;
3638         swp_entry_t entry;
3639         pmd_t pmdswp;
3640
3641         if (!(pvmw->pmd && !pvmw->pte))
3642                 return 0;
3643
3644         flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
3645         pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
3646
3647         /* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */
3648         anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page);
3649         if (anon_exclusive && folio_try_share_anon_rmap_pmd(folio, page)) {
3650                 set_pmd_at(mm, address, pvmw->pmd, pmdval);
3651                 return -EBUSY;
3652         }
3653
3654         if (pmd_dirty(pmdval))
3655                 folio_mark_dirty(folio);
3656         if (pmd_write(pmdval))
3657                 entry = make_writable_migration_entry(page_to_pfn(page));
3658         else if (anon_exclusive)
3659                 entry = make_readable_exclusive_migration_entry(page_to_pfn(page));
3660         else
3661                 entry = make_readable_migration_entry(page_to_pfn(page));
3662         if (pmd_young(pmdval))
3663                 entry = make_migration_entry_young(entry);
3664         if (pmd_dirty(pmdval))
3665                 entry = make_migration_entry_dirty(entry);
3666         pmdswp = swp_entry_to_pmd(entry);
3667         if (pmd_soft_dirty(pmdval))
3668                 pmdswp = pmd_swp_mksoft_dirty(pmdswp);
3669         if (pmd_uffd_wp(pmdval))
3670                 pmdswp = pmd_swp_mkuffd_wp(pmdswp);
3671         set_pmd_at(mm, address, pvmw->pmd, pmdswp);
3672         folio_remove_rmap_pmd(folio, page, vma);
3673         folio_put(folio);
3674         trace_set_migration_pmd(address, pmd_val(pmdswp));
3675
3676         return 0;
3677 }
3678
3679 void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
3680 {
3681         struct folio *folio = page_folio(new);
3682         struct vm_area_struct *vma = pvmw->vma;
3683         struct mm_struct *mm = vma->vm_mm;
3684         unsigned long address = pvmw->address;
3685         unsigned long haddr = address & HPAGE_PMD_MASK;
3686         pmd_t pmde;
3687         swp_entry_t entry;
3688
3689         if (!(pvmw->pmd && !pvmw->pte))
3690                 return;
3691
3692         entry = pmd_to_swp_entry(*pvmw->pmd);
3693         folio_get(folio);
3694         pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot));
3695         if (pmd_swp_soft_dirty(*pvmw->pmd))
3696                 pmde = pmd_mksoft_dirty(pmde);
3697         if (is_writable_migration_entry(entry))
3698                 pmde = pmd_mkwrite(pmde, vma);
3699         if (pmd_swp_uffd_wp(*pvmw->pmd))
3700                 pmde = pmd_mkuffd_wp(pmde);
3701         if (!is_migration_entry_young(entry))
3702                 pmde = pmd_mkold(pmde);
3703         /* NOTE: this may contain setting soft-dirty on some archs */
3704         if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
3705                 pmde = pmd_mkdirty(pmde);
3706
3707         if (folio_test_anon(folio)) {
3708                 rmap_t rmap_flags = RMAP_NONE;
3709
3710                 if (!is_readable_migration_entry(entry))
3711                         rmap_flags |= RMAP_EXCLUSIVE;
3712
3713                 folio_add_anon_rmap_pmd(folio, new, vma, haddr, rmap_flags);
3714         } else {
3715                 folio_add_file_rmap_pmd(folio, new, vma);
3716         }
3717         VM_BUG_ON(pmd_write(pmde) && folio_test_anon(folio) && !PageAnonExclusive(new));
3718         set_pmd_at(mm, haddr, pvmw->pmd, pmde);
3719
3720         /* No need to invalidate - it was non-present before */
3721         update_mmu_cache_pmd(vma, address, pvmw->pmd);
3722         trace_remove_migration_pmd(address, pmd_val(pmde));
3723 }
3724 #endif