1 // SPDX-License-Identifier: GPL-2.0-only
3 * Userfaultfd tests util functions
5 * Copyright (C) 2015-2023 Red Hat, Inc.
8 #include "uffd-common.h"
10 #define BASE_PMD_ADDR ((void *)(1UL << 30))
12 volatile bool test_uffdio_copy_eexist = true;
13 unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
14 char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap;
15 int uffd = -1, uffd_flags, finished, *pipefd, test_type;
17 bool test_uffdio_wp = true;
18 unsigned long long *count_verify;
19 uffd_test_ops_t *uffd_test_ops;
20 uffd_test_case_ops_t *uffd_test_case_ops;
21 atomic_bool ready_for_fork;
23 static int uffd_mem_fd_create(off_t mem_size, bool hugetlb)
25 unsigned int memfd_flags = 0;
29 memfd_flags = MFD_HUGETLB;
30 mem_fd = memfd_create("uffd-test", memfd_flags);
33 if (ftruncate(mem_fd, mem_size))
36 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0,
43 static void anon_release_pages(char *rel_area)
45 if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
46 err("madvise(MADV_DONTNEED) failed");
49 static int anon_allocate_area(void **alloc_area, bool is_src)
51 *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
52 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
53 if (*alloc_area == MAP_FAILED) {
60 static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
64 static void hugetlb_release_pages(char *rel_area)
67 if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
68 err("madvise(MADV_DONTNEED) failed");
70 if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
71 err("madvise(MADV_REMOVE) failed");
75 static int hugetlb_allocate_area(void **alloc_area, bool is_src)
77 off_t size = nr_pages * page_size;
78 off_t offset = is_src ? 0 : size;
79 void *area_alias = NULL;
80 char **alloc_area_alias;
81 int mem_fd = uffd_mem_fd_create(size * 2, true);
83 *alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE,
84 (map_shared ? MAP_SHARED : MAP_PRIVATE) |
85 (is_src ? 0 : MAP_NORESERVE),
87 if (*alloc_area == MAP_FAILED) {
93 area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE,
94 MAP_SHARED, mem_fd, offset);
95 if (area_alias == MAP_FAILED)
100 alloc_area_alias = &area_src_alias;
102 alloc_area_alias = &area_dst_alias;
105 *alloc_area_alias = area_alias;
111 static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
116 *start = (unsigned long) area_dst_alias + offset;
119 static void shmem_release_pages(char *rel_area)
121 if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
122 err("madvise(MADV_REMOVE) failed");
125 static int shmem_allocate_area(void **alloc_area, bool is_src)
127 void *area_alias = NULL;
128 size_t bytes = nr_pages * page_size, hpage_size = read_pmd_pagesize();
129 unsigned long offset = is_src ? 0 : bytes;
130 char *p = NULL, *p_alias = NULL;
131 int mem_fd = uffd_mem_fd_create(bytes * 2, false);
133 /* TODO: clean this up. Use a static addr is ugly */
136 /* src map + alias + interleaved hpages */
137 p += 2 * (bytes + hpage_size);
140 p_alias += hpage_size; /* Prevent src/dst VMA merge */
142 *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
144 if (*alloc_area == MAP_FAILED) {
148 if (*alloc_area != p)
149 err("mmap of memfd failed at %p", p);
151 area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
153 if (area_alias == MAP_FAILED) {
154 munmap(*alloc_area, bytes);
158 if (area_alias != p_alias)
159 err("mmap of anonymous memory failed at %p", p_alias);
162 area_src_alias = area_alias;
164 area_dst_alias = area_alias;
170 static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset)
172 *start = (unsigned long)area_dst_alias + offset;
175 static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages)
177 if (!check_huge_shmem(area_dst_alias, expect_nr_hpages,
178 read_pmd_pagesize()))
179 err("Did not find expected %d number of hugepages",
183 struct uffd_test_ops anon_uffd_test_ops = {
184 .allocate_area = anon_allocate_area,
185 .release_pages = anon_release_pages,
186 .alias_mapping = noop_alias_mapping,
187 .check_pmd_mapping = NULL,
190 struct uffd_test_ops shmem_uffd_test_ops = {
191 .allocate_area = shmem_allocate_area,
192 .release_pages = shmem_release_pages,
193 .alias_mapping = shmem_alias_mapping,
194 .check_pmd_mapping = shmem_check_pmd_mapping,
197 struct uffd_test_ops hugetlb_uffd_test_ops = {
198 .allocate_area = hugetlb_allocate_area,
199 .release_pages = hugetlb_release_pages,
200 .alias_mapping = hugetlb_alias_mapping,
201 .check_pmd_mapping = NULL,
204 void uffd_stats_report(struct uffd_args *args, int n_cpus)
207 unsigned long long miss_total = 0, wp_total = 0, minor_total = 0;
209 for (i = 0; i < n_cpus; i++) {
210 miss_total += args[i].missing_faults;
211 wp_total += args[i].wp_faults;
212 minor_total += args[i].minor_faults;
215 printf("userfaults: ");
217 printf("%llu missing (", miss_total);
218 for (i = 0; i < n_cpus; i++)
219 printf("%lu+", args[i].missing_faults);
223 printf("%llu wp (", wp_total);
224 for (i = 0; i < n_cpus; i++)
225 printf("%lu+", args[i].wp_faults);
229 printf("%llu minor (", minor_total);
230 for (i = 0; i < n_cpus; i++)
231 printf("%lu+", args[i].minor_faults);
237 int userfaultfd_open(uint64_t *features)
239 struct uffdio_api uffdio_api;
241 uffd = uffd_open(UFFD_FLAGS);
244 uffd_flags = fcntl(uffd, F_GETFD, NULL);
246 uffdio_api.api = UFFD_API;
247 uffdio_api.features = *features;
248 if (ioctl(uffd, UFFDIO_API, &uffdio_api))
249 /* Probably lack of CAP_PTRACE? */
251 if (uffdio_api.api != UFFD_API)
252 err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api);
254 *features = uffdio_api.features;
258 static inline void munmap_area(void **area)
261 if (munmap(*area, nr_pages * page_size))
267 void uffd_test_ctx_clear(void)
272 for (i = 0; i < nr_cpus * 2; ++i) {
273 if (close(pipefd[i]))
291 munmap_area((void **)&area_src);
292 munmap_area((void **)&area_src_alias);
293 munmap_area((void **)&area_dst);
294 munmap_area((void **)&area_dst_alias);
295 munmap_area((void **)&area_remap);
298 int uffd_test_ctx_init(uint64_t features, const char **errmsg)
300 unsigned long nr, cpu;
303 if (uffd_test_case_ops && uffd_test_case_ops->pre_alloc) {
304 ret = uffd_test_case_ops->pre_alloc(errmsg);
309 ret = uffd_test_ops->allocate_area((void **)&area_src, true);
310 ret |= uffd_test_ops->allocate_area((void **)&area_dst, false);
313 *errmsg = "memory allocation failed";
317 if (uffd_test_case_ops && uffd_test_case_ops->post_alloc) {
318 ret = uffd_test_case_ops->post_alloc(errmsg);
323 ret = userfaultfd_open(&features);
326 *errmsg = "possible lack of priviledge";
330 count_verify = malloc(nr_pages * sizeof(unsigned long long));
334 for (nr = 0; nr < nr_pages; nr++) {
335 *area_mutex(area_src, nr) =
336 (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
337 count_verify[nr] = *area_count(area_src, nr) = 1;
339 * In the transition between 255 to 256, powerpc will
340 * read out of order in my_bcmp and see both bytes as
341 * zero, so leave a placeholder below always non-zero
342 * after the count, to avoid my_bcmp to trigger false
345 *(area_count(area_src, nr) + 1) = 1;
349 * After initialization of area_src, we must explicitly release pages
350 * for area_dst to make sure it's fully empty. Otherwise we could have
351 * some area_dst pages be errornously initialized with zero pages,
352 * hence we could hit memory corruption later in the test.
354 * One example is when THP is globally enabled, above allocate_area()
355 * calls could have the two areas merged into a single VMA (as they
356 * will have the same VMA flags so they're mergeable). When we
357 * initialize the area_src above, it's possible that some part of
358 * area_dst could have been faulted in via one huge THP that will be
359 * shared between area_src and area_dst. It could cause some of the
360 * area_dst won't be trapped by missing userfaults.
362 * This release_pages() will guarantee even if that happened, we'll
363 * proactively split the thp and drop any accidentally initialized
364 * pages within area_dst.
366 uffd_test_ops->release_pages(area_dst);
368 pipefd = malloc(sizeof(int) * nr_cpus * 2);
371 for (cpu = 0; cpu < nr_cpus; cpu++)
372 if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK))
378 void wp_range(int ufd, __u64 start, __u64 len, bool wp)
380 struct uffdio_writeprotect prms;
382 /* Write protection page faults */
383 prms.range.start = start;
384 prms.range.len = len;
385 /* Undo write-protect, do wakeup after that */
386 prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
388 if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms))
389 err("clear WP failed: address=0x%"PRIx64, (uint64_t)start);
392 static void continue_range(int ufd, __u64 start, __u64 len, bool wp)
394 struct uffdio_continue req;
397 req.range.start = start;
401 req.mode |= UFFDIO_CONTINUE_MODE_WP;
403 if (ioctl(ufd, UFFDIO_CONTINUE, &req))
404 err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
408 * Error handling within the kernel for continue is subtly different
409 * from copy or zeropage, so it may be a source of bugs. Trigger an
410 * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG.
413 ret = ioctl(ufd, UFFDIO_CONTINUE, &req);
414 if (ret >= 0 || req.mapped != -EEXIST)
415 err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64,
416 ret, (int64_t) req.mapped);
419 int uffd_read_msg(int ufd, struct uffd_msg *msg)
421 int ret = read(uffd, msg, sizeof(*msg));
423 if (ret != sizeof(*msg)) {
425 if (errno == EAGAIN || errno == EINTR)
427 err("blocking read error");
436 void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_args *args)
438 unsigned long offset;
440 if (msg->event != UFFD_EVENT_PAGEFAULT)
441 err("unexpected msg event %u", msg->event);
443 if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
444 /* Write protect page faults */
445 wp_range(uffd, msg->arg.pagefault.address, page_size, false);
447 } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) {
454 * To prove we can modify the original range for testing
455 * purposes, we're going to bit flip this range before
458 * Note that this requires all minor page fault tests operate on
459 * area_dst (non-UFFD-registered) and area_dst_alias
463 area = (uint8_t *)(area_dst +
464 ((char *)msg->arg.pagefault.address -
466 for (b = 0; b < page_size; ++b)
468 continue_range(uffd, msg->arg.pagefault.address, page_size,
470 args->minor_faults++;
473 * Missing page faults.
475 * Here we force a write check for each of the missing mode
476 * faults. It's guaranteed because the only threads that
477 * will trigger uffd faults are the locking threads, and
478 * their first instruction to touch the missing page will
479 * always be pthread_mutex_lock().
481 * Note that here we relied on an NPTL glibc impl detail to
482 * always read the lock type at the entry of the lock op
483 * (pthread_mutex_t.__data.__type, offset 0x10) before
484 * doing any locking operations to guarantee that. It's
485 * actually not good to rely on this impl detail because
486 * logically a pthread-compatible lib can implement the
487 * locks without types and we can fail when linking with
488 * them. However since we used to find bugs with this
489 * strict check we still keep it around. Hopefully this
490 * could be a good hint when it fails again. If one day
491 * it'll break on some other impl of glibc we'll revisit.
493 if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
494 err("unexpected write fault");
496 offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
497 offset &= ~(page_size-1);
499 if (copy_page(uffd, offset, args->apply_wp))
500 args->missing_faults++;
504 void *uffd_poll_thread(void *arg)
506 struct uffd_args *args = (struct uffd_args *)arg;
507 unsigned long cpu = args->cpu;
508 struct pollfd pollfd[2];
510 struct uffdio_register uffd_reg;
514 if (!args->handle_fault)
515 args->handle_fault = uffd_handle_page_fault;
518 pollfd[0].events = POLLIN;
519 pollfd[1].fd = pipefd[cpu*2];
520 pollfd[1].events = POLLIN;
522 ready_for_fork = true;
525 ret = poll(pollfd, 2, -1);
527 if (errno == EINTR || errno == EAGAIN)
529 err("poll error: %d", ret);
531 if (pollfd[1].revents) {
532 if (!(pollfd[1].revents & POLLIN))
533 err("pollfd[1].revents %d", pollfd[1].revents);
534 if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
535 err("read pipefd error");
538 if (!(pollfd[0].revents & POLLIN))
539 err("pollfd[0].revents %d", pollfd[0].revents);
540 if (uffd_read_msg(uffd, &msg))
544 err("unexpected msg event %u\n", msg.event);
546 case UFFD_EVENT_PAGEFAULT:
547 args->handle_fault(&msg, args);
549 case UFFD_EVENT_FORK:
551 uffd = msg.arg.fork.ufd;
554 case UFFD_EVENT_REMOVE:
555 uffd_reg.range.start = msg.arg.remove.start;
556 uffd_reg.range.len = msg.arg.remove.end -
557 msg.arg.remove.start;
558 if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
559 err("remove failure");
561 case UFFD_EVENT_REMAP:
562 area_remap = area_dst; /* save for later unmap */
563 area_dst = (char *)(unsigned long)msg.arg.remap.to;
571 static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
572 unsigned long offset)
574 uffd_test_ops->alias_mapping(&uffdio_copy->dst,
577 if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
578 /* real retval in ufdio_copy.copy */
579 if (uffdio_copy->copy != -EEXIST)
580 err("UFFDIO_COPY retry error: %"PRId64,
581 (int64_t)uffdio_copy->copy);
583 err("UFFDIO_COPY retry unexpected: %"PRId64,
584 (int64_t)uffdio_copy->copy);
588 static void wake_range(int ufd, unsigned long addr, unsigned long len)
590 struct uffdio_range uffdio_wake;
592 uffdio_wake.start = addr;
593 uffdio_wake.len = len;
595 if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake))
596 fprintf(stderr, "error waking %lu\n",
600 int __copy_page(int ufd, unsigned long offset, bool retry, bool wp)
602 struct uffdio_copy uffdio_copy;
604 if (offset >= nr_pages * page_size)
605 err("unexpected offset %lu\n", offset);
606 uffdio_copy.dst = (unsigned long) area_dst + offset;
607 uffdio_copy.src = (unsigned long) area_src + offset;
608 uffdio_copy.len = page_size;
610 uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
612 uffdio_copy.mode = 0;
613 uffdio_copy.copy = 0;
614 if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
615 /* real retval in ufdio_copy.copy */
616 if (uffdio_copy.copy != -EEXIST)
617 err("UFFDIO_COPY error: %"PRId64,
618 (int64_t)uffdio_copy.copy);
619 wake_range(ufd, uffdio_copy.dst, page_size);
620 } else if (uffdio_copy.copy != page_size) {
621 err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy);
623 if (test_uffdio_copy_eexist && retry) {
624 test_uffdio_copy_eexist = false;
625 retry_copy_page(ufd, &uffdio_copy, offset);
632 int copy_page(int ufd, unsigned long offset, bool wp)
634 return __copy_page(ufd, offset, false, wp);
637 int move_page(int ufd, unsigned long offset, unsigned long len)
639 struct uffdio_move uffdio_move;
641 if (offset + len > nr_pages * page_size)
642 err("unexpected offset %lu and length %lu\n", offset, len);
643 uffdio_move.dst = (unsigned long) area_dst + offset;
644 uffdio_move.src = (unsigned long) area_src + offset;
645 uffdio_move.len = len;
646 uffdio_move.mode = UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES;
647 uffdio_move.move = 0;
648 if (ioctl(ufd, UFFDIO_MOVE, &uffdio_move)) {
649 /* real retval in uffdio_move.move */
650 if (uffdio_move.move != -EEXIST)
651 err("UFFDIO_MOVE error: %"PRId64,
652 (int64_t)uffdio_move.move);
653 wake_range(ufd, uffdio_move.dst, len);
654 } else if (uffdio_move.move != len) {
655 err("UFFDIO_MOVE error: %"PRId64, (int64_t)uffdio_move.move);
661 int uffd_open_dev(unsigned int flags)
665 fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
668 uffd = ioctl(fd, USERFAULTFD_IOC_NEW, flags);
674 int uffd_open_sys(unsigned int flags)
676 #ifdef __NR_userfaultfd
677 return syscall(__NR_userfaultfd, flags);
683 int uffd_open(unsigned int flags)
685 int uffd = uffd_open_sys(flags);
688 uffd = uffd_open_dev(flags);
693 int uffd_get_features(uint64_t *features)
695 struct uffdio_api uffdio_api = { .api = UFFD_API, .features = 0 };
697 * This should by default work in most kernels; the feature list
698 * will be the same no matter what we pass in here.
700 int fd = uffd_open(UFFD_USER_MODE_ONLY);
703 /* Maybe the kernel is older than user-only mode? */
709 if (ioctl(fd, UFFDIO_API, &uffdio_api)) {
714 *features = uffdio_api.features;