tools/testing/selftests/kvm/rseq_test.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 #define _GNU_SOURCE /* for program_invocation_short_name */
   3 #include <errno.h>
   4 #include <fcntl.h>
   5 #include <pthread.h>
   6 #include <sched.h>
   7 #include <stdio.h>
   8 #include <stdlib.h>
   9 #include <string.h>
  10 #include <signal.h>
  11 #include <syscall.h>
  12 #include <sys/ioctl.h>
  13 #include <sys/sysinfo.h>
  14 #include <asm/barrier.h>
  15 #include <linux/atomic.h>
  16 #include <linux/rseq.h>
  17 #include <linux/unistd.h>
  18
  19 #include "kvm_util.h"
  20 #include "processor.h"
  21 #include "test_util.h"
  22
  23 #include "../rseq/rseq.c"
  24
  25 /*
  26  * Any bug related to task migration is likely to be timing-dependent; perform
  27  * a large number of migrations to reduce the odds of a false negative.
  28  */
  29 #define NR_TASK_MIGRATIONS 100000
  30
  31 static pthread_t migration_thread;
  32 static cpu_set_t possible_mask;
  33 static int min_cpu, max_cpu;
  34 static bool done;
  35
  36 static atomic_t seq_cnt;
  37
  38 static void guest_code(void)
  39 {
  40         for (;;)
  41                 GUEST_SYNC(0);
  42 }
  43
  44 static int next_cpu(int cpu)
  45 {
  46         /*
  47          * Advance to the next CPU, skipping those that weren't in the original
  48          * affinity set.  Sadly, there is no CPU_SET_FOR_EACH, and cpu_set_t's
  49          * data storage is considered as opaque.  Note, if this task is pinned
  50          * to a small set of discontigous CPUs, e.g. 2 and 1023, this loop will
  51          * burn a lot cycles and the test will take longer than normal to
  52          * complete.
  53          */
  54         do {
  55                 cpu++;
  56                 if (cpu > max_cpu) {
  57                         cpu = min_cpu;
  58                         TEST_ASSERT(CPU_ISSET(cpu, &possible_mask),
  59                                     "Min CPU = %d must always be usable", cpu);
  60                         break;
  61                 }
  62         } while (!CPU_ISSET(cpu, &possible_mask));
  63
  64         return cpu;
  65 }
  66
  67 static void *migration_worker(void *__rseq_tid)
  68 {
  69         pid_t rseq_tid = (pid_t)(unsigned long)__rseq_tid;
  70         cpu_set_t allowed_mask;
  71         int r, i, cpu;
  72
  73         CPU_ZERO(&allowed_mask);
  74
  75         for (i = 0, cpu = min_cpu; i < NR_TASK_MIGRATIONS; i++, cpu = next_cpu(cpu)) {
  76                 CPU_SET(cpu, &allowed_mask);
  77
  78                 /*
  79                  * Bump the sequence count twice to allow the reader to detect
  80                  * that a migration may have occurred in between rseq and sched
  81                  * CPU ID reads.  An odd sequence count indicates a migration
  82                  * is in-progress, while a completely different count indicates
  83                  * a migration occurred since the count was last read.
  84                  */
  85                 atomic_inc(&seq_cnt);
  86
  87                 /*
  88                  * Ensure the odd count is visible while getcpu() isn't
  89                  * stable, i.e. while changing affinity is in-progress.
  90                  */
  91                 smp_wmb();
  92                 r = sched_setaffinity(rseq_tid, sizeof(allowed_mask), &allowed_mask);
  93                 TEST_ASSERT(!r, "sched_setaffinity failed, errno = %d (%s)",
  94                             errno, strerror(errno));
  95                 smp_wmb();
  96                 atomic_inc(&seq_cnt);
  97
  98                 CPU_CLR(cpu, &allowed_mask);
  99
 100                 /*
 101                  * Wait 1-10us before proceeding to the next iteration and more
 102                  * specifically, before bumping seq_cnt again.  A delay is
 103                  * needed on three fronts:
 104                  *
 105                  *  1. To allow sched_setaffinity() to prompt migration before
 106                  *     ioctl(KVM_RUN) enters the guest so that TIF_NOTIFY_RESUME
 107                  *     (or TIF_NEED_RESCHED, which indirectly leads to handling
 108                  *     NOTIFY_RESUME) is handled in KVM context.
 109                  *
 110                  *     If NOTIFY_RESUME/NEED_RESCHED is set after KVM enters
 111                  *     the guest, the guest will trigger a IO/MMIO exit all the
 112                  *     way to userspace and the TIF flags will be handled by
 113                  *     the generic "exit to userspace" logic, not by KVM.  The
 114                  *     exit to userspace is necessary to give the test a chance
 115                  *     to check the rseq CPU ID (see #2).
 116                  *
 117                  *     Alternatively, guest_code() could include an instruction
 118                  *     to trigger an exit that is handled by KVM, but any such
 119                  *     exit requires architecture specific code.
 120                  *
 121                  *  2. To let ioctl(KVM_RUN) make its way back to the test
 122                  *     before the next round of migration.  The test's check on
 123                  *     the rseq CPU ID must wait for migration to complete in
 124                  *     order to avoid false positive, thus any kernel rseq bug
 125                  *     will be missed if the next migration starts before the
 126                  *     check completes.
 127                  *
 128                  *  3. To ensure the read-side makes efficient forward progress,
 129                  *     e.g. if getcpu() involves a syscall. Stalling the read-side
 130                  *     means the test will spend more time waiting for getcpu()
 131                  *     to stabilize and less time trying to hit the timing-dependent
 132                  *     bug.
 133                  *
 134                  * Because any bug in this area is likely to be timing-dependent,
 135                  * run with a range of delays at 1us intervals from 1us to 10us
 136                  * as a best effort to avoid tuning the test to the point where
 137                  * it can hit _only_ the original bug and not detect future
 138                  * regressions.
 139                  *
 140                  * The original bug can reproduce with a delay up to ~500us on
 141                  * x86-64, but starts to require more iterations to reproduce
 142                  * as the delay creeps above ~10us, and the average runtime of
 143                  * each iteration obviously increases as well.  Cap the delay
 144                  * at 10us to keep test runtime reasonable while minimizing
 145                  * potential coverage loss.
 146                  *
 147                  * The lower bound for reproducing the bug is likely below 1us,
 148                  * e.g. failures occur on x86-64 with nanosleep(0), but at that
 149                  * point the overhead of the syscall likely dominates the delay.
 150                  * Use usleep() for simplicity and to avoid unnecessary kernel
 151                  * dependencies.
 152                  */
 153                 usleep((i % 10) + 1);
 154         }
 155         done = true;
 156         return NULL;
 157 }
 158
 159 static void calc_min_max_cpu(void)
 160 {
 161         int i, cnt, nproc;
 162
 163         TEST_REQUIRE(CPU_COUNT(&possible_mask) >= 2);
 164
 165         /*
 166          * CPU_SET doesn't provide a FOR_EACH helper, get the min/max CPU that
 167          * this task is affined to in order to reduce the time spent querying
 168          * unusable CPUs, e.g. if this task is pinned to a small percentage of
 169          * total CPUs.
 170          */
 171         nproc = get_nprocs_conf();
 172         min_cpu = -1;
 173         max_cpu = -1;
 174         cnt = 0;
 175
 176         for (i = 0; i < nproc; i++) {
 177                 if (!CPU_ISSET(i, &possible_mask))
 178                         continue;
 179                 if (min_cpu == -1)
 180                         min_cpu = i;
 181                 max_cpu = i;
 182                 cnt++;
 183         }
 184
 185         __TEST_REQUIRE(cnt >= 2,
 186                        "Only one usable CPU, task migration not possible");
 187 }
 188
 189 int main(int argc, char *argv[])
 190 {
 191         int r, i, snapshot;
 192         struct kvm_vm *vm;
 193         struct kvm_vcpu *vcpu;
 194         u32 cpu, rseq_cpu;
 195
 196         r = sched_getaffinity(0, sizeof(possible_mask), &possible_mask);
 197         TEST_ASSERT(!r, "sched_getaffinity failed, errno = %d (%s)", errno,
 198                     strerror(errno));
 199
 200         calc_min_max_cpu();
 201
 202         r = rseq_register_current_thread();
 203         TEST_ASSERT(!r, "rseq_register_current_thread failed, errno = %d (%s)",
 204                     errno, strerror(errno));
 205
 206         /*
 207          * Create and run a dummy VM that immediately exits to userspace via
 208          * GUEST_SYNC, while concurrently migrating the process by setting its
 209          * CPU affinity.
 210          */
 211         vm = vm_create_with_one_vcpu(&vcpu, guest_code);
 212
 213         pthread_create(&migration_thread, NULL, migration_worker,
 214                        (void *)(unsigned long)syscall(SYS_gettid));
 215
 216         for (i = 0; !done; i++) {
 217                 vcpu_run(vcpu);
 218                 TEST_ASSERT(get_ucall(vcpu, NULL) == UCALL_SYNC,
 219                             "Guest failed?");
 220
 221                 /*
 222                  * Verify rseq's CPU matches sched's CPU.  Ensure migration
 223                  * doesn't occur between getcpu() and reading the rseq cpu_id
 224                  * by rereading both if the sequence count changes, or if the
 225                  * count is odd (migration in-progress).
 226                  */
 227                 do {
 228                         /*
 229                          * Drop bit 0 to force a mismatch if the count is odd,
 230                          * i.e. if a migration is in-progress.
 231                          */
 232                         snapshot = atomic_read(&seq_cnt) & ~1;
 233
 234                         /*
 235                          * Ensure calling getcpu() and reading rseq.cpu_id complete
 236                          * in a single "no migration" window, i.e. are not reordered
 237                          * across the seq_cnt reads.
 238                          */
 239                         smp_rmb();
 240                         r = sys_getcpu(&cpu, NULL);
 241                         TEST_ASSERT(!r, "getcpu failed, errno = %d (%s)",
 242                                     errno, strerror(errno));
 243                         rseq_cpu = rseq_current_cpu_raw();
 244                         smp_rmb();
 245                 } while (snapshot != atomic_read(&seq_cnt));
 246
 247                 TEST_ASSERT(rseq_cpu == cpu,
 248                             "rseq CPU = %d, sched CPU = %d", rseq_cpu, cpu);
 249         }
 250
 251         /*
 252          * Sanity check that the test was able to enter the guest a reasonable
 253          * number of times, e.g. didn't get stalled too often/long waiting for
 254          * getcpu() to stabilize.  A 2:1 migration:KVM_RUN ratio is a fairly
 255          * conservative ratio on x86-64, which can do _more_ KVM_RUNs than
 256          * migrations given the 1us+ delay in the migration task.
 257          */
 258         TEST_ASSERT(i > (NR_TASK_MIGRATIONS / 2),
 259                     "Only performed %d KVM_RUNs, task stalled too much?", i);
 260
 261         pthread_join(migration_thread, NULL);
 262
 263         kvm_vm_free(vm);
 264
 265         rseq_unregister_current_thread();
 266
 267         return 0;
 268 }