arch/x86/kvm/svm/svm.c

   1 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   2
   3 #include <linux/kvm_host.h>
   4
   5 #include "irq.h"
   6 #include "mmu.h"
   7 #include "kvm_cache_regs.h"
   8 #include "x86.h"
   9 #include "smm.h"
  10 #include "cpuid.h"
  11 #include "pmu.h"
  12
  13 #include <linux/module.h>
  14 #include <linux/mod_devicetable.h>
  15 #include <linux/kernel.h>
  16 #include <linux/vmalloc.h>
  17 #include <linux/highmem.h>
  18 #include <linux/amd-iommu.h>
  19 #include <linux/sched.h>
  20 #include <linux/trace_events.h>
  21 #include <linux/slab.h>
  22 #include <linux/hashtable.h>
  23 #include <linux/objtool.h>
  24 #include <linux/psp-sev.h>
  25 #include <linux/file.h>
  26 #include <linux/pagemap.h>
  27 #include <linux/swap.h>
  28 #include <linux/rwsem.h>
  29 #include <linux/cc_platform.h>
  30 #include <linux/smp.h>
  31
  32 #include <asm/apic.h>
  33 #include <asm/perf_event.h>
  34 #include <asm/tlbflush.h>
  35 #include <asm/desc.h>
  36 #include <asm/debugreg.h>
  37 #include <asm/kvm_para.h>
  38 #include <asm/irq_remapping.h>
  39 #include <asm/spec-ctrl.h>
  40 #include <asm/cpu_device_id.h>
  41 #include <asm/traps.h>
  42 #include <asm/reboot.h>
  43 #include <asm/fpu/api.h>
  44
  45 #include <trace/events/ipi.h>
  46
  47 #include "trace.h"
  48
  49 #include "svm.h"
  50 #include "svm_ops.h"
  51
  52 #include "kvm_onhyperv.h"
  53 #include "svm_onhyperv.h"
  54
  55 MODULE_AUTHOR("Qumranet");
  56 MODULE_LICENSE("GPL");
  57
  58 #ifdef MODULE
  59 static const struct x86_cpu_id svm_cpu_id[] = {
  60         X86_MATCH_FEATURE(X86_FEATURE_SVM, NULL),
  61         {}
  62 };
  63 MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
  64 #endif
  65
  66 #define SEG_TYPE_LDT 2
  67 #define SEG_TYPE_BUSY_TSS16 3
  68
  69 static bool erratum_383_found __read_mostly;
  70
  71 u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
  72
  73 /*
  74  * Set osvw_len to higher value when updated Revision Guides
  75  * are published and we know what the new status bits are
  76  */
  77 static uint64_t osvw_len = 4, osvw_status;
  78
  79 static DEFINE_PER_CPU(u64, current_tsc_ratio);
  80
  81 #define X2APIC_MSR(x)   (APIC_BASE_MSR + (x >> 4))
  82
  83 static const struct svm_direct_access_msrs {
  84         u32 index;   /* Index of the MSR */
  85         bool always; /* True if intercept is initially cleared */
  86 } direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
  87         { .index = MSR_STAR,                            .always = true  },
  88         { .index = MSR_IA32_SYSENTER_CS,                .always = true  },
  89         { .index = MSR_IA32_SYSENTER_EIP,               .always = false },
  90         { .index = MSR_IA32_SYSENTER_ESP,               .always = false },
  91 #ifdef CONFIG_X86_64
  92         { .index = MSR_GS_BASE,                         .always = true  },
  93         { .index = MSR_FS_BASE,                         .always = true  },
  94         { .index = MSR_KERNEL_GS_BASE,                  .always = true  },
  95         { .index = MSR_LSTAR,                           .always = true  },
  96         { .index = MSR_CSTAR,                           .always = true  },
  97         { .index = MSR_SYSCALL_MASK,                    .always = true  },
  98 #endif
  99         { .index = MSR_IA32_SPEC_CTRL,                  .always = false },
 100         { .index = MSR_IA32_PRED_CMD,                   .always = false },
 101         { .index = MSR_IA32_FLUSH_CMD,                  .always = false },
 102         { .index = MSR_IA32_LASTBRANCHFROMIP,           .always = false },
 103         { .index = MSR_IA32_LASTBRANCHTOIP,             .always = false },
 104         { .index = MSR_IA32_LASTINTFROMIP,              .always = false },
 105         { .index = MSR_IA32_LASTINTTOIP,                .always = false },
 106         { .index = MSR_EFER,                            .always = false },
 107         { .index = MSR_IA32_CR_PAT,                     .always = false },
 108         { .index = MSR_AMD64_SEV_ES_GHCB,               .always = true  },
 109         { .index = MSR_TSC_AUX,                         .always = false },
 110         { .index = X2APIC_MSR(APIC_ID),                 .always = false },
 111         { .index = X2APIC_MSR(APIC_LVR),                .always = false },
 112         { .index = X2APIC_MSR(APIC_TASKPRI),            .always = false },
 113         { .index = X2APIC_MSR(APIC_ARBPRI),             .always = false },
 114         { .index = X2APIC_MSR(APIC_PROCPRI),            .always = false },
 115         { .index = X2APIC_MSR(APIC_EOI),                .always = false },
 116         { .index = X2APIC_MSR(APIC_RRR),                .always = false },
 117         { .index = X2APIC_MSR(APIC_LDR),                .always = false },
 118         { .index = X2APIC_MSR(APIC_DFR),                .always = false },
 119         { .index = X2APIC_MSR(APIC_SPIV),               .always = false },
 120         { .index = X2APIC_MSR(APIC_ISR),                .always = false },
 121         { .index = X2APIC_MSR(APIC_TMR),                .always = false },
 122         { .index = X2APIC_MSR(APIC_IRR),                .always = false },
 123         { .index = X2APIC_MSR(APIC_ESR),                .always = false },
 124         { .index = X2APIC_MSR(APIC_ICR),                .always = false },
 125         { .index = X2APIC_MSR(APIC_ICR2),               .always = false },
 126
 127         /*
 128          * Note:
 129          * AMD does not virtualize APIC TSC-deadline timer mode, but it is
 130          * emulated by KVM. When setting APIC LVTT (0x832) register bit 18,
 131          * the AVIC hardware would generate GP fault. Therefore, always
 132          * intercept the MSR 0x832, and do not setup direct_access_msr.
 133          */
 134         { .index = X2APIC_MSR(APIC_LVTTHMR),            .always = false },
 135         { .index = X2APIC_MSR(APIC_LVTPC),              .always = false },
 136         { .index = X2APIC_MSR(APIC_LVT0),               .always = false },
 137         { .index = X2APIC_MSR(APIC_LVT1),               .always = false },
 138         { .index = X2APIC_MSR(APIC_LVTERR),             .always = false },
 139         { .index = X2APIC_MSR(APIC_TMICT),              .always = false },
 140         { .index = X2APIC_MSR(APIC_TMCCT),              .always = false },
 141         { .index = X2APIC_MSR(APIC_TDCR),               .always = false },
 142         { .index = MSR_INVALID,                         .always = false },
 143 };
 144
 145 /*
 146  * These 2 parameters are used to config the controls for Pause-Loop Exiting:
 147  * pause_filter_count: On processors that support Pause filtering(indicated
 148  *      by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
 149  *      count value. On VMRUN this value is loaded into an internal counter.
 150  *      Each time a pause instruction is executed, this counter is decremented
 151  *      until it reaches zero at which time a #VMEXIT is generated if pause
 152  *      intercept is enabled. Refer to  AMD APM Vol 2 Section 15.14.4 Pause
 153  *      Intercept Filtering for more details.
 154  *      This also indicate if ple logic enabled.
 155  *
 156  * pause_filter_thresh: In addition, some processor families support advanced
 157  *      pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
 158  *      the amount of time a guest is allowed to execute in a pause loop.
 159  *      In this mode, a 16-bit pause filter threshold field is added in the
 160  *      VMCB. The threshold value is a cycle count that is used to reset the
 161  *      pause counter. As with simple pause filtering, VMRUN loads the pause
 162  *      count value from VMCB into an internal counter. Then, on each pause
 163  *      instruction the hardware checks the elapsed number of cycles since
 164  *      the most recent pause instruction against the pause filter threshold.
 165  *      If the elapsed cycle count is greater than the pause filter threshold,
 166  *      then the internal pause count is reloaded from the VMCB and execution
 167  *      continues. If the elapsed cycle count is less than the pause filter
 168  *      threshold, then the internal pause count is decremented. If the count
 169  *      value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
 170  *      triggered. If advanced pause filtering is supported and pause filter
 171  *      threshold field is set to zero, the filter will operate in the simpler,
 172  *      count only mode.
 173  */
 174
 175 static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
 176 module_param(pause_filter_thresh, ushort, 0444);
 177
 178 static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
 179 module_param(pause_filter_count, ushort, 0444);
 180
 181 /* Default doubles per-vcpu window every exit. */
 182 static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
 183 module_param(pause_filter_count_grow, ushort, 0444);
 184
 185 /* Default resets per-vcpu window every exit to pause_filter_count. */
 186 static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
 187 module_param(pause_filter_count_shrink, ushort, 0444);
 188
 189 /* Default is to compute the maximum so we can never overflow. */
 190 static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
 191 module_param(pause_filter_count_max, ushort, 0444);
 192
 193 /*
 194  * Use nested page tables by default.  Note, NPT may get forced off by
 195  * svm_hardware_setup() if it's unsupported by hardware or the host kernel.
 196  */
 197 bool npt_enabled = true;
 198 module_param_named(npt, npt_enabled, bool, 0444);
 199
 200 /* allow nested virtualization in KVM/SVM */
 201 static int nested = true;
 202 module_param(nested, int, S_IRUGO);
 203
 204 /* enable/disable Next RIP Save */
 205 int nrips = true;
 206 module_param(nrips, int, 0444);
 207
 208 /* enable/disable Virtual VMLOAD VMSAVE */
 209 static int vls = true;
 210 module_param(vls, int, 0444);
 211
 212 /* enable/disable Virtual GIF */
 213 int vgif = true;
 214 module_param(vgif, int, 0444);
 215
 216 /* enable/disable LBR virtualization */
 217 static int lbrv = true;
 218 module_param(lbrv, int, 0444);
 219
 220 static int tsc_scaling = true;
 221 module_param(tsc_scaling, int, 0444);
 222
 223 /*
 224  * enable / disable AVIC.  Because the defaults differ for APICv
 225  * support between VMX and SVM we cannot use module_param_named.
 226  */
 227 static bool avic;
 228 module_param(avic, bool, 0444);
 229
 230 bool __read_mostly dump_invalid_vmcb;
 231 module_param(dump_invalid_vmcb, bool, 0644);
 232
 233
 234 bool intercept_smi = true;
 235 module_param(intercept_smi, bool, 0444);
 236
 237 bool vnmi = true;
 238 module_param(vnmi, bool, 0444);
 239
 240 static bool svm_gp_erratum_intercept = true;
 241
 242 static u8 rsm_ins_bytes[] = "\x0f\xaa";
 243
 244 static unsigned long iopm_base;
 245
 246 DEFINE_PER_CPU(struct svm_cpu_data, svm_data);
 247
 248 /*
 249  * Only MSR_TSC_AUX is switched via the user return hook.  EFER is switched via
 250  * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE.
 251  *
 252  * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to
 253  * defer the restoration of TSC_AUX until the CPU returns to userspace.
 254  */
 255 static int tsc_aux_uret_slot __read_mostly = -1;
 256
 257 static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
 258
 259 #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
 260 #define MSRS_RANGE_SIZE 2048
 261 #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
 262
 263 u32 svm_msrpm_offset(u32 msr)
 264 {
 265         u32 offset;
 266         int i;
 267
 268         for (i = 0; i < NUM_MSR_MAPS; i++) {
 269                 if (msr < msrpm_ranges[i] ||
 270                     msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
 271                         continue;
 272
 273                 offset  = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
 274                 offset += (i * MSRS_RANGE_SIZE);       /* add range offset */
 275
 276                 /* Now we have the u8 offset - but need the u32 offset */
 277                 return offset / 4;
 278         }
 279
 280         /* MSR not in any range */
 281         return MSR_INVALID;
 282 }
 283
 284 static void svm_flush_tlb_current(struct kvm_vcpu *vcpu);
 285
 286 static int get_npt_level(void)
 287 {
 288 #ifdef CONFIG_X86_64
 289         return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
 290 #else
 291         return PT32E_ROOT_LEVEL;
 292 #endif
 293 }
 294
 295 int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 296 {
 297         struct vcpu_svm *svm = to_svm(vcpu);
 298         u64 old_efer = vcpu->arch.efer;
 299         vcpu->arch.efer = efer;
 300
 301         if (!npt_enabled) {
 302                 /* Shadow paging assumes NX to be available.  */
 303                 efer |= EFER_NX;
 304
 305                 if (!(efer & EFER_LMA))
 306                         efer &= ~EFER_LME;
 307         }
 308
 309         if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
 310                 if (!(efer & EFER_SVME)) {
 311                         svm_leave_nested(vcpu);
 312                         svm_set_gif(svm, true);
 313                         /* #GP intercept is still needed for vmware backdoor */
 314                         if (!enable_vmware_backdoor)
 315                                 clr_exception_intercept(svm, GP_VECTOR);
 316
 317                         /*
 318                          * Free the nested guest state, unless we are in SMM.
 319                          * In this case we will return to the nested guest
 320                          * as soon as we leave SMM.
 321                          */
 322                         if (!is_smm(vcpu))
 323                                 svm_free_nested(svm);
 324
 325                 } else {
 326                         int ret = svm_allocate_nested(svm);
 327
 328                         if (ret) {
 329                                 vcpu->arch.efer = old_efer;
 330                                 return ret;
 331                         }
 332
 333                         /*
 334                          * Never intercept #GP for SEV guests, KVM can't
 335                          * decrypt guest memory to workaround the erratum.
 336                          */
 337                         if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm))
 338                                 set_exception_intercept(svm, GP_VECTOR);
 339                 }
 340         }
 341
 342         svm->vmcb->save.efer = efer | EFER_SVME;
 343         vmcb_mark_dirty(svm->vmcb, VMCB_CR);
 344         return 0;
 345 }
 346
 347 static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
 348 {
 349         struct vcpu_svm *svm = to_svm(vcpu);
 350         u32 ret = 0;
 351
 352         if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
 353                 ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
 354         return ret;
 355 }
 356
 357 static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
 358 {
 359         struct vcpu_svm *svm = to_svm(vcpu);
 360
 361         if (mask == 0)
 362                 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
 363         else
 364                 svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
 365
 366 }
 367 static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
 368                                         void *insn, int insn_len);
 369
 370 static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
 371                                            bool commit_side_effects)
 372 {
 373         struct vcpu_svm *svm = to_svm(vcpu);
 374         unsigned long old_rflags;
 375
 376         /*
 377          * SEV-ES does not expose the next RIP. The RIP update is controlled by
 378          * the type of exit and the #VC handler in the guest.
 379          */
 380         if (sev_es_guest(vcpu->kvm))
 381                 goto done;
 382
 383         if (nrips && svm->vmcb->control.next_rip != 0) {
 384                 WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
 385                 svm->next_rip = svm->vmcb->control.next_rip;
 386         }
 387
 388         if (!svm->next_rip) {
 389                 /*
 390                  * FIXME: Drop this when kvm_emulate_instruction() does the
 391                  * right thing and treats "can't emulate" as outright failure
 392                  * for EMULTYPE_SKIP.
 393                  */
 394                 if (!svm_can_emulate_instruction(vcpu, EMULTYPE_SKIP, NULL, 0))
 395                         return 0;
 396
 397                 if (unlikely(!commit_side_effects))
 398                         old_rflags = svm->vmcb->save.rflags;
 399
 400                 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
 401                         return 0;
 402
 403                 if (unlikely(!commit_side_effects))
 404                         svm->vmcb->save.rflags = old_rflags;
 405         } else {
 406                 kvm_rip_write(vcpu, svm->next_rip);
 407         }
 408
 409 done:
 410         if (likely(commit_side_effects))
 411                 svm_set_interrupt_shadow(vcpu, 0);
 412
 413         return 1;
 414 }
 415
 416 static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
 417 {
 418         return __svm_skip_emulated_instruction(vcpu, true);
 419 }
 420
 421 static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu)
 422 {
 423         unsigned long rip, old_rip = kvm_rip_read(vcpu);
 424         struct vcpu_svm *svm = to_svm(vcpu);
 425
 426         /*
 427          * Due to architectural shortcomings, the CPU doesn't always provide
 428          * NextRIP, e.g. if KVM intercepted an exception that occurred while
 429          * the CPU was vectoring an INTO/INT3 in the guest.  Temporarily skip
 430          * the instruction even if NextRIP is supported to acquire the next
 431          * RIP so that it can be shoved into the NextRIP field, otherwise
 432          * hardware will fail to advance guest RIP during event injection.
 433          * Drop the exception/interrupt if emulation fails and effectively
 434          * retry the instruction, it's the least awful option.  If NRIPS is
 435          * in use, the skip must not commit any side effects such as clearing
 436          * the interrupt shadow or RFLAGS.RF.
 437          */
 438         if (!__svm_skip_emulated_instruction(vcpu, !nrips))
 439                 return -EIO;
 440
 441         rip = kvm_rip_read(vcpu);
 442
 443         /*
 444          * Save the injection information, even when using next_rip, as the
 445          * VMCB's next_rip will be lost (cleared on VM-Exit) if the injection
 446          * doesn't complete due to a VM-Exit occurring while the CPU is
 447          * vectoring the event.   Decoding the instruction isn't guaranteed to
 448          * work as there may be no backing instruction, e.g. if the event is
 449          * being injected by L1 for L2, or if the guest is patching INT3 into
 450          * a different instruction.
 451          */
 452         svm->soft_int_injected = true;
 453         svm->soft_int_csbase = svm->vmcb->save.cs.base;
 454         svm->soft_int_old_rip = old_rip;
 455         svm->soft_int_next_rip = rip;
 456
 457         if (nrips)
 458                 kvm_rip_write(vcpu, old_rip);
 459
 460         if (static_cpu_has(X86_FEATURE_NRIPS))
 461                 svm->vmcb->control.next_rip = rip;
 462
 463         return 0;
 464 }
 465
 466 static void svm_inject_exception(struct kvm_vcpu *vcpu)
 467 {
 468         struct kvm_queued_exception *ex = &vcpu->arch.exception;
 469         struct vcpu_svm *svm = to_svm(vcpu);
 470
 471         kvm_deliver_exception_payload(vcpu, ex);
 472
 473         if (kvm_exception_is_soft(ex->vector) &&
 474             svm_update_soft_interrupt_rip(vcpu))
 475                 return;
 476
 477         svm->vmcb->control.event_inj = ex->vector
 478                 | SVM_EVTINJ_VALID
 479                 | (ex->has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
 480                 | SVM_EVTINJ_TYPE_EXEPT;
 481         svm->vmcb->control.event_inj_err = ex->error_code;
 482 }
 483
 484 static void svm_init_erratum_383(void)
 485 {
 486         u32 low, high;
 487         int err;
 488         u64 val;
 489
 490         if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
 491                 return;
 492
 493         /* Use _safe variants to not break nested virtualization */
 494         val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
 495         if (err)
 496                 return;
 497
 498         val |= (1ULL << 47);
 499
 500         low  = lower_32_bits(val);
 501         high = upper_32_bits(val);
 502
 503         native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
 504
 505         erratum_383_found = true;
 506 }
 507
 508 static void svm_init_osvw(struct kvm_vcpu *vcpu)
 509 {
 510         /*
 511          * Guests should see errata 400 and 415 as fixed (assuming that
 512          * HLT and IO instructions are intercepted).
 513          */
 514         vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
 515         vcpu->arch.osvw.status = osvw_status & ~(6ULL);
 516
 517         /*
 518          * By increasing VCPU's osvw.length to 3 we are telling the guest that
 519          * all osvw.status bits inside that length, including bit 0 (which is
 520          * reserved for erratum 298), are valid. However, if host processor's
 521          * osvw_len is 0 then osvw_status[0] carries no information. We need to
 522          * be conservative here and therefore we tell the guest that erratum 298
 523          * is present (because we really don't know).
 524          */
 525         if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
 526                 vcpu->arch.osvw.status |= 1;
 527 }
 528
 529 static bool __kvm_is_svm_supported(void)
 530 {
 531         int cpu = smp_processor_id();
 532         struct cpuinfo_x86 *c = &cpu_data(cpu);
 533
 534         if (c->x86_vendor != X86_VENDOR_AMD &&
 535             c->x86_vendor != X86_VENDOR_HYGON) {
 536                 pr_err("CPU %d isn't AMD or Hygon\n", cpu);
 537                 return false;
 538         }
 539
 540         if (!cpu_has(c, X86_FEATURE_SVM)) {
 541                 pr_err("SVM not supported by CPU %d\n", cpu);
 542                 return false;
 543         }
 544
 545         if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
 546                 pr_info("KVM is unsupported when running as an SEV guest\n");
 547                 return false;
 548         }
 549
 550         return true;
 551 }
 552
 553 static bool kvm_is_svm_supported(void)
 554 {
 555         bool supported;
 556
 557         migrate_disable();
 558         supported = __kvm_is_svm_supported();
 559         migrate_enable();
 560
 561         return supported;
 562 }
 563
 564 static int svm_check_processor_compat(void)
 565 {
 566         if (!__kvm_is_svm_supported())
 567                 return -EIO;
 568
 569         return 0;
 570 }
 571
 572 static void __svm_write_tsc_multiplier(u64 multiplier)
 573 {
 574         if (multiplier == __this_cpu_read(current_tsc_ratio))
 575                 return;
 576
 577         wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
 578         __this_cpu_write(current_tsc_ratio, multiplier);
 579 }
 580
 581 static inline void kvm_cpu_svm_disable(void)
 582 {
 583         uint64_t efer;
 584
 585         wrmsrl(MSR_VM_HSAVE_PA, 0);
 586         rdmsrl(MSR_EFER, efer);
 587         if (efer & EFER_SVME) {
 588                 /*
 589                  * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and
 590                  * NMI aren't blocked.
 591                  */
 592                 stgi();
 593                 wrmsrl(MSR_EFER, efer & ~EFER_SVME);
 594         }
 595 }
 596
 597 static void svm_emergency_disable(void)
 598 {
 599         kvm_rebooting = true;
 600
 601         kvm_cpu_svm_disable();
 602 }
 603
 604 static void svm_hardware_disable(void)
 605 {
 606         /* Make sure we clean up behind us */
 607         if (tsc_scaling)
 608                 __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
 609
 610         kvm_cpu_svm_disable();
 611
 612         amd_pmu_disable_virt();
 613 }
 614
 615 static int svm_hardware_enable(void)
 616 {
 617
 618         struct svm_cpu_data *sd;
 619         uint64_t efer;
 620         int me = raw_smp_processor_id();
 621
 622         rdmsrl(MSR_EFER, efer);
 623         if (efer & EFER_SVME)
 624                 return -EBUSY;
 625
 626         sd = per_cpu_ptr(&svm_data, me);
 627         sd->asid_generation = 1;
 628         sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
 629         sd->next_asid = sd->max_asid + 1;
 630         sd->min_asid = max_sev_asid + 1;
 631
 632         wrmsrl(MSR_EFER, efer | EFER_SVME);
 633
 634         wrmsrl(MSR_VM_HSAVE_PA, sd->save_area_pa);
 635
 636         if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
 637                 /*
 638                  * Set the default value, even if we don't use TSC scaling
 639                  * to avoid having stale value in the msr
 640                  */
 641                 __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
 642         }
 643
 644
 645         /*
 646          * Get OSVW bits.
 647          *
 648          * Note that it is possible to have a system with mixed processor
 649          * revisions and therefore different OSVW bits. If bits are not the same
 650          * on different processors then choose the worst case (i.e. if erratum
 651          * is present on one processor and not on another then assume that the
 652          * erratum is present everywhere).
 653          */
 654         if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
 655                 uint64_t len, status = 0;
 656                 int err;
 657
 658                 len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
 659                 if (!err)
 660                         status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
 661                                                       &err);
 662
 663                 if (err)
 664                         osvw_status = osvw_len = 0;
 665                 else {
 666                         if (len < osvw_len)
 667                                 osvw_len = len;
 668                         osvw_status |= status;
 669                         osvw_status &= (1ULL << osvw_len) - 1;
 670                 }
 671         } else
 672                 osvw_status = osvw_len = 0;
 673
 674         svm_init_erratum_383();
 675
 676         amd_pmu_enable_virt();
 677
 678         return 0;
 679 }
 680
 681 static void svm_cpu_uninit(int cpu)
 682 {
 683         struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
 684
 685         if (!sd->save_area)
 686                 return;
 687
 688         kfree(sd->sev_vmcbs);
 689         __free_page(sd->save_area);
 690         sd->save_area_pa = 0;
 691         sd->save_area = NULL;
 692 }
 693
 694 static int svm_cpu_init(int cpu)
 695 {
 696         struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
 697         int ret = -ENOMEM;
 698
 699         memset(sd, 0, sizeof(struct svm_cpu_data));
 700         sd->save_area = alloc_page(GFP_KERNEL | __GFP_ZERO);
 701         if (!sd->save_area)
 702                 return ret;
 703
 704         ret = sev_cpu_init(sd);
 705         if (ret)
 706                 goto free_save_area;
 707
 708         sd->save_area_pa = __sme_page_pa(sd->save_area);
 709         return 0;
 710
 711 free_save_area:
 712         __free_page(sd->save_area);
 713         sd->save_area = NULL;
 714         return ret;
 715
 716 }
 717
 718 static void set_dr_intercepts(struct vcpu_svm *svm)
 719 {
 720         struct vmcb *vmcb = svm->vmcb01.ptr;
 721
 722         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ);
 723         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ);
 724         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ);
 725         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ);
 726         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ);
 727         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ);
 728         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ);
 729         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE);
 730         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE);
 731         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE);
 732         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE);
 733         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE);
 734         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE);
 735         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE);
 736         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
 737         vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
 738
 739         recalc_intercepts(svm);
 740 }
 741
 742 static void clr_dr_intercepts(struct vcpu_svm *svm)
 743 {
 744         struct vmcb *vmcb = svm->vmcb01.ptr;
 745
 746         vmcb->control.intercepts[INTERCEPT_DR] = 0;
 747
 748         recalc_intercepts(svm);
 749 }
 750
 751 static int direct_access_msr_slot(u32 msr)
 752 {
 753         u32 i;
 754
 755         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
 756                 if (direct_access_msrs[i].index == msr)
 757                         return i;
 758
 759         return -ENOENT;
 760 }
 761
 762 static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read,
 763                                      int write)
 764 {
 765         struct vcpu_svm *svm = to_svm(vcpu);
 766         int slot = direct_access_msr_slot(msr);
 767
 768         if (slot == -ENOENT)
 769                 return;
 770
 771         /* Set the shadow bitmaps to the desired intercept states */
 772         if (read)
 773                 set_bit(slot, svm->shadow_msr_intercept.read);
 774         else
 775                 clear_bit(slot, svm->shadow_msr_intercept.read);
 776
 777         if (write)
 778                 set_bit(slot, svm->shadow_msr_intercept.write);
 779         else
 780                 clear_bit(slot, svm->shadow_msr_intercept.write);
 781 }
 782
 783 static bool valid_msr_intercept(u32 index)
 784 {
 785         return direct_access_msr_slot(index) != -ENOENT;
 786 }
 787
 788 static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
 789 {
 790         u8 bit_write;
 791         unsigned long tmp;
 792         u32 offset;
 793         u32 *msrpm;
 794
 795         /*
 796          * For non-nested case:
 797          * If the L01 MSR bitmap does not intercept the MSR, then we need to
 798          * save it.
 799          *
 800          * For nested case:
 801          * If the L02 MSR bitmap does not intercept the MSR, then we need to
 802          * save it.
 803          */
 804         msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
 805                                       to_svm(vcpu)->msrpm;
 806
 807         offset    = svm_msrpm_offset(msr);
 808         bit_write = 2 * (msr & 0x0f) + 1;
 809         tmp       = msrpm[offset];
 810
 811         BUG_ON(offset == MSR_INVALID);
 812
 813         return test_bit(bit_write, &tmp);
 814 }
 815
 816 static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
 817                                         u32 msr, int read, int write)
 818 {
 819         struct vcpu_svm *svm = to_svm(vcpu);
 820         u8 bit_read, bit_write;
 821         unsigned long tmp;
 822         u32 offset;
 823
 824         /*
 825          * If this warning triggers extend the direct_access_msrs list at the
 826          * beginning of the file
 827          */
 828         WARN_ON(!valid_msr_intercept(msr));
 829
 830         /* Enforce non allowed MSRs to trap */
 831         if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
 832                 read = 0;
 833
 834         if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
 835                 write = 0;
 836
 837         offset    = svm_msrpm_offset(msr);
 838         bit_read  = 2 * (msr & 0x0f);
 839         bit_write = 2 * (msr & 0x0f) + 1;
 840         tmp       = msrpm[offset];
 841
 842         BUG_ON(offset == MSR_INVALID);
 843
 844         read  ? clear_bit(bit_read,  &tmp) : set_bit(bit_read,  &tmp);
 845         write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
 846
 847         msrpm[offset] = tmp;
 848
 849         svm_hv_vmcb_dirty_nested_enlightenments(vcpu);
 850         svm->nested.force_msr_bitmap_recalc = true;
 851 }
 852
 853 void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
 854                           int read, int write)
 855 {
 856         set_shadow_msr_intercept(vcpu, msr, read, write);
 857         set_msr_interception_bitmap(vcpu, msrpm, msr, read, write);
 858 }
 859
 860 u32 *svm_vcpu_alloc_msrpm(void)
 861 {
 862         unsigned int order = get_order(MSRPM_SIZE);
 863         struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order);
 864         u32 *msrpm;
 865
 866         if (!pages)
 867                 return NULL;
 868
 869         msrpm = page_address(pages);
 870         memset(msrpm, 0xff, PAGE_SIZE * (1 << order));
 871
 872         return msrpm;
 873 }
 874
 875 void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
 876 {
 877         int i;
 878
 879         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 880                 if (!direct_access_msrs[i].always)
 881                         continue;
 882                 set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1);
 883         }
 884 }
 885
 886 void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept)
 887 {
 888         int i;
 889
 890         if (intercept == svm->x2avic_msrs_intercepted)
 891                 return;
 892
 893         if (!x2avic_enabled ||
 894             !apic_x2apic_mode(svm->vcpu.arch.apic))
 895                 return;
 896
 897         for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) {
 898                 int index = direct_access_msrs[i].index;
 899
 900                 if ((index < APIC_BASE_MSR) ||
 901                     (index > APIC_BASE_MSR + 0xff))
 902                         continue;
 903                 set_msr_interception(&svm->vcpu, svm->msrpm, index,
 904                                      !intercept, !intercept);
 905         }
 906
 907         svm->x2avic_msrs_intercepted = intercept;
 908 }
 909
 910 void svm_vcpu_free_msrpm(u32 *msrpm)
 911 {
 912         __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE));
 913 }
 914
 915 static void svm_msr_filter_changed(struct kvm_vcpu *vcpu)
 916 {
 917         struct vcpu_svm *svm = to_svm(vcpu);
 918         u32 i;
 919
 920         /*
 921          * Set intercept permissions for all direct access MSRs again. They
 922          * will automatically get filtered through the MSR filter, so we are
 923          * back in sync after this.
 924          */
 925         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 926                 u32 msr = direct_access_msrs[i].index;
 927                 u32 read = test_bit(i, svm->shadow_msr_intercept.read);
 928                 u32 write = test_bit(i, svm->shadow_msr_intercept.write);
 929
 930                 set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write);
 931         }
 932 }
 933
 934 static void add_msr_offset(u32 offset)
 935 {
 936         int i;
 937
 938         for (i = 0; i < MSRPM_OFFSETS; ++i) {
 939
 940                 /* Offset already in list? */
 941                 if (msrpm_offsets[i] == offset)
 942                         return;
 943
 944                 /* Slot used by another offset? */
 945                 if (msrpm_offsets[i] != MSR_INVALID)
 946                         continue;
 947
 948                 /* Add offset to list */
 949                 msrpm_offsets[i] = offset;
 950
 951                 return;
 952         }
 953
 954         /*
 955          * If this BUG triggers the msrpm_offsets table has an overflow. Just
 956          * increase MSRPM_OFFSETS in this case.
 957          */
 958         BUG();
 959 }
 960
 961 static void init_msrpm_offsets(void)
 962 {
 963         int i;
 964
 965         memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
 966
 967         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 968                 u32 offset;
 969
 970                 offset = svm_msrpm_offset(direct_access_msrs[i].index);
 971                 BUG_ON(offset == MSR_INVALID);
 972
 973                 add_msr_offset(offset);
 974         }
 975 }
 976
 977 void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
 978 {
 979         to_vmcb->save.dbgctl            = from_vmcb->save.dbgctl;
 980         to_vmcb->save.br_from           = from_vmcb->save.br_from;
 981         to_vmcb->save.br_to             = from_vmcb->save.br_to;
 982         to_vmcb->save.last_excp_from    = from_vmcb->save.last_excp_from;
 983         to_vmcb->save.last_excp_to      = from_vmcb->save.last_excp_to;
 984
 985         vmcb_mark_dirty(to_vmcb, VMCB_LBR);
 986 }
 987
 988 static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
 989 {
 990         struct vcpu_svm *svm = to_svm(vcpu);
 991
 992         svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
 993         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
 994         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
 995         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
 996         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
 997
 998         /* Move the LBR msrs to the vmcb02 so that the guest can see them. */
 999         if (is_guest_mode(vcpu))
1000                 svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr);
1001 }
1002
1003 static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
1004 {
1005         struct vcpu_svm *svm = to_svm(vcpu);
1006
1007         svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
1008         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
1009         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
1010         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
1011         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
1012
1013         /*
1014          * Move the LBR msrs back to the vmcb01 to avoid copying them
1015          * on nested guest entries.
1016          */
1017         if (is_guest_mode(vcpu))
1018                 svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb);
1019 }
1020
1021 static struct vmcb *svm_get_lbr_vmcb(struct vcpu_svm *svm)
1022 {
1023         /*
1024          * If LBR virtualization is disabled, the LBR MSRs are always kept in
1025          * vmcb01.  If LBR virtualization is enabled and L1 is running VMs of
1026          * its own, the MSRs are moved between vmcb01 and vmcb02 as needed.
1027          */
1028         return svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK ? svm->vmcb :
1029                                                                    svm->vmcb01.ptr;
1030 }
1031
1032 void svm_update_lbrv(struct kvm_vcpu *vcpu)
1033 {
1034         struct vcpu_svm *svm = to_svm(vcpu);
1035         bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK;
1036         bool enable_lbrv = (svm_get_lbr_vmcb(svm)->save.dbgctl & DEBUGCTLMSR_LBR) ||
1037                             (is_guest_mode(vcpu) && guest_can_use(vcpu, X86_FEATURE_LBRV) &&
1038                             (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK));
1039
1040         if (enable_lbrv == current_enable_lbrv)
1041                 return;
1042
1043         if (enable_lbrv)
1044                 svm_enable_lbrv(vcpu);
1045         else
1046                 svm_disable_lbrv(vcpu);
1047 }
1048
1049 void disable_nmi_singlestep(struct vcpu_svm *svm)
1050 {
1051         svm->nmi_singlestep = false;
1052
1053         if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
1054                 /* Clear our flags if they were not set by the guest */
1055                 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
1056                         svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
1057                 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
1058                         svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
1059         }
1060 }
1061
1062 static void grow_ple_window(struct kvm_vcpu *vcpu)
1063 {
1064         struct vcpu_svm *svm = to_svm(vcpu);
1065         struct vmcb_control_area *control = &svm->vmcb->control;
1066         int old = control->pause_filter_count;
1067
1068         if (kvm_pause_in_guest(vcpu->kvm))
1069                 return;
1070
1071         control->pause_filter_count = __grow_ple_window(old,
1072                                                         pause_filter_count,
1073                                                         pause_filter_count_grow,
1074                                                         pause_filter_count_max);
1075
1076         if (control->pause_filter_count != old) {
1077                 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1078                 trace_kvm_ple_window_update(vcpu->vcpu_id,
1079                                             control->pause_filter_count, old);
1080         }
1081 }
1082
1083 static void shrink_ple_window(struct kvm_vcpu *vcpu)
1084 {
1085         struct vcpu_svm *svm = to_svm(vcpu);
1086         struct vmcb_control_area *control = &svm->vmcb->control;
1087         int old = control->pause_filter_count;
1088
1089         if (kvm_pause_in_guest(vcpu->kvm))
1090                 return;
1091
1092         control->pause_filter_count =
1093                                 __shrink_ple_window(old,
1094                                                     pause_filter_count,
1095                                                     pause_filter_count_shrink,
1096                                                     pause_filter_count);
1097         if (control->pause_filter_count != old) {
1098                 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1099                 trace_kvm_ple_window_update(vcpu->vcpu_id,
1100                                             control->pause_filter_count, old);
1101         }
1102 }
1103
1104 static void svm_hardware_unsetup(void)
1105 {
1106         int cpu;
1107
1108         sev_hardware_unsetup();
1109
1110         for_each_possible_cpu(cpu)
1111                 svm_cpu_uninit(cpu);
1112
1113         __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT),
1114         get_order(IOPM_SIZE));
1115         iopm_base = 0;
1116 }
1117
1118 static void init_seg(struct vmcb_seg *seg)
1119 {
1120         seg->selector = 0;
1121         seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
1122                       SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
1123         seg->limit = 0xffff;
1124         seg->base = 0;
1125 }
1126
1127 static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
1128 {
1129         seg->selector = 0;
1130         seg->attrib = SVM_SELECTOR_P_MASK | type;
1131         seg->limit = 0xffff;
1132         seg->base = 0;
1133 }
1134
1135 static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
1136 {
1137         struct vcpu_svm *svm = to_svm(vcpu);
1138
1139         return svm->nested.ctl.tsc_offset;
1140 }
1141
1142 static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
1143 {
1144         struct vcpu_svm *svm = to_svm(vcpu);
1145
1146         return svm->tsc_ratio_msr;
1147 }
1148
1149 static void svm_write_tsc_offset(struct kvm_vcpu *vcpu)
1150 {
1151         struct vcpu_svm *svm = to_svm(vcpu);
1152
1153         svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
1154         svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset;
1155         vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1156 }
1157
1158 void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu)
1159 {
1160         preempt_disable();
1161         if (to_svm(vcpu)->guest_state_loaded)
1162                 __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
1163         preempt_enable();
1164 }
1165
1166 /* Evaluate instruction intercepts that depend on guest CPUID features. */
1167 static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
1168                                               struct vcpu_svm *svm)
1169 {
1170         /*
1171          * Intercept INVPCID if shadow paging is enabled to sync/free shadow
1172          * roots, or if INVPCID is disabled in the guest to inject #UD.
1173          */
1174         if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) {
1175                 if (!npt_enabled ||
1176                     !guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID))
1177                         svm_set_intercept(svm, INTERCEPT_INVPCID);
1178                 else
1179                         svm_clr_intercept(svm, INTERCEPT_INVPCID);
1180         }
1181
1182         if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) {
1183                 if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
1184                         svm_clr_intercept(svm, INTERCEPT_RDTSCP);
1185                 else
1186                         svm_set_intercept(svm, INTERCEPT_RDTSCP);
1187         }
1188 }
1189
1190 static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
1191 {
1192         struct vcpu_svm *svm = to_svm(vcpu);
1193
1194         if (guest_cpuid_is_intel(vcpu)) {
1195                 /*
1196                  * We must intercept SYSENTER_EIP and SYSENTER_ESP
1197                  * accesses because the processor only stores 32 bits.
1198                  * For the same reason we cannot use virtual VMLOAD/VMSAVE.
1199                  */
1200                 svm_set_intercept(svm, INTERCEPT_VMLOAD);
1201                 svm_set_intercept(svm, INTERCEPT_VMSAVE);
1202                 svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
1203
1204                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
1205                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
1206         } else {
1207                 /*
1208                  * If hardware supports Virtual VMLOAD VMSAVE then enable it
1209                  * in VMCB and clear intercepts to avoid #VMEXIT.
1210                  */
1211                 if (vls) {
1212                         svm_clr_intercept(svm, INTERCEPT_VMLOAD);
1213                         svm_clr_intercept(svm, INTERCEPT_VMSAVE);
1214                         svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
1215                 }
1216                 /* No need to intercept these MSRs */
1217                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
1218                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
1219         }
1220 }
1221
1222 static void init_vmcb(struct kvm_vcpu *vcpu)
1223 {
1224         struct vcpu_svm *svm = to_svm(vcpu);
1225         struct vmcb *vmcb = svm->vmcb01.ptr;
1226         struct vmcb_control_area *control = &vmcb->control;
1227         struct vmcb_save_area *save = &vmcb->save;
1228
1229         svm_set_intercept(svm, INTERCEPT_CR0_READ);
1230         svm_set_intercept(svm, INTERCEPT_CR3_READ);
1231         svm_set_intercept(svm, INTERCEPT_CR4_READ);
1232         svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
1233         svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
1234         svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
1235         if (!kvm_vcpu_apicv_active(vcpu))
1236                 svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
1237
1238         set_dr_intercepts(svm);
1239
1240         set_exception_intercept(svm, PF_VECTOR);
1241         set_exception_intercept(svm, UD_VECTOR);
1242         set_exception_intercept(svm, MC_VECTOR);
1243         set_exception_intercept(svm, AC_VECTOR);
1244         set_exception_intercept(svm, DB_VECTOR);
1245         /*
1246          * Guest access to VMware backdoor ports could legitimately
1247          * trigger #GP because of TSS I/O permission bitmap.
1248          * We intercept those #GP and allow access to them anyway
1249          * as VMware does.
1250          */
1251         if (enable_vmware_backdoor)
1252                 set_exception_intercept(svm, GP_VECTOR);
1253
1254         svm_set_intercept(svm, INTERCEPT_INTR);
1255         svm_set_intercept(svm, INTERCEPT_NMI);
1256
1257         if (intercept_smi)
1258                 svm_set_intercept(svm, INTERCEPT_SMI);
1259
1260         svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
1261         svm_set_intercept(svm, INTERCEPT_RDPMC);
1262         svm_set_intercept(svm, INTERCEPT_CPUID);
1263         svm_set_intercept(svm, INTERCEPT_INVD);
1264         svm_set_intercept(svm, INTERCEPT_INVLPG);
1265         svm_set_intercept(svm, INTERCEPT_INVLPGA);
1266         svm_set_intercept(svm, INTERCEPT_IOIO_PROT);
1267         svm_set_intercept(svm, INTERCEPT_MSR_PROT);
1268         svm_set_intercept(svm, INTERCEPT_TASK_SWITCH);
1269         svm_set_intercept(svm, INTERCEPT_SHUTDOWN);
1270         svm_set_intercept(svm, INTERCEPT_VMRUN);
1271         svm_set_intercept(svm, INTERCEPT_VMMCALL);
1272         svm_set_intercept(svm, INTERCEPT_VMLOAD);
1273         svm_set_intercept(svm, INTERCEPT_VMSAVE);
1274         svm_set_intercept(svm, INTERCEPT_STGI);
1275         svm_set_intercept(svm, INTERCEPT_CLGI);
1276         svm_set_intercept(svm, INTERCEPT_SKINIT);
1277         svm_set_intercept(svm, INTERCEPT_WBINVD);
1278         svm_set_intercept(svm, INTERCEPT_XSETBV);
1279         svm_set_intercept(svm, INTERCEPT_RDPRU);
1280         svm_set_intercept(svm, INTERCEPT_RSM);
1281
1282         if (!kvm_mwait_in_guest(vcpu->kvm)) {
1283                 svm_set_intercept(svm, INTERCEPT_MONITOR);
1284                 svm_set_intercept(svm, INTERCEPT_MWAIT);
1285         }
1286
1287         if (!kvm_hlt_in_guest(vcpu->kvm))
1288                 svm_set_intercept(svm, INTERCEPT_HLT);
1289
1290         control->iopm_base_pa = __sme_set(iopm_base);
1291         control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
1292         control->int_ctl = V_INTR_MASKING_MASK;
1293
1294         init_seg(&save->es);
1295         init_seg(&save->ss);
1296         init_seg(&save->ds);
1297         init_seg(&save->fs);
1298         init_seg(&save->gs);
1299
1300         save->cs.selector = 0xf000;
1301         save->cs.base = 0xffff0000;
1302         /* Executable/Readable Code Segment */
1303         save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
1304                 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
1305         save->cs.limit = 0xffff;
1306
1307         save->gdtr.base = 0;
1308         save->gdtr.limit = 0xffff;
1309         save->idtr.base = 0;
1310         save->idtr.limit = 0xffff;
1311
1312         init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
1313         init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
1314
1315         if (npt_enabled) {
1316                 /* Setup VMCB for Nested Paging */
1317                 control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
1318                 svm_clr_intercept(svm, INTERCEPT_INVLPG);
1319                 clr_exception_intercept(svm, PF_VECTOR);
1320                 svm_clr_intercept(svm, INTERCEPT_CR3_READ);
1321                 svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
1322                 save->g_pat = vcpu->arch.pat;
1323                 save->cr3 = 0;
1324         }
1325         svm->current_vmcb->asid_generation = 0;
1326         svm->asid = 0;
1327
1328         svm->nested.vmcb12_gpa = INVALID_GPA;
1329         svm->nested.last_vmcb12_gpa = INVALID_GPA;
1330
1331         if (!kvm_pause_in_guest(vcpu->kvm)) {
1332                 control->pause_filter_count = pause_filter_count;
1333                 if (pause_filter_thresh)
1334                         control->pause_filter_thresh = pause_filter_thresh;
1335                 svm_set_intercept(svm, INTERCEPT_PAUSE);
1336         } else {
1337                 svm_clr_intercept(svm, INTERCEPT_PAUSE);
1338         }
1339
1340         svm_recalc_instruction_intercepts(vcpu, svm);
1341
1342         /*
1343          * If the host supports V_SPEC_CTRL then disable the interception
1344          * of MSR_IA32_SPEC_CTRL.
1345          */
1346         if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
1347                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
1348
1349         if (kvm_vcpu_apicv_active(vcpu))
1350                 avic_init_vmcb(svm, vmcb);
1351
1352         if (vnmi)
1353                 svm->vmcb->control.int_ctl |= V_NMI_ENABLE_MASK;
1354
1355         if (vgif) {
1356                 svm_clr_intercept(svm, INTERCEPT_STGI);
1357                 svm_clr_intercept(svm, INTERCEPT_CLGI);
1358                 svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
1359         }
1360
1361         if (sev_guest(vcpu->kvm))
1362                 sev_init_vmcb(svm);
1363
1364         svm_hv_init_vmcb(vmcb);
1365         init_vmcb_after_set_cpuid(vcpu);
1366
1367         vmcb_mark_all_dirty(vmcb);
1368
1369         enable_gif(svm);
1370 }
1371
1372 static void __svm_vcpu_reset(struct kvm_vcpu *vcpu)
1373 {
1374         struct vcpu_svm *svm = to_svm(vcpu);
1375
1376         svm_vcpu_init_msrpm(vcpu, svm->msrpm);
1377
1378         svm_init_osvw(vcpu);
1379         vcpu->arch.microcode_version = 0x01000065;
1380         svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio;
1381
1382         svm->nmi_masked = false;
1383         svm->awaiting_iret_completion = false;
1384
1385         if (sev_es_guest(vcpu->kvm))
1386                 sev_es_vcpu_reset(svm);
1387 }
1388
1389 static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
1390 {
1391         struct vcpu_svm *svm = to_svm(vcpu);
1392
1393         svm->spec_ctrl = 0;
1394         svm->virt_spec_ctrl = 0;
1395
1396         init_vmcb(vcpu);
1397
1398         if (!init_event)
1399                 __svm_vcpu_reset(vcpu);
1400 }
1401
1402 void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
1403 {
1404         svm->current_vmcb = target_vmcb;
1405         svm->vmcb = target_vmcb->ptr;
1406 }
1407
1408 static int svm_vcpu_create(struct kvm_vcpu *vcpu)
1409 {
1410         struct vcpu_svm *svm;
1411         struct page *vmcb01_page;
1412         struct page *vmsa_page = NULL;
1413         int err;
1414
1415         BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
1416         svm = to_svm(vcpu);
1417
1418         err = -ENOMEM;
1419         vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1420         if (!vmcb01_page)
1421                 goto out;
1422
1423         if (sev_es_guest(vcpu->kvm)) {
1424                 /*
1425                  * SEV-ES guests require a separate VMSA page used to contain
1426                  * the encrypted register state of the guest.
1427                  */
1428                 vmsa_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1429                 if (!vmsa_page)
1430                         goto error_free_vmcb_page;
1431
1432                 /*
1433                  * SEV-ES guests maintain an encrypted version of their FPU
1434                  * state which is restored and saved on VMRUN and VMEXIT.
1435                  * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't
1436                  * do xsave/xrstor on it.
1437                  */
1438                 fpstate_set_confidential(&vcpu->arch.guest_fpu);
1439         }
1440
1441         err = avic_init_vcpu(svm);
1442         if (err)
1443                 goto error_free_vmsa_page;
1444
1445         svm->msrpm = svm_vcpu_alloc_msrpm();
1446         if (!svm->msrpm) {
1447                 err = -ENOMEM;
1448                 goto error_free_vmsa_page;
1449         }
1450
1451         svm->x2avic_msrs_intercepted = true;
1452
1453         svm->vmcb01.ptr = page_address(vmcb01_page);
1454         svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
1455         svm_switch_vmcb(svm, &svm->vmcb01);
1456
1457         if (vmsa_page)
1458                 svm->sev_es.vmsa = page_address(vmsa_page);
1459
1460         svm->guest_state_loaded = false;
1461
1462         return 0;
1463
1464 error_free_vmsa_page:
1465         if (vmsa_page)
1466                 __free_page(vmsa_page);
1467 error_free_vmcb_page:
1468         __free_page(vmcb01_page);
1469 out:
1470         return err;
1471 }
1472
1473 static void svm_clear_current_vmcb(struct vmcb *vmcb)
1474 {
1475         int i;
1476
1477         for_each_online_cpu(i)
1478                 cmpxchg(per_cpu_ptr(&svm_data.current_vmcb, i), vmcb, NULL);
1479 }
1480
1481 static void svm_vcpu_free(struct kvm_vcpu *vcpu)
1482 {
1483         struct vcpu_svm *svm = to_svm(vcpu);
1484
1485         /*
1486          * The vmcb page can be recycled, causing a false negative in
1487          * svm_vcpu_load(). So, ensure that no logical CPU has this
1488          * vmcb page recorded as its current vmcb.
1489          */
1490         svm_clear_current_vmcb(svm->vmcb);
1491
1492         svm_leave_nested(vcpu);
1493         svm_free_nested(svm);
1494
1495         sev_free_vcpu(vcpu);
1496
1497         __free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT));
1498         __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
1499 }
1500
1501 static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
1502 {
1503         struct vcpu_svm *svm = to_svm(vcpu);
1504         struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
1505
1506         if (sev_es_guest(vcpu->kvm))
1507                 sev_es_unmap_ghcb(svm);
1508
1509         if (svm->guest_state_loaded)
1510                 return;
1511
1512         /*
1513          * Save additional host state that will be restored on VMEXIT (sev-es)
1514          * or subsequent vmload of host save area.
1515          */
1516         vmsave(sd->save_area_pa);
1517         if (sev_es_guest(vcpu->kvm)) {
1518                 struct sev_es_save_area *hostsa;
1519                 hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400);
1520
1521                 sev_es_prepare_switch_to_guest(hostsa);
1522         }
1523
1524         if (tsc_scaling)
1525                 __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
1526
1527         if (likely(tsc_aux_uret_slot >= 0))
1528                 kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
1529
1530         svm->guest_state_loaded = true;
1531 }
1532
1533 static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
1534 {
1535         to_svm(vcpu)->guest_state_loaded = false;
1536 }
1537
1538 static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1539 {
1540         struct vcpu_svm *svm = to_svm(vcpu);
1541         struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
1542
1543         if (sd->current_vmcb != svm->vmcb) {
1544                 sd->current_vmcb = svm->vmcb;
1545
1546                 if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT))
1547                         indirect_branch_prediction_barrier();
1548         }
1549         if (kvm_vcpu_apicv_active(vcpu))
1550                 avic_vcpu_load(vcpu, cpu);
1551 }
1552
1553 static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1554 {
1555         if (kvm_vcpu_apicv_active(vcpu))
1556                 avic_vcpu_put(vcpu);
1557
1558         svm_prepare_host_switch(vcpu);
1559
1560         ++vcpu->stat.host_state_reload;
1561 }
1562
1563 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
1564 {
1565         struct vcpu_svm *svm = to_svm(vcpu);
1566         unsigned long rflags = svm->vmcb->save.rflags;
1567
1568         if (svm->nmi_singlestep) {
1569                 /* Hide our flags if they were not set by the guest */
1570                 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
1571                         rflags &= ~X86_EFLAGS_TF;
1572                 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
1573                         rflags &= ~X86_EFLAGS_RF;
1574         }
1575         return rflags;
1576 }
1577
1578 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1579 {
1580         if (to_svm(vcpu)->nmi_singlestep)
1581                 rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
1582
1583        /*
1584         * Any change of EFLAGS.VM is accompanied by a reload of SS
1585         * (caused by either a task switch or an inter-privilege IRET),
1586         * so we do not need to update the CPL here.
1587         */
1588         to_svm(vcpu)->vmcb->save.rflags = rflags;
1589 }
1590
1591 static bool svm_get_if_flag(struct kvm_vcpu *vcpu)
1592 {
1593         struct vmcb *vmcb = to_svm(vcpu)->vmcb;
1594
1595         return sev_es_guest(vcpu->kvm)
1596                 ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK
1597                 : kvm_get_rflags(vcpu) & X86_EFLAGS_IF;
1598 }
1599
1600 static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1601 {
1602         kvm_register_mark_available(vcpu, reg);
1603
1604         switch (reg) {
1605         case VCPU_EXREG_PDPTR:
1606                 /*
1607                  * When !npt_enabled, mmu->pdptrs[] is already available since
1608                  * it is always updated per SDM when moving to CRs.
1609                  */
1610                 if (npt_enabled)
1611                         load_pdptrs(vcpu, kvm_read_cr3(vcpu));
1612                 break;
1613         default:
1614                 KVM_BUG_ON(1, vcpu->kvm);
1615         }
1616 }
1617
1618 static void svm_set_vintr(struct vcpu_svm *svm)
1619 {
1620         struct vmcb_control_area *control;
1621
1622         /*
1623          * The following fields are ignored when AVIC is enabled
1624          */
1625         WARN_ON(kvm_vcpu_apicv_activated(&svm->vcpu));
1626
1627         svm_set_intercept(svm, INTERCEPT_VINTR);
1628
1629         /*
1630          * Recalculating intercepts may have cleared the VINTR intercept.  If
1631          * V_INTR_MASKING is enabled in vmcb12, then the effective RFLAGS.IF
1632          * for L1 physical interrupts is L1's RFLAGS.IF at the time of VMRUN.
1633          * Requesting an interrupt window if save.RFLAGS.IF=0 is pointless as
1634          * interrupts will never be unblocked while L2 is running.
1635          */
1636         if (!svm_is_intercept(svm, INTERCEPT_VINTR))
1637                 return;
1638
1639         /*
1640          * This is just a dummy VINTR to actually cause a vmexit to happen.
1641          * Actual injection of virtual interrupts happens through EVENTINJ.
1642          */
1643         control = &svm->vmcb->control;
1644         control->int_vector = 0x0;
1645         control->int_ctl &= ~V_INTR_PRIO_MASK;
1646         control->int_ctl |= V_IRQ_MASK |
1647                 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
1648         vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
1649 }
1650
1651 static void svm_clear_vintr(struct vcpu_svm *svm)
1652 {
1653         svm_clr_intercept(svm, INTERCEPT_VINTR);
1654
1655         /* Drop int_ctl fields related to VINTR injection.  */
1656         svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
1657         if (is_guest_mode(&svm->vcpu)) {
1658                 svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
1659
1660                 WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
1661                         (svm->nested.ctl.int_ctl & V_TPR_MASK));
1662
1663                 svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl &
1664                         V_IRQ_INJECTION_BITS_MASK;
1665
1666                 svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
1667         }
1668
1669         vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
1670 }
1671
1672 static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
1673 {
1674         struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1675         struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save;
1676
1677         switch (seg) {
1678         case VCPU_SREG_CS: return &save->cs;
1679         case VCPU_SREG_DS: return &save->ds;
1680         case VCPU_SREG_ES: return &save->es;
1681         case VCPU_SREG_FS: return &save01->fs;
1682         case VCPU_SREG_GS: return &save01->gs;
1683         case VCPU_SREG_SS: return &save->ss;
1684         case VCPU_SREG_TR: return &save01->tr;
1685         case VCPU_SREG_LDTR: return &save01->ldtr;
1686         }
1687         BUG();
1688         return NULL;
1689 }
1690
1691 static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1692 {
1693         struct vmcb_seg *s = svm_seg(vcpu, seg);
1694
1695         return s->base;
1696 }
1697
1698 static void svm_get_segment(struct kvm_vcpu *vcpu,
1699                             struct kvm_segment *var, int seg)
1700 {
1701         struct vmcb_seg *s = svm_seg(vcpu, seg);
1702
1703         var->base = s->base;
1704         var->limit = s->limit;
1705         var->selector = s->selector;
1706         var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
1707         var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
1708         var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
1709         var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
1710         var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
1711         var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
1712         var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
1713
1714         /*
1715          * AMD CPUs circa 2014 track the G bit for all segments except CS.
1716          * However, the SVM spec states that the G bit is not observed by the
1717          * CPU, and some VMware virtual CPUs drop the G bit for all segments.
1718          * So let's synthesize a legal G bit for all segments, this helps
1719          * running KVM nested. It also helps cross-vendor migration, because
1720          * Intel's vmentry has a check on the 'G' bit.
1721          */
1722         var->g = s->limit > 0xfffff;
1723
1724         /*
1725          * AMD's VMCB does not have an explicit unusable field, so emulate it
1726          * for cross vendor migration purposes by "not present"
1727          */
1728         var->unusable = !var->present;
1729
1730         switch (seg) {
1731         case VCPU_SREG_TR:
1732                 /*
1733                  * Work around a bug where the busy flag in the tr selector
1734                  * isn't exposed
1735                  */
1736                 var->type |= 0x2;
1737                 break;
1738         case VCPU_SREG_DS:
1739         case VCPU_SREG_ES:
1740         case VCPU_SREG_FS:
1741         case VCPU_SREG_GS:
1742                 /*
1743                  * The accessed bit must always be set in the segment
1744                  * descriptor cache, although it can be cleared in the
1745                  * descriptor, the cached bit always remains at 1. Since
1746                  * Intel has a check on this, set it here to support
1747                  * cross-vendor migration.
1748                  */
1749                 if (!var->unusable)
1750                         var->type |= 0x1;
1751                 break;
1752         case VCPU_SREG_SS:
1753                 /*
1754                  * On AMD CPUs sometimes the DB bit in the segment
1755                  * descriptor is left as 1, although the whole segment has
1756                  * been made unusable. Clear it here to pass an Intel VMX
1757                  * entry check when cross vendor migrating.
1758                  */
1759                 if (var->unusable)
1760                         var->db = 0;
1761                 /* This is symmetric with svm_set_segment() */
1762                 var->dpl = to_svm(vcpu)->vmcb->save.cpl;
1763                 break;
1764         }
1765 }
1766
1767 static int svm_get_cpl(struct kvm_vcpu *vcpu)
1768 {
1769         struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1770
1771         return save->cpl;
1772 }
1773
1774 static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
1775 {
1776         struct kvm_segment cs;
1777
1778         svm_get_segment(vcpu, &cs, VCPU_SREG_CS);
1779         *db = cs.db;
1780         *l = cs.l;
1781 }
1782
1783 static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1784 {
1785         struct vcpu_svm *svm = to_svm(vcpu);
1786
1787         dt->size = svm->vmcb->save.idtr.limit;
1788         dt->address = svm->vmcb->save.idtr.base;
1789 }
1790
1791 static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1792 {
1793         struct vcpu_svm *svm = to_svm(vcpu);
1794
1795         svm->vmcb->save.idtr.limit = dt->size;
1796         svm->vmcb->save.idtr.base = dt->address ;
1797         vmcb_mark_dirty(svm->vmcb, VMCB_DT);
1798 }
1799
1800 static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1801 {
1802         struct vcpu_svm *svm = to_svm(vcpu);
1803
1804         dt->size = svm->vmcb->save.gdtr.limit;
1805         dt->address = svm->vmcb->save.gdtr.base;
1806 }
1807
1808 static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1809 {
1810         struct vcpu_svm *svm = to_svm(vcpu);
1811
1812         svm->vmcb->save.gdtr.limit = dt->size;
1813         svm->vmcb->save.gdtr.base = dt->address ;
1814         vmcb_mark_dirty(svm->vmcb, VMCB_DT);
1815 }
1816
1817 static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1818 {
1819         struct vcpu_svm *svm = to_svm(vcpu);
1820
1821         /*
1822          * For guests that don't set guest_state_protected, the cr3 update is
1823          * handled via kvm_mmu_load() while entering the guest. For guests
1824          * that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to
1825          * VMCB save area now, since the save area will become the initial
1826          * contents of the VMSA, and future VMCB save area updates won't be
1827          * seen.
1828          */
1829         if (sev_es_guest(vcpu->kvm)) {
1830                 svm->vmcb->save.cr3 = cr3;
1831                 vmcb_mark_dirty(svm->vmcb, VMCB_CR);
1832         }
1833 }
1834
1835 static bool svm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1836 {
1837         return true;
1838 }
1839
1840 void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1841 {
1842         struct vcpu_svm *svm = to_svm(vcpu);
1843         u64 hcr0 = cr0;
1844         bool old_paging = is_paging(vcpu);
1845
1846 #ifdef CONFIG_X86_64
1847         if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) {
1848                 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
1849                         vcpu->arch.efer |= EFER_LMA;
1850                         svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
1851                 }
1852
1853                 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
1854                         vcpu->arch.efer &= ~EFER_LMA;
1855                         svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
1856                 }
1857         }
1858 #endif
1859         vcpu->arch.cr0 = cr0;
1860
1861         if (!npt_enabled) {
1862                 hcr0 |= X86_CR0_PG | X86_CR0_WP;
1863                 if (old_paging != is_paging(vcpu))
1864                         svm_set_cr4(vcpu, kvm_read_cr4(vcpu));
1865         }
1866
1867         /*
1868          * re-enable caching here because the QEMU bios
1869          * does not do it - this results in some delay at
1870          * reboot
1871          */
1872         if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
1873                 hcr0 &= ~(X86_CR0_CD | X86_CR0_NW);
1874
1875         svm->vmcb->save.cr0 = hcr0;
1876         vmcb_mark_dirty(svm->vmcb, VMCB_CR);
1877
1878         /*
1879          * SEV-ES guests must always keep the CR intercepts cleared. CR
1880          * tracking is done using the CR write traps.
1881          */
1882         if (sev_es_guest(vcpu->kvm))
1883                 return;
1884
1885         if (hcr0 == cr0) {
1886                 /* Selective CR0 write remains on.  */
1887                 svm_clr_intercept(svm, INTERCEPT_CR0_READ);
1888                 svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
1889         } else {
1890                 svm_set_intercept(svm, INTERCEPT_CR0_READ);
1891                 svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
1892         }
1893 }
1894
1895 static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1896 {
1897         return true;
1898 }
1899
1900 void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1901 {
1902         unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
1903         unsigned long old_cr4 = vcpu->arch.cr4;
1904
1905         if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
1906                 svm_flush_tlb_current(vcpu);
1907
1908         vcpu->arch.cr4 = cr4;
1909         if (!npt_enabled) {
1910                 cr4 |= X86_CR4_PAE;
1911
1912                 if (!is_paging(vcpu))
1913                         cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
1914         }
1915         cr4 |= host_cr4_mce;
1916         to_svm(vcpu)->vmcb->save.cr4 = cr4;
1917         vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
1918
1919         if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
1920                 kvm_update_cpuid_runtime(vcpu);
1921 }
1922
1923 static void svm_set_segment(struct kvm_vcpu *vcpu,
1924                             struct kvm_segment *var, int seg)
1925 {
1926         struct vcpu_svm *svm = to_svm(vcpu);
1927         struct vmcb_seg *s = svm_seg(vcpu, seg);
1928
1929         s->base = var->base;
1930         s->limit = var->limit;
1931         s->selector = var->selector;
1932         s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
1933         s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
1934         s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
1935         s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
1936         s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
1937         s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
1938         s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
1939         s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
1940
1941         /*
1942          * This is always accurate, except if SYSRET returned to a segment
1943          * with SS.DPL != 3.  Intel does not have this quirk, and always
1944          * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
1945          * would entail passing the CPL to userspace and back.
1946          */
1947         if (seg == VCPU_SREG_SS)
1948                 /* This is symmetric with svm_get_segment() */
1949                 svm->vmcb->save.cpl = (var->dpl & 3);
1950
1951         vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
1952 }
1953
1954 static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu)
1955 {
1956         struct vcpu_svm *svm = to_svm(vcpu);
1957
1958         clr_exception_intercept(svm, BP_VECTOR);
1959
1960         if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
1961                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
1962                         set_exception_intercept(svm, BP_VECTOR);
1963         }
1964 }
1965
1966 static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1967 {
1968         if (sd->next_asid > sd->max_asid) {
1969                 ++sd->asid_generation;
1970                 sd->next_asid = sd->min_asid;
1971                 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
1972                 vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
1973         }
1974
1975         svm->current_vmcb->asid_generation = sd->asid_generation;
1976         svm->asid = sd->next_asid++;
1977 }
1978
1979 static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value)
1980 {
1981         struct vmcb *vmcb = svm->vmcb;
1982
1983         if (svm->vcpu.arch.guest_state_protected)
1984                 return;
1985
1986         if (unlikely(value != vmcb->save.dr6)) {
1987                 vmcb->save.dr6 = value;
1988                 vmcb_mark_dirty(vmcb, VMCB_DR);
1989         }
1990 }
1991
1992 static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
1993 {
1994         struct vcpu_svm *svm = to_svm(vcpu);
1995
1996         if (WARN_ON_ONCE(sev_es_guest(vcpu->kvm)))
1997                 return;
1998
1999         get_debugreg(vcpu->arch.db[0], 0);
2000         get_debugreg(vcpu->arch.db[1], 1);
2001         get_debugreg(vcpu->arch.db[2], 2);
2002         get_debugreg(vcpu->arch.db[3], 3);
2003         /*
2004          * We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here,
2005          * because db_interception might need it.  We can do it before vmentry.
2006          */
2007         vcpu->arch.dr6 = svm->vmcb->save.dr6;
2008         vcpu->arch.dr7 = svm->vmcb->save.dr7;
2009         vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
2010         set_dr_intercepts(svm);
2011 }
2012
2013 static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
2014 {
2015         struct vcpu_svm *svm = to_svm(vcpu);
2016
2017         if (vcpu->arch.guest_state_protected)
2018                 return;
2019
2020         svm->vmcb->save.dr7 = value;
2021         vmcb_mark_dirty(svm->vmcb, VMCB_DR);
2022 }
2023
2024 static int pf_interception(struct kvm_vcpu *vcpu)
2025 {
2026         struct vcpu_svm *svm = to_svm(vcpu);
2027
2028         u64 fault_address = svm->vmcb->control.exit_info_2;
2029         u64 error_code = svm->vmcb->control.exit_info_1;
2030
2031         return kvm_handle_page_fault(vcpu, error_code, fault_address,
2032                         static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
2033                         svm->vmcb->control.insn_bytes : NULL,
2034                         svm->vmcb->control.insn_len);
2035 }
2036
2037 static int npf_interception(struct kvm_vcpu *vcpu)
2038 {
2039         struct vcpu_svm *svm = to_svm(vcpu);
2040
2041         u64 fault_address = svm->vmcb->control.exit_info_2;
2042         u64 error_code = svm->vmcb->control.exit_info_1;
2043
2044         trace_kvm_page_fault(vcpu, fault_address, error_code);
2045         return kvm_mmu_page_fault(vcpu, fault_address, error_code,
2046                         static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
2047                         svm->vmcb->control.insn_bytes : NULL,
2048                         svm->vmcb->control.insn_len);
2049 }
2050
2051 static int db_interception(struct kvm_vcpu *vcpu)
2052 {
2053         struct kvm_run *kvm_run = vcpu->run;
2054         struct vcpu_svm *svm = to_svm(vcpu);
2055
2056         if (!(vcpu->guest_debug &
2057               (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
2058                 !svm->nmi_singlestep) {
2059                 u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW;
2060                 kvm_queue_exception_p(vcpu, DB_VECTOR, payload);
2061                 return 1;
2062         }
2063
2064         if (svm->nmi_singlestep) {
2065                 disable_nmi_singlestep(svm);
2066                 /* Make sure we check for pending NMIs upon entry */
2067                 kvm_make_request(KVM_REQ_EVENT, vcpu);
2068         }
2069
2070         if (vcpu->guest_debug &
2071             (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
2072                 kvm_run->exit_reason = KVM_EXIT_DEBUG;
2073                 kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6;
2074                 kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7;
2075                 kvm_run->debug.arch.pc =
2076                         svm->vmcb->save.cs.base + svm->vmcb->save.rip;
2077                 kvm_run->debug.arch.exception = DB_VECTOR;
2078                 return 0;
2079         }
2080
2081         return 1;
2082 }
2083
2084 static int bp_interception(struct kvm_vcpu *vcpu)
2085 {
2086         struct vcpu_svm *svm = to_svm(vcpu);
2087         struct kvm_run *kvm_run = vcpu->run;
2088
2089         kvm_run->exit_reason = KVM_EXIT_DEBUG;
2090         kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
2091         kvm_run->debug.arch.exception = BP_VECTOR;
2092         return 0;
2093 }
2094
2095 static int ud_interception(struct kvm_vcpu *vcpu)
2096 {
2097         return handle_ud(vcpu);
2098 }
2099
2100 static int ac_interception(struct kvm_vcpu *vcpu)
2101 {
2102         kvm_queue_exception_e(vcpu, AC_VECTOR, 0);
2103         return 1;
2104 }
2105
2106 static bool is_erratum_383(void)
2107 {
2108         int err, i;
2109         u64 value;
2110
2111         if (!erratum_383_found)
2112                 return false;
2113
2114         value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
2115         if (err)
2116                 return false;
2117
2118         /* Bit 62 may or may not be set for this mce */
2119         value &= ~(1ULL << 62);
2120
2121         if (value != 0xb600000000010015ULL)
2122                 return false;
2123
2124         /* Clear MCi_STATUS registers */
2125         for (i = 0; i < 6; ++i)
2126                 native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
2127
2128         value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
2129         if (!err) {
2130                 u32 low, high;
2131
2132                 value &= ~(1ULL << 2);
2133                 low    = lower_32_bits(value);
2134                 high   = upper_32_bits(value);
2135
2136                 native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
2137         }
2138
2139         /* Flush tlb to evict multi-match entries */
2140         __flush_tlb_all();
2141
2142         return true;
2143 }
2144
2145 static void svm_handle_mce(struct kvm_vcpu *vcpu)
2146 {
2147         if (is_erratum_383()) {
2148                 /*
2149                  * Erratum 383 triggered. Guest state is corrupt so kill the
2150                  * guest.
2151                  */
2152                 pr_err("Guest triggered AMD Erratum 383\n");
2153
2154                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2155
2156                 return;
2157         }
2158
2159         /*
2160          * On an #MC intercept the MCE handler is not called automatically in
2161          * the host. So do it by hand here.
2162          */
2163         kvm_machine_check();
2164 }
2165
2166 static int mc_interception(struct kvm_vcpu *vcpu)
2167 {
2168         return 1;
2169 }
2170
2171 static int shutdown_interception(struct kvm_vcpu *vcpu)
2172 {
2173         struct kvm_run *kvm_run = vcpu->run;
2174         struct vcpu_svm *svm = to_svm(vcpu);
2175
2176         /*
2177          * The VM save area has already been encrypted so it
2178          * cannot be reinitialized - just terminate.
2179          */
2180         if (sev_es_guest(vcpu->kvm))
2181                 return -EINVAL;
2182
2183         /*
2184          * VMCB is undefined after a SHUTDOWN intercept.  INIT the vCPU to put
2185          * the VMCB in a known good state.  Unfortuately, KVM doesn't have
2186          * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking
2187          * userspace.  At a platform view, INIT is acceptable behavior as
2188          * there exist bare metal platforms that automatically INIT the CPU
2189          * in response to shutdown.
2190          */
2191         clear_page(svm->vmcb);
2192         kvm_vcpu_reset(vcpu, true);
2193
2194         kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
2195         return 0;
2196 }
2197
2198 static int io_interception(struct kvm_vcpu *vcpu)
2199 {
2200         struct vcpu_svm *svm = to_svm(vcpu);
2201         u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
2202         int size, in, string;
2203         unsigned port;
2204
2205         ++vcpu->stat.io_exits;
2206         string = (io_info & SVM_IOIO_STR_MASK) != 0;
2207         in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
2208         port = io_info >> 16;
2209         size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
2210
2211         if (string) {
2212                 if (sev_es_guest(vcpu->kvm))
2213                         return sev_es_string_io(svm, size, port, in);
2214                 else
2215                         return kvm_emulate_instruction(vcpu, 0);
2216         }
2217
2218         svm->next_rip = svm->vmcb->control.exit_info_2;
2219
2220         return kvm_fast_pio(vcpu, size, port, in);
2221 }
2222
2223 static int nmi_interception(struct kvm_vcpu *vcpu)
2224 {
2225         return 1;
2226 }
2227
2228 static int smi_interception(struct kvm_vcpu *vcpu)
2229 {
2230         return 1;
2231 }
2232
2233 static int intr_interception(struct kvm_vcpu *vcpu)
2234 {
2235         ++vcpu->stat.irq_exits;
2236         return 1;
2237 }
2238
2239 static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
2240 {
2241         struct vcpu_svm *svm = to_svm(vcpu);
2242         struct vmcb *vmcb12;
2243         struct kvm_host_map map;
2244         int ret;
2245
2246         if (nested_svm_check_permissions(vcpu))
2247                 return 1;
2248
2249         ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
2250         if (ret) {
2251                 if (ret == -EINVAL)
2252                         kvm_inject_gp(vcpu, 0);
2253                 return 1;
2254         }
2255
2256         vmcb12 = map.hva;
2257
2258         ret = kvm_skip_emulated_instruction(vcpu);
2259
2260         if (vmload) {
2261                 svm_copy_vmloadsave_state(svm->vmcb, vmcb12);
2262                 svm->sysenter_eip_hi = 0;
2263                 svm->sysenter_esp_hi = 0;
2264         } else {
2265                 svm_copy_vmloadsave_state(vmcb12, svm->vmcb);
2266         }
2267
2268         kvm_vcpu_unmap(vcpu, &map, true);
2269
2270         return ret;
2271 }
2272
2273 static int vmload_interception(struct kvm_vcpu *vcpu)
2274 {
2275         return vmload_vmsave_interception(vcpu, true);
2276 }
2277
2278 static int vmsave_interception(struct kvm_vcpu *vcpu)
2279 {
2280         return vmload_vmsave_interception(vcpu, false);
2281 }
2282
2283 static int vmrun_interception(struct kvm_vcpu *vcpu)
2284 {
2285         if (nested_svm_check_permissions(vcpu))
2286                 return 1;
2287
2288         return nested_svm_vmrun(vcpu);
2289 }
2290
2291 enum {
2292         NONE_SVM_INSTR,
2293         SVM_INSTR_VMRUN,
2294         SVM_INSTR_VMLOAD,
2295         SVM_INSTR_VMSAVE,
2296 };
2297
2298 /* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */
2299 static int svm_instr_opcode(struct kvm_vcpu *vcpu)
2300 {
2301         struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
2302
2303         if (ctxt->b != 0x1 || ctxt->opcode_len != 2)
2304                 return NONE_SVM_INSTR;
2305
2306         switch (ctxt->modrm) {
2307         case 0xd8: /* VMRUN */
2308                 return SVM_INSTR_VMRUN;
2309         case 0xda: /* VMLOAD */
2310                 return SVM_INSTR_VMLOAD;
2311         case 0xdb: /* VMSAVE */
2312                 return SVM_INSTR_VMSAVE;
2313         default:
2314                 break;
2315         }
2316
2317         return NONE_SVM_INSTR;
2318 }
2319
2320 static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
2321 {
2322         const int guest_mode_exit_codes[] = {
2323                 [SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN,
2324                 [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
2325                 [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
2326         };
2327         int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = {
2328                 [SVM_INSTR_VMRUN] = vmrun_interception,
2329                 [SVM_INSTR_VMLOAD] = vmload_interception,
2330                 [SVM_INSTR_VMSAVE] = vmsave_interception,
2331         };
2332         struct vcpu_svm *svm = to_svm(vcpu);
2333         int ret;
2334
2335         if (is_guest_mode(vcpu)) {
2336                 /* Returns '1' or -errno on failure, '0' on success. */
2337                 ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]);
2338                 if (ret)
2339                         return ret;
2340                 return 1;
2341         }
2342         return svm_instr_handlers[opcode](vcpu);
2343 }
2344
2345 /*
2346  * #GP handling code. Note that #GP can be triggered under the following two
2347  * cases:
2348  *   1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on
2349  *      some AMD CPUs when EAX of these instructions are in the reserved memory
2350  *      regions (e.g. SMM memory on host).
2351  *   2) VMware backdoor
2352  */
2353 static int gp_interception(struct kvm_vcpu *vcpu)
2354 {
2355         struct vcpu_svm *svm = to_svm(vcpu);
2356         u32 error_code = svm->vmcb->control.exit_info_1;
2357         int opcode;
2358
2359         /* Both #GP cases have zero error_code */
2360         if (error_code)
2361                 goto reinject;
2362
2363         /* Decode the instruction for usage later */
2364         if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK)
2365                 goto reinject;
2366
2367         opcode = svm_instr_opcode(vcpu);
2368
2369         if (opcode == NONE_SVM_INSTR) {
2370                 if (!enable_vmware_backdoor)
2371                         goto reinject;
2372
2373                 /*
2374                  * VMware backdoor emulation on #GP interception only handles
2375                  * IN{S}, OUT{S}, and RDPMC.
2376                  */
2377                 if (!is_guest_mode(vcpu))
2378                         return kvm_emulate_instruction(vcpu,
2379                                 EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE);
2380         } else {
2381                 /* All SVM instructions expect page aligned RAX */
2382                 if (svm->vmcb->save.rax & ~PAGE_MASK)
2383                         goto reinject;
2384
2385                 return emulate_svm_instr(vcpu, opcode);
2386         }
2387
2388 reinject:
2389         kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
2390         return 1;
2391 }
2392
2393 void svm_set_gif(struct vcpu_svm *svm, bool value)
2394 {
2395         if (value) {
2396                 /*
2397                  * If VGIF is enabled, the STGI intercept is only added to
2398                  * detect the opening of the SMI/NMI window; remove it now.
2399                  * Likewise, clear the VINTR intercept, we will set it
2400                  * again while processing KVM_REQ_EVENT if needed.
2401                  */
2402                 if (vgif)
2403                         svm_clr_intercept(svm, INTERCEPT_STGI);
2404                 if (svm_is_intercept(svm, INTERCEPT_VINTR))
2405                         svm_clear_vintr(svm);
2406
2407                 enable_gif(svm);
2408                 if (svm->vcpu.arch.smi_pending ||
2409                     svm->vcpu.arch.nmi_pending ||
2410                     kvm_cpu_has_injectable_intr(&svm->vcpu) ||
2411                     kvm_apic_has_pending_init_or_sipi(&svm->vcpu))
2412                         kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2413         } else {
2414                 disable_gif(svm);
2415
2416                 /*
2417                  * After a CLGI no interrupts should come.  But if vGIF is
2418                  * in use, we still rely on the VINTR intercept (rather than
2419                  * STGI) to detect an open interrupt window.
2420                 */
2421                 if (!vgif)
2422                         svm_clear_vintr(svm);
2423         }
2424 }
2425
2426 static int stgi_interception(struct kvm_vcpu *vcpu)
2427 {
2428         int ret;
2429
2430         if (nested_svm_check_permissions(vcpu))
2431                 return 1;
2432
2433         ret = kvm_skip_emulated_instruction(vcpu);
2434         svm_set_gif(to_svm(vcpu), true);
2435         return ret;
2436 }
2437
2438 static int clgi_interception(struct kvm_vcpu *vcpu)
2439 {
2440         int ret;
2441
2442         if (nested_svm_check_permissions(vcpu))
2443                 return 1;
2444
2445         ret = kvm_skip_emulated_instruction(vcpu);
2446         svm_set_gif(to_svm(vcpu), false);
2447         return ret;
2448 }
2449
2450 static int invlpga_interception(struct kvm_vcpu *vcpu)
2451 {
2452         gva_t gva = kvm_rax_read(vcpu);
2453         u32 asid = kvm_rcx_read(vcpu);
2454
2455         /* FIXME: Handle an address size prefix. */
2456         if (!is_long_mode(vcpu))
2457                 gva = (u32)gva;
2458
2459         trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, asid, gva);
2460
2461         /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
2462         kvm_mmu_invlpg(vcpu, gva);
2463
2464         return kvm_skip_emulated_instruction(vcpu);
2465 }
2466
2467 static int skinit_interception(struct kvm_vcpu *vcpu)
2468 {
2469         trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu));
2470
2471         kvm_queue_exception(vcpu, UD_VECTOR);
2472         return 1;
2473 }
2474
2475 static int task_switch_interception(struct kvm_vcpu *vcpu)
2476 {
2477         struct vcpu_svm *svm = to_svm(vcpu);
2478         u16 tss_selector;
2479         int reason;
2480         int int_type = svm->vmcb->control.exit_int_info &
2481                 SVM_EXITINTINFO_TYPE_MASK;
2482         int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
2483         uint32_t type =
2484                 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
2485         uint32_t idt_v =
2486                 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
2487         bool has_error_code = false;
2488         u32 error_code = 0;
2489
2490         tss_selector = (u16)svm->vmcb->control.exit_info_1;
2491
2492         if (svm->vmcb->control.exit_info_2 &
2493             (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
2494                 reason = TASK_SWITCH_IRET;
2495         else if (svm->vmcb->control.exit_info_2 &
2496                  (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
2497                 reason = TASK_SWITCH_JMP;
2498         else if (idt_v)
2499                 reason = TASK_SWITCH_GATE;
2500         else
2501                 reason = TASK_SWITCH_CALL;
2502
2503         if (reason == TASK_SWITCH_GATE) {
2504                 switch (type) {
2505                 case SVM_EXITINTINFO_TYPE_NMI:
2506                         vcpu->arch.nmi_injected = false;
2507                         break;
2508                 case SVM_EXITINTINFO_TYPE_EXEPT:
2509                         if (svm->vmcb->control.exit_info_2 &
2510                             (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
2511                                 has_error_code = true;
2512                                 error_code =
2513                                         (u32)svm->vmcb->control.exit_info_2;
2514                         }
2515                         kvm_clear_exception_queue(vcpu);
2516                         break;
2517                 case SVM_EXITINTINFO_TYPE_INTR:
2518                 case SVM_EXITINTINFO_TYPE_SOFT:
2519                         kvm_clear_interrupt_queue(vcpu);
2520                         break;
2521                 default:
2522                         break;
2523                 }
2524         }
2525
2526         if (reason != TASK_SWITCH_GATE ||
2527             int_type == SVM_EXITINTINFO_TYPE_SOFT ||
2528             (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
2529              (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
2530                 if (!svm_skip_emulated_instruction(vcpu))
2531                         return 0;
2532         }
2533
2534         if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
2535                 int_vec = -1;
2536
2537         return kvm_task_switch(vcpu, tss_selector, int_vec, reason,
2538                                has_error_code, error_code);
2539 }
2540
2541 static void svm_clr_iret_intercept(struct vcpu_svm *svm)
2542 {
2543         if (!sev_es_guest(svm->vcpu.kvm))
2544                 svm_clr_intercept(svm, INTERCEPT_IRET);
2545 }
2546
2547 static void svm_set_iret_intercept(struct vcpu_svm *svm)
2548 {
2549         if (!sev_es_guest(svm->vcpu.kvm))
2550                 svm_set_intercept(svm, INTERCEPT_IRET);
2551 }
2552
2553 static int iret_interception(struct kvm_vcpu *vcpu)
2554 {
2555         struct vcpu_svm *svm = to_svm(vcpu);
2556
2557         WARN_ON_ONCE(sev_es_guest(vcpu->kvm));
2558
2559         ++vcpu->stat.nmi_window_exits;
2560         svm->awaiting_iret_completion = true;
2561
2562         svm_clr_iret_intercept(svm);
2563         svm->nmi_iret_rip = kvm_rip_read(vcpu);
2564
2565         kvm_make_request(KVM_REQ_EVENT, vcpu);
2566         return 1;
2567 }
2568
2569 static int invlpg_interception(struct kvm_vcpu *vcpu)
2570 {
2571         if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2572                 return kvm_emulate_instruction(vcpu, 0);
2573
2574         kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1);
2575         return kvm_skip_emulated_instruction(vcpu);
2576 }
2577
2578 static int emulate_on_interception(struct kvm_vcpu *vcpu)
2579 {
2580         return kvm_emulate_instruction(vcpu, 0);
2581 }
2582
2583 static int rsm_interception(struct kvm_vcpu *vcpu)
2584 {
2585         return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2);
2586 }
2587
2588 static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
2589                                             unsigned long val)
2590 {
2591         struct vcpu_svm *svm = to_svm(vcpu);
2592         unsigned long cr0 = vcpu->arch.cr0;
2593         bool ret = false;
2594
2595         if (!is_guest_mode(vcpu) ||
2596             (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
2597                 return false;
2598
2599         cr0 &= ~SVM_CR0_SELECTIVE_MASK;
2600         val &= ~SVM_CR0_SELECTIVE_MASK;
2601
2602         if (cr0 ^ val) {
2603                 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
2604                 ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
2605         }
2606
2607         return ret;
2608 }
2609
2610 #define CR_VALID (1ULL << 63)
2611
2612 static int cr_interception(struct kvm_vcpu *vcpu)
2613 {
2614         struct vcpu_svm *svm = to_svm(vcpu);
2615         int reg, cr;
2616         unsigned long val;
2617         int err;
2618
2619         if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2620                 return emulate_on_interception(vcpu);
2621
2622         if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
2623                 return emulate_on_interception(vcpu);
2624
2625         reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2626         if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
2627                 cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
2628         else
2629                 cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
2630
2631         err = 0;
2632         if (cr >= 16) { /* mov to cr */
2633                 cr -= 16;
2634                 val = kvm_register_read(vcpu, reg);
2635                 trace_kvm_cr_write(cr, val);
2636                 switch (cr) {
2637                 case 0:
2638                         if (!check_selective_cr0_intercepted(vcpu, val))
2639                                 err = kvm_set_cr0(vcpu, val);
2640                         else
2641                                 return 1;
2642
2643                         break;
2644                 case 3:
2645                         err = kvm_set_cr3(vcpu, val);
2646                         break;
2647                 case 4:
2648                         err = kvm_set_cr4(vcpu, val);
2649                         break;
2650                 case 8:
2651                         err = kvm_set_cr8(vcpu, val);
2652                         break;
2653                 default:
2654                         WARN(1, "unhandled write to CR%d", cr);
2655                         kvm_queue_exception(vcpu, UD_VECTOR);
2656                         return 1;
2657                 }
2658         } else { /* mov from cr */
2659                 switch (cr) {
2660                 case 0:
2661                         val = kvm_read_cr0(vcpu);
2662                         break;
2663                 case 2:
2664                         val = vcpu->arch.cr2;
2665                         break;
2666                 case 3:
2667                         val = kvm_read_cr3(vcpu);
2668                         break;
2669                 case 4:
2670                         val = kvm_read_cr4(vcpu);
2671                         break;
2672                 case 8:
2673                         val = kvm_get_cr8(vcpu);
2674                         break;
2675                 default:
2676                         WARN(1, "unhandled read from CR%d", cr);
2677                         kvm_queue_exception(vcpu, UD_VECTOR);
2678                         return 1;
2679                 }
2680                 kvm_register_write(vcpu, reg, val);
2681                 trace_kvm_cr_read(cr, val);
2682         }
2683         return kvm_complete_insn_gp(vcpu, err);
2684 }
2685
2686 static int cr_trap(struct kvm_vcpu *vcpu)
2687 {
2688         struct vcpu_svm *svm = to_svm(vcpu);
2689         unsigned long old_value, new_value;
2690         unsigned int cr;
2691         int ret = 0;
2692
2693         new_value = (unsigned long)svm->vmcb->control.exit_info_1;
2694
2695         cr = svm->vmcb->control.exit_code - SVM_EXIT_CR0_WRITE_TRAP;
2696         switch (cr) {
2697         case 0:
2698                 old_value = kvm_read_cr0(vcpu);
2699                 svm_set_cr0(vcpu, new_value);
2700
2701                 kvm_post_set_cr0(vcpu, old_value, new_value);
2702                 break;
2703         case 4:
2704                 old_value = kvm_read_cr4(vcpu);
2705                 svm_set_cr4(vcpu, new_value);
2706
2707                 kvm_post_set_cr4(vcpu, old_value, new_value);
2708                 break;
2709         case 8:
2710                 ret = kvm_set_cr8(vcpu, new_value);
2711                 break;
2712         default:
2713                 WARN(1, "unhandled CR%d write trap", cr);
2714                 kvm_queue_exception(vcpu, UD_VECTOR);
2715                 return 1;
2716         }
2717
2718         return kvm_complete_insn_gp(vcpu, ret);
2719 }
2720
2721 static int dr_interception(struct kvm_vcpu *vcpu)
2722 {
2723         struct vcpu_svm *svm = to_svm(vcpu);
2724         int reg, dr;
2725         unsigned long val;
2726         int err = 0;
2727
2728         /*
2729          * SEV-ES intercepts DR7 only to disable guest debugging and the guest issues a VMGEXIT
2730          * for DR7 write only. KVM cannot change DR7 (always swapped as type 'A') so return early.
2731          */
2732         if (sev_es_guest(vcpu->kvm))
2733                 return 1;
2734
2735         if (vcpu->guest_debug == 0) {
2736                 /*
2737                  * No more DR vmexits; force a reload of the debug registers
2738                  * and reenter on this instruction.  The next vmexit will
2739                  * retrieve the full state of the debug registers.
2740                  */
2741                 clr_dr_intercepts(svm);
2742                 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
2743                 return 1;
2744         }
2745
2746         if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
2747                 return emulate_on_interception(vcpu);
2748
2749         reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2750         dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
2751         if (dr >= 16) { /* mov to DRn  */
2752                 dr -= 16;
2753                 val = kvm_register_read(vcpu, reg);
2754                 err = kvm_set_dr(vcpu, dr, val);
2755         } else {
2756                 kvm_get_dr(vcpu, dr, &val);
2757                 kvm_register_write(vcpu, reg, val);
2758         }
2759
2760         return kvm_complete_insn_gp(vcpu, err);
2761 }
2762
2763 static int cr8_write_interception(struct kvm_vcpu *vcpu)
2764 {
2765         int r;
2766
2767         u8 cr8_prev = kvm_get_cr8(vcpu);
2768         /* instruction emulation calls kvm_set_cr8() */
2769         r = cr_interception(vcpu);
2770         if (lapic_in_kernel(vcpu))
2771                 return r;
2772         if (cr8_prev <= kvm_get_cr8(vcpu))
2773                 return r;
2774         vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
2775         return 0;
2776 }
2777
2778 static int efer_trap(struct kvm_vcpu *vcpu)
2779 {
2780         struct msr_data msr_info;
2781         int ret;
2782
2783         /*
2784          * Clear the EFER_SVME bit from EFER. The SVM code always sets this
2785          * bit in svm_set_efer(), but __kvm_valid_efer() checks it against
2786          * whether the guest has X86_FEATURE_SVM - this avoids a failure if
2787          * the guest doesn't have X86_FEATURE_SVM.
2788          */
2789         msr_info.host_initiated = false;
2790         msr_info.index = MSR_EFER;
2791         msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME;
2792         ret = kvm_set_msr_common(vcpu, &msr_info);
2793
2794         return kvm_complete_insn_gp(vcpu, ret);
2795 }
2796
2797 static int svm_get_msr_feature(struct kvm_msr_entry *msr)
2798 {
2799         msr->data = 0;
2800
2801         switch (msr->index) {
2802         case MSR_AMD64_DE_CFG:
2803                 if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC))
2804                         msr->data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE;
2805                 break;
2806         default:
2807                 return KVM_MSR_RET_INVALID;
2808         }
2809
2810         return 0;
2811 }
2812
2813 static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2814 {
2815         struct vcpu_svm *svm = to_svm(vcpu);
2816
2817         switch (msr_info->index) {
2818         case MSR_AMD64_TSC_RATIO:
2819                 if (!msr_info->host_initiated &&
2820                     !guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR))
2821                         return 1;
2822                 msr_info->data = svm->tsc_ratio_msr;
2823                 break;
2824         case MSR_STAR:
2825                 msr_info->data = svm->vmcb01.ptr->save.star;
2826                 break;
2827 #ifdef CONFIG_X86_64
2828         case MSR_LSTAR:
2829                 msr_info->data = svm->vmcb01.ptr->save.lstar;
2830                 break;
2831         case MSR_CSTAR:
2832                 msr_info->data = svm->vmcb01.ptr->save.cstar;
2833                 break;
2834         case MSR_KERNEL_GS_BASE:
2835                 msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base;
2836                 break;
2837         case MSR_SYSCALL_MASK:
2838                 msr_info->data = svm->vmcb01.ptr->save.sfmask;
2839                 break;
2840 #endif
2841         case MSR_IA32_SYSENTER_CS:
2842                 msr_info->data = svm->vmcb01.ptr->save.sysenter_cs;
2843                 break;
2844         case MSR_IA32_SYSENTER_EIP:
2845                 msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip;
2846                 if (guest_cpuid_is_intel(vcpu))
2847                         msr_info->data |= (u64)svm->sysenter_eip_hi << 32;
2848                 break;
2849         case MSR_IA32_SYSENTER_ESP:
2850                 msr_info->data = svm->vmcb01.ptr->save.sysenter_esp;
2851                 if (guest_cpuid_is_intel(vcpu))
2852                         msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
2853                 break;
2854         case MSR_TSC_AUX:
2855                 msr_info->data = svm->tsc_aux;
2856                 break;
2857         case MSR_IA32_DEBUGCTLMSR:
2858                 msr_info->data = svm_get_lbr_vmcb(svm)->save.dbgctl;
2859                 break;
2860         case MSR_IA32_LASTBRANCHFROMIP:
2861                 msr_info->data = svm_get_lbr_vmcb(svm)->save.br_from;
2862                 break;
2863         case MSR_IA32_LASTBRANCHTOIP:
2864                 msr_info->data = svm_get_lbr_vmcb(svm)->save.br_to;
2865                 break;
2866         case MSR_IA32_LASTINTFROMIP:
2867                 msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_from;
2868                 break;
2869         case MSR_IA32_LASTINTTOIP:
2870                 msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_to;
2871                 break;
2872         case MSR_VM_HSAVE_PA:
2873                 msr_info->data = svm->nested.hsave_msr;
2874                 break;
2875         case MSR_VM_CR:
2876                 msr_info->data = svm->nested.vm_cr_msr;
2877                 break;
2878         case MSR_IA32_SPEC_CTRL:
2879                 if (!msr_info->host_initiated &&
2880                     !guest_has_spec_ctrl_msr(vcpu))
2881                         return 1;
2882
2883                 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
2884                         msr_info->data = svm->vmcb->save.spec_ctrl;
2885                 else
2886                         msr_info->data = svm->spec_ctrl;
2887                 break;
2888         case MSR_AMD64_VIRT_SPEC_CTRL:
2889                 if (!msr_info->host_initiated &&
2890                     !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
2891                         return 1;
2892
2893                 msr_info->data = svm->virt_spec_ctrl;
2894                 break;
2895         case MSR_F15H_IC_CFG: {
2896
2897                 int family, model;
2898
2899                 family = guest_cpuid_family(vcpu);
2900                 model  = guest_cpuid_model(vcpu);
2901
2902                 if (family < 0 || model < 0)
2903                         return kvm_get_msr_common(vcpu, msr_info);
2904
2905                 msr_info->data = 0;
2906
2907                 if (family == 0x15 &&
2908                     (model >= 0x2 && model < 0x20))
2909                         msr_info->data = 0x1E;
2910                 }
2911                 break;
2912         case MSR_AMD64_DE_CFG:
2913                 msr_info->data = svm->msr_decfg;
2914                 break;
2915         default:
2916                 return kvm_get_msr_common(vcpu, msr_info);
2917         }
2918         return 0;
2919 }
2920
2921 static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
2922 {
2923         struct vcpu_svm *svm = to_svm(vcpu);
2924         if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb))
2925                 return kvm_complete_insn_gp(vcpu, err);
2926
2927         ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 1);
2928         ghcb_set_sw_exit_info_2(svm->sev_es.ghcb,
2929                                 X86_TRAP_GP |
2930                                 SVM_EVTINJ_TYPE_EXEPT |
2931                                 SVM_EVTINJ_VALID);
2932         return 1;
2933 }
2934
2935 static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
2936 {
2937         struct vcpu_svm *svm = to_svm(vcpu);
2938         int svm_dis, chg_mask;
2939
2940         if (data & ~SVM_VM_CR_VALID_MASK)
2941                 return 1;
2942
2943         chg_mask = SVM_VM_CR_VALID_MASK;
2944
2945         if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
2946                 chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
2947
2948         svm->nested.vm_cr_msr &= ~chg_mask;
2949         svm->nested.vm_cr_msr |= (data & chg_mask);
2950
2951         svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
2952
2953         /* check for svm_disable while efer.svme is set */
2954         if (svm_dis && (vcpu->arch.efer & EFER_SVME))
2955                 return 1;
2956
2957         return 0;
2958 }
2959
2960 static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2961 {
2962         struct vcpu_svm *svm = to_svm(vcpu);
2963         int ret = 0;
2964
2965         u32 ecx = msr->index;
2966         u64 data = msr->data;
2967         switch (ecx) {
2968         case MSR_AMD64_TSC_RATIO:
2969
2970                 if (!guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR)) {
2971
2972                         if (!msr->host_initiated)
2973                                 return 1;
2974                         /*
2975                          * In case TSC scaling is not enabled, always
2976                          * leave this MSR at the default value.
2977                          *
2978                          * Due to bug in qemu 6.2.0, it would try to set
2979                          * this msr to 0 if tsc scaling is not enabled.
2980                          * Ignore this value as well.
2981                          */
2982                         if (data != 0 && data != svm->tsc_ratio_msr)
2983                                 return 1;
2984                         break;
2985                 }
2986
2987                 if (data & SVM_TSC_RATIO_RSVD)
2988                         return 1;
2989
2990                 svm->tsc_ratio_msr = data;
2991
2992                 if (guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR) &&
2993                     is_guest_mode(vcpu))
2994                         nested_svm_update_tsc_ratio_msr(vcpu);
2995
2996                 break;
2997         case MSR_IA32_CR_PAT:
2998                 ret = kvm_set_msr_common(vcpu, msr);
2999                 if (ret)
3000                         break;
3001
3002                 svm->vmcb01.ptr->save.g_pat = data;
3003                 if (is_guest_mode(vcpu))
3004                         nested_vmcb02_compute_g_pat(svm);
3005                 vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
3006                 break;
3007         case MSR_IA32_SPEC_CTRL:
3008                 if (!msr->host_initiated &&
3009                     !guest_has_spec_ctrl_msr(vcpu))
3010                         return 1;
3011
3012                 if (kvm_spec_ctrl_test_value(data))
3013                         return 1;
3014
3015                 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
3016                         svm->vmcb->save.spec_ctrl = data;
3017                 else
3018                         svm->spec_ctrl = data;
3019                 if (!data)
3020                         break;
3021
3022                 /*
3023                  * For non-nested:
3024                  * When it's written (to non-zero) for the first time, pass
3025                  * it through.
3026                  *
3027                  * For nested:
3028                  * The handling of the MSR bitmap for L2 guests is done in
3029                  * nested_svm_vmrun_msrpm.
3030                  * We update the L1 MSR bit as well since it will end up
3031                  * touching the MSR anyway now.
3032                  */
3033                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
3034                 break;
3035         case MSR_AMD64_VIRT_SPEC_CTRL:
3036                 if (!msr->host_initiated &&
3037                     !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
3038                         return 1;
3039
3040                 if (data & ~SPEC_CTRL_SSBD)
3041                         return 1;
3042
3043                 svm->virt_spec_ctrl = data;
3044                 break;
3045         case MSR_STAR:
3046                 svm->vmcb01.ptr->save.star = data;
3047                 break;
3048 #ifdef CONFIG_X86_64
3049         case MSR_LSTAR:
3050                 svm->vmcb01.ptr->save.lstar = data;
3051                 break;
3052         case MSR_CSTAR:
3053                 svm->vmcb01.ptr->save.cstar = data;
3054                 break;
3055         case MSR_KERNEL_GS_BASE:
3056                 svm->vmcb01.ptr->save.kernel_gs_base = data;
3057                 break;
3058         case MSR_SYSCALL_MASK:
3059                 svm->vmcb01.ptr->save.sfmask = data;
3060                 break;
3061 #endif
3062         case MSR_IA32_SYSENTER_CS:
3063                 svm->vmcb01.ptr->save.sysenter_cs = data;
3064                 break;
3065         case MSR_IA32_SYSENTER_EIP:
3066                 svm->vmcb01.ptr->save.sysenter_eip = (u32)data;
3067                 /*
3068                  * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs
3069                  * when we spoof an Intel vendor ID (for cross vendor migration).
3070                  * In this case we use this intercept to track the high
3071                  * 32 bit part of these msrs to support Intel's
3072                  * implementation of SYSENTER/SYSEXIT.
3073                  */
3074                 svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
3075                 break;
3076         case MSR_IA32_SYSENTER_ESP:
3077                 svm->vmcb01.ptr->save.sysenter_esp = (u32)data;
3078                 svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
3079                 break;
3080         case MSR_TSC_AUX:
3081                 /*
3082                  * TSC_AUX is usually changed only during boot and never read
3083                  * directly.  Intercept TSC_AUX instead of exposing it to the
3084                  * guest via direct_access_msrs, and switch it via user return.
3085                  */
3086                 preempt_disable();
3087                 ret = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
3088                 preempt_enable();
3089                 if (ret)
3090                         break;
3091
3092                 svm->tsc_aux = data;
3093                 break;
3094         case MSR_IA32_DEBUGCTLMSR:
3095                 if (!lbrv) {
3096                         kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
3097                         break;
3098                 }
3099                 if (data & DEBUGCTL_RESERVED_BITS)
3100                         return 1;
3101
3102                 svm_get_lbr_vmcb(svm)->save.dbgctl = data;
3103                 svm_update_lbrv(vcpu);
3104                 break;
3105         case MSR_VM_HSAVE_PA:
3106                 /*
3107                  * Old kernels did not validate the value written to
3108                  * MSR_VM_HSAVE_PA.  Allow KVM_SET_MSR to set an invalid
3109                  * value to allow live migrating buggy or malicious guests
3110                  * originating from those kernels.
3111                  */
3112                 if (!msr->host_initiated && !page_address_valid(vcpu, data))
3113                         return 1;
3114
3115                 svm->nested.hsave_msr = data & PAGE_MASK;
3116                 break;
3117         case MSR_VM_CR:
3118                 return svm_set_vm_cr(vcpu, data);
3119         case MSR_VM_IGNNE:
3120                 kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
3121                 break;
3122         case MSR_AMD64_DE_CFG: {
3123                 struct kvm_msr_entry msr_entry;
3124
3125                 msr_entry.index = msr->index;
3126                 if (svm_get_msr_feature(&msr_entry))
3127                         return 1;
3128
3129                 /* Check the supported bits */
3130                 if (data & ~msr_entry.data)
3131                         return 1;
3132
3133                 /* Don't allow the guest to change a bit, #GP */
3134                 if (!msr->host_initiated && (data ^ msr_entry.data))
3135                         return 1;
3136
3137                 svm->msr_decfg = data;
3138                 break;
3139         }
3140         default:
3141                 return kvm_set_msr_common(vcpu, msr);
3142         }
3143         return ret;
3144 }
3145
3146 static int msr_interception(struct kvm_vcpu *vcpu)
3147 {
3148         if (to_svm(vcpu)->vmcb->control.exit_info_1)
3149                 return kvm_emulate_wrmsr(vcpu);
3150         else
3151                 return kvm_emulate_rdmsr(vcpu);
3152 }
3153
3154 static int interrupt_window_interception(struct kvm_vcpu *vcpu)
3155 {
3156         kvm_make_request(KVM_REQ_EVENT, vcpu);
3157         svm_clear_vintr(to_svm(vcpu));
3158
3159         /*
3160          * If not running nested, for AVIC, the only reason to end up here is ExtINTs.
3161          * In this case AVIC was temporarily disabled for
3162          * requesting the IRQ window and we have to re-enable it.
3163          *
3164          * If running nested, still remove the VM wide AVIC inhibit to
3165          * support case in which the interrupt window was requested when the
3166          * vCPU was not running nested.
3167
3168          * All vCPUs which run still run nested, will remain to have their
3169          * AVIC still inhibited due to per-cpu AVIC inhibition.
3170          */
3171         kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
3172
3173         ++vcpu->stat.irq_window_exits;
3174         return 1;
3175 }
3176
3177 static int pause_interception(struct kvm_vcpu *vcpu)
3178 {
3179         bool in_kernel;
3180         /*
3181          * CPL is not made available for an SEV-ES guest, therefore
3182          * vcpu->arch.preempted_in_kernel can never be true.  Just
3183          * set in_kernel to false as well.
3184          */
3185         in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0;
3186
3187         grow_ple_window(vcpu);
3188
3189         kvm_vcpu_on_spin(vcpu, in_kernel);
3190         return kvm_skip_emulated_instruction(vcpu);
3191 }
3192
3193 static int invpcid_interception(struct kvm_vcpu *vcpu)
3194 {
3195         struct vcpu_svm *svm = to_svm(vcpu);
3196         unsigned long type;
3197         gva_t gva;
3198
3199         if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
3200                 kvm_queue_exception(vcpu, UD_VECTOR);
3201                 return 1;
3202         }
3203
3204         /*
3205          * For an INVPCID intercept:
3206          * EXITINFO1 provides the linear address of the memory operand.
3207          * EXITINFO2 provides the contents of the register operand.
3208          */
3209         type = svm->vmcb->control.exit_info_2;
3210         gva = svm->vmcb->control.exit_info_1;
3211
3212         return kvm_handle_invpcid(vcpu, type, gva);
3213 }
3214
3215 static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3216         [SVM_EXIT_READ_CR0]                     = cr_interception,
3217         [SVM_EXIT_READ_CR3]                     = cr_interception,
3218         [SVM_EXIT_READ_CR4]                     = cr_interception,
3219         [SVM_EXIT_READ_CR8]                     = cr_interception,
3220         [SVM_EXIT_CR0_SEL_WRITE]                = cr_interception,
3221         [SVM_EXIT_WRITE_CR0]                    = cr_interception,
3222         [SVM_EXIT_WRITE_CR3]                    = cr_interception,
3223         [SVM_EXIT_WRITE_CR4]                    = cr_interception,
3224         [SVM_EXIT_WRITE_CR8]                    = cr8_write_interception,
3225         [SVM_EXIT_READ_DR0]                     = dr_interception,
3226         [SVM_EXIT_READ_DR1]                     = dr_interception,
3227         [SVM_EXIT_READ_DR2]                     = dr_interception,
3228         [SVM_EXIT_READ_DR3]                     = dr_interception,
3229         [SVM_EXIT_READ_DR4]                     = dr_interception,
3230         [SVM_EXIT_READ_DR5]                     = dr_interception,
3231         [SVM_EXIT_READ_DR6]                     = dr_interception,
3232         [SVM_EXIT_READ_DR7]                     = dr_interception,
3233         [SVM_EXIT_WRITE_DR0]                    = dr_interception,
3234         [SVM_EXIT_WRITE_DR1]                    = dr_interception,
3235         [SVM_EXIT_WRITE_DR2]                    = dr_interception,
3236         [SVM_EXIT_WRITE_DR3]                    = dr_interception,
3237         [SVM_EXIT_WRITE_DR4]                    = dr_interception,
3238         [SVM_EXIT_WRITE_DR5]                    = dr_interception,
3239         [SVM_EXIT_WRITE_DR6]                    = dr_interception,
3240         [SVM_EXIT_WRITE_DR7]                    = dr_interception,
3241         [SVM_EXIT_EXCP_BASE + DB_VECTOR]        = db_interception,
3242         [SVM_EXIT_EXCP_BASE + BP_VECTOR]        = bp_interception,
3243         [SVM_EXIT_EXCP_BASE + UD_VECTOR]        = ud_interception,
3244         [SVM_EXIT_EXCP_BASE + PF_VECTOR]        = pf_interception,
3245         [SVM_EXIT_EXCP_BASE + MC_VECTOR]        = mc_interception,
3246         [SVM_EXIT_EXCP_BASE + AC_VECTOR]        = ac_interception,
3247         [SVM_EXIT_EXCP_BASE + GP_VECTOR]        = gp_interception,
3248         [SVM_EXIT_INTR]                         = intr_interception,
3249         [SVM_EXIT_NMI]                          = nmi_interception,
3250         [SVM_EXIT_SMI]                          = smi_interception,
3251         [SVM_EXIT_VINTR]                        = interrupt_window_interception,
3252         [SVM_EXIT_RDPMC]                        = kvm_emulate_rdpmc,
3253         [SVM_EXIT_CPUID]                        = kvm_emulate_cpuid,
3254         [SVM_EXIT_IRET]                         = iret_interception,
3255         [SVM_EXIT_INVD]                         = kvm_emulate_invd,
3256         [SVM_EXIT_PAUSE]                        = pause_interception,
3257         [SVM_EXIT_HLT]                          = kvm_emulate_halt,
3258         [SVM_EXIT_INVLPG]                       = invlpg_interception,
3259         [SVM_EXIT_INVLPGA]                      = invlpga_interception,
3260         [SVM_EXIT_IOIO]                         = io_interception,
3261         [SVM_EXIT_MSR]                          = msr_interception,
3262         [SVM_EXIT_TASK_SWITCH]                  = task_switch_interception,
3263         [SVM_EXIT_SHUTDOWN]                     = shutdown_interception,
3264         [SVM_EXIT_VMRUN]                        = vmrun_interception,
3265         [SVM_EXIT_VMMCALL]                      = kvm_emulate_hypercall,
3266         [SVM_EXIT_VMLOAD]                       = vmload_interception,
3267         [SVM_EXIT_VMSAVE]                       = vmsave_interception,
3268         [SVM_EXIT_STGI]                         = stgi_interception,
3269         [SVM_EXIT_CLGI]                         = clgi_interception,
3270         [SVM_EXIT_SKINIT]                       = skinit_interception,
3271         [SVM_EXIT_RDTSCP]                       = kvm_handle_invalid_op,
3272         [SVM_EXIT_WBINVD]                       = kvm_emulate_wbinvd,
3273         [SVM_EXIT_MONITOR]                      = kvm_emulate_monitor,
3274         [SVM_EXIT_MWAIT]                        = kvm_emulate_mwait,
3275         [SVM_EXIT_XSETBV]                       = kvm_emulate_xsetbv,
3276         [SVM_EXIT_RDPRU]                        = kvm_handle_invalid_op,
3277         [SVM_EXIT_EFER_WRITE_TRAP]              = efer_trap,
3278         [SVM_EXIT_CR0_WRITE_TRAP]               = cr_trap,
3279         [SVM_EXIT_CR4_WRITE_TRAP]               = cr_trap,
3280         [SVM_EXIT_CR8_WRITE_TRAP]               = cr_trap,
3281         [SVM_EXIT_INVPCID]                      = invpcid_interception,
3282         [SVM_EXIT_NPF]                          = npf_interception,
3283         [SVM_EXIT_RSM]                          = rsm_interception,
3284         [SVM_EXIT_AVIC_INCOMPLETE_IPI]          = avic_incomplete_ipi_interception,
3285         [SVM_EXIT_AVIC_UNACCELERATED_ACCESS]    = avic_unaccelerated_access_interception,
3286         [SVM_EXIT_VMGEXIT]                      = sev_handle_vmgexit,
3287 };
3288
3289 static void dump_vmcb(struct kvm_vcpu *vcpu)
3290 {
3291         struct vcpu_svm *svm = to_svm(vcpu);
3292         struct vmcb_control_area *control = &svm->vmcb->control;
3293         struct vmcb_save_area *save = &svm->vmcb->save;
3294         struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save;
3295
3296         if (!dump_invalid_vmcb) {
3297                 pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
3298                 return;
3299         }
3300
3301         pr_err("VMCB %p, last attempted VMRUN on CPU %d\n",
3302                svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu);
3303         pr_err("VMCB Control Area:\n");
3304         pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff);
3305         pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16);
3306         pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff);
3307         pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16);
3308         pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]);
3309         pr_err("%-20s%08x %08x\n", "intercepts:",
3310               control->intercepts[INTERCEPT_WORD3],
3311                control->intercepts[INTERCEPT_WORD4]);
3312         pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
3313         pr_err("%-20s%d\n", "pause filter threshold:",
3314                control->pause_filter_thresh);
3315         pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
3316         pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
3317         pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
3318         pr_err("%-20s%d\n", "asid:", control->asid);
3319         pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
3320         pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
3321         pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
3322         pr_err("%-20s%08x\n", "int_state:", control->int_state);
3323         pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
3324         pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
3325         pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
3326         pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
3327         pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
3328         pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
3329         pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
3330         pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
3331         pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa);
3332         pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
3333         pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
3334         pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
3335         pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
3336         pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
3337         pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
3338         pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
3339         pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa);
3340         pr_err("VMCB State Save Area:\n");
3341         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3342                "es:",
3343                save->es.selector, save->es.attrib,
3344                save->es.limit, save->es.base);
3345         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3346                "cs:",
3347                save->cs.selector, save->cs.attrib,
3348                save->cs.limit, save->cs.base);
3349         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3350                "ss:",
3351                save->ss.selector, save->ss.attrib,
3352                save->ss.limit, save->ss.base);
3353         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3354                "ds:",
3355                save->ds.selector, save->ds.attrib,
3356                save->ds.limit, save->ds.base);
3357         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3358                "fs:",
3359                save01->fs.selector, save01->fs.attrib,
3360                save01->fs.limit, save01->fs.base);
3361         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3362                "gs:",
3363                save01->gs.selector, save01->gs.attrib,
3364                save01->gs.limit, save01->gs.base);
3365         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3366                "gdtr:",
3367                save->gdtr.selector, save->gdtr.attrib,
3368                save->gdtr.limit, save->gdtr.base);
3369         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3370                "ldtr:",
3371                save01->ldtr.selector, save01->ldtr.attrib,
3372                save01->ldtr.limit, save01->ldtr.base);
3373         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3374                "idtr:",
3375                save->idtr.selector, save->idtr.attrib,
3376                save->idtr.limit, save->idtr.base);
3377         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3378                "tr:",
3379                save01->tr.selector, save01->tr.attrib,
3380                save01->tr.limit, save01->tr.base);
3381         pr_err("vmpl: %d   cpl:  %d               efer:          %016llx\n",
3382                save->vmpl, save->cpl, save->efer);
3383         pr_err("%-15s %016llx %-13s %016llx\n",
3384                "cr0:", save->cr0, "cr2:", save->cr2);
3385         pr_err("%-15s %016llx %-13s %016llx\n",
3386                "cr3:", save->cr3, "cr4:", save->cr4);
3387         pr_err("%-15s %016llx %-13s %016llx\n",
3388                "dr6:", save->dr6, "dr7:", save->dr7);
3389         pr_err("%-15s %016llx %-13s %016llx\n",
3390                "rip:", save->rip, "rflags:", save->rflags);
3391         pr_err("%-15s %016llx %-13s %016llx\n",
3392                "rsp:", save->rsp, "rax:", save->rax);
3393         pr_err("%-15s %016llx %-13s %016llx\n",
3394                "star:", save01->star, "lstar:", save01->lstar);
3395         pr_err("%-15s %016llx %-13s %016llx\n",
3396                "cstar:", save01->cstar, "sfmask:", save01->sfmask);
3397         pr_err("%-15s %016llx %-13s %016llx\n",
3398                "kernel_gs_base:", save01->kernel_gs_base,
3399                "sysenter_cs:", save01->sysenter_cs);
3400         pr_err("%-15s %016llx %-13s %016llx\n",
3401                "sysenter_esp:", save01->sysenter_esp,
3402                "sysenter_eip:", save01->sysenter_eip);
3403         pr_err("%-15s %016llx %-13s %016llx\n",
3404                "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
3405         pr_err("%-15s %016llx %-13s %016llx\n",
3406                "br_from:", save->br_from, "br_to:", save->br_to);
3407         pr_err("%-15s %016llx %-13s %016llx\n",
3408                "excp_from:", save->last_excp_from,
3409                "excp_to:", save->last_excp_to);
3410 }
3411
3412 static bool svm_check_exit_valid(u64 exit_code)
3413 {
3414         return (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
3415                 svm_exit_handlers[exit_code]);
3416 }
3417
3418 static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
3419 {
3420         vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code);
3421         dump_vmcb(vcpu);
3422         vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3423         vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
3424         vcpu->run->internal.ndata = 2;
3425         vcpu->run->internal.data[0] = exit_code;
3426         vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
3427         return 0;
3428 }
3429
3430 int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
3431 {
3432         if (!svm_check_exit_valid(exit_code))
3433                 return svm_handle_invalid_exit(vcpu, exit_code);
3434
3435 #ifdef CONFIG_RETPOLINE
3436         if (exit_code == SVM_EXIT_MSR)
3437                 return msr_interception(vcpu);
3438         else if (exit_code == SVM_EXIT_VINTR)
3439                 return interrupt_window_interception(vcpu);
3440         else if (exit_code == SVM_EXIT_INTR)
3441                 return intr_interception(vcpu);
3442         else if (exit_code == SVM_EXIT_HLT)
3443                 return kvm_emulate_halt(vcpu);
3444         else if (exit_code == SVM_EXIT_NPF)
3445                 return npf_interception(vcpu);
3446 #endif
3447         return svm_exit_handlers[exit_code](vcpu);
3448 }
3449
3450 static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
3451                               u64 *info1, u64 *info2,
3452                               u32 *intr_info, u32 *error_code)
3453 {
3454         struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
3455
3456         *reason = control->exit_code;
3457         *info1 = control->exit_info_1;
3458         *info2 = control->exit_info_2;
3459         *intr_info = control->exit_int_info;
3460         if ((*intr_info & SVM_EXITINTINFO_VALID) &&
3461             (*intr_info & SVM_EXITINTINFO_VALID_ERR))
3462                 *error_code = control->exit_int_info_err;
3463         else
3464                 *error_code = 0;
3465 }
3466
3467 static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
3468 {
3469         struct vcpu_svm *svm = to_svm(vcpu);
3470         struct kvm_run *kvm_run = vcpu->run;
3471         u32 exit_code = svm->vmcb->control.exit_code;
3472
3473         /* SEV-ES guests must use the CR write traps to track CR registers. */
3474         if (!sev_es_guest(vcpu->kvm)) {
3475                 if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE))
3476                         vcpu->arch.cr0 = svm->vmcb->save.cr0;
3477                 if (npt_enabled)
3478                         vcpu->arch.cr3 = svm->vmcb->save.cr3;
3479         }
3480
3481         if (is_guest_mode(vcpu)) {
3482                 int vmexit;
3483
3484                 trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM);
3485
3486                 vmexit = nested_svm_exit_special(svm);
3487
3488                 if (vmexit == NESTED_EXIT_CONTINUE)
3489                         vmexit = nested_svm_exit_handled(svm);
3490
3491                 if (vmexit == NESTED_EXIT_DONE)
3492                         return 1;
3493         }
3494
3495         if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
3496                 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3497                 kvm_run->fail_entry.hardware_entry_failure_reason
3498                         = svm->vmcb->control.exit_code;
3499                 kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
3500                 dump_vmcb(vcpu);
3501                 return 0;
3502         }
3503
3504         if (exit_fastpath != EXIT_FASTPATH_NONE)
3505                 return 1;
3506
3507         return svm_invoke_exit_handler(vcpu, exit_code);
3508 }
3509
3510 static void pre_svm_run(struct kvm_vcpu *vcpu)
3511 {
3512         struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
3513         struct vcpu_svm *svm = to_svm(vcpu);
3514
3515         /*
3516          * If the previous vmrun of the vmcb occurred on a different physical
3517          * cpu, then mark the vmcb dirty and assign a new asid.  Hardware's
3518          * vmcb clean bits are per logical CPU, as are KVM's asid assignments.
3519          */
3520         if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) {
3521                 svm->current_vmcb->asid_generation = 0;
3522                 vmcb_mark_all_dirty(svm->vmcb);
3523                 svm->current_vmcb->cpu = vcpu->cpu;
3524         }
3525
3526         if (sev_guest(vcpu->kvm))
3527                 return pre_sev_run(svm, vcpu->cpu);
3528
3529         /* FIXME: handle wraparound of asid_generation */
3530         if (svm->current_vmcb->asid_generation != sd->asid_generation)
3531                 new_asid(svm, sd);
3532 }
3533
3534 static void svm_inject_nmi(struct kvm_vcpu *vcpu)
3535 {
3536         struct vcpu_svm *svm = to_svm(vcpu);
3537
3538         svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
3539
3540         if (svm->nmi_l1_to_l2)
3541                 return;
3542
3543         svm->nmi_masked = true;
3544         svm_set_iret_intercept(svm);
3545         ++vcpu->stat.nmi_injections;
3546 }
3547
3548 static bool svm_is_vnmi_pending(struct kvm_vcpu *vcpu)
3549 {
3550         struct vcpu_svm *svm = to_svm(vcpu);
3551
3552         if (!is_vnmi_enabled(svm))
3553                 return false;
3554
3555         return !!(svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK);
3556 }
3557
3558 static bool svm_set_vnmi_pending(struct kvm_vcpu *vcpu)
3559 {
3560         struct vcpu_svm *svm = to_svm(vcpu);
3561
3562         if (!is_vnmi_enabled(svm))
3563                 return false;
3564
3565         if (svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK)
3566                 return false;
3567
3568         svm->vmcb->control.int_ctl |= V_NMI_PENDING_MASK;
3569         vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
3570
3571         /*
3572          * Because the pending NMI is serviced by hardware, KVM can't know when
3573          * the NMI is "injected", but for all intents and purposes, passing the
3574          * NMI off to hardware counts as injection.
3575          */
3576         ++vcpu->stat.nmi_injections;
3577
3578         return true;
3579 }
3580
3581 static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
3582 {
3583         struct vcpu_svm *svm = to_svm(vcpu);
3584         u32 type;
3585
3586         if (vcpu->arch.interrupt.soft) {
3587                 if (svm_update_soft_interrupt_rip(vcpu))
3588                         return;
3589
3590                 type = SVM_EVTINJ_TYPE_SOFT;
3591         } else {
3592                 type = SVM_EVTINJ_TYPE_INTR;
3593         }
3594
3595         trace_kvm_inj_virq(vcpu->arch.interrupt.nr,
3596                            vcpu->arch.interrupt.soft, reinjected);
3597         ++vcpu->stat.irq_injections;
3598
3599         svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
3600                                        SVM_EVTINJ_VALID | type;
3601 }
3602
3603 void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
3604                                      int trig_mode, int vector)
3605 {
3606         /*
3607          * apic->apicv_active must be read after vcpu->mode.
3608          * Pairs with smp_store_release in vcpu_enter_guest.
3609          */
3610         bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE);
3611
3612         /* Note, this is called iff the local APIC is in-kernel. */
3613         if (!READ_ONCE(vcpu->arch.apic->apicv_active)) {
3614                 /* Process the interrupt via kvm_check_and_inject_events(). */
3615                 kvm_make_request(KVM_REQ_EVENT, vcpu);
3616                 kvm_vcpu_kick(vcpu);
3617                 return;
3618         }
3619
3620         trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
3621         if (in_guest_mode) {
3622                 /*
3623                  * Signal the doorbell to tell hardware to inject the IRQ.  If
3624                  * the vCPU exits the guest before the doorbell chimes, hardware
3625                  * will automatically process AVIC interrupts at the next VMRUN.
3626                  */
3627                 avic_ring_doorbell(vcpu);
3628         } else {
3629                 /*
3630                  * Wake the vCPU if it was blocking.  KVM will then detect the
3631                  * pending IRQ when checking if the vCPU has a wake event.
3632                  */
3633                 kvm_vcpu_wake_up(vcpu);
3634         }
3635 }
3636
3637 static void svm_deliver_interrupt(struct kvm_lapic *apic,  int delivery_mode,
3638                                   int trig_mode, int vector)
3639 {
3640         kvm_lapic_set_irr(vector, apic);
3641
3642         /*
3643          * Pairs with the smp_mb_*() after setting vcpu->guest_mode in
3644          * vcpu_enter_guest() to ensure the write to the vIRR is ordered before
3645          * the read of guest_mode.  This guarantees that either VMRUN will see
3646          * and process the new vIRR entry, or that svm_complete_interrupt_delivery
3647          * will signal the doorbell if the CPU has already entered the guest.
3648          */
3649         smp_mb__after_atomic();
3650         svm_complete_interrupt_delivery(apic->vcpu, delivery_mode, trig_mode, vector);
3651 }
3652
3653 static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3654 {
3655         struct vcpu_svm *svm = to_svm(vcpu);
3656
3657         /*
3658          * SEV-ES guests must always keep the CR intercepts cleared. CR
3659          * tracking is done using the CR write traps.
3660          */
3661         if (sev_es_guest(vcpu->kvm))
3662                 return;
3663
3664         if (nested_svm_virtualize_tpr(vcpu))
3665                 return;
3666
3667         svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
3668
3669         if (irr == -1)
3670                 return;
3671
3672         if (tpr >= irr)
3673                 svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
3674 }
3675
3676 static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
3677 {
3678         struct vcpu_svm *svm = to_svm(vcpu);
3679
3680         if (is_vnmi_enabled(svm))
3681                 return svm->vmcb->control.int_ctl & V_NMI_BLOCKING_MASK;
3682         else
3683                 return svm->nmi_masked;
3684 }
3685
3686 static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
3687 {
3688         struct vcpu_svm *svm = to_svm(vcpu);
3689
3690         if (is_vnmi_enabled(svm)) {
3691                 if (masked)
3692                         svm->vmcb->control.int_ctl |= V_NMI_BLOCKING_MASK;
3693                 else
3694                         svm->vmcb->control.int_ctl &= ~V_NMI_BLOCKING_MASK;
3695
3696         } else {
3697                 svm->nmi_masked = masked;
3698                 if (masked)
3699                         svm_set_iret_intercept(svm);
3700                 else
3701                         svm_clr_iret_intercept(svm);
3702         }
3703 }
3704
3705 bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
3706 {
3707         struct vcpu_svm *svm = to_svm(vcpu);
3708         struct vmcb *vmcb = svm->vmcb;
3709
3710         if (!gif_set(svm))
3711                 return true;
3712
3713         if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
3714                 return false;
3715
3716         if (svm_get_nmi_mask(vcpu))
3717                 return true;
3718
3719         return vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK;
3720 }
3721
3722 static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
3723 {
3724         struct vcpu_svm *svm = to_svm(vcpu);
3725         if (svm->nested.nested_run_pending)
3726                 return -EBUSY;
3727
3728         if (svm_nmi_blocked(vcpu))
3729                 return 0;
3730
3731         /* An NMI must not be injected into L2 if it's supposed to VM-Exit.  */
3732         if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
3733                 return -EBUSY;
3734         return 1;
3735 }
3736
3737 bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
3738 {
3739         struct vcpu_svm *svm = to_svm(vcpu);
3740         struct vmcb *vmcb = svm->vmcb;
3741
3742         if (!gif_set(svm))
3743                 return true;
3744
3745         if (is_guest_mode(vcpu)) {
3746                 /* As long as interrupts are being delivered...  */
3747                 if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
3748                     ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)
3749                     : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
3750                         return true;
3751
3752                 /* ... vmexits aren't blocked by the interrupt shadow  */
3753                 if (nested_exit_on_intr(svm))
3754                         return false;
3755         } else {
3756                 if (!svm_get_if_flag(vcpu))
3757                         return true;
3758         }
3759
3760         return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK);
3761 }
3762
3763 static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
3764 {
3765         struct vcpu_svm *svm = to_svm(vcpu);
3766
3767         if (svm->nested.nested_run_pending)
3768                 return -EBUSY;
3769
3770         if (svm_interrupt_blocked(vcpu))
3771                 return 0;
3772
3773         /*
3774          * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
3775          * e.g. if the IRQ arrived asynchronously after checking nested events.
3776          */
3777         if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm))
3778                 return -EBUSY;
3779
3780         return 1;
3781 }
3782
3783 static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
3784 {
3785         struct vcpu_svm *svm = to_svm(vcpu);
3786
3787         /*
3788          * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
3789          * 1, because that's a separate STGI/VMRUN intercept.  The next time we
3790          * get that intercept, this function will be called again though and
3791          * we'll get the vintr intercept. However, if the vGIF feature is
3792          * enabled, the STGI interception will not occur. Enable the irq
3793          * window under the assumption that the hardware will set the GIF.
3794          */
3795         if (vgif || gif_set(svm)) {
3796                 /*
3797                  * IRQ window is not needed when AVIC is enabled,
3798                  * unless we have pending ExtINT since it cannot be injected
3799                  * via AVIC. In such case, KVM needs to temporarily disable AVIC,
3800                  * and fallback to injecting IRQ via V_IRQ.
3801                  *
3802                  * If running nested, AVIC is already locally inhibited
3803                  * on this vCPU, therefore there is no need to request
3804                  * the VM wide AVIC inhibition.
3805                  */
3806                 if (!is_guest_mode(vcpu))
3807                         kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
3808
3809                 svm_set_vintr(svm);
3810         }
3811 }
3812
3813 static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
3814 {
3815         struct vcpu_svm *svm = to_svm(vcpu);
3816
3817         /*
3818          * KVM should never request an NMI window when vNMI is enabled, as KVM
3819          * allows at most one to-be-injected NMI and one pending NMI, i.e. if
3820          * two NMIs arrive simultaneously, KVM will inject one and set
3821          * V_NMI_PENDING for the other.  WARN, but continue with the standard
3822          * single-step approach to try and salvage the pending NMI.
3823          */
3824         WARN_ON_ONCE(is_vnmi_enabled(svm));
3825
3826         if (svm_get_nmi_mask(vcpu) && !svm->awaiting_iret_completion)
3827                 return; /* IRET will cause a vm exit */
3828
3829         /*
3830          * SEV-ES guests are responsible for signaling when a vCPU is ready to
3831          * receive a new NMI, as SEV-ES guests can't be single-stepped, i.e.
3832          * KVM can't intercept and single-step IRET to detect when NMIs are
3833          * unblocked (architecturally speaking).  See SVM_VMGEXIT_NMI_COMPLETE.
3834          *
3835          * Note, GIF is guaranteed to be '1' for SEV-ES guests as hardware
3836          * ignores SEV-ES guest writes to EFER.SVME *and* CLGI/STGI are not
3837          * supported NAEs in the GHCB protocol.
3838          */
3839         if (sev_es_guest(vcpu->kvm))
3840                 return;
3841
3842         if (!gif_set(svm)) {
3843                 if (vgif)
3844                         svm_set_intercept(svm, INTERCEPT_STGI);
3845                 return; /* STGI will cause a vm exit */
3846         }
3847
3848         /*
3849          * Something prevents NMI from been injected. Single step over possible
3850          * problem (IRET or exception injection or interrupt shadow)
3851          */
3852         svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
3853         svm->nmi_singlestep = true;
3854         svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
3855 }
3856
3857 static void svm_flush_tlb_asid(struct kvm_vcpu *vcpu)
3858 {
3859         struct vcpu_svm *svm = to_svm(vcpu);
3860
3861         /*
3862          * Unlike VMX, SVM doesn't provide a way to flush only NPT TLB entries.
3863          * A TLB flush for the current ASID flushes both "host" and "guest" TLB
3864          * entries, and thus is a superset of Hyper-V's fine grained flushing.
3865          */
3866         kvm_hv_vcpu_purge_flush_tlb(vcpu);
3867
3868         /*
3869          * Flush only the current ASID even if the TLB flush was invoked via
3870          * kvm_flush_remote_tlbs().  Although flushing remote TLBs requires all
3871          * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and
3872          * unconditionally does a TLB flush on both nested VM-Enter and nested
3873          * VM-Exit (via kvm_mmu_reset_context()).
3874          */
3875         if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
3876                 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
3877         else
3878                 svm->current_vmcb->asid_generation--;
3879 }
3880
3881 static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
3882 {
3883         hpa_t root_tdp = vcpu->arch.mmu->root.hpa;
3884
3885         /*
3886          * When running on Hyper-V with EnlightenedNptTlb enabled, explicitly
3887          * flush the NPT mappings via hypercall as flushing the ASID only
3888          * affects virtual to physical mappings, it does not invalidate guest
3889          * physical to host physical mappings.
3890          */
3891         if (svm_hv_is_enlightened_tlb_enabled(vcpu) && VALID_PAGE(root_tdp))
3892                 hyperv_flush_guest_mapping(root_tdp);
3893
3894         svm_flush_tlb_asid(vcpu);
3895 }
3896
3897 static void svm_flush_tlb_all(struct kvm_vcpu *vcpu)
3898 {
3899         /*
3900          * When running on Hyper-V with EnlightenedNptTlb enabled, remote TLB
3901          * flushes should be routed to hv_flush_remote_tlbs() without requesting
3902          * a "regular" remote flush.  Reaching this point means either there's
3903          * a KVM bug or a prior hv_flush_remote_tlbs() call failed, both of
3904          * which might be fatal to the guest.  Yell, but try to recover.
3905          */
3906         if (WARN_ON_ONCE(svm_hv_is_enlightened_tlb_enabled(vcpu)))
3907                 hv_flush_remote_tlbs(vcpu->kvm);
3908
3909         svm_flush_tlb_asid(vcpu);
3910 }
3911
3912 static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
3913 {
3914         struct vcpu_svm *svm = to_svm(vcpu);
3915
3916         invlpga(gva, svm->vmcb->control.asid);
3917 }
3918
3919 static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
3920 {
3921         struct vcpu_svm *svm = to_svm(vcpu);
3922
3923         if (nested_svm_virtualize_tpr(vcpu))
3924                 return;
3925
3926         if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) {
3927                 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
3928                 kvm_set_cr8(vcpu, cr8);
3929         }
3930 }
3931
3932 static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
3933 {
3934         struct vcpu_svm *svm = to_svm(vcpu);
3935         u64 cr8;
3936
3937         if (nested_svm_virtualize_tpr(vcpu) ||
3938             kvm_vcpu_apicv_active(vcpu))
3939                 return;
3940
3941         cr8 = kvm_get_cr8(vcpu);
3942         svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
3943         svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
3944 }
3945
3946 static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector,
3947                                         int type)
3948 {
3949         bool is_exception = (type == SVM_EXITINTINFO_TYPE_EXEPT);
3950         bool is_soft = (type == SVM_EXITINTINFO_TYPE_SOFT);
3951         struct vcpu_svm *svm = to_svm(vcpu);
3952
3953         /*
3954          * If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's
3955          * associated with the original soft exception/interrupt.  next_rip is
3956          * cleared on all exits that can occur while vectoring an event, so KVM
3957          * needs to manually set next_rip for re-injection.  Unlike the !nrips
3958          * case below, this needs to be done if and only if KVM is re-injecting
3959          * the same event, i.e. if the event is a soft exception/interrupt,
3960          * otherwise next_rip is unused on VMRUN.
3961          */
3962         if (nrips && (is_soft || (is_exception && kvm_exception_is_soft(vector))) &&
3963             kvm_is_linear_rip(vcpu, svm->soft_int_old_rip + svm->soft_int_csbase))
3964                 svm->vmcb->control.next_rip = svm->soft_int_next_rip;
3965         /*
3966          * If NRIPS isn't enabled, KVM must manually advance RIP prior to
3967          * injecting the soft exception/interrupt.  That advancement needs to
3968          * be unwound if vectoring didn't complete.  Note, the new event may
3969          * not be the injected event, e.g. if KVM injected an INTn, the INTn
3970          * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will
3971          * be the reported vectored event, but RIP still needs to be unwound.
3972          */
3973         else if (!nrips && (is_soft || is_exception) &&
3974                  kvm_is_linear_rip(vcpu, svm->soft_int_next_rip + svm->soft_int_csbase))
3975                 kvm_rip_write(vcpu, svm->soft_int_old_rip);
3976 }
3977
3978 static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
3979 {
3980         struct vcpu_svm *svm = to_svm(vcpu);
3981         u8 vector;
3982         int type;
3983         u32 exitintinfo = svm->vmcb->control.exit_int_info;
3984         bool nmi_l1_to_l2 = svm->nmi_l1_to_l2;
3985         bool soft_int_injected = svm->soft_int_injected;
3986
3987         svm->nmi_l1_to_l2 = false;
3988         svm->soft_int_injected = false;
3989
3990         /*
3991          * If we've made progress since setting awaiting_iret_completion, we've
3992          * executed an IRET and can allow NMI injection.
3993          */
3994         if (svm->awaiting_iret_completion &&
3995             kvm_rip_read(vcpu) != svm->nmi_iret_rip) {
3996                 svm->awaiting_iret_completion = false;
3997                 svm->nmi_masked = false;
3998                 kvm_make_request(KVM_REQ_EVENT, vcpu);
3999         }
4000
4001         vcpu->arch.nmi_injected = false;
4002         kvm_clear_exception_queue(vcpu);
4003         kvm_clear_interrupt_queue(vcpu);
4004
4005         if (!(exitintinfo & SVM_EXITINTINFO_VALID))
4006                 return;
4007
4008         kvm_make_request(KVM_REQ_EVENT, vcpu);
4009
4010         vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
4011         type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
4012
4013         if (soft_int_injected)
4014                 svm_complete_soft_interrupt(vcpu, vector, type);
4015
4016         switch (type) {
4017         case SVM_EXITINTINFO_TYPE_NMI:
4018                 vcpu->arch.nmi_injected = true;
4019                 svm->nmi_l1_to_l2 = nmi_l1_to_l2;
4020                 break;
4021         case SVM_EXITINTINFO_TYPE_EXEPT:
4022                 /*
4023                  * Never re-inject a #VC exception.
4024                  */
4025                 if (vector == X86_TRAP_VC)
4026                         break;
4027
4028                 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
4029                         u32 err = svm->vmcb->control.exit_int_info_err;
4030                         kvm_requeue_exception_e(vcpu, vector, err);
4031
4032                 } else
4033                         kvm_requeue_exception(vcpu, vector);
4034                 break;
4035         case SVM_EXITINTINFO_TYPE_INTR:
4036                 kvm_queue_interrupt(vcpu, vector, false);
4037                 break;
4038         case SVM_EXITINTINFO_TYPE_SOFT:
4039                 kvm_queue_interrupt(vcpu, vector, true);
4040                 break;
4041         default:
4042                 break;
4043         }
4044
4045 }
4046
4047 static void svm_cancel_injection(struct kvm_vcpu *vcpu)
4048 {
4049         struct vcpu_svm *svm = to_svm(vcpu);
4050         struct vmcb_control_area *control = &svm->vmcb->control;
4051
4052         control->exit_int_info = control->event_inj;
4053         control->exit_int_info_err = control->event_inj_err;
4054         control->event_inj = 0;
4055         svm_complete_interrupts(vcpu);
4056 }
4057
4058 static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
4059 {
4060         return 1;
4061 }
4062
4063 static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
4064 {
4065         if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
4066             to_svm(vcpu)->vmcb->control.exit_info_1)
4067                 return handle_fastpath_set_msr_irqoff(vcpu);
4068
4069         return EXIT_FASTPATH_NONE;
4070 }
4071
4072 static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted)
4073 {
4074         struct vcpu_svm *svm = to_svm(vcpu);
4075
4076         guest_state_enter_irqoff();
4077
4078         amd_clear_divider();
4079
4080         if (sev_es_guest(vcpu->kvm))
4081                 __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted);
4082         else
4083                 __svm_vcpu_run(svm, spec_ctrl_intercepted);
4084
4085         guest_state_exit_irqoff();
4086 }
4087
4088 static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
4089 {
4090         struct vcpu_svm *svm = to_svm(vcpu);
4091         bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
4092
4093         trace_kvm_entry(vcpu);
4094
4095         svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
4096         svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
4097         svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
4098
4099         /*
4100          * Disable singlestep if we're injecting an interrupt/exception.
4101          * We don't want our modified rflags to be pushed on the stack where
4102          * we might not be able to easily reset them if we disabled NMI
4103          * singlestep later.
4104          */
4105         if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
4106                 /*
4107                  * Event injection happens before external interrupts cause a
4108                  * vmexit and interrupts are disabled here, so smp_send_reschedule
4109                  * is enough to force an immediate vmexit.
4110                  */
4111                 disable_nmi_singlestep(svm);
4112                 smp_send_reschedule(vcpu->cpu);
4113         }
4114
4115         pre_svm_run(vcpu);
4116
4117         sync_lapic_to_cr8(vcpu);
4118
4119         if (unlikely(svm->asid != svm->vmcb->control.asid)) {
4120                 svm->vmcb->control.asid = svm->asid;
4121                 vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
4122         }
4123         svm->vmcb->save.cr2 = vcpu->arch.cr2;
4124
4125         svm_hv_update_vp_id(svm->vmcb, vcpu);
4126
4127         /*
4128          * Run with all-zero DR6 unless needed, so that we can get the exact cause
4129          * of a #DB.
4130          */
4131         if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
4132                 svm_set_dr6(svm, vcpu->arch.dr6);
4133         else
4134                 svm_set_dr6(svm, DR6_ACTIVE_LOW);
4135
4136         clgi();
4137         kvm_load_guest_xsave_state(vcpu);
4138
4139         kvm_wait_lapic_expire(vcpu);
4140
4141         /*
4142          * If this vCPU has touched SPEC_CTRL, restore the guest's value if
4143          * it's non-zero. Since vmentry is serialising on affected CPUs, there
4144          * is no need to worry about the conditional branch over the wrmsr
4145          * being speculatively taken.
4146          */
4147         if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
4148                 x86_spec_ctrl_set_guest(svm->virt_spec_ctrl);
4149
4150         svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted);
4151
4152         if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
4153                 x86_spec_ctrl_restore_host(svm->virt_spec_ctrl);
4154
4155         if (!sev_es_guest(vcpu->kvm)) {
4156                 vcpu->arch.cr2 = svm->vmcb->save.cr2;
4157                 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
4158                 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
4159                 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
4160         }
4161         vcpu->arch.regs_dirty = 0;
4162
4163         if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
4164                 kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
4165
4166         kvm_load_host_xsave_state(vcpu);
4167         stgi();
4168
4169         /* Any pending NMI will happen here */
4170
4171         if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
4172                 kvm_after_interrupt(vcpu);
4173
4174         sync_cr8_to_lapic(vcpu);
4175
4176         svm->next_rip = 0;
4177         if (is_guest_mode(vcpu)) {
4178                 nested_sync_control_from_vmcb02(svm);
4179
4180                 /* Track VMRUNs that have made past consistency checking */
4181                 if (svm->nested.nested_run_pending &&
4182                     svm->vmcb->control.exit_code != SVM_EXIT_ERR)
4183                         ++vcpu->stat.nested_run;
4184
4185                 svm->nested.nested_run_pending = 0;
4186         }
4187
4188         svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
4189         vmcb_mark_all_clean(svm->vmcb);
4190
4191         /* if exit due to PF check for async PF */
4192         if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
4193                 vcpu->arch.apf.host_apf_flags =
4194                         kvm_read_and_reset_apf_flags();
4195
4196         vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET;
4197
4198         /*
4199          * We need to handle MC intercepts here before the vcpu has a chance to
4200          * change the physical cpu
4201          */
4202         if (unlikely(svm->vmcb->control.exit_code ==
4203                      SVM_EXIT_EXCP_BASE + MC_VECTOR))
4204                 svm_handle_mce(vcpu);
4205
4206         trace_kvm_exit(vcpu, KVM_ISA_SVM);
4207
4208         svm_complete_interrupts(vcpu);
4209
4210         if (is_guest_mode(vcpu))
4211                 return EXIT_FASTPATH_NONE;
4212
4213         return svm_exit_handlers_fastpath(vcpu);
4214 }
4215
4216 static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
4217                              int root_level)
4218 {
4219         struct vcpu_svm *svm = to_svm(vcpu);
4220         unsigned long cr3;
4221
4222         if (npt_enabled) {
4223                 svm->vmcb->control.nested_cr3 = __sme_set(root_hpa);
4224                 vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
4225
4226                 hv_track_root_tdp(vcpu, root_hpa);
4227
4228                 cr3 = vcpu->arch.cr3;
4229         } else if (root_level >= PT64_ROOT_4LEVEL) {
4230                 cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
4231         } else {
4232                 /* PCID in the guest should be impossible with a 32-bit MMU. */
4233                 WARN_ON_ONCE(kvm_get_active_pcid(vcpu));
4234                 cr3 = root_hpa;
4235         }
4236
4237         svm->vmcb->save.cr3 = cr3;
4238         vmcb_mark_dirty(svm->vmcb, VMCB_CR);
4239 }
4240
4241 static void
4242 svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
4243 {
4244         /*
4245          * Patch in the VMMCALL instruction:
4246          */
4247         hypercall[0] = 0x0f;
4248         hypercall[1] = 0x01;
4249         hypercall[2] = 0xd9;
4250 }
4251
4252 /*
4253  * The kvm parameter can be NULL (module initialization, or invocation before
4254  * VM creation). Be sure to check the kvm parameter before using it.
4255  */
4256 static bool svm_has_emulated_msr(struct kvm *kvm, u32 index)
4257 {
4258         switch (index) {
4259         case MSR_IA32_MCG_EXT_CTL:
4260         case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
4261                 return false;
4262         case MSR_IA32_SMBASE:
4263                 if (!IS_ENABLED(CONFIG_KVM_SMM))
4264                         return false;
4265                 /* SEV-ES guests do not support SMM, so report false */
4266                 if (kvm && sev_es_guest(kvm))
4267                         return false;
4268                 break;
4269         default:
4270                 break;
4271         }
4272
4273         return true;
4274 }
4275
4276 static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
4277 {
4278         struct vcpu_svm *svm = to_svm(vcpu);
4279         struct kvm_cpuid_entry2 *best;
4280
4281         /*
4282          * SVM doesn't provide a way to disable just XSAVES in the guest, KVM
4283          * can only disable all variants of by disallowing CR4.OSXSAVE from
4284          * being set.  As a result, if the host has XSAVE and XSAVES, and the
4285          * guest has XSAVE enabled, the guest can execute XSAVES without
4286          * faulting.  Treat XSAVES as enabled in this case regardless of
4287          * whether it's advertised to the guest so that KVM context switches
4288          * XSS on VM-Enter/VM-Exit.  Failure to do so would effectively give
4289          * the guest read/write access to the host's XSS.
4290          */
4291         if (boot_cpu_has(X86_FEATURE_XSAVE) &&
4292             boot_cpu_has(X86_FEATURE_XSAVES) &&
4293             guest_cpuid_has(vcpu, X86_FEATURE_XSAVE))
4294                 kvm_governed_feature_set(vcpu, X86_FEATURE_XSAVES);
4295
4296         kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_NRIPS);
4297         kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_TSCRATEMSR);
4298         kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LBRV);
4299
4300         /*
4301          * Intercept VMLOAD if the vCPU mode is Intel in order to emulate that
4302          * VMLOAD drops bits 63:32 of SYSENTER (ignoring the fact that exposing
4303          * SVM on Intel is bonkers and extremely unlikely to work).
4304          */
4305         if (!guest_cpuid_is_intel(vcpu))
4306                 kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
4307
4308         kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PAUSEFILTER);
4309         kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PFTHRESHOLD);
4310         kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VGIF);
4311         kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VNMI);
4312
4313         svm_recalc_instruction_intercepts(vcpu, svm);
4314
4315         if (boot_cpu_has(X86_FEATURE_IBPB))
4316                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0,
4317                                      !!guest_has_pred_cmd_msr(vcpu));
4318
4319         if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
4320                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0,
4321                                      !!guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D));
4322
4323         /* For sev guests, the memory encryption bit is not reserved in CR3.  */
4324         if (sev_guest(vcpu->kvm)) {
4325                 best = kvm_find_cpuid_entry(vcpu, 0x8000001F);
4326                 if (best)
4327                         vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
4328         }
4329
4330         init_vmcb_after_set_cpuid(vcpu);
4331 }
4332
4333 static bool svm_has_wbinvd_exit(void)
4334 {
4335         return true;
4336 }
4337
4338 #define PRE_EX(exit)  { .exit_code = (exit), \
4339                         .stage = X86_ICPT_PRE_EXCEPT, }
4340 #define POST_EX(exit) { .exit_code = (exit), \
4341                         .stage = X86_ICPT_POST_EXCEPT, }
4342 #define POST_MEM(exit) { .exit_code = (exit), \
4343                         .stage = X86_ICPT_POST_MEMACCESS, }
4344
4345 static const struct __x86_intercept {
4346         u32 exit_code;
4347         enum x86_intercept_stage stage;
4348 } x86_intercept_map[] = {
4349         [x86_intercept_cr_read]         = POST_EX(SVM_EXIT_READ_CR0),
4350         [x86_intercept_cr_write]        = POST_EX(SVM_EXIT_WRITE_CR0),
4351         [x86_intercept_clts]            = POST_EX(SVM_EXIT_WRITE_CR0),
4352         [x86_intercept_lmsw]            = POST_EX(SVM_EXIT_WRITE_CR0),
4353         [x86_intercept_smsw]            = POST_EX(SVM_EXIT_READ_CR0),
4354         [x86_intercept_dr_read]         = POST_EX(SVM_EXIT_READ_DR0),
4355         [x86_intercept_dr_write]        = POST_EX(SVM_EXIT_WRITE_DR0),
4356         [x86_intercept_sldt]            = POST_EX(SVM_EXIT_LDTR_READ),
4357         [x86_intercept_str]             = POST_EX(SVM_EXIT_TR_READ),
4358         [x86_intercept_lldt]            = POST_EX(SVM_EXIT_LDTR_WRITE),
4359         [x86_intercept_ltr]             = POST_EX(SVM_EXIT_TR_WRITE),
4360         [x86_intercept_sgdt]            = POST_EX(SVM_EXIT_GDTR_READ),
4361         [x86_intercept_sidt]            = POST_EX(SVM_EXIT_IDTR_READ),
4362         [x86_intercept_lgdt]            = POST_EX(SVM_EXIT_GDTR_WRITE),
4363         [x86_intercept_lidt]            = POST_EX(SVM_EXIT_IDTR_WRITE),
4364         [x86_intercept_vmrun]           = POST_EX(SVM_EXIT_VMRUN),
4365         [x86_intercept_vmmcall]         = POST_EX(SVM_EXIT_VMMCALL),
4366         [x86_intercept_vmload]          = POST_EX(SVM_EXIT_VMLOAD),
4367         [x86_intercept_vmsave]          = POST_EX(SVM_EXIT_VMSAVE),
4368         [x86_intercept_stgi]            = POST_EX(SVM_EXIT_STGI),
4369         [x86_intercept_clgi]            = POST_EX(SVM_EXIT_CLGI),
4370         [x86_intercept_skinit]          = POST_EX(SVM_EXIT_SKINIT),
4371         [x86_intercept_invlpga]         = POST_EX(SVM_EXIT_INVLPGA),
4372         [x86_intercept_rdtscp]          = POST_EX(SVM_EXIT_RDTSCP),
4373         [x86_intercept_monitor]         = POST_MEM(SVM_EXIT_MONITOR),
4374         [x86_intercept_mwait]           = POST_EX(SVM_EXIT_MWAIT),
4375         [x86_intercept_invlpg]          = POST_EX(SVM_EXIT_INVLPG),
4376         [x86_intercept_invd]            = POST_EX(SVM_EXIT_INVD),
4377         [x86_intercept_wbinvd]          = POST_EX(SVM_EXIT_WBINVD),
4378         [x86_intercept_wrmsr]           = POST_EX(SVM_EXIT_MSR),
4379         [x86_intercept_rdtsc]           = POST_EX(SVM_EXIT_RDTSC),
4380         [x86_intercept_rdmsr]           = POST_EX(SVM_EXIT_MSR),
4381         [x86_intercept_rdpmc]           = POST_EX(SVM_EXIT_RDPMC),
4382         [x86_intercept_cpuid]           = PRE_EX(SVM_EXIT_CPUID),
4383         [x86_intercept_rsm]             = PRE_EX(SVM_EXIT_RSM),
4384         [x86_intercept_pause]           = PRE_EX(SVM_EXIT_PAUSE),
4385         [x86_intercept_pushf]           = PRE_EX(SVM_EXIT_PUSHF),
4386         [x86_intercept_popf]            = PRE_EX(SVM_EXIT_POPF),
4387         [x86_intercept_intn]            = PRE_EX(SVM_EXIT_SWINT),
4388         [x86_intercept_iret]            = PRE_EX(SVM_EXIT_IRET),
4389         [x86_intercept_icebp]           = PRE_EX(SVM_EXIT_ICEBP),
4390         [x86_intercept_hlt]             = POST_EX(SVM_EXIT_HLT),
4391         [x86_intercept_in]              = POST_EX(SVM_EXIT_IOIO),
4392         [x86_intercept_ins]             = POST_EX(SVM_EXIT_IOIO),
4393         [x86_intercept_out]             = POST_EX(SVM_EXIT_IOIO),
4394         [x86_intercept_outs]            = POST_EX(SVM_EXIT_IOIO),
4395         [x86_intercept_xsetbv]          = PRE_EX(SVM_EXIT_XSETBV),
4396 };
4397
4398 #undef PRE_EX
4399 #undef POST_EX
4400 #undef POST_MEM
4401
4402 static int svm_check_intercept(struct kvm_vcpu *vcpu,
4403                                struct x86_instruction_info *info,
4404                                enum x86_intercept_stage stage,
4405                                struct x86_exception *exception)
4406 {
4407         struct vcpu_svm *svm = to_svm(vcpu);
4408         int vmexit, ret = X86EMUL_CONTINUE;
4409         struct __x86_intercept icpt_info;
4410         struct vmcb *vmcb = svm->vmcb;
4411
4412         if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
4413                 goto out;
4414
4415         icpt_info = x86_intercept_map[info->intercept];
4416
4417         if (stage != icpt_info.stage)
4418                 goto out;
4419
4420         switch (icpt_info.exit_code) {
4421         case SVM_EXIT_READ_CR0:
4422                 if (info->intercept == x86_intercept_cr_read)
4423                         icpt_info.exit_code += info->modrm_reg;
4424                 break;
4425         case SVM_EXIT_WRITE_CR0: {
4426                 unsigned long cr0, val;
4427
4428                 if (info->intercept == x86_intercept_cr_write)
4429                         icpt_info.exit_code += info->modrm_reg;
4430
4431                 if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
4432                     info->intercept == x86_intercept_clts)
4433                         break;
4434
4435                 if (!(vmcb12_is_intercept(&svm->nested.ctl,
4436                                         INTERCEPT_SELECTIVE_CR0)))
4437                         break;
4438
4439                 cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
4440                 val = info->src_val  & ~SVM_CR0_SELECTIVE_MASK;
4441
4442                 if (info->intercept == x86_intercept_lmsw) {
4443                         cr0 &= 0xfUL;
4444                         val &= 0xfUL;
4445                         /* lmsw can't clear PE - catch this here */
4446                         if (cr0 & X86_CR0_PE)
4447                                 val |= X86_CR0_PE;
4448                 }
4449
4450                 if (cr0 ^ val)
4451                         icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
4452
4453                 break;
4454         }
4455         case SVM_EXIT_READ_DR0:
4456         case SVM_EXIT_WRITE_DR0:
4457                 icpt_info.exit_code += info->modrm_reg;
4458                 break;
4459         case SVM_EXIT_MSR:
4460                 if (info->intercept == x86_intercept_wrmsr)
4461                         vmcb->control.exit_info_1 = 1;
4462                 else
4463                         vmcb->control.exit_info_1 = 0;
4464                 break;
4465         case SVM_EXIT_PAUSE:
4466                 /*
4467                  * We get this for NOP only, but pause
4468                  * is rep not, check this here
4469                  */
4470                 if (info->rep_prefix != REPE_PREFIX)
4471                         goto out;
4472                 break;
4473         case SVM_EXIT_IOIO: {
4474                 u64 exit_info;
4475                 u32 bytes;
4476
4477                 if (info->intercept == x86_intercept_in ||
4478                     info->intercept == x86_intercept_ins) {
4479                         exit_info = ((info->src_val & 0xffff) << 16) |
4480                                 SVM_IOIO_TYPE_MASK;
4481                         bytes = info->dst_bytes;
4482                 } else {
4483                         exit_info = (info->dst_val & 0xffff) << 16;
4484                         bytes = info->src_bytes;
4485                 }
4486
4487                 if (info->intercept == x86_intercept_outs ||
4488                     info->intercept == x86_intercept_ins)
4489                         exit_info |= SVM_IOIO_STR_MASK;
4490
4491                 if (info->rep_prefix)
4492                         exit_info |= SVM_IOIO_REP_MASK;
4493
4494                 bytes = min(bytes, 4u);
4495
4496                 exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
4497
4498                 exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
4499
4500                 vmcb->control.exit_info_1 = exit_info;
4501                 vmcb->control.exit_info_2 = info->next_rip;
4502
4503                 break;
4504         }
4505         default:
4506                 break;
4507         }
4508
4509         /* TODO: Advertise NRIPS to guest hypervisor unconditionally */
4510         if (static_cpu_has(X86_FEATURE_NRIPS))
4511                 vmcb->control.next_rip  = info->next_rip;
4512         vmcb->control.exit_code = icpt_info.exit_code;
4513         vmexit = nested_svm_exit_handled(svm);
4514
4515         ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
4516                                            : X86EMUL_CONTINUE;
4517
4518 out:
4519         return ret;
4520 }
4521
4522 static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
4523 {
4524         if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR)
4525                 vcpu->arch.at_instruction_boundary = true;
4526 }
4527
4528 static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
4529 {
4530         if (!kvm_pause_in_guest(vcpu->kvm))
4531                 shrink_ple_window(vcpu);
4532 }
4533
4534 static void svm_setup_mce(struct kvm_vcpu *vcpu)
4535 {
4536         /* [63:9] are reserved. */
4537         vcpu->arch.mcg_cap &= 0x1ff;
4538 }
4539
4540 #ifdef CONFIG_KVM_SMM
4541 bool svm_smi_blocked(struct kvm_vcpu *vcpu)
4542 {
4543         struct vcpu_svm *svm = to_svm(vcpu);
4544
4545         /* Per APM Vol.2 15.22.2 "Response to SMI" */
4546         if (!gif_set(svm))
4547                 return true;
4548
4549         return is_smm(vcpu);
4550 }
4551
4552 static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
4553 {
4554         struct vcpu_svm *svm = to_svm(vcpu);
4555         if (svm->nested.nested_run_pending)
4556                 return -EBUSY;
4557
4558         if (svm_smi_blocked(vcpu))
4559                 return 0;
4560
4561         /* An SMI must not be injected into L2 if it's supposed to VM-Exit.  */
4562         if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm))
4563                 return -EBUSY;
4564
4565         return 1;
4566 }
4567
4568 static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
4569 {
4570         struct vcpu_svm *svm = to_svm(vcpu);
4571         struct kvm_host_map map_save;
4572         int ret;
4573
4574         if (!is_guest_mode(vcpu))
4575                 return 0;
4576
4577         /*
4578          * 32-bit SMRAM format doesn't preserve EFER and SVM state.  Userspace is
4579          * responsible for ensuring nested SVM and SMIs are mutually exclusive.
4580          */
4581
4582         if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
4583                 return 1;
4584
4585         smram->smram64.svm_guest_flag = 1;
4586         smram->smram64.svm_guest_vmcb_gpa = svm->nested.vmcb12_gpa;
4587
4588         svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
4589         svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
4590         svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
4591
4592         ret = nested_svm_simple_vmexit(svm, SVM_EXIT_SW);
4593         if (ret)
4594                 return ret;
4595
4596         /*
4597          * KVM uses VMCB01 to store L1 host state while L2 runs but
4598          * VMCB01 is going to be used during SMM and thus the state will
4599          * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save
4600          * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the
4601          * format of the area is identical to guest save area offsetted
4602          * by 0x400 (matches the offset of 'struct vmcb_save_area'
4603          * within 'struct vmcb'). Note: HSAVE area may also be used by
4604          * L1 hypervisor to save additional host context (e.g. KVM does
4605          * that, see svm_prepare_switch_to_guest()) which must be
4606          * preserved.
4607          */
4608         if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save))
4609                 return 1;
4610
4611         BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400);
4612
4613         svm_copy_vmrun_state(map_save.hva + 0x400,
4614                              &svm->vmcb01.ptr->save);
4615
4616         kvm_vcpu_unmap(vcpu, &map_save, true);
4617         return 0;
4618 }
4619
4620 static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
4621 {
4622         struct vcpu_svm *svm = to_svm(vcpu);
4623         struct kvm_host_map map, map_save;
4624         struct vmcb *vmcb12;
4625         int ret;
4626
4627         const struct kvm_smram_state_64 *smram64 = &smram->smram64;
4628
4629         if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
4630                 return 0;
4631
4632         /* Non-zero if SMI arrived while vCPU was in guest mode. */
4633         if (!smram64->svm_guest_flag)
4634                 return 0;
4635
4636         if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
4637                 return 1;
4638
4639         if (!(smram64->efer & EFER_SVME))
4640                 return 1;
4641
4642         if (kvm_vcpu_map(vcpu, gpa_to_gfn(smram64->svm_guest_vmcb_gpa), &map))
4643                 return 1;
4644
4645         ret = 1;
4646         if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save))
4647                 goto unmap_map;
4648
4649         if (svm_allocate_nested(svm))
4650                 goto unmap_save;
4651
4652         /*
4653          * Restore L1 host state from L1 HSAVE area as VMCB01 was
4654          * used during SMM (see svm_enter_smm())
4655          */
4656
4657         svm_copy_vmrun_state(&svm->vmcb01.ptr->save, map_save.hva + 0x400);
4658
4659         /*
4660          * Enter the nested guest now
4661          */
4662
4663         vmcb_mark_all_dirty(svm->vmcb01.ptr);
4664
4665         vmcb12 = map.hva;
4666         nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
4667         nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
4668         ret = enter_svm_guest_mode(vcpu, smram64->svm_guest_vmcb_gpa, vmcb12, false);
4669
4670         if (ret)
4671                 goto unmap_save;
4672
4673         svm->nested.nested_run_pending = 1;
4674
4675 unmap_save:
4676         kvm_vcpu_unmap(vcpu, &map_save, true);
4677 unmap_map:
4678         kvm_vcpu_unmap(vcpu, &map, true);
4679         return ret;
4680 }
4681
4682 static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
4683 {
4684         struct vcpu_svm *svm = to_svm(vcpu);
4685
4686         if (!gif_set(svm)) {
4687                 if (vgif)
4688                         svm_set_intercept(svm, INTERCEPT_STGI);
4689                 /* STGI will cause a vm exit */
4690         } else {
4691                 /* We must be in SMM; RSM will cause a vmexit anyway.  */
4692         }
4693 }
4694 #endif
4695
4696 static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
4697                                         void *insn, int insn_len)
4698 {
4699         bool smep, smap, is_user;
4700         u64 error_code;
4701
4702         /* Emulation is always possible when KVM has access to all guest state. */
4703         if (!sev_guest(vcpu->kvm))
4704                 return true;
4705
4706         /* #UD and #GP should never be intercepted for SEV guests. */
4707         WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD |
4708                                   EMULTYPE_TRAP_UD_FORCED |
4709                                   EMULTYPE_VMWARE_GP));
4710
4711         /*
4712          * Emulation is impossible for SEV-ES guests as KVM doesn't have access
4713          * to guest register state.
4714          */
4715         if (sev_es_guest(vcpu->kvm))
4716                 return false;
4717
4718         /*
4719          * Emulation is possible if the instruction is already decoded, e.g.
4720          * when completing I/O after returning from userspace.
4721          */
4722         if (emul_type & EMULTYPE_NO_DECODE)
4723                 return true;
4724
4725         /*
4726          * Emulation is possible for SEV guests if and only if a prefilled
4727          * buffer containing the bytes of the intercepted instruction is
4728          * available. SEV guest memory is encrypted with a guest specific key
4729          * and cannot be decrypted by KVM, i.e. KVM would read cyphertext and
4730          * decode garbage.
4731          *
4732          * If KVM is NOT trying to simply skip an instruction, inject #UD if
4733          * KVM reached this point without an instruction buffer.  In practice,
4734          * this path should never be hit by a well-behaved guest, e.g. KVM
4735          * doesn't intercept #UD or #GP for SEV guests, but this path is still
4736          * theoretically reachable, e.g. via unaccelerated fault-like AVIC
4737          * access, and needs to be handled by KVM to avoid putting the guest
4738          * into an infinite loop.   Injecting #UD is somewhat arbitrary, but
4739          * its the least awful option given lack of insight into the guest.
4740          *
4741          * If KVM is trying to skip an instruction, simply resume the guest.
4742          * If a #NPF occurs while the guest is vectoring an INT3/INTO, then KVM
4743          * will attempt to re-inject the INT3/INTO and skip the instruction.
4744          * In that scenario, retrying the INT3/INTO and hoping the guest will
4745          * make forward progress is the only option that has a chance of
4746          * success (and in practice it will work the vast majority of the time).
4747          */
4748         if (unlikely(!insn)) {
4749                 if (!(emul_type & EMULTYPE_SKIP))
4750                         kvm_queue_exception(vcpu, UD_VECTOR);
4751                 return false;
4752         }
4753
4754         /*
4755          * Emulate for SEV guests if the insn buffer is not empty.  The buffer
4756          * will be empty if the DecodeAssist microcode cannot fetch bytes for
4757          * the faulting instruction because the code fetch itself faulted, e.g.
4758          * the guest attempted to fetch from emulated MMIO or a guest page
4759          * table used to translate CS:RIP resides in emulated MMIO.
4760          */
4761         if (likely(insn_len))
4762                 return true;
4763
4764         /*
4765          * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
4766          *
4767          * Errata:
4768          * When CPU raises #NPF on guest data access and vCPU CR4.SMAP=1, it is
4769          * possible that CPU microcode implementing DecodeAssist will fail to
4770          * read guest memory at CS:RIP and vmcb.GuestIntrBytes will incorrectly
4771          * be '0'.  This happens because microcode reads CS:RIP using a _data_
4772          * loap uop with CPL=0 privileges.  If the load hits a SMAP #PF, ucode
4773          * gives up and does not fill the instruction bytes buffer.
4774          *
4775          * As above, KVM reaches this point iff the VM is an SEV guest, the CPU
4776          * supports DecodeAssist, a #NPF was raised, KVM's page fault handler
4777          * triggered emulation (e.g. for MMIO), and the CPU returned 0 in the
4778          * GuestIntrBytes field of the VMCB.
4779          *
4780          * This does _not_ mean that the erratum has been encountered, as the
4781          * DecodeAssist will also fail if the load for CS:RIP hits a legitimate
4782          * #PF, e.g. if the guest attempt to execute from emulated MMIO and
4783          * encountered a reserved/not-present #PF.
4784          *
4785          * To hit the erratum, the following conditions must be true:
4786          *    1. CR4.SMAP=1 (obviously).
4787          *    2. CR4.SMEP=0 || CPL=3.  If SMEP=1 and CPL<3, the erratum cannot
4788          *       have been hit as the guest would have encountered a SMEP
4789          *       violation #PF, not a #NPF.
4790          *    3. The #NPF is not due to a code fetch, in which case failure to
4791          *       retrieve the instruction bytes is legitimate (see abvoe).
4792          *
4793          * In addition, don't apply the erratum workaround if the #NPF occurred
4794          * while translating guest page tables (see below).
4795          */
4796         error_code = to_svm(vcpu)->vmcb->control.exit_info_1;
4797         if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK))
4798                 goto resume_guest;
4799
4800         smep = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMEP);
4801         smap = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMAP);
4802         is_user = svm_get_cpl(vcpu) == 3;
4803         if (smap && (!smep || is_user)) {
4804                 pr_err_ratelimited("SEV Guest triggered AMD Erratum 1096\n");
4805
4806                 /*
4807                  * If the fault occurred in userspace, arbitrarily inject #GP
4808                  * to avoid killing the guest and to hopefully avoid confusing
4809                  * the guest kernel too much, e.g. injecting #PF would not be
4810                  * coherent with respect to the guest's page tables.  Request
4811                  * triple fault if the fault occurred in the kernel as there's
4812                  * no fault that KVM can inject without confusing the guest.
4813                  * In practice, the triple fault is moot as no sane SEV kernel
4814                  * will execute from user memory while also running with SMAP=1.
4815                  */
4816                 if (is_user)
4817                         kvm_inject_gp(vcpu, 0);
4818                 else
4819                         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
4820         }
4821
4822 resume_guest:
4823         /*
4824          * If the erratum was not hit, simply resume the guest and let it fault
4825          * again.  While awful, e.g. the vCPU may get stuck in an infinite loop
4826          * if the fault is at CPL=0, it's the lesser of all evils.  Exiting to
4827          * userspace will kill the guest, and letting the emulator read garbage
4828          * will yield random behavior and potentially corrupt the guest.
4829          *
4830          * Simply resuming the guest is technically not a violation of the SEV
4831          * architecture.  AMD's APM states that all code fetches and page table
4832          * accesses for SEV guest are encrypted, regardless of the C-Bit.  The
4833          * APM also states that encrypted accesses to MMIO are "ignored", but
4834          * doesn't explicitly define "ignored", i.e. doing nothing and letting
4835          * the guest spin is technically "ignoring" the access.
4836          */
4837         return false;
4838 }
4839
4840 static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
4841 {
4842         struct vcpu_svm *svm = to_svm(vcpu);
4843
4844         return !gif_set(svm);
4845 }
4846
4847 static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
4848 {
4849         if (!sev_es_guest(vcpu->kvm))
4850                 return kvm_vcpu_deliver_sipi_vector(vcpu, vector);
4851
4852         sev_vcpu_deliver_sipi_vector(vcpu, vector);
4853 }
4854
4855 static void svm_vm_destroy(struct kvm *kvm)
4856 {
4857         avic_vm_destroy(kvm);
4858         sev_vm_destroy(kvm);
4859 }
4860
4861 static int svm_vm_init(struct kvm *kvm)
4862 {
4863         if (!pause_filter_count || !pause_filter_thresh)
4864                 kvm->arch.pause_in_guest = true;
4865
4866         if (enable_apicv) {
4867                 int ret = avic_vm_init(kvm);
4868                 if (ret)
4869                         return ret;
4870         }
4871
4872         return 0;
4873 }
4874
4875 static struct kvm_x86_ops svm_x86_ops __initdata = {
4876         .name = KBUILD_MODNAME,
4877
4878         .check_processor_compatibility = svm_check_processor_compat,
4879
4880         .hardware_unsetup = svm_hardware_unsetup,
4881         .hardware_enable = svm_hardware_enable,
4882         .hardware_disable = svm_hardware_disable,
4883         .has_emulated_msr = svm_has_emulated_msr,
4884
4885         .vcpu_create = svm_vcpu_create,
4886         .vcpu_free = svm_vcpu_free,
4887         .vcpu_reset = svm_vcpu_reset,
4888
4889         .vm_size = sizeof(struct kvm_svm),
4890         .vm_init = svm_vm_init,
4891         .vm_destroy = svm_vm_destroy,
4892
4893         .prepare_switch_to_guest = svm_prepare_switch_to_guest,
4894         .vcpu_load = svm_vcpu_load,
4895         .vcpu_put = svm_vcpu_put,
4896         .vcpu_blocking = avic_vcpu_blocking,
4897         .vcpu_unblocking = avic_vcpu_unblocking,
4898
4899         .update_exception_bitmap = svm_update_exception_bitmap,
4900         .get_msr_feature = svm_get_msr_feature,
4901         .get_msr = svm_get_msr,
4902         .set_msr = svm_set_msr,
4903         .get_segment_base = svm_get_segment_base,
4904         .get_segment = svm_get_segment,
4905         .set_segment = svm_set_segment,
4906         .get_cpl = svm_get_cpl,
4907         .get_cs_db_l_bits = svm_get_cs_db_l_bits,
4908         .is_valid_cr0 = svm_is_valid_cr0,
4909         .set_cr0 = svm_set_cr0,
4910         .post_set_cr3 = sev_post_set_cr3,
4911         .is_valid_cr4 = svm_is_valid_cr4,
4912         .set_cr4 = svm_set_cr4,
4913         .set_efer = svm_set_efer,
4914         .get_idt = svm_get_idt,
4915         .set_idt = svm_set_idt,
4916         .get_gdt = svm_get_gdt,
4917         .set_gdt = svm_set_gdt,
4918         .set_dr7 = svm_set_dr7,
4919         .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
4920         .cache_reg = svm_cache_reg,
4921         .get_rflags = svm_get_rflags,
4922         .set_rflags = svm_set_rflags,
4923         .get_if_flag = svm_get_if_flag,
4924
4925         .flush_tlb_all = svm_flush_tlb_all,
4926         .flush_tlb_current = svm_flush_tlb_current,
4927         .flush_tlb_gva = svm_flush_tlb_gva,
4928         .flush_tlb_guest = svm_flush_tlb_asid,
4929
4930         .vcpu_pre_run = svm_vcpu_pre_run,
4931         .vcpu_run = svm_vcpu_run,
4932         .handle_exit = svm_handle_exit,
4933         .skip_emulated_instruction = svm_skip_emulated_instruction,
4934         .update_emulated_instruction = NULL,
4935         .set_interrupt_shadow = svm_set_interrupt_shadow,
4936         .get_interrupt_shadow = svm_get_interrupt_shadow,
4937         .patch_hypercall = svm_patch_hypercall,
4938         .inject_irq = svm_inject_irq,
4939         .inject_nmi = svm_inject_nmi,
4940         .is_vnmi_pending = svm_is_vnmi_pending,
4941         .set_vnmi_pending = svm_set_vnmi_pending,
4942         .inject_exception = svm_inject_exception,
4943         .cancel_injection = svm_cancel_injection,
4944         .interrupt_allowed = svm_interrupt_allowed,
4945         .nmi_allowed = svm_nmi_allowed,
4946         .get_nmi_mask = svm_get_nmi_mask,
4947         .set_nmi_mask = svm_set_nmi_mask,
4948         .enable_nmi_window = svm_enable_nmi_window,
4949         .enable_irq_window = svm_enable_irq_window,
4950         .update_cr8_intercept = svm_update_cr8_intercept,
4951         .set_virtual_apic_mode = avic_refresh_virtual_apic_mode,
4952         .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
4953         .apicv_post_state_restore = avic_apicv_post_state_restore,
4954         .required_apicv_inhibits = AVIC_REQUIRED_APICV_INHIBITS,
4955
4956         .get_exit_info = svm_get_exit_info,
4957
4958         .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid,
4959
4960         .has_wbinvd_exit = svm_has_wbinvd_exit,
4961
4962         .get_l2_tsc_offset = svm_get_l2_tsc_offset,
4963         .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier,
4964         .write_tsc_offset = svm_write_tsc_offset,
4965         .write_tsc_multiplier = svm_write_tsc_multiplier,
4966
4967         .load_mmu_pgd = svm_load_mmu_pgd,
4968
4969         .check_intercept = svm_check_intercept,
4970         .handle_exit_irqoff = svm_handle_exit_irqoff,
4971
4972         .request_immediate_exit = __kvm_request_immediate_exit,
4973
4974         .sched_in = svm_sched_in,
4975
4976         .nested_ops = &svm_nested_ops,
4977
4978         .deliver_interrupt = svm_deliver_interrupt,
4979         .pi_update_irte = avic_pi_update_irte,
4980         .setup_mce = svm_setup_mce,
4981
4982 #ifdef CONFIG_KVM_SMM
4983         .smi_allowed = svm_smi_allowed,
4984         .enter_smm = svm_enter_smm,
4985         .leave_smm = svm_leave_smm,
4986         .enable_smi_window = svm_enable_smi_window,
4987 #endif
4988
4989         .mem_enc_ioctl = sev_mem_enc_ioctl,
4990         .mem_enc_register_region = sev_mem_enc_register_region,
4991         .mem_enc_unregister_region = sev_mem_enc_unregister_region,
4992         .guest_memory_reclaimed = sev_guest_memory_reclaimed,
4993
4994         .vm_copy_enc_context_from = sev_vm_copy_enc_context_from,
4995         .vm_move_enc_context_from = sev_vm_move_enc_context_from,
4996
4997         .can_emulate_instruction = svm_can_emulate_instruction,
4998
4999         .apic_init_signal_blocked = svm_apic_init_signal_blocked,
5000
5001         .msr_filter_changed = svm_msr_filter_changed,
5002         .complete_emulated_msr = svm_complete_emulated_msr,
5003
5004         .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
5005         .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons,
5006 };
5007
5008 /*
5009  * The default MMIO mask is a single bit (excluding the present bit),
5010  * which could conflict with the memory encryption bit. Check for
5011  * memory encryption support and override the default MMIO mask if
5012  * memory encryption is enabled.
5013  */
5014 static __init void svm_adjust_mmio_mask(void)
5015 {
5016         unsigned int enc_bit, mask_bit;
5017         u64 msr, mask;
5018
5019         /* If there is no memory encryption support, use existing mask */
5020         if (cpuid_eax(0x80000000) < 0x8000001f)
5021                 return;
5022
5023         /* If memory encryption is not enabled, use existing mask */
5024         rdmsrl(MSR_AMD64_SYSCFG, msr);
5025         if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
5026                 return;
5027
5028         enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
5029         mask_bit = boot_cpu_data.x86_phys_bits;
5030
5031         /* Increment the mask bit if it is the same as the encryption bit */
5032         if (enc_bit == mask_bit)
5033                 mask_bit++;
5034
5035         /*
5036          * If the mask bit location is below 52, then some bits above the
5037          * physical addressing limit will always be reserved, so use the
5038          * rsvd_bits() function to generate the mask. This mask, along with
5039          * the present bit, will be used to generate a page fault with
5040          * PFER.RSV = 1.
5041          *
5042          * If the mask bit location is 52 (or above), then clear the mask.
5043          */
5044         mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
5045
5046         kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
5047 }
5048
5049 static __init void svm_set_cpu_caps(void)
5050 {
5051         kvm_set_cpu_caps();
5052
5053         kvm_caps.supported_perf_cap = 0;
5054         kvm_caps.supported_xss = 0;
5055
5056         /* CPUID 0x80000001 and 0x8000000A (SVM features) */
5057         if (nested) {
5058                 kvm_cpu_cap_set(X86_FEATURE_SVM);
5059                 kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN);
5060
5061                 if (nrips)
5062                         kvm_cpu_cap_set(X86_FEATURE_NRIPS);
5063
5064                 if (npt_enabled)
5065                         kvm_cpu_cap_set(X86_FEATURE_NPT);
5066
5067                 if (tsc_scaling)
5068                         kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
5069
5070                 if (vls)
5071                         kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD);
5072                 if (lbrv)
5073                         kvm_cpu_cap_set(X86_FEATURE_LBRV);
5074
5075                 if (boot_cpu_has(X86_FEATURE_PAUSEFILTER))
5076                         kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER);
5077
5078                 if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD))
5079                         kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD);
5080
5081                 if (vgif)
5082                         kvm_cpu_cap_set(X86_FEATURE_VGIF);
5083
5084                 if (vnmi)
5085                         kvm_cpu_cap_set(X86_FEATURE_VNMI);
5086
5087                 /* Nested VM can receive #VMEXIT instead of triggering #GP */
5088                 kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
5089         }
5090
5091         /* CPUID 0x80000008 */
5092         if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
5093             boot_cpu_has(X86_FEATURE_AMD_SSBD))
5094                 kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
5095
5096         if (enable_pmu) {
5097                 /*
5098                  * Enumerate support for PERFCTR_CORE if and only if KVM has
5099                  * access to enough counters to virtualize "core" support,
5100                  * otherwise limit vPMU support to the legacy number of counters.
5101                  */
5102                 if (kvm_pmu_cap.num_counters_gp < AMD64_NUM_COUNTERS_CORE)
5103                         kvm_pmu_cap.num_counters_gp = min(AMD64_NUM_COUNTERS,
5104                                                           kvm_pmu_cap.num_counters_gp);
5105                 else
5106                         kvm_cpu_cap_check_and_set(X86_FEATURE_PERFCTR_CORE);
5107
5108                 if (kvm_pmu_cap.version != 2 ||
5109                     !kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE))
5110                         kvm_cpu_cap_clear(X86_FEATURE_PERFMON_V2);
5111         }
5112
5113         /* CPUID 0x8000001F (SME/SEV features) */
5114         sev_set_cpu_caps();
5115 }
5116
5117 static __init int svm_hardware_setup(void)
5118 {
5119         int cpu;
5120         struct page *iopm_pages;
5121         void *iopm_va;
5122         int r;
5123         unsigned int order = get_order(IOPM_SIZE);
5124
5125         /*
5126          * NX is required for shadow paging and for NPT if the NX huge pages
5127          * mitigation is enabled.
5128          */
5129         if (!boot_cpu_has(X86_FEATURE_NX)) {
5130                 pr_err_ratelimited("NX (Execute Disable) not supported\n");
5131                 return -EOPNOTSUPP;
5132         }
5133         kvm_enable_efer_bits(EFER_NX);
5134
5135         iopm_pages = alloc_pages(GFP_KERNEL, order);
5136
5137         if (!iopm_pages)
5138                 return -ENOMEM;
5139
5140         iopm_va = page_address(iopm_pages);
5141         memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
5142         iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
5143
5144         init_msrpm_offsets();
5145
5146         kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
5147                                      XFEATURE_MASK_BNDCSR);
5148
5149         if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
5150                 kvm_enable_efer_bits(EFER_FFXSR);
5151
5152         if (tsc_scaling) {
5153                 if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
5154                         tsc_scaling = false;
5155                 } else {
5156                         pr_info("TSC scaling supported\n");
5157                         kvm_caps.has_tsc_control = true;
5158                 }
5159         }
5160         kvm_caps.max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX;
5161         kvm_caps.tsc_scaling_ratio_frac_bits = 32;
5162
5163         tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
5164
5165         if (boot_cpu_has(X86_FEATURE_AUTOIBRS))
5166                 kvm_enable_efer_bits(EFER_AUTOIBRS);
5167
5168         /* Check for pause filtering support */
5169         if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
5170                 pause_filter_count = 0;
5171                 pause_filter_thresh = 0;
5172         } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
5173                 pause_filter_thresh = 0;
5174         }
5175
5176         if (nested) {
5177                 pr_info("Nested Virtualization enabled\n");
5178                 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
5179         }
5180
5181         /*
5182          * KVM's MMU doesn't support using 2-level paging for itself, and thus
5183          * NPT isn't supported if the host is using 2-level paging since host
5184          * CR4 is unchanged on VMRUN.
5185          */
5186         if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
5187                 npt_enabled = false;
5188
5189         if (!boot_cpu_has(X86_FEATURE_NPT))
5190                 npt_enabled = false;
5191
5192         /* Force VM NPT level equal to the host's paging level */
5193         kvm_configure_mmu(npt_enabled, get_npt_level(),
5194                           get_npt_level(), PG_LEVEL_1G);
5195         pr_info("Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
5196
5197         /* Setup shadow_me_value and shadow_me_mask */
5198         kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask);
5199
5200         svm_adjust_mmio_mask();
5201
5202         nrips = nrips && boot_cpu_has(X86_FEATURE_NRIPS);
5203
5204         /*
5205          * Note, SEV setup consumes npt_enabled and enable_mmio_caching (which
5206          * may be modified by svm_adjust_mmio_mask()), as well as nrips.
5207          */
5208         sev_hardware_setup();
5209
5210         svm_hv_hardware_setup();
5211
5212         for_each_possible_cpu(cpu) {
5213                 r = svm_cpu_init(cpu);
5214                 if (r)
5215                         goto err;
5216         }
5217
5218         enable_apicv = avic = avic && avic_hardware_setup();
5219
5220         if (!enable_apicv) {
5221                 svm_x86_ops.vcpu_blocking = NULL;
5222                 svm_x86_ops.vcpu_unblocking = NULL;
5223                 svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
5224         } else if (!x2avic_enabled) {
5225                 svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true;
5226         }
5227
5228         if (vls) {
5229                 if (!npt_enabled ||
5230                     !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
5231                     !IS_ENABLED(CONFIG_X86_64)) {
5232                         vls = false;
5233                 } else {
5234                         pr_info("Virtual VMLOAD VMSAVE supported\n");
5235                 }
5236         }
5237
5238         if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
5239                 svm_gp_erratum_intercept = false;
5240
5241         if (vgif) {
5242                 if (!boot_cpu_has(X86_FEATURE_VGIF))
5243                         vgif = false;
5244                 else
5245                         pr_info("Virtual GIF supported\n");
5246         }
5247
5248         vnmi = vgif && vnmi && boot_cpu_has(X86_FEATURE_VNMI);
5249         if (vnmi)
5250                 pr_info("Virtual NMI enabled\n");
5251
5252         if (!vnmi) {
5253                 svm_x86_ops.is_vnmi_pending = NULL;
5254                 svm_x86_ops.set_vnmi_pending = NULL;
5255         }
5256
5257
5258         if (lbrv) {
5259                 if (!boot_cpu_has(X86_FEATURE_LBRV))
5260                         lbrv = false;
5261                 else
5262                         pr_info("LBR virtualization supported\n");
5263         }
5264
5265         if (!enable_pmu)
5266                 pr_info("PMU virtualization is disabled\n");
5267
5268         svm_set_cpu_caps();
5269
5270         /*
5271          * It seems that on AMD processors PTE's accessed bit is
5272          * being set by the CPU hardware before the NPF vmexit.
5273          * This is not expected behaviour and our tests fail because
5274          * of it.
5275          * A workaround here is to disable support for
5276          * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
5277          * In this case userspace can know if there is support using
5278          * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
5279          * it
5280          * If future AMD CPU models change the behaviour described above,
5281          * this variable can be changed accordingly
5282          */
5283         allow_smaller_maxphyaddr = !npt_enabled;
5284
5285         return 0;
5286
5287 err:
5288         svm_hardware_unsetup();
5289         return r;
5290 }
5291
5292
5293 static struct kvm_x86_init_ops svm_init_ops __initdata = {
5294         .hardware_setup = svm_hardware_setup,
5295
5296         .runtime_ops = &svm_x86_ops,
5297         .pmu_ops = &amd_pmu_ops,
5298 };
5299
5300 static void __svm_exit(void)
5301 {
5302         kvm_x86_vendor_exit();
5303
5304         cpu_emergency_unregister_virt_callback(svm_emergency_disable);
5305 }
5306
5307 static int __init svm_init(void)
5308 {
5309         int r;
5310
5311         __unused_size_checks();
5312
5313         if (!kvm_is_svm_supported())
5314                 return -EOPNOTSUPP;
5315
5316         r = kvm_x86_vendor_init(&svm_init_ops);
5317         if (r)
5318                 return r;
5319
5320         cpu_emergency_register_virt_callback(svm_emergency_disable);
5321
5322         /*
5323          * Common KVM initialization _must_ come last, after this, /dev/kvm is
5324          * exposed to userspace!
5325          */
5326         r = kvm_init(sizeof(struct vcpu_svm), __alignof__(struct vcpu_svm),
5327                      THIS_MODULE);
5328         if (r)
5329                 goto err_kvm_init;
5330
5331         return 0;
5332
5333 err_kvm_init:
5334         __svm_exit();
5335         return r;
5336 }
5337
5338 static void __exit svm_exit(void)
5339 {
5340         kvm_exit();
5341         __svm_exit();
5342 }
5343
5344 module_init(svm_init)
5345 module_exit(svm_exit)