Merge tag 'kvm-4.15-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 16 Nov 2017 21:00:24 +0000 (13:00 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 16 Nov 2017 21:00:24 +0000 (13:00 -0800)
Pull KVM updates from Radim Krčmář:
 "First batch of KVM changes for 4.15

  Common:
   - Python 3 support in kvm_stat
   - Accounting of slabs to kmemcg

  ARM:
   - Optimized arch timer handling for KVM/ARM
   - Improvements to the VGIC ITS code and introduction of an ITS reset
     ioctl
   - Unification of the 32-bit fault injection logic
   - More exact external abort matching logic

  PPC:
   - Support for running hashed page table (HPT) MMU mode on a host that
     is using the radix MMU mode; single threaded mode on POWER 9 is
     added as a pre-requisite
   - Resolution of merge conflicts with the last second 4.14 HPT fixes
   - Fixes and cleanups

  s390:
   - Some initial preparation patches for exitless interrupts and crypto
   - New capability for AIS migration
   - Fixes

  x86:
   - Improved emulation of LAPIC timer mode changes, MCi_STATUS MSRs,
     and after-reset state
   - Refined dependencies for VMX features
   - Fixes for nested SMI injection
   - A lot of cleanups"

* tag 'kvm-4.15-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (89 commits)
  KVM: s390: provide a capability for AIS state migration
  KVM: s390: clear_io_irq() requests are not expected for adapter interrupts
  KVM: s390: abstract conversion between isc and enum irq_types
  KVM: s390: vsie: use common code functions for pinning
  KVM: s390: SIE considerations for AP Queue virtualization
  KVM: s390: document memory ordering for kvm_s390_vcpu_wakeup
  KVM: PPC: Book3S HV: Cosmetic post-merge cleanups
  KVM: arm/arm64: fix the incompatible matching for external abort
  KVM: arm/arm64: Unify 32bit fault injection
  KVM: arm/arm64: vgic-its: Implement KVM_DEV_ARM_ITS_CTRL_RESET
  KVM: arm/arm64: Document KVM_DEV_ARM_ITS_CTRL_RESET
  KVM: arm/arm64: vgic-its: Free caches when GITS_BASER Valid bit is cleared
  KVM: arm/arm64: vgic-its: New helper functions to free the caches
  KVM: arm/arm64: vgic-its: Remove kvm_its_unmap_device
  arm/arm64: KVM: Load the timer state when enabling the timer
  KVM: arm/arm64: Rework kvm_timer_should_fire
  KVM: arm/arm64: Get rid of kvm_timer_flush_hwstate
  KVM: arm/arm64: Avoid phys timer emulation in vcpu entry/exit
  KVM: arm/arm64: Move phys_timer_emulate function
  KVM: arm/arm64: Use kvm_arm_timer_set/get_reg for guest register traps
  ...

22 files changed:
1  2 
arch/arm/include/uapi/asm/kvm.h
arch/arm64/include/asm/arch_timer.h
arch/arm64/include/uapi/asm/kvm.h
arch/arm64/kvm/hyp/switch.c
arch/arm64/kvm/sys_regs.c
arch/powerpc/kernel/asm-offsets.c
arch/powerpc/kvm/book3s_hv.c
arch/s390/kvm/interrupt.c
arch/s390/kvm/kvm-s390.c
arch/x86/include/asm/kvm_emulate.h
arch/x86/include/asm/kvm_host.h
arch/x86/kvm/lapic.c
arch/x86/kvm/mmu.c
arch/x86/kvm/mmu.h
arch/x86/kvm/vmx.c
drivers/clocksource/arm_arch_timer.c
drivers/irqchip/irq-gic-v3.c
drivers/irqchip/irq-gic.c
include/uapi/linux/kvm.h
virt/kvm/arm/arm.c
virt/kvm/arm/vgic/vgic-its.c
virt/kvm/kvm_main.c

index 1f57bbe82b6fb8582c2a3a1617345266c22e33e8,b56895593c84007386a54ab9a2fbbba478d717c7..6edd177bb1c7c66e0ec32caf7ec8d2c3680ed2f3
@@@ -1,4 -1,3 +1,4 @@@
 +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
  /*
   * Copyright (C) 2012 - Virtual Open Systems and Columbia University
   * Author: Christoffer Dall <c.dall@virtualopensystems.com>
@@@ -152,6 -151,12 +152,12 @@@ struct kvm_arch_memory_slot 
        (__ARM_CP15_REG(op1, 0, crm, 0) | KVM_REG_SIZE_U64)
  #define ARM_CP15_REG64(...) __ARM_CP15_REG64(__VA_ARGS__)
  
+ /* PL1 Physical Timer Registers */
+ #define KVM_REG_ARM_PTIMER_CTL                ARM_CP15_REG32(0, 14, 2, 1)
+ #define KVM_REG_ARM_PTIMER_CNT                ARM_CP15_REG64(0, 14)
+ #define KVM_REG_ARM_PTIMER_CVAL               ARM_CP15_REG64(2, 14)
+ /* Virtual Timer Registers */
  #define KVM_REG_ARM_TIMER_CTL         ARM_CP15_REG32(0, 14, 3, 1)
  #define KVM_REG_ARM_TIMER_CNT         ARM_CP15_REG64(1, 14)
  #define KVM_REG_ARM_TIMER_CVAL                ARM_CP15_REG64(3, 14)
  #define   KVM_DEV_ARM_ITS_SAVE_TABLES         1
  #define   KVM_DEV_ARM_ITS_RESTORE_TABLES      2
  #define   KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES        3
+ #define   KVM_DEV_ARM_ITS_CTRL_RESET          4
  
  /* KVM_IRQ_LINE irq field index values */
  #define KVM_ARM_IRQ_TYPE_SHIFT                24
index bdedd8f748d17c42c36a033b80c26b86e48c3661,04275de614dbdff465738c132cef49fb68bd7789..f2a234d6516cf5b537b80134b137729dd9abb798
@@@ -52,6 -52,7 +52,7 @@@ struct arch_timer_erratum_workaround 
        const char *desc;
        u32 (*read_cntp_tval_el0)(void);
        u32 (*read_cntv_tval_el0)(void);
+       u64 (*read_cntpct_el0)(void);
        u64 (*read_cntvct_el0)(void);
        int (*set_next_event_phys)(unsigned long, struct clock_event_device *);
        int (*set_next_event_virt)(unsigned long, struct clock_event_device *);
@@@ -144,16 -145,12 +145,13 @@@ static inline u32 arch_timer_get_cntkct
  static inline void arch_timer_set_cntkctl(u32 cntkctl)
  {
        write_sysreg(cntkctl, cntkctl_el1);
 +      isb();
  }
  
  static inline u64 arch_counter_get_cntpct(void)
  {
-       /*
-        * AArch64 kernel and user space mandate the use of CNTVCT.
-        */
-       BUG();
-       return 0;
+       isb();
+       return arch_timer_reg_read_stable(cntpct_el0);
  }
  
  static inline u64 arch_counter_get_cntvct(void)
index 51149ec75fe480b324fd74d2697579a936438fdc,37ca7394549cf8191cd982c301f2662c8bd59dad..9abbf30446545a0668083b0891461f015563bcb1
@@@ -1,4 -1,3 +1,4 @@@
 +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
  /*
   * Copyright (C) 2012,2013 - ARM Ltd
   * Author: Marc Zyngier <marc.zyngier@arm.com>
@@@ -196,6 -195,12 +196,12 @@@ struct kvm_arch_memory_slot 
  
  #define ARM64_SYS_REG(...) (__ARM64_SYS_REG(__VA_ARGS__) | KVM_REG_SIZE_U64)
  
+ /* Physical Timer EL0 Registers */
+ #define KVM_REG_ARM_PTIMER_CTL                ARM64_SYS_REG(3, 3, 14, 2, 1)
+ #define KVM_REG_ARM_PTIMER_CVAL               ARM64_SYS_REG(3, 3, 14, 2, 2)
+ #define KVM_REG_ARM_PTIMER_CNT                ARM64_SYS_REG(3, 3, 14, 0, 1)
+ /* EL0 Virtual Timer Registers */
  #define KVM_REG_ARM_TIMER_CTL         ARM64_SYS_REG(3, 3, 14, 3, 1)
  #define KVM_REG_ARM_TIMER_CNT         ARM64_SYS_REG(3, 3, 14, 3, 2)
  #define KVM_REG_ARM_TIMER_CVAL                ARM64_SYS_REG(3, 3, 14, 0, 2)
  #define   KVM_DEV_ARM_ITS_SAVE_TABLES           1
  #define   KVM_DEV_ARM_ITS_RESTORE_TABLES        2
  #define   KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES        3
+ #define   KVM_DEV_ARM_ITS_CTRL_RESET          4
  
  /* Device Control API on vcpu fd */
  #define KVM_ARM_VCPU_PMU_V3_CTRL      0
index 951f3ebaff26e6df8e58a8af1e63548c587e0b4d,4994f4bdaca5d49f7bc31f6a229b46ce7f0257d6..525c01f48867808b6efa257063daaa4c8207252e
@@@ -48,7 -48,7 +48,7 @@@ static void __hyp_text __activate_traps
  
        val = read_sysreg(cpacr_el1);
        val |= CPACR_EL1_TTA;
 -      val &= ~CPACR_EL1_FPEN;
 +      val &= ~(CPACR_EL1_FPEN | CPACR_EL1_ZEN);
        write_sysreg(val, cpacr_el1);
  
        write_sysreg(__kvm_hyp_vector, vbar_el1);
@@@ -59,7 -59,7 +59,7 @@@ static void __hyp_text __activate_traps
        u64 val;
  
        val = CPTR_EL2_DEFAULT;
 -      val |= CPTR_EL2_TTA | CPTR_EL2_TFP;
 +      val |= CPTR_EL2_TTA | CPTR_EL2_TFP | CPTR_EL2_TZ;
        write_sysreg(val, cptr_el2);
  }
  
@@@ -81,17 -81,11 +81,17 @@@ static void __hyp_text __activate_traps
         * it will cause an exception.
         */
        val = vcpu->arch.hcr_el2;
 +
        if (!(val & HCR_RW) && system_supports_fpsimd()) {
                write_sysreg(1 << 30, fpexc32_el2);
                isb();
        }
 +
 +      if (val & HCR_RW) /* for AArch64 only: */
 +              val |= HCR_TID3; /* TID3: trap feature register accesses */
 +
        write_sysreg(val, hcr_el2);
 +
        /* Trap on AArch32 cp15 c15 accesses (EL1 or EL0) */
        write_sysreg(1 << 15, hstr_el2);
        /*
@@@ -117,7 -111,7 +117,7 @@@ static void __hyp_text __deactivate_tra
  
        write_sysreg(mdcr_el2, mdcr_el2);
        write_sysreg(HCR_HOST_VHE_FLAGS, hcr_el2);
 -      write_sysreg(CPACR_EL1_FPEN, cpacr_el1);
 +      write_sysreg(CPACR_EL1_DEFAULT, cpacr_el1);
        write_sysreg(vectors, vbar_el1);
  }
  
@@@ -304,7 -298,7 +304,7 @@@ int __hyp_text __kvm_vcpu_run(struct kv
        __activate_vm(vcpu);
  
        __vgic_restore_state(vcpu);
-       __timer_restore_state(vcpu);
+       __timer_enable_traps(vcpu);
  
        /*
         * We must restore the 32-bit state before the sysregs, thanks
@@@ -374,7 -368,7 +374,7 @@@ again
  
        __sysreg_save_guest_state(guest_ctxt);
        __sysreg32_save_state(vcpu);
-       __timer_save_state(vcpu);
+       __timer_disable_traps(vcpu);
        __vgic_save_state(vcpu);
  
        __deactivate_traps(vcpu);
@@@ -442,7 -436,7 +442,7 @@@ void __hyp_text __noreturn __hyp_panic(
  
                vcpu = (struct kvm_vcpu *)read_sysreg(tpidr_el2);
                host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
-               __timer_save_state(vcpu);
+               __timer_disable_traps(vcpu);
                __deactivate_traps(vcpu);
                __deactivate_vm(vcpu);
                __sysreg_restore_host_state(host_ctxt);
index a0ee9b05e3d445b80011c71f016555c927609324,bb0e41b3154e619458a61abff97960842a198276..1830ebc227d18d7c5ad06e8f1858bbe8f33fa53a
@@@ -23,7 -23,6 +23,7 @@@
  #include <linux/bsearch.h>
  #include <linux/kvm_host.h>
  #include <linux/mm.h>
 +#include <linux/printk.h>
  #include <linux/uaccess.h>
  
  #include <asm/cacheflush.h>
@@@ -842,13 -841,16 +842,16 @@@ static bool access_cntp_tval(struct kvm
                struct sys_reg_params *p,
                const struct sys_reg_desc *r)
  {
-       struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
        u64 now = kvm_phys_timer_read();
+       u64 cval;
  
-       if (p->is_write)
-               ptimer->cnt_cval = p->regval + now;
-       else
-               p->regval = ptimer->cnt_cval - now;
+       if (p->is_write) {
+               kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL,
+                                     p->regval + now);
+       } else {
+               cval = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL);
+               p->regval = cval - now;
+       }
  
        return true;
  }
@@@ -857,24 -859,10 +860,10 @@@ static bool access_cntp_ctl(struct kvm_
                struct sys_reg_params *p,
                const struct sys_reg_desc *r)
  {
-       struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
-       if (p->is_write) {
-               /* ISTATUS bit is read-only */
-               ptimer->cnt_ctl = p->regval & ~ARCH_TIMER_CTRL_IT_STAT;
-       } else {
-               u64 now = kvm_phys_timer_read();
-               p->regval = ptimer->cnt_ctl;
-               /*
-                * Set ISTATUS bit if it's expired.
-                * Note that according to ARMv8 ARM Issue A.k, ISTATUS bit is
-                * UNKNOWN when ENABLE bit is 0, so we chose to set ISTATUS bit
-                * regardless of ENABLE bit for our implementation convenience.
-                */
-               if (ptimer->cnt_cval <= now)
-                       p->regval |= ARCH_TIMER_CTRL_IT_STAT;
-       }
+       if (p->is_write)
+               kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CTL, p->regval);
+       else
+               p->regval = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CTL);
  
        return true;
  }
@@@ -883,156 -871,14 +872,154 @@@ static bool access_cntp_cval(struct kvm
                struct sys_reg_params *p,
                const struct sys_reg_desc *r)
  {
-       struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
        if (p->is_write)
-               ptimer->cnt_cval = p->regval;
+               kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL, p->regval);
        else
-               p->regval = ptimer->cnt_cval;
+               p->regval = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL);
  
        return true;
  }
  
 +/* Read a sanitised cpufeature ID register by sys_reg_desc */
 +static u64 read_id_reg(struct sys_reg_desc const *r, bool raz)
 +{
 +      u32 id = sys_reg((u32)r->Op0, (u32)r->Op1,
 +                       (u32)r->CRn, (u32)r->CRm, (u32)r->Op2);
 +      u64 val = raz ? 0 : read_sanitised_ftr_reg(id);
 +
 +      if (id == SYS_ID_AA64PFR0_EL1) {
 +              if (val & (0xfUL << ID_AA64PFR0_SVE_SHIFT))
 +                      pr_err_once("kvm [%i]: SVE unsupported for guests, suppressing\n",
 +                                  task_pid_nr(current));
 +
 +              val &= ~(0xfUL << ID_AA64PFR0_SVE_SHIFT);
 +      }
 +
 +      return val;
 +}
 +
 +/* cpufeature ID register access trap handlers */
 +
 +static bool __access_id_reg(struct kvm_vcpu *vcpu,
 +                          struct sys_reg_params *p,
 +                          const struct sys_reg_desc *r,
 +                          bool raz)
 +{
 +      if (p->is_write)
 +              return write_to_read_only(vcpu, p, r);
 +
 +      p->regval = read_id_reg(r, raz);
 +      return true;
 +}
 +
 +static bool access_id_reg(struct kvm_vcpu *vcpu,
 +                        struct sys_reg_params *p,
 +                        const struct sys_reg_desc *r)
 +{
 +      return __access_id_reg(vcpu, p, r, false);
 +}
 +
 +static bool access_raz_id_reg(struct kvm_vcpu *vcpu,
 +                            struct sys_reg_params *p,
 +                            const struct sys_reg_desc *r)
 +{
 +      return __access_id_reg(vcpu, p, r, true);
 +}
 +
 +static int reg_from_user(u64 *val, const void __user *uaddr, u64 id);
 +static int reg_to_user(void __user *uaddr, const u64 *val, u64 id);
 +static u64 sys_reg_to_index(const struct sys_reg_desc *reg);
 +
 +/*
 + * cpufeature ID register user accessors
 + *
 + * For now, these registers are immutable for userspace, so no values
 + * are stored, and for set_id_reg() we don't allow the effective value
 + * to be changed.
 + */
 +static int __get_id_reg(const struct sys_reg_desc *rd, void __user *uaddr,
 +                      bool raz)
 +{
 +      const u64 id = sys_reg_to_index(rd);
 +      const u64 val = read_id_reg(rd, raz);
 +
 +      return reg_to_user(uaddr, &val, id);
 +}
 +
 +static int __set_id_reg(const struct sys_reg_desc *rd, void __user *uaddr,
 +                      bool raz)
 +{
 +      const u64 id = sys_reg_to_index(rd);
 +      int err;
 +      u64 val;
 +
 +      err = reg_from_user(&val, uaddr, id);
 +      if (err)
 +              return err;
 +
 +      /* This is what we mean by invariant: you can't change it. */
 +      if (val != read_id_reg(rd, raz))
 +              return -EINVAL;
 +
 +      return 0;
 +}
 +
 +static int get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
 +                    const struct kvm_one_reg *reg, void __user *uaddr)
 +{
 +      return __get_id_reg(rd, uaddr, false);
 +}
 +
 +static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
 +                    const struct kvm_one_reg *reg, void __user *uaddr)
 +{
 +      return __set_id_reg(rd, uaddr, false);
 +}
 +
 +static int get_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
 +                        const struct kvm_one_reg *reg, void __user *uaddr)
 +{
 +      return __get_id_reg(rd, uaddr, true);
 +}
 +
 +static int set_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
 +                        const struct kvm_one_reg *reg, void __user *uaddr)
 +{
 +      return __set_id_reg(rd, uaddr, true);
 +}
 +
 +/* sys_reg_desc initialiser for known cpufeature ID registers */
 +#define ID_SANITISED(name) {                  \
 +      SYS_DESC(SYS_##name),                   \
 +      .access = access_id_reg,                \
 +      .get_user = get_id_reg,                 \
 +      .set_user = set_id_reg,                 \
 +}
 +
 +/*
 + * sys_reg_desc initialiser for architecturally unallocated cpufeature ID
 + * register with encoding Op0=3, Op1=0, CRn=0, CRm=crm, Op2=op2
 + * (1 <= crm < 8, 0 <= Op2 < 8).
 + */
 +#define ID_UNALLOCATED(crm, op2) {                    \
 +      Op0(3), Op1(0), CRn(0), CRm(crm), Op2(op2),     \
 +      .access = access_raz_id_reg,                    \
 +      .get_user = get_raz_id_reg,                     \
 +      .set_user = set_raz_id_reg,                     \
 +}
 +
 +/*
 + * sys_reg_desc initialiser for known ID registers that we hide from guests.
 + * For now, these are exposed just like unallocated ID regs: they appear
 + * RAZ for the guest.
 + */
 +#define ID_HIDDEN(name) {                     \
 +      SYS_DESC(SYS_##name),                   \
 +      .access = access_raz_id_reg,            \
 +      .get_user = get_raz_id_reg,             \
 +      .set_user = set_raz_id_reg,             \
 +}
 +
  /*
   * Architected system registers.
   * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2
@@@ -1085,84 -931,6 +1072,84 @@@ static const struct sys_reg_desc sys_re
        { SYS_DESC(SYS_DBGVCR32_EL2), NULL, reset_val, DBGVCR32_EL2, 0 },
  
        { SYS_DESC(SYS_MPIDR_EL1), NULL, reset_mpidr, MPIDR_EL1 },
 +
 +      /*
 +       * ID regs: all ID_SANITISED() entries here must have corresponding
 +       * entries in arm64_ftr_regs[].
 +       */
 +
 +      /* AArch64 mappings of the AArch32 ID registers */
 +      /* CRm=1 */
 +      ID_SANITISED(ID_PFR0_EL1),
 +      ID_SANITISED(ID_PFR1_EL1),
 +      ID_SANITISED(ID_DFR0_EL1),
 +      ID_HIDDEN(ID_AFR0_EL1),
 +      ID_SANITISED(ID_MMFR0_EL1),
 +      ID_SANITISED(ID_MMFR1_EL1),
 +      ID_SANITISED(ID_MMFR2_EL1),
 +      ID_SANITISED(ID_MMFR3_EL1),
 +
 +      /* CRm=2 */
 +      ID_SANITISED(ID_ISAR0_EL1),
 +      ID_SANITISED(ID_ISAR1_EL1),
 +      ID_SANITISED(ID_ISAR2_EL1),
 +      ID_SANITISED(ID_ISAR3_EL1),
 +      ID_SANITISED(ID_ISAR4_EL1),
 +      ID_SANITISED(ID_ISAR5_EL1),
 +      ID_SANITISED(ID_MMFR4_EL1),
 +      ID_UNALLOCATED(2,7),
 +
 +      /* CRm=3 */
 +      ID_SANITISED(MVFR0_EL1),
 +      ID_SANITISED(MVFR1_EL1),
 +      ID_SANITISED(MVFR2_EL1),
 +      ID_UNALLOCATED(3,3),
 +      ID_UNALLOCATED(3,4),
 +      ID_UNALLOCATED(3,5),
 +      ID_UNALLOCATED(3,6),
 +      ID_UNALLOCATED(3,7),
 +
 +      /* AArch64 ID registers */
 +      /* CRm=4 */
 +      ID_SANITISED(ID_AA64PFR0_EL1),
 +      ID_SANITISED(ID_AA64PFR1_EL1),
 +      ID_UNALLOCATED(4,2),
 +      ID_UNALLOCATED(4,3),
 +      ID_UNALLOCATED(4,4),
 +      ID_UNALLOCATED(4,5),
 +      ID_UNALLOCATED(4,6),
 +      ID_UNALLOCATED(4,7),
 +
 +      /* CRm=5 */
 +      ID_SANITISED(ID_AA64DFR0_EL1),
 +      ID_SANITISED(ID_AA64DFR1_EL1),
 +      ID_UNALLOCATED(5,2),
 +      ID_UNALLOCATED(5,3),
 +      ID_HIDDEN(ID_AA64AFR0_EL1),
 +      ID_HIDDEN(ID_AA64AFR1_EL1),
 +      ID_UNALLOCATED(5,6),
 +      ID_UNALLOCATED(5,7),
 +
 +      /* CRm=6 */
 +      ID_SANITISED(ID_AA64ISAR0_EL1),
 +      ID_SANITISED(ID_AA64ISAR1_EL1),
 +      ID_UNALLOCATED(6,2),
 +      ID_UNALLOCATED(6,3),
 +      ID_UNALLOCATED(6,4),
 +      ID_UNALLOCATED(6,5),
 +      ID_UNALLOCATED(6,6),
 +      ID_UNALLOCATED(6,7),
 +
 +      /* CRm=7 */
 +      ID_SANITISED(ID_AA64MMFR0_EL1),
 +      ID_SANITISED(ID_AA64MMFR1_EL1),
 +      ID_SANITISED(ID_AA64MMFR2_EL1),
 +      ID_UNALLOCATED(7,3),
 +      ID_UNALLOCATED(7,4),
 +      ID_UNALLOCATED(7,5),
 +      ID_UNALLOCATED(7,6),
 +      ID_UNALLOCATED(7,7),
 +
        { SYS_DESC(SYS_SCTLR_EL1), access_vm_reg, reset_val, SCTLR_EL1, 0x00C50078 },
        { SYS_DESC(SYS_CPACR_EL1), NULL, reset_val, CPACR_EL1, 0 },
        { SYS_DESC(SYS_TTBR0_EL1), access_vm_reg, reset_unknown, TTBR0_EL1 },
@@@ -2009,8 -1777,8 +1996,8 @@@ static const struct sys_reg_desc *index
        if (!r)
                r = find_reg(&params, sys_reg_descs, ARRAY_SIZE(sys_reg_descs));
  
 -      /* Not saved in the sys_reg array? */
 -      if (r && !r->reg)
 +      /* Not saved in the sys_reg array and not otherwise accessible? */
 +      if (r && !(r->reg || r->get_user))
                r = NULL;
  
        return r;
  FUNCTION_INVARIANT(midr_el1)
  FUNCTION_INVARIANT(ctr_el0)
  FUNCTION_INVARIANT(revidr_el1)
 -FUNCTION_INVARIANT(id_pfr0_el1)
 -FUNCTION_INVARIANT(id_pfr1_el1)
 -FUNCTION_INVARIANT(id_dfr0_el1)
 -FUNCTION_INVARIANT(id_afr0_el1)
 -FUNCTION_INVARIANT(id_mmfr0_el1)
 -FUNCTION_INVARIANT(id_mmfr1_el1)
 -FUNCTION_INVARIANT(id_mmfr2_el1)
 -FUNCTION_INVARIANT(id_mmfr3_el1)
 -FUNCTION_INVARIANT(id_isar0_el1)
 -FUNCTION_INVARIANT(id_isar1_el1)
 -FUNCTION_INVARIANT(id_isar2_el1)
 -FUNCTION_INVARIANT(id_isar3_el1)
 -FUNCTION_INVARIANT(id_isar4_el1)
 -FUNCTION_INVARIANT(id_isar5_el1)
  FUNCTION_INVARIANT(clidr_el1)
  FUNCTION_INVARIANT(aidr_el1)
  
  static struct sys_reg_desc invariant_sys_regs[] = {
        { SYS_DESC(SYS_MIDR_EL1), NULL, get_midr_el1 },
        { SYS_DESC(SYS_REVIDR_EL1), NULL, get_revidr_el1 },
 -      { SYS_DESC(SYS_ID_PFR0_EL1), NULL, get_id_pfr0_el1 },
 -      { SYS_DESC(SYS_ID_PFR1_EL1), NULL, get_id_pfr1_el1 },
 -      { SYS_DESC(SYS_ID_DFR0_EL1), NULL, get_id_dfr0_el1 },
 -      { SYS_DESC(SYS_ID_AFR0_EL1), NULL, get_id_afr0_el1 },
 -      { SYS_DESC(SYS_ID_MMFR0_EL1), NULL, get_id_mmfr0_el1 },
 -      { SYS_DESC(SYS_ID_MMFR1_EL1), NULL, get_id_mmfr1_el1 },
 -      { SYS_DESC(SYS_ID_MMFR2_EL1), NULL, get_id_mmfr2_el1 },
 -      { SYS_DESC(SYS_ID_MMFR3_EL1), NULL, get_id_mmfr3_el1 },
 -      { SYS_DESC(SYS_ID_ISAR0_EL1), NULL, get_id_isar0_el1 },
 -      { SYS_DESC(SYS_ID_ISAR1_EL1), NULL, get_id_isar1_el1 },
 -      { SYS_DESC(SYS_ID_ISAR2_EL1), NULL, get_id_isar2_el1 },
 -      { SYS_DESC(SYS_ID_ISAR3_EL1), NULL, get_id_isar3_el1 },
 -      { SYS_DESC(SYS_ID_ISAR4_EL1), NULL, get_id_isar4_el1 },
 -      { SYS_DESC(SYS_ID_ISAR5_EL1), NULL, get_id_isar5_el1 },
        { SYS_DESC(SYS_CLIDR_EL1), NULL, get_clidr_el1 },
        { SYS_DESC(SYS_AIDR_EL1), NULL, get_aidr_el1 },
        { SYS_DESC(SYS_CTR_EL0), NULL, get_ctr_el0 },
@@@ -2270,31 -2066,12 +2257,31 @@@ static bool copy_reg_to_user(const stru
        return true;
  }
  
 +static int walk_one_sys_reg(const struct sys_reg_desc *rd,
 +                          u64 __user **uind,
 +                          unsigned int *total)
 +{
 +      /*
 +       * Ignore registers we trap but don't save,
 +       * and for which no custom user accessor is provided.
 +       */
 +      if (!(rd->reg || rd->get_user))
 +              return 0;
 +
 +      if (!copy_reg_to_user(rd, uind))
 +              return -EFAULT;
 +
 +      (*total)++;
 +      return 0;
 +}
 +
  /* Assumed ordered tables, see kvm_sys_reg_table_init. */
  static int walk_sys_regs(struct kvm_vcpu *vcpu, u64 __user *uind)
  {
        const struct sys_reg_desc *i1, *i2, *end1, *end2;
        unsigned int total = 0;
        size_t num;
 +      int err;
  
        /* We check for duplicates here, to allow arch-specific overrides. */
        i1 = get_target_table(vcpu->arch.target, true, &num);
        while (i1 || i2) {
                int cmp = cmp_sys_reg(i1, i2);
                /* target-specific overrides generic entry. */
 -              if (cmp <= 0) {
 -                      /* Ignore registers we trap but don't save. */
 -                      if (i1->reg) {
 -                              if (!copy_reg_to_user(i1, &uind))
 -                                      return -EFAULT;
 -                              total++;
 -                      }
 -              } else {
 -                      /* Ignore registers we trap but don't save. */
 -                      if (i2->reg) {
 -                              if (!copy_reg_to_user(i2, &uind))
 -                                      return -EFAULT;
 -                              total++;
 -                      }
 -              }
 +              if (cmp <= 0)
 +                      err = walk_one_sys_reg(i1, &uind, &total);
 +              else
 +                      err = walk_one_sys_reg(i2, &uind, &total);
 +
 +              if (err)
 +                      return err;
  
                if (cmp <= 0 && ++i1 == end1)
                        i1 = NULL;
index 9aace433491adf30ebfaaad68fac9723f5585e5f,519fad556113196273fd38d13f4d18c7918d5ebd..6b958414b4e036ac1e4c97bceb61277ffab65e76
@@@ -185,7 -185,7 +185,7 @@@ int main(void
  #ifdef CONFIG_PPC_MM_SLICES
        OFFSET(PACALOWSLICESPSIZE, paca_struct, mm_ctx_low_slices_psize);
        OFFSET(PACAHIGHSLICEPSIZE, paca_struct, mm_ctx_high_slices_psize);
 -      DEFINE(PACA_ADDR_LIMIT, offsetof(struct paca_struct, addr_limit));
 +      OFFSET(PACA_SLB_ADDR_LIMIT, paca_struct, mm_ctx_slb_addr_limit);
        DEFINE(MMUPSIZEDEFSIZE, sizeof(struct mmu_psize_def));
  #endif /* CONFIG_PPC_MM_SLICES */
  #endif
        OFFSET(TCD_ESEL_FIRST, tlb_core_data, esel_first);
  #endif /* CONFIG_PPC_BOOK3E */
  
 -#ifdef CONFIG_PPC_STD_MMU_64
 +#ifdef CONFIG_PPC_BOOK3S_64
        OFFSET(PACASLBCACHE, paca_struct, slb_cache);
        OFFSET(PACASLBCACHEPTR, paca_struct, slb_cache_ptr);
        OFFSET(PACAVMALLOCSLLP, paca_struct, vmalloc_sllp);
        OFFSET(LPPACA_DTLIDX, lppaca, dtl_idx);
        OFFSET(LPPACA_YIELDCOUNT, lppaca, yield_count);
        OFFSET(PACA_DTL_RIDX, paca_struct, dtl_ridx);
 -#endif /* CONFIG_PPC_STD_MMU_64 */
 +#endif /* CONFIG_PPC_BOOK3S_64 */
        OFFSET(PACAEMERGSP, paca_struct, emergency_sp);
  #ifdef CONFIG_PPC_BOOK3S_64
        OFFSET(PACAMCEMERGSP, paca_struct, mc_emergency_sp);
        HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr);
        HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi);
        HSTATE_FIELD(HSTATE_PTID, ptid);
+       HSTATE_FIELD(HSTATE_TID, tid);
        HSTATE_FIELD(HSTATE_MMCR0, host_mmcr[0]);
        HSTATE_FIELD(HSTATE_MMCR1, host_mmcr[1]);
        HSTATE_FIELD(HSTATE_MMCRA, host_mmcr[2]);
        OFFSET(KVM_SPLIT_LDBAR, kvm_split_mode, ldbar);
        OFFSET(KVM_SPLIT_DO_NAP, kvm_split_mode, do_nap);
        OFFSET(KVM_SPLIT_NAPPED, kvm_split_mode, napped);
+       OFFSET(KVM_SPLIT_DO_SET, kvm_split_mode, do_set);
+       OFFSET(KVM_SPLIT_DO_RESTORE, kvm_split_mode, do_restore);
  #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
  
  #ifdef CONFIG_PPC_BOOK3S_64
index 40e5857c4b1c44312dc9c74eab27d3b835a8be16,18b16c3957fcd745c0f65e532f75b878196e280b..79ea3d9269dbf568904e504d78cc56850c77860d
@@@ -19,6 -19,7 +19,7 @@@
   */
  
  #include <linux/kvm_host.h>
+ #include <linux/kernel.h>
  #include <linux/err.h>
  #include <linux/slab.h>
  #include <linux/preempt.h>
@@@ -47,7 -48,6 +48,7 @@@
  
  #include <asm/reg.h>
  #include <asm/ppc-opcode.h>
 +#include <asm/asm-prototypes.h>
  #include <asm/disassemble.h>
  #include <asm/cputable.h>
  #include <asm/cacheflush.h>
@@@ -98,6 -98,10 +99,10 @@@ static int target_smt_mode
  module_param(target_smt_mode, int, S_IRUGO | S_IWUSR);
  MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)");
  
+ static bool indep_threads_mode = true;
+ module_param(indep_threads_mode, bool, S_IRUGO | S_IWUSR);
+ MODULE_PARM_DESC(indep_threads_mode, "Independent-threads mode (only on POWER9)");
  #ifdef CONFIG_KVM_XICS
  static struct kernel_param_ops module_param_ops = {
        .set = param_set_int,
@@@ -115,6 -119,7 +120,7 @@@ MODULE_PARM_DESC(h_ipi_redirect, "Redir
  
  static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
  static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
+ static void kvmppc_setup_partition_table(struct kvm *kvm);
  
  static inline struct kvm_vcpu *next_runnable_thread(struct kvmppc_vcore *vc,
                int *ip)
@@@ -1090,10 -1095,9 +1096,10 @@@ static int kvmppc_handle_exit_hv(struc
                vcpu->stat.ext_intr_exits++;
                r = RESUME_GUEST;
                break;
 -      /* HMI is hypervisor interrupt and host has handled it. Resume guest.*/
 +      /* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/
        case BOOK3S_INTERRUPT_HMI:
        case BOOK3S_INTERRUPT_PERFMON:
 +      case BOOK3S_INTERRUPT_SYSTEM_RESET:
                r = RESUME_GUEST;
                break;
        case BOOK3S_INTERRUPT_MACHINE_CHECK:
@@@ -1734,9 -1738,9 +1740,9 @@@ static int kvmppc_set_one_reg_hv(struc
   * MMU mode (radix or HPT), unfortunately, but since we only support
   * HPT guests on a HPT host so far, that isn't an impediment yet.
   */
- static int threads_per_vcore(void)
+ static int threads_per_vcore(struct kvm *kvm)
  {
-       if (cpu_has_feature(CPU_FTR_ARCH_300))
+       if (kvm->arch.threads_indep)
                return 1;
        return threads_per_subcore;
  }
@@@ -1774,7 -1778,7 +1780,7 @@@ static struct debugfs_timings_element 
        {"cede",        offsetof(struct kvm_vcpu, arch.cede_time)},
  };
  
- #define N_TIMINGS     (sizeof(timings) / sizeof(timings[0]))
+ #define N_TIMINGS     (ARRAY_SIZE(timings))
  
  struct debugfs_timings_state {
        struct kvm_vcpu *vcpu;
@@@ -2228,11 -2232,10 +2234,10 @@@ static void kvmppc_start_thread(struct 
                kvmppc_ipi_thread(cpu);
  }
  
- static void kvmppc_wait_for_nap(void)
+ static void kvmppc_wait_for_nap(int n_threads)
  {
        int cpu = smp_processor_id();
        int i, loops;
-       int n_threads = threads_per_vcore();
  
        if (n_threads <= 1)
                return;
@@@ -2319,7 -2322,7 +2324,7 @@@ static void kvmppc_vcore_preempt(struc
  
        vc->vcore_state = VCORE_PREEMPT;
        vc->pcpu = smp_processor_id();
-       if (vc->num_threads < threads_per_vcore()) {
+       if (vc->num_threads < threads_per_vcore(vc->kvm)) {
                spin_lock(&lp->lock);
                list_add_tail(&vc->preempt_list, &lp->list);
                spin_unlock(&lp->lock);
@@@ -2357,7 -2360,7 +2362,7 @@@ struct core_info 
  
  /*
   * This mapping means subcores 0 and 1 can use threads 0-3 and 4-7
-  * respectively in 2-way micro-threading (split-core) mode.
+  * respectively in 2-way micro-threading (split-core) mode on POWER8.
   */
  static int subcore_thread_map[MAX_SUBCORES] = { 0, 4, 2, 6 };
  
@@@ -2373,7 -2376,14 +2378,14 @@@ static void init_core_info(struct core_
  
  static bool subcore_config_ok(int n_subcores, int n_threads)
  {
-       /* Can only dynamically split if unsplit to begin with */
+       /*
+        * POWER9 "SMT4" cores are permanently in what is effectively a 4-way split-core
+        * mode, with one thread per subcore.
+        */
+       if (cpu_has_feature(CPU_FTR_ARCH_300))
+               return n_subcores <= 4 && n_threads == 1;
+       /* On POWER8, can only dynamically split if unsplit to begin with */
        if (n_subcores > 1 && threads_per_subcore < MAX_SMT_THREADS)
                return false;
        if (n_subcores > MAX_SUBCORES)
@@@ -2404,6 -2414,11 +2416,11 @@@ static bool can_dynamic_split(struct kv
        if (!cpu_has_feature(CPU_FTR_ARCH_207S))
                return false;
  
+       /* POWER9 currently requires all threads to be in the same MMU mode */
+       if (cpu_has_feature(CPU_FTR_ARCH_300) &&
+           kvm_is_radix(vc->kvm) != kvm_is_radix(cip->vc[0]->kvm))
+               return false;
        if (n_threads < cip->max_subcore_threads)
                n_threads = cip->max_subcore_threads;
        if (!subcore_config_ok(cip->n_subcores + 1, n_threads))
@@@ -2606,9 -2621,6 +2623,9 @@@ static void set_irq_happened(int trap
        case BOOK3S_INTERRUPT_HMI:
                local_paca->irq_happened |= PACA_IRQ_HMI;
                break;
 +      case BOOK3S_INTERRUPT_SYSTEM_RESET:
 +              replay_system_reset();
 +              break;
        }
  }
  
@@@ -2632,6 -2644,8 +2649,8 @@@ static noinline void kvmppc_run_core(st
        int target_threads;
        int controlled_threads;
        int trap;
+       bool is_power8;
+       bool hpt_on_radix;
  
        /*
         * Remove from the list any threads that have a signal pending
         * the number of threads per subcore, except on POWER9,
         * where it's 1 because the threads are (mostly) independent.
         */
-       controlled_threads = threads_per_vcore();
+       controlled_threads = threads_per_vcore(vc->kvm);
  
        /*
         * Make sure we are running on primary threads, and that secondary
         * threads are offline.  Also check if the number of threads in this
         * guest are greater than the current system threads per guest.
+        * On POWER9, we need to be not in independent-threads mode if
+        * this is a HPT guest on a radix host.
         */
-       if ((controlled_threads > 1) &&
-           ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
+       hpt_on_radix = radix_enabled() && !kvm_is_radix(vc->kvm);
+       if (((controlled_threads > 1) &&
+            ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) ||
+           (hpt_on_radix && vc->kvm->arch.threads_indep)) {
                for_each_runnable_thread(i, vcpu, vc) {
                        vcpu->arch.ret = -EBUSY;
                        kvmppc_remove_runnable(vc, vcpu);
         * Hard-disable interrupts, and check resched flag and signals.
         * If we need to reschedule or deliver a signal, clean up
         * and return without going into the guest(s).
-        * If the hpte_setup_done flag has been cleared, don't go into the
+        * If the mmu_ready flag has been cleared, don't go into the
         * guest because that means a HPT resize operation is in progress.
         */
        local_irq_disable();
        hard_irq_disable();
        if (lazy_irq_pending() || need_resched() ||
-           recheck_signals(&core_info) ||
-           (!kvm_is_radix(vc->kvm) && !vc->kvm->arch.hpte_setup_done)) {
+           recheck_signals(&core_info) || !vc->kvm->arch.mmu_ready) {
                local_irq_enable();
                vc->vcore_state = VCORE_INACTIVE;
                /* Unlock all except the primary vcore */
        cmd_bit = stat_bit = 0;
        split = core_info.n_subcores;
        sip = NULL;
-       if (split > 1) {
-               /* threads_per_subcore must be MAX_SMT_THREADS (8) here */
-               if (split == 2 && (dynamic_mt_modes & 2)) {
-                       cmd_bit = HID0_POWER8_1TO2LPAR;
-                       stat_bit = HID0_POWER8_2LPARMODE;
-               } else {
-                       split = 4;
-                       cmd_bit = HID0_POWER8_1TO4LPAR;
-                       stat_bit = HID0_POWER8_4LPARMODE;
-               }
-               subcore_size = MAX_SMT_THREADS / split;
+       is_power8 = cpu_has_feature(CPU_FTR_ARCH_207S)
+               && !cpu_has_feature(CPU_FTR_ARCH_300);
+       if (split > 1 || hpt_on_radix) {
                sip = &split_info;
                memset(&split_info, 0, sizeof(split_info));
-               split_info.rpr = mfspr(SPRN_RPR);
-               split_info.pmmar = mfspr(SPRN_PMMAR);
-               split_info.ldbar = mfspr(SPRN_LDBAR);
-               split_info.subcore_size = subcore_size;
                for (sub = 0; sub < core_info.n_subcores; ++sub)
                        split_info.vc[sub] = core_info.vc[sub];
+               if (is_power8) {
+                       if (split == 2 && (dynamic_mt_modes & 2)) {
+                               cmd_bit = HID0_POWER8_1TO2LPAR;
+                               stat_bit = HID0_POWER8_2LPARMODE;
+                       } else {
+                               split = 4;
+                               cmd_bit = HID0_POWER8_1TO4LPAR;
+                               stat_bit = HID0_POWER8_4LPARMODE;
+                       }
+                       subcore_size = MAX_SMT_THREADS / split;
+                       split_info.rpr = mfspr(SPRN_RPR);
+                       split_info.pmmar = mfspr(SPRN_PMMAR);
+                       split_info.ldbar = mfspr(SPRN_LDBAR);
+                       split_info.subcore_size = subcore_size;
+               } else {
+                       split_info.subcore_size = 1;
+                       if (hpt_on_radix) {
+                               /* Use the split_info for LPCR/LPIDR changes */
+                               split_info.lpcr_req = vc->lpcr;
+                               split_info.lpidr_req = vc->kvm->arch.lpid;
+                               split_info.host_lpcr = vc->kvm->arch.host_lpcr;
+                               split_info.do_set = 1;
+                       }
+               }
                /* order writes to split_info before kvm_split_mode pointer */
                smp_wmb();
        }
-       for (thr = 0; thr < controlled_threads; ++thr)
+       for (thr = 0; thr < controlled_threads; ++thr) {
+               paca[pcpu + thr].kvm_hstate.tid = thr;
+               paca[pcpu + thr].kvm_hstate.napping = 0;
                paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip;
+       }
  
-       /* Initiate micro-threading (split-core) if required */
+       /* Initiate micro-threading (split-core) on POWER8 if required */
        if (cmd_bit) {
                unsigned long hid0 = mfspr(SPRN_HID0);
  
        /* Start all the threads */
        active = 0;
        for (sub = 0; sub < core_info.n_subcores; ++sub) {
-               thr = subcore_thread_map[sub];
+               thr = is_power8 ? subcore_thread_map[sub] : sub;
                thr0_done = false;
                active |= 1 << thr;
                pvc = core_info.vc[sub];
         * the vcore pointer in the PACA of the secondaries.
         */
        smp_mb();
-       if (cmd_bit)
-               split_info.do_nap = 1;  /* ask secondaries to nap when done */
  
        /*
         * When doing micro-threading, poke the inactive threads as well.
         * This gets them to the nap instruction after kvm_do_nap,
         * which reduces the time taken to unsplit later.
+        * For POWER9 HPT guest on radix host, we need all the secondary
+        * threads woken up so they can do the LPCR/LPIDR change.
         */
-       if (split > 1)
+       if (cmd_bit || hpt_on_radix) {
+               split_info.do_nap = 1;  /* ask secondaries to nap when done */
                for (thr = 1; thr < threads_per_subcore; ++thr)
                        if (!(active & (1 << thr)))
                                kvmppc_ipi_thread(pcpu + thr);
+       }
  
        vc->vcore_state = VCORE_RUNNING;
        preempt_disable();
        vc->vcore_state = VCORE_EXITING;
  
        /* wait for secondary threads to finish writing their state to memory */
-       kvmppc_wait_for_nap();
+       kvmppc_wait_for_nap(controlled_threads);
  
        /* Return to whole-core mode if we split the core earlier */
-       if (split > 1) {
+       if (cmd_bit) {
                unsigned long hid0 = mfspr(SPRN_HID0);
                unsigned long loops = 0;
  
                        cpu_relax();
                        ++loops;
                }
-               split_info.do_nap = 0;
+       } else if (hpt_on_radix) {
+               /* Wait for all threads to have seen final sync */
+               for (thr = 1; thr < controlled_threads; ++thr) {
+                       while (paca[pcpu + thr].kvm_hstate.kvm_split_mode) {
+                               HMT_low();
+                               barrier();
+                       }
+                       HMT_medium();
+               }
        }
+       split_info.do_nap = 0;
  
        kvmppc_set_host_core(pcpu);
  
        trace_kvmppc_vcore_wakeup(do_sleep, block_ns);
  }
  
+ static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu)
+ {
+       int r = 0;
+       struct kvm *kvm = vcpu->kvm;
+       mutex_lock(&kvm->lock);
+       if (!kvm->arch.mmu_ready) {
+               if (!kvm_is_radix(kvm))
+                       r = kvmppc_hv_setup_htab_rma(vcpu);
+               if (!r) {
+                       if (cpu_has_feature(CPU_FTR_ARCH_300))
+                               kvmppc_setup_partition_table(kvm);
+                       kvm->arch.mmu_ready = 1;
+               }
+       }
+       mutex_unlock(&kvm->lock);
+       return r;
+ }
  static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
  {
        int n_ceded, i, r;
  
        while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
               !signal_pending(current)) {
-               /* See if the HPT and VRMA are ready to go */
-               if (!kvm_is_radix(vcpu->kvm) &&
-                   !vcpu->kvm->arch.hpte_setup_done) {
+               /* See if the MMU is ready to go */
+               if (!vcpu->kvm->arch.mmu_ready) {
                        spin_unlock(&vc->lock);
-                       r = kvmppc_hv_setup_htab_rma(vcpu);
+                       r = kvmhv_setup_mmu(vcpu);
                        spin_lock(&vc->lock);
                        if (r) {
                                kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
-                               kvm_run->fail_entry.hardware_entry_failure_reason = 0;
+                               kvm_run->fail_entry.
+                                       hardware_entry_failure_reason = 0;
                                vcpu->arch.ret = r;
                                break;
                        }
@@@ -3219,6 -3285,7 +3290,7 @@@ static int kvmppc_vcpu_run_hv(struct kv
        unsigned long ebb_regs[3] = {}; /* shut up GCC */
        unsigned long user_tar = 0;
        unsigned int user_vrsave;
+       struct kvm *kvm;
  
        if (!vcpu->arch.sane) {
                run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
                return -EINTR;
        }
  
-       atomic_inc(&vcpu->kvm->arch.vcpus_running);
-       /* Order vcpus_running vs. hpte_setup_done, see kvmppc_alloc_reset_hpt */
+       kvm = vcpu->kvm;
+       atomic_inc(&kvm->arch.vcpus_running);
+       /* Order vcpus_running vs. mmu_ready, see kvmppc_alloc_reset_hpt */
        smp_mb();
  
        flush_all_to_thread(current);
                        trace_kvm_hcall_exit(vcpu, r);
                        kvmppc_core_prepare_to_enter(vcpu);
                } else if (r == RESUME_PAGE_FAULT) {
-                       srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+                       srcu_idx = srcu_read_lock(&kvm->srcu);
                        r = kvmppc_book3s_hv_page_fault(run, vcpu,
                                vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
-                       srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
+                       srcu_read_unlock(&kvm->srcu, srcu_idx);
                } else if (r == RESUME_PASSTHROUGH) {
                        if (WARN_ON(xive_enabled()))
                                r = H_SUCCESS;
        mtspr(SPRN_VRSAVE, user_vrsave);
  
        vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
-       atomic_dec(&vcpu->kvm->arch.vcpus_running);
+       atomic_dec(&kvm->arch.vcpus_running);
        return r;
  }
  
  static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
-                                    int linux_psize)
+                                    int shift, int sllp)
  {
-       struct mmu_psize_def *def = &mmu_psize_defs[linux_psize];
-       if (!def->shift)
-               return;
-       (*sps)->page_shift = def->shift;
-       (*sps)->slb_enc = def->sllp;
-       (*sps)->enc[0].page_shift = def->shift;
-       (*sps)->enc[0].pte_enc = def->penc[linux_psize];
+       (*sps)->page_shift = shift;
+       (*sps)->slb_enc = sllp;
+       (*sps)->enc[0].page_shift = shift;
+       (*sps)->enc[0].pte_enc = kvmppc_pgsize_lp_encoding(shift, shift);
        /*
-        * Add 16MB MPSS support if host supports it
+        * Add 16MB MPSS support (may get filtered out by userspace)
         */
-       if (linux_psize != MMU_PAGE_16M && def->penc[MMU_PAGE_16M] != -1) {
-               (*sps)->enc[1].page_shift = 24;
-               (*sps)->enc[1].pte_enc = def->penc[MMU_PAGE_16M];
+       if (shift != 24) {
+               int penc = kvmppc_pgsize_lp_encoding(shift, 24);
+               if (penc != -1) {
+                       (*sps)->enc[1].page_shift = 24;
+                       (*sps)->enc[1].pte_enc = penc;
+               }
        }
        (*sps)++;
  }
@@@ -3338,13 -3405,6 +3410,6 @@@ static int kvm_vm_ioctl_get_smmu_info_h
  {
        struct kvm_ppc_one_seg_page_size *sps;
  
-       /*
-        * Since we don't yet support HPT guests on a radix host,
-        * return an error if the host uses radix.
-        */
-       if (radix_enabled())
-               return -EINVAL;
        /*
         * POWER7, POWER8 and POWER9 all support 32 storage keys for data.
         * POWER7 doesn't support keys for instruction accesses,
        info->data_keys = 32;
        info->instr_keys = cpu_has_feature(CPU_FTR_ARCH_207S) ? 32 : 0;
  
-       info->flags = KVM_PPC_PAGE_SIZES_REAL;
-       if (mmu_has_feature(MMU_FTR_1T_SEGMENT))
-               info->flags |= KVM_PPC_1T_SEGMENTS;
-       info->slb_size = mmu_slb_size;
+       /* POWER7, 8 and 9 all have 1T segments and 32-entry SLB */
+       info->flags = KVM_PPC_PAGE_SIZES_REAL | KVM_PPC_1T_SEGMENTS;
+       info->slb_size = 32;
  
        /* We only support these sizes for now, and no muti-size segments */
        sps = &info->sps[0];
-       kvmppc_add_seg_page_size(&sps, MMU_PAGE_4K);
-       kvmppc_add_seg_page_size(&sps, MMU_PAGE_64K);
-       kvmppc_add_seg_page_size(&sps, MMU_PAGE_16M);
+       kvmppc_add_seg_page_size(&sps, 12, 0);
+       kvmppc_add_seg_page_size(&sps, 16, SLB_VSID_L | SLB_VSID_LP_01);
+       kvmppc_add_seg_page_size(&sps, 24, SLB_VSID_L);
  
        return 0;
  }
@@@ -3377,7 -3436,7 +3441,7 @@@ static int kvm_vm_ioctl_get_dirty_log_h
        struct kvm_memory_slot *memslot;
        int i, r;
        unsigned long n;
-       unsigned long *buf;
+       unsigned long *buf, *p;
        struct kvm_vcpu *vcpu;
  
        mutex_lock(&kvm->slots_lock);
                goto out;
  
        /*
-        * Use second half of bitmap area because radix accumulates
-        * bits in the first half.
+        * Use second half of bitmap area because both HPT and radix
+        * accumulate bits in the first half.
         */
        n = kvm_dirty_bitmap_bytes(memslot);
        buf = memslot->dirty_bitmap + n / sizeof(long);
        if (r)
                goto out;
  
+       /*
+        * We accumulate dirty bits in the first half of the
+        * memslot's dirty_bitmap area, for when pages are paged
+        * out or modified by the host directly.  Pick up these
+        * bits and add them to the map.
+        */
+       p = memslot->dirty_bitmap;
+       for (i = 0; i < n / sizeof(long); ++i)
+               buf[i] |= xchg(&p[i], 0);
        /* Harvest dirty bits from VPA and DTL updates */
        /* Note: we never modify the SLB shadow buffer areas */
        kvm_for_each_vcpu(i, vcpu, kvm) {
@@@ -3438,15 -3507,6 +3512,6 @@@ static void kvmppc_core_free_memslot_hv
  static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot,
                                         unsigned long npages)
  {
-       /*
-        * For now, if radix_enabled() then we only support radix guests,
-        * and in that case we don't need the rmap array.
-        */
-       if (radix_enabled()) {
-               slot->arch.rmap = NULL;
-               return 0;
-       }
        slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap));
        if (!slot->arch.rmap)
                return -ENOMEM;
@@@ -3467,8 -3527,6 +3532,6 @@@ static void kvmppc_core_commit_memory_r
                                const struct kvm_memory_slot *new)
  {
        unsigned long npages = mem->memory_size >> PAGE_SHIFT;
-       struct kvm_memslots *slots;
-       struct kvm_memory_slot *memslot;
  
        /*
         * If we are making a new memslot, it might make
         */
        if (npages)
                atomic64_inc(&kvm->arch.mmio_update);
-       if (npages && old->npages && !kvm_is_radix(kvm)) {
-               /*
-                * If modifying a memslot, reset all the rmap dirty bits.
-                * If this is a new memslot, we don't need to do anything
-                * since the rmap array starts out as all zeroes,
-                * i.e. no pages are dirty.
-                */
-               slots = kvm_memslots(kvm);
-               memslot = id_to_memslot(slots, mem->slot);
-               kvmppc_hv_get_dirty_log_hpt(kvm, memslot, NULL);
-       }
  }
  
  /*
@@@ -3545,6 -3591,10 +3596,10 @@@ static void kvmppc_setup_partition_tabl
        mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1);
  }
  
+ /*
+  * Set up HPT (hashed page table) and RMA (real-mode area).
+  * Must be called with kvm->lock held.
+  */
  static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
  {
        int err = 0;
        unsigned long psize, porder;
        int srcu_idx;
  
-       mutex_lock(&kvm->lock);
-       if (kvm->arch.hpte_setup_done)
-               goto out;       /* another vcpu beat us to it */
        /* Allocate hashed page table (if not done already) and reset it */
        if (!kvm->arch.hpt.virt) {
                int order = KVM_DEFAULT_HPT_ORDER;
                /* the -4 is to account for senc values starting at 0x10 */
                lpcr = senc << (LPCR_VRMASD_SH - 4);
                kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
-       } else {
-               kvmppc_setup_partition_table(kvm);
        }
  
-       /* Order updates to kvm->arch.lpcr etc. vs. hpte_setup_done */
+       /* Order updates to kvm->arch.lpcr etc. vs. mmu_ready */
        smp_wmb();
-       kvm->arch.hpte_setup_done = 1;
        err = 0;
   out_srcu:
        srcu_read_unlock(&kvm->srcu, srcu_idx);
   out:
-       mutex_unlock(&kvm->lock);
        return err;
  
   up_out:
        goto out_srcu;
  }
  
+ /* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */
+ int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
+ {
+       kvmppc_free_radix(kvm);
+       kvmppc_update_lpcr(kvm, LPCR_VPM1,
+                          LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
+       kvmppc_rmap_reset(kvm);
+       kvm->arch.radix = 0;
+       kvm->arch.process_table = 0;
+       return 0;
+ }
+ /* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */
+ int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
+ {
+       int err;
+       err = kvmppc_init_vm_radix(kvm);
+       if (err)
+               return err;
+       kvmppc_free_hpt(&kvm->arch.hpt);
+       kvmppc_update_lpcr(kvm, LPCR_UPRT | LPCR_GTSE | LPCR_HR,
+                          LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
+       kvm->arch.radix = 1;
+       return 0;
+ }
  #ifdef CONFIG_KVM_XICS
  /*
   * Allocate a per-core structure for managing state about which cores are
@@@ -3780,10 -3850,11 +3855,11 @@@ static int kvmppc_core_init_vm_hv(struc
        }
  
        /*
-        * For now, if the host uses radix, the guest must be radix.
+        * If the host uses radix, the guest starts out as radix.
         */
        if (radix_enabled()) {
                kvm->arch.radix = 1;
+               kvm->arch.mmu_ready = 1;
                lpcr &= ~LPCR_VPM1;
                lpcr |= LPCR_UPRT | LPCR_GTSE | LPCR_HR;
                ret = kvmppc_init_vm_radix(kvm);
         * Work out how many sets the TLB has, for the use of
         * the TLB invalidation loop in book3s_hv_rmhandlers.S.
         */
-       if (kvm_is_radix(kvm))
+       if (radix_enabled())
                kvm->arch.tlb_sets = POWER9_TLB_SETS_RADIX;     /* 128 */
        else if (cpu_has_feature(CPU_FTR_ARCH_300))
                kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH;      /* 256 */
        /*
         * Track that we now have a HV mode VM active. This blocks secondary
         * CPU threads from coming online.
-        * On POWER9, we only need to do this for HPT guests on a radix
-        * host, which is not yet supported.
+        * On POWER9, we only need to do this if the "indep_threads_mode"
+        * module parameter has been set to N.
         */
-       if (!cpu_has_feature(CPU_FTR_ARCH_300))
+       if (cpu_has_feature(CPU_FTR_ARCH_300))
+               kvm->arch.threads_indep = indep_threads_mode;
+       if (!kvm->arch.threads_indep)
                kvm_hv_vm_activated();
  
        /*
@@@ -3858,7 -3931,7 +3936,7 @@@ static void kvmppc_core_destroy_vm_hv(s
  {
        debugfs_remove_recursive(kvm->arch.debugfs_dir);
  
-       if (!cpu_has_feature(CPU_FTR_ARCH_300))
+       if (!kvm->arch.threads_indep)
                kvm_hv_vm_deactivated();
  
        kvmppc_free_vcores(kvm);
@@@ -4193,6 -4266,7 +4271,7 @@@ static int kvmhv_configure_mmu(struct k
  {
        unsigned long lpcr;
        int radix;
+       int err;
  
        /* If not on a POWER9, reject it */
        if (!cpu_has_feature(CPU_FTR_ARCH_300))
        if (cfg->flags & ~(KVM_PPC_MMUV3_RADIX | KVM_PPC_MMUV3_GTSE))
                return -EINVAL;
  
-       /* We can't change a guest to/from radix yet */
-       radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX);
-       if (radix != kvm_is_radix(kvm))
-               return -EINVAL;
        /* GR (guest radix) bit in process_table field must match */
+       radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX);
        if (!!(cfg->process_table & PATB_GR) != radix)
                return -EINVAL;
  
        if ((cfg->process_table & PRTS_MASK) > 24)
                return -EINVAL;
  
+       /* We can change a guest to/from radix now, if the host is radix */
+       if (radix && !radix_enabled())
+               return -EINVAL;
        mutex_lock(&kvm->lock);
+       if (radix != kvm_is_radix(kvm)) {
+               if (kvm->arch.mmu_ready) {
+                       kvm->arch.mmu_ready = 0;
+                       /* order mmu_ready vs. vcpus_running */
+                       smp_mb();
+                       if (atomic_read(&kvm->arch.vcpus_running)) {
+                               kvm->arch.mmu_ready = 1;
+                               err = -EBUSY;
+                               goto out_unlock;
+                       }
+               }
+               if (radix)
+                       err = kvmppc_switch_mmu_to_radix(kvm);
+               else
+                       err = kvmppc_switch_mmu_to_hpt(kvm);
+               if (err)
+                       goto out_unlock;
+       }
        kvm->arch.process_table = cfg->process_table;
        kvmppc_setup_partition_table(kvm);
  
        lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0;
        kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE);
-       mutex_unlock(&kvm->lock);
+       err = 0;
  
-       return 0;
+  out_unlock:
+       mutex_unlock(&kvm->lock);
+       return err;
  }
  
  static struct kvmppc_ops kvm_ops_hv = {
@@@ -4365,4 -4460,3 +4465,3 @@@ module_exit(kvmppc_book3s_exit_hv)
  MODULE_LICENSE("GPL");
  MODULE_ALIAS_MISCDEV(KVM_MINOR);
  MODULE_ALIAS("devname:kvm");
index 329b2843fee2161093f13ef55db77977f6c5449a,c8aacced23fb3d2fccec48290527b556620688e7..fa557372d600a0283663635ff198895cfad91709
@@@ -213,6 -213,16 +213,16 @@@ static inline unsigned long pending_irq
               vcpu->arch.local_int.pending_irqs;
  }
  
+ static inline int isc_to_irq_type(unsigned long isc)
+ {
+       return IRQ_PEND_IO_ISC_0 + isc;
+ }
+ static inline int irq_type_to_isc(unsigned long irq_type)
+ {
+       return irq_type - IRQ_PEND_IO_ISC_0;
+ }
  static unsigned long disable_iscs(struct kvm_vcpu *vcpu,
                                   unsigned long active_mask)
  {
  
        for (i = 0; i <= MAX_ISC; i++)
                if (!(vcpu->arch.sie_block->gcr[6] & isc_to_isc_bits(i)))
-                       active_mask &= ~(1UL << (IRQ_PEND_IO_ISC_0 + i));
+                       active_mask &= ~(1UL << (isc_to_irq_type(i)));
  
        return active_mask;
  }
@@@ -901,7 -911,7 +911,7 @@@ static int __must_check __deliver_io(st
        fi = &vcpu->kvm->arch.float_int;
  
        spin_lock(&fi->lock);
-       isc_list = &fi->lists[irq_type - IRQ_PEND_IO_ISC_0];
+       isc_list = &fi->lists[irq_type_to_isc(irq_type)];
        inti = list_first_entry_or_null(isc_list,
                                        struct kvm_s390_interrupt_info,
                                        list);
@@@ -1074,6 -1084,12 +1084,12 @@@ void kvm_s390_vcpu_wakeup(struct kvm_vc
         * in kvm_vcpu_block without having the waitqueue set (polling)
         */
        vcpu->valid_wakeup = true;
+       /*
+        * This is mostly to document, that the read in swait_active could
+        * be moved before other stores, leading to subtle races.
+        * All current users do not store or use an atomic like update
+        */
+       smp_mb__after_atomic();
        if (swait_active(&vcpu->wq)) {
                /*
                 * The vcpu gave up the cpu voluntarily, mark it as a good
@@@ -1395,7 -1411,7 +1411,7 @@@ static struct kvm_s390_interrupt_info *
                list_del_init(&iter->list);
                fi->counters[FIRQ_CNTR_IO] -= 1;
                if (list_empty(isc_list))
-                       clear_bit(IRQ_PEND_IO_ISC_0 + isc, &fi->pending_irqs);
+                       clear_bit(isc_to_irq_type(isc), &fi->pending_irqs);
                spin_unlock(&fi->lock);
                return iter;
        }
@@@ -1522,7 -1538,7 +1538,7 @@@ static int __inject_io(struct kvm *kvm
        isc = int_word_to_isc(inti->io.io_int_word);
        list = &fi->lists[FIRQ_LIST_IO_ISC_0 + isc];
        list_add_tail(&inti->list, list);
-       set_bit(IRQ_PEND_IO_ISC_0 + isc, &fi->pending_irqs);
+       set_bit(isc_to_irq_type(isc), &fi->pending_irqs);
        spin_unlock(&fi->lock);
        return 0;
  }
@@@ -2175,6 -2191,8 +2191,8 @@@ static int clear_io_irq(struct kvm *kvm
                return -EINVAL;
        if (copy_from_user(&schid, (void __user *) attr->addr, sizeof(schid)))
                return -EFAULT;
+       if (!schid)
+               return -EINVAL;
        kfree(kvm_s390_get_io_int(kvm, isc_mask, schid));
        /*
         * If userspace is conforming to the architecture, we can have at most
@@@ -2483,11 -2501,11 +2501,11 @@@ void kvm_s390_reinject_machine_check(st
  
        mci.val = mcck_info->mcic;
        if (mci.sr)
 -              cr14 |= MCCK_CR14_RECOVERY_SUB_MASK;
 +              cr14 |= CR14_RECOVERY_SUBMASK;
        if (mci.dg)
 -              cr14 |= MCCK_CR14_DEGRAD_SUB_MASK;
 +              cr14 |= CR14_DEGRADATION_SUBMASK;
        if (mci.w)
 -              cr14 |= MCCK_CR14_WARN_SUB_MASK;
 +              cr14 |= CR14_WARNING_SUBMASK;
  
        mchk = mci.ck ? &inti.mchk : &irq.u.mchk;
        mchk->cr14 = cr14;
diff --combined arch/s390/kvm/kvm-s390.c
index 4bc70afe0a104dcb2680cc7c8762983b952eb180,8f4b655f65d78f4d9bf0b57094f26a0b6c627d8b..98ad8b9e036093c8a784cfc0dfd3887e925c6357
@@@ -395,6 -395,7 +395,7 @@@ int kvm_vm_ioctl_check_extension(struc
        case KVM_CAP_S390_USER_INSTR0:
        case KVM_CAP_S390_CMMA_MIGRATION:
        case KVM_CAP_S390_AIS:
+       case KVM_CAP_S390_AIS_MIGRATION:
                r = 1;
                break;
        case KVM_CAP_S390_MEM_OP:
@@@ -3281,7 -3282,7 +3282,7 @@@ static void sync_regs(struct kvm_vcpu *
         */
        if ((kvm_run->kvm_dirty_regs & KVM_SYNC_RICCB) &&
            test_kvm_facility(vcpu->kvm, 64) &&
 -          riccb->valid &&
 +          riccb->v &&
            !(vcpu->arch.sie_block->ecb3 & ECB3_RI)) {
                VCPU_EVENT(vcpu, 3, "%s", "ENABLE: RI (sync_regs)");
                vcpu->arch.sie_block->ecb3 |= ECB3_RI;
index ee23a43386a2908c140e96b8b85e8b82bc4fbd27,ad38c5e918ecc97d02301f3b69e667569e960d27..034caa1a084e360ff74c77e84116bb0de6e28dcd
@@@ -1,4 -1,3 +1,4 @@@
 +/* SPDX-License-Identifier: GPL-2.0 */
  /******************************************************************************
   * x86_emulate.h
   *
@@@ -226,6 -225,8 +226,8 @@@ struct x86_emulate_ops 
  
        unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt);
        void (*set_hflags)(struct x86_emulate_ctxt *ctxt, unsigned hflags);
+       int (*pre_leave_smm)(struct x86_emulate_ctxt *ctxt, u64 smbase);
  };
  
  typedef u32 __attribute__((vector_size(16))) sse128_t;
index 9d7d856b2d8965f605412d2716717e7069cefdb9,7233445a20bdf2e7b3a4691b33f6c95316876242..1bfb99770c34197b6c0627897753d282b3e5c378
@@@ -1061,6 -1061,11 +1061,11 @@@ struct kvm_x86_ops 
        void (*cancel_hv_timer)(struct kvm_vcpu *vcpu);
  
        void (*setup_mce)(struct kvm_vcpu *vcpu);
+       int (*smi_allowed)(struct kvm_vcpu *vcpu);
+       int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate);
+       int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smbase);
+       int (*enable_smi_window)(struct kvm_vcpu *vcpu);
  };
  
  struct kvm_arch_async_pf {
@@@ -1419,11 -1424,14 +1424,14 @@@ static inline void kvm_arch_vcpu_block_
  static inline int kvm_cpu_get_apicid(int mps_cpu)
  {
  #ifdef CONFIG_X86_LOCAL_APIC
 -      return __default_cpu_present_to_apicid(mps_cpu);
 +      return default_cpu_present_to_apicid(mps_cpu);
  #else
        WARN_ON_ONCE(1);
        return BAD_APICID;
  #endif
  }
  
+ #define put_smstate(type, buf, offset, val)                      \
+       *(type *)((buf) + (offset) - 0x7e00) = val
  #endif /* _ASM_X86_KVM_HOST_H */
diff --combined arch/x86/kvm/lapic.c
index 36c90d631096d8c4eea10291d2958d6fb1393b4b,a778f1ae2927df889aa4416aba59ef676011ecbd..943acbf00c69d8f423289116bc363159144f883a
@@@ -1301,14 -1301,42 +1301,42 @@@ static void update_divide_count(struct 
                                   apic->divide_count);
  }
  
+ static void limit_periodic_timer_frequency(struct kvm_lapic *apic)
+ {
+       /*
+        * Do not allow the guest to program periodic timers with small
+        * interval, since the hrtimers are not throttled by the host
+        * scheduler.
+        */
+       if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
+               s64 min_period = min_timer_period_us * 1000LL;
+               if (apic->lapic_timer.period < min_period) {
+                       pr_info_ratelimited(
+                           "kvm: vcpu %i: requested %lld ns "
+                           "lapic timer period limited to %lld ns\n",
+                           apic->vcpu->vcpu_id,
+                           apic->lapic_timer.period, min_period);
+                       apic->lapic_timer.period = min_period;
+               }
+       }
+ }
  static void apic_update_lvtt(struct kvm_lapic *apic)
  {
        u32 timer_mode = kvm_lapic_get_reg(apic, APIC_LVTT) &
                        apic->lapic_timer.timer_mode_mask;
  
        if (apic->lapic_timer.timer_mode != timer_mode) {
+               if (apic_lvtt_tscdeadline(apic) != (timer_mode ==
+                               APIC_LVT_TIMER_TSCDEADLINE)) {
+                       hrtimer_cancel(&apic->lapic_timer.timer);
+                       kvm_lapic_set_reg(apic, APIC_TMICT, 0);
+                       apic->lapic_timer.period = 0;
+                       apic->lapic_timer.tscdeadline = 0;
+               }
                apic->lapic_timer.timer_mode = timer_mode;
-               hrtimer_cancel(&apic->lapic_timer.timer);
+               limit_periodic_timer_frequency(apic);
        }
  }
  
@@@ -1430,6 -1458,30 +1458,30 @@@ static void start_sw_period(struct kvm_
                HRTIMER_MODE_ABS_PINNED);
  }
  
+ static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor)
+ {
+       ktime_t now, remaining;
+       u64 ns_remaining_old, ns_remaining_new;
+       apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
+               * APIC_BUS_CYCLE_NS * apic->divide_count;
+       limit_periodic_timer_frequency(apic);
+       now = ktime_get();
+       remaining = ktime_sub(apic->lapic_timer.target_expiration, now);
+       if (ktime_to_ns(remaining) < 0)
+               remaining = 0;
+       ns_remaining_old = ktime_to_ns(remaining);
+       ns_remaining_new = mul_u64_u32_div(ns_remaining_old,
+                                          apic->divide_count, old_divisor);
+       apic->lapic_timer.tscdeadline +=
+               nsec_to_cycles(apic->vcpu, ns_remaining_new) -
+               nsec_to_cycles(apic->vcpu, ns_remaining_old);
+       apic->lapic_timer.target_expiration = ktime_add_ns(now, ns_remaining_new);
+ }
  static bool set_target_expiration(struct kvm_lapic *apic)
  {
        ktime_t now;
        apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
                * APIC_BUS_CYCLE_NS * apic->divide_count;
  
-       if (!apic->lapic_timer.period)
+       if (!apic->lapic_timer.period) {
+               apic->lapic_timer.tscdeadline = 0;
                return false;
-       /*
-        * Do not allow the guest to program periodic timers with small
-        * interval, since the hrtimers are not throttled by the host
-        * scheduler.
-        */
-       if (apic_lvtt_period(apic)) {
-               s64 min_period = min_timer_period_us * 1000LL;
-               if (apic->lapic_timer.period < min_period) {
-                       pr_info_ratelimited(
-                           "kvm: vcpu %i: requested %lld ns "
-                           "lapic timer period limited to %lld ns\n",
-                           apic->vcpu->vcpu_id,
-                           apic->lapic_timer.period, min_period);
-                       apic->lapic_timer.period = min_period;
-               }
        }
  
+       limit_periodic_timer_frequency(apic);
        apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
                   PRIx64 ", "
                   "timer initial count 0x%x, period %lldns, "
@@@ -1515,6 -1553,9 +1553,9 @@@ static bool start_hv_timer(struct kvm_l
        if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending))
                return false;
  
+       if (!ktimer->tscdeadline)
+               return false;
        r = kvm_x86_ops->set_hv_timer(apic->vcpu, ktimer->tscdeadline);
        if (r < 0)
                return false;
@@@ -1738,13 -1779,21 +1779,21 @@@ int kvm_lapic_reg_write(struct kvm_lapi
                start_apic_timer(apic);
                break;
  
-       case APIC_TDCR:
+       case APIC_TDCR: {
+               uint32_t old_divisor = apic->divide_count;
                if (val & 4)
                        apic_debug("KVM_WRITE:TDCR %x\n", val);
                kvm_lapic_set_reg(apic, APIC_TDCR, val);
                update_divide_count(apic);
+               if (apic->divide_count != old_divisor &&
+                               apic->lapic_timer.period) {
+                       hrtimer_cancel(&apic->lapic_timer.timer);
+                       update_target_expiration(apic, old_divisor);
+                       restart_apic_timer(apic);
+               }
                break;
+       }
        case APIC_ESR:
                if (apic_x2apic_mode(apic) && val != 0) {
                        apic_debug("KVM_WRITE:ESR not zero %x\n", val);
@@@ -1992,11 -2041,6 +2041,11 @@@ void kvm_lapic_reset(struct kvm_vcpu *v
                                vcpu->arch.apic_base | MSR_IA32_APICBASE_BSP);
        vcpu->arch.pv_eoi.msr_val = 0;
        apic_update_ppr(apic);
 +      if (vcpu->arch.apicv_active) {
 +              kvm_x86_ops->apicv_post_state_restore(vcpu);
 +              kvm_x86_ops->hwapic_irr_update(vcpu, -1);
 +              kvm_x86_ops->hwapic_isr_update(vcpu, -1);
 +      }
  
        vcpu->arch.apic_arb_prio = 0;
        vcpu->arch.apic_attention = 0;
diff --combined arch/x86/kvm/mmu.c
index a119b361b8b7a9c916e4df7ecd9e69622e64c1b3,0b481cc9c72533d5eb5f9f43856d05354f39f880..e5e66e5c664057bb5cc5ad2660008ccbf19b69e5
@@@ -150,6 -150,20 +150,20 @@@ module_param(dbg, bool, 0644)
  /* make pte_list_desc fit well in cache line */
  #define PTE_LIST_EXT 3
  
+ /*
+  * Return values of handle_mmio_page_fault and mmu.page_fault:
+  * RET_PF_RETRY: let CPU fault again on the address.
+  * RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
+  *
+  * For handle_mmio_page_fault only:
+  * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
+  */
+ enum {
+       RET_PF_RETRY = 0,
+       RET_PF_EMULATE = 1,
+       RET_PF_INVALID = 2,
+ };
  struct pte_list_desc {
        u64 *sptes[PTE_LIST_EXT];
        struct pte_list_desc *more;
@@@ -443,7 -457,7 +457,7 @@@ static u64 __update_clear_spte_slow(u6
  
  static u64 __get_spte_lockless(u64 *sptep)
  {
 -      return ACCESS_ONCE(*sptep);
 +      return READ_ONCE(*sptep);
  }
  #else
  union split_spte {
@@@ -2424,7 -2438,7 +2438,7 @@@ static void __shadow_walk_next(struct k
  
  static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
  {
-       return __shadow_walk_next(iterator, *iterator->sptep);
+       __shadow_walk_next(iterator, *iterator->sptep);
  }
  
  static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
@@@ -2794,13 -2808,13 +2808,13 @@@ done
        return ret;
  }
  
- static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
-                        int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
-                        bool speculative, bool host_writable)
+ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
+                       int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
+                       bool speculative, bool host_writable)
  {
        int was_rmapped = 0;
        int rmap_count;
-       bool emulate = false;
+       int ret = RET_PF_RETRY;
  
        pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
                 *sptep, write_fault, gfn);
        if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative,
              true, host_writable)) {
                if (write_fault)
-                       emulate = true;
+                       ret = RET_PF_EMULATE;
                kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
        }
  
        if (unlikely(is_mmio_spte(*sptep)))
-               emulate = true;
+               ret = RET_PF_EMULATE;
  
        pgprintk("%s: setting spte %llx\n", __func__, *sptep);
        pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
  
        kvm_release_pfn_clean(pfn);
  
-       return emulate;
+       return ret;
  }
  
  static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
@@@ -2994,14 -3008,13 +3008,13 @@@ static int kvm_handle_bad_page(struct k
         * Do not cache the mmio info caused by writing the readonly gfn
         * into the spte otherwise read access on readonly gfn also can
         * caused mmio page fault and treat it as mmio access.
-        * Return 1 to tell kvm to emulate it.
         */
        if (pfn == KVM_PFN_ERR_RO_FAULT)
-               return 1;
+               return RET_PF_EMULATE;
  
        if (pfn == KVM_PFN_ERR_HWPOISON) {
                kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
-               return 0;
+               return RET_PF_RETRY;
        }
  
        return -EFAULT;
@@@ -3286,13 -3299,13 +3299,13 @@@ static int nonpaging_map(struct kvm_vcp
        }
  
        if (fast_page_fault(vcpu, v, level, error_code))
-               return 0;
+               return RET_PF_RETRY;
  
        mmu_seq = vcpu->kvm->mmu_notifier_seq;
        smp_rmb();
  
        if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
-               return 0;
+               return RET_PF_RETRY;
  
        if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
                return r;
  out_unlock:
        spin_unlock(&vcpu->kvm->mmu_lock);
        kvm_release_pfn_clean(pfn);
-       return 0;
+       return RET_PF_RETRY;
  }
  
  
@@@ -3659,54 -3672,38 +3672,38 @@@ exit
        return reserved;
  }
  
- /*
-  * Return values of handle_mmio_page_fault:
-  * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction
-  *                    directly.
-  * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page
-  *                    fault path update the mmio spte.
-  * RET_MMIO_PF_RETRY: let CPU fault again on the address.
-  * RET_MMIO_PF_BUG: a bug was detected (and a WARN was printed).
-  */
- enum {
-       RET_MMIO_PF_EMULATE = 1,
-       RET_MMIO_PF_INVALID = 2,
-       RET_MMIO_PF_RETRY = 0,
-       RET_MMIO_PF_BUG = -1
- };
  static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
  {
        u64 spte;
        bool reserved;
  
        if (mmio_info_in_cache(vcpu, addr, direct))
-               return RET_MMIO_PF_EMULATE;
+               return RET_PF_EMULATE;
  
        reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
        if (WARN_ON(reserved))
-               return RET_MMIO_PF_BUG;
+               return -EINVAL;
  
        if (is_mmio_spte(spte)) {
                gfn_t gfn = get_mmio_spte_gfn(spte);
                unsigned access = get_mmio_spte_access(spte);
  
                if (!check_mmio_spte(vcpu, spte))
-                       return RET_MMIO_PF_INVALID;
+                       return RET_PF_INVALID;
  
                if (direct)
                        addr = 0;
  
                trace_handle_mmio_page_fault(addr, gfn, access);
                vcpu_cache_mmio_info(vcpu, addr, gfn, access);
-               return RET_MMIO_PF_EMULATE;
+               return RET_PF_EMULATE;
        }
  
        /*
         * If the page table is zapped by other cpus, let CPU fault again on
         * the address.
         */
-       return RET_MMIO_PF_RETRY;
+       return RET_PF_RETRY;
  }
  EXPORT_SYMBOL_GPL(handle_mmio_page_fault);
  
@@@ -3756,7 -3753,7 +3753,7 @@@ static int nonpaging_page_fault(struct 
        pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
  
        if (page_fault_handle_page_track(vcpu, error_code, gfn))
-               return 1;
+               return RET_PF_EMULATE;
  
        r = mmu_topup_memory_caches(vcpu);
        if (r)
@@@ -3820,8 -3817,7 +3817,7 @@@ static bool try_async_pf(struct kvm_vcp
  }
  
  int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
-                               u64 fault_address, char *insn, int insn_len,
-                               bool need_unprotect)
+                               u64 fault_address, char *insn, int insn_len)
  {
        int r = 1;
  
        default:
                trace_kvm_page_fault(fault_address, error_code);
  
-               if (need_unprotect && kvm_event_needs_reinjection(vcpu))
+               if (kvm_event_needs_reinjection(vcpu))
                        kvm_mmu_unprotect_page_virt(vcpu, fault_address);
                r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
                                insn_len);
@@@ -3876,7 -3872,7 +3872,7 @@@ static int tdp_page_fault(struct kvm_vc
        MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
  
        if (page_fault_handle_page_track(vcpu, error_code, gfn))
-               return 1;
+               return RET_PF_EMULATE;
  
        r = mmu_topup_memory_caches(vcpu);
        if (r)
        }
  
        if (fast_page_fault(vcpu, gpa, level, error_code))
-               return 0;
+               return RET_PF_RETRY;
  
        mmu_seq = vcpu->kvm->mmu_notifier_seq;
        smp_rmb();
  
        if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
-               return 0;
+               return RET_PF_RETRY;
  
        if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
                return r;
  out_unlock:
        spin_unlock(&vcpu->kvm->mmu_lock);
        kvm_release_pfn_clean(pfn);
-       return 0;
+       return RET_PF_RETRY;
  }
  
  static void nonpaging_init_context(struct kvm_vcpu *vcpu,
@@@ -4819,7 -4815,7 +4815,7 @@@ static void kvm_mmu_pte_write(struct kv
         * If we don't have indirect shadow pages, it means no page is
         * write-protected, so we can exit simply.
         */
 -      if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
 +      if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
                return;
  
        remote_flush = local_flush = false;
@@@ -4918,25 -4914,25 +4914,25 @@@ int kvm_mmu_page_fault(struct kvm_vcpu 
                vcpu->arch.gpa_val = cr2;
        }
  
+       r = RET_PF_INVALID;
        if (unlikely(error_code & PFERR_RSVD_MASK)) {
                r = handle_mmio_page_fault(vcpu, cr2, direct);
-               if (r == RET_MMIO_PF_EMULATE) {
+               if (r == RET_PF_EMULATE) {
                        emulation_type = 0;
                        goto emulate;
                }
-               if (r == RET_MMIO_PF_RETRY)
-                       return 1;
-               if (r < 0)
-                       return r;
-               /* Must be RET_MMIO_PF_INVALID.  */
        }
  
-       r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code),
-                                     false);
+       if (r == RET_PF_INVALID) {
+               r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code),
+                                             false);
+               WARN_ON(r == RET_PF_INVALID);
+       }
+       if (r == RET_PF_RETRY)
+               return 1;
        if (r < 0)
                return r;
-       if (!r)
-               return 1;
  
        /*
         * Before emulating the instruction, check if the error code
@@@ -4993,8 -4989,7 +4989,7 @@@ EXPORT_SYMBOL_GPL(kvm_disable_tdp)
  static void free_mmu_pages(struct kvm_vcpu *vcpu)
  {
        free_page((unsigned long)vcpu->arch.mmu.pae_root);
-       if (vcpu->arch.mmu.lm_root != NULL)
-               free_page((unsigned long)vcpu->arch.mmu.lm_root);
+       free_page((unsigned long)vcpu->arch.mmu.lm_root);
  }
  
  static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
@@@ -5464,10 -5459,8 +5459,8 @@@ static struct shrinker mmu_shrinker = 
  
  static void mmu_destroy_caches(void)
  {
-       if (pte_list_desc_cache)
-               kmem_cache_destroy(pte_list_desc_cache);
-       if (mmu_page_header_cache)
-               kmem_cache_destroy(mmu_page_header_cache);
+       kmem_cache_destroy(pte_list_desc_cache);
+       kmem_cache_destroy(mmu_page_header_cache);
  }
  
  int kvm_mmu_module_init(void)
  
        pte_list_desc_cache = kmem_cache_create("pte_list_desc",
                                            sizeof(struct pte_list_desc),
-                                           0, 0, NULL);
+                                           0, SLAB_ACCOUNT, NULL);
        if (!pte_list_desc_cache)
                goto nomem;
  
        mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
                                                  sizeof(struct kvm_mmu_page),
-                                                 0, 0, NULL);
+                                                 0, SLAB_ACCOUNT, NULL);
        if (!mmu_page_header_cache)
                goto nomem;
  
diff --combined arch/x86/kvm/mmu.h
index efc857615d8ea48305be79ba4205371f70837540,1092302aa16a5404a9f8f9fabf0a669faa109343..5b408c0ad6121223db18caf757e18f7bebd38300
@@@ -1,4 -1,3 +1,4 @@@
 +/* SPDX-License-Identifier: GPL-2.0 */
  #ifndef __KVM_X86_MMU_H
  #define __KVM_X86_MMU_H
  
@@@ -66,8 -65,7 +66,7 @@@ void kvm_init_shadow_ept_mmu(struct kvm
                             bool accessed_dirty);
  bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
  int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
-                               u64 fault_address, char *insn, int insn_len,
-                               bool need_unprotect);
+                               u64 fault_address, char *insn, int insn_len);
  
  static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
  {
diff --combined arch/x86/kvm/vmx.c
index a6f4f095f8f4eb4aa5b4bae2a21dd66cccd824e7,e6c8ffa849683161734ee25af2e14752542a3500..7c3522a989d0b37713a802be82ee1f265fe64c9a
@@@ -486,6 -486,14 +486,14 @@@ struct nested_vmx 
        u64 nested_vmx_cr4_fixed1;
        u64 nested_vmx_vmcs_enum;
        u64 nested_vmx_vmfunc_controls;
+       /* SMM related state */
+       struct {
+               /* in VMX operation on SMM entry? */
+               bool vmxon;
+               /* in guest mode on SMM entry? */
+               bool guest_mode;
+       } smm;
  };
  
  #define POSTED_INTR_ON  0
@@@ -900,16 -908,13 +908,13 @@@ static bool nested_ept_ad_enabled(struc
  static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
  static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
  static bool vmx_xsaves_supported(void);
- static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
  static void vmx_set_segment(struct kvm_vcpu *vcpu,
                            struct kvm_segment *var, int seg);
  static void vmx_get_segment(struct kvm_vcpu *vcpu,
                            struct kvm_segment *var, int seg);
  static bool guest_state_valid(struct kvm_vcpu *vcpu);
  static u32 vmx_segment_access_rights(struct kvm_segment *var);
- static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
  static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
- static int alloc_identity_pagetable(struct kvm *kvm);
  static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
  static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
  static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
@@@ -1598,18 -1603,15 +1603,15 @@@ static inline void vpid_sync_context(in
  
  static inline void ept_sync_global(void)
  {
-       if (cpu_has_vmx_invept_global())
-               __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
+       __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
  }
  
  static inline void ept_sync_context(u64 eptp)
  {
-       if (enable_ept) {
-               if (cpu_has_vmx_invept_context())
-                       __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
-               else
-                       ept_sync_global();
-       }
+       if (cpu_has_vmx_invept_context())
+               __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
+       else
+               ept_sync_global();
  }
  
  static __always_inline void vmcs_check16(unsigned long field)
@@@ -2831,8 -2833,7 +2833,7 @@@ static void nested_vmx_setup_ctls_msrs(
                                SECONDARY_EXEC_ENABLE_PML;
                        vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT;
                }
-       } else
-               vmx->nested.nested_vmx_ept_caps = 0;
+       }
  
        if (cpu_has_vmx_vmfunc()) {
                vmx->nested.nested_vmx_secondary_ctls_high |=
                 * Advertise EPTP switching unconditionally
                 * since we emulate it
                 */
-               vmx->nested.nested_vmx_vmfunc_controls =
-                       VMX_VMFUNC_EPTP_SWITCHING;
+               if (enable_ept)
+                       vmx->nested.nested_vmx_vmfunc_controls =
+                               VMX_VMFUNC_EPTP_SWITCHING;
        }
  
        /*
                        SECONDARY_EXEC_ENABLE_VPID;
                vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
                        VMX_VPID_EXTENT_SUPPORTED_MASK;
-       } else
-               vmx->nested.nested_vmx_vpid_caps = 0;
+       }
  
        if (enable_unrestricted_guest)
                vmx->nested.nested_vmx_secondary_ctls_high |=
@@@ -3544,7 -3545,8 +3545,8 @@@ static int hardware_enable(void
                wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
        }
        kvm_cpu_vmxon(phys_addr);
-       ept_sync_global();
+       if (enable_ept)
+               ept_sync_global();
  
        return 0;
  }
@@@ -3657,8 -3659,8 +3659,8 @@@ static __init int setup_vmcs_config(str
                        SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
                        SECONDARY_EXEC_SHADOW_VMCS |
                        SECONDARY_EXEC_XSAVES |
-                       SECONDARY_EXEC_RDSEED |
-                       SECONDARY_EXEC_RDRAND |
+                       SECONDARY_EXEC_RDSEED_EXITING |
+                       SECONDARY_EXEC_RDRAND_EXITING |
                        SECONDARY_EXEC_ENABLE_PML |
                        SECONDARY_EXEC_TSC_SCALING |
                        SECONDARY_EXEC_ENABLE_VMFUNC;
                                SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
                                SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
  
+       rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
+               &vmx_capability.ept, &vmx_capability.vpid);
        if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
                /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
                   enabled */
                _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
                                             CPU_BASED_CR3_STORE_EXITING |
                                             CPU_BASED_INVLPG_EXITING);
-               rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
-                     vmx_capability.ept, vmx_capability.vpid);
+       } else if (vmx_capability.ept) {
+               vmx_capability.ept = 0;
+               pr_warn_once("EPT CAP should not exist if not support "
+                               "1-setting enable EPT VM-execution control\n");
+       }
+       if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
+               vmx_capability.vpid) {
+               vmx_capability.vpid = 0;
+               pr_warn_once("VPID CAP should not exist if not support "
+                               "1-setting enable VPID VM-execution control\n");
        }
  
        min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
@@@ -4781,18 -4794,18 +4794,18 @@@ static int init_rmode_identity_map(stru
        kvm_pfn_t identity_map_pfn;
        u32 tmp;
  
-       if (!enable_ept)
-               return 0;
        /* Protect kvm->arch.ept_identity_pagetable_done. */
        mutex_lock(&kvm->slots_lock);
  
        if (likely(kvm->arch.ept_identity_pagetable_done))
                goto out2;
  
+       if (!kvm->arch.ept_identity_map_addr)
+               kvm->arch.ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
        identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
  
-       r = alloc_identity_pagetable(kvm);
+       r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
+                                   kvm->arch.ept_identity_map_addr, PAGE_SIZE);
        if (r < 0)
                goto out2;
  
@@@ -4864,20 -4877,6 +4877,6 @@@ out
        return r;
  }
  
- static int alloc_identity_pagetable(struct kvm *kvm)
- {
-       /* Called with kvm->slots_lock held. */
-       int r = 0;
-       BUG_ON(kvm->arch.ept_identity_pagetable_done);
-       r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
-                                   kvm->arch.ept_identity_map_addr, PAGE_SIZE);
-       return r;
- }
  static int allocate_vpid(void)
  {
        int vpid;
@@@ -5282,13 -5281,13 +5281,13 @@@ static u32 vmx_exec_control(struct vcpu
  static bool vmx_rdrand_supported(void)
  {
        return vmcs_config.cpu_based_2nd_exec_ctrl &
-               SECONDARY_EXEC_RDRAND;
+               SECONDARY_EXEC_RDRAND_EXITING;
  }
  
  static bool vmx_rdseed_supported(void)
  {
        return vmcs_config.cpu_based_2nd_exec_ctrl &
-               SECONDARY_EXEC_RDSEED;
+               SECONDARY_EXEC_RDSEED_EXITING;
  }
  
  static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
        if (vmx_rdrand_supported()) {
                bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND);
                if (rdrand_enabled)
-                       exec_control &= ~SECONDARY_EXEC_RDRAND;
+                       exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING;
  
                if (nested) {
                        if (rdrand_enabled)
                                vmx->nested.nested_vmx_secondary_ctls_high |=
-                                       SECONDARY_EXEC_RDRAND;
+                                       SECONDARY_EXEC_RDRAND_EXITING;
                        else
                                vmx->nested.nested_vmx_secondary_ctls_high &=
-                                       ~SECONDARY_EXEC_RDRAND;
+                                       ~SECONDARY_EXEC_RDRAND_EXITING;
                }
        }
  
        if (vmx_rdseed_supported()) {
                bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED);
                if (rdseed_enabled)
-                       exec_control &= ~SECONDARY_EXEC_RDSEED;
+                       exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING;
  
                if (nested) {
                        if (rdseed_enabled)
                                vmx->nested.nested_vmx_secondary_ctls_high |=
-                                       SECONDARY_EXEC_RDSEED;
+                                       SECONDARY_EXEC_RDSEED_EXITING;
                        else
                                vmx->nested.nested_vmx_secondary_ctls_high &=
-                                       ~SECONDARY_EXEC_RDSEED;
+                                       ~SECONDARY_EXEC_RDSEED_EXITING;
                }
        }
  
@@@ -5426,7 -5425,7 +5425,7 @@@ static void ept_set_mmio_spte_mask(void
  /*
   * Sets up the vmcs for emulated real mode.
   */
- static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
+ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
  {
  #ifdef CONFIG_X86_64
        unsigned long a;
                vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
                vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
        }
-       return 0;
  }
  
  static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
        vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
        vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
        vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
+       if (kvm_mpx_supported())
+               vmcs_write64(GUEST_BNDCFGS, 0);
  
        setup_msrs(vmx);
  
  
        kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
  
 -      if (kvm_vcpu_apicv_active(vcpu))
 -              memset(&vmx->pi_desc, 0, sizeof(struct pi_desc));
 -
        if (vmx->vpid != 0)
                vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
  
@@@ -5912,8 -5914,7 +5911,7 @@@ static int handle_exception(struct kvm_
                cr2 = vmcs_readl(EXIT_QUALIFICATION);
                /* EPT won't cause page fault directly */
                WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept);
-               return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0,
-                               true);
+               return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
        }
  
        ex_no = intr_info & INTR_INFO_VECTOR_MASK;
@@@ -6747,16 -6748,14 +6745,14 @@@ static __init int hardware_setup(void
  
        if (!cpu_has_vmx_ept() ||
            !cpu_has_vmx_ept_4levels() ||
-           !cpu_has_vmx_ept_mt_wb()) {
+           !cpu_has_vmx_ept_mt_wb() ||
+           !cpu_has_vmx_invept_global())
                enable_ept = 0;
-               enable_unrestricted_guest = 0;
-               enable_ept_ad_bits = 0;
-       }
  
        if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
                enable_ept_ad_bits = 0;
  
-       if (!cpu_has_vmx_unrestricted_guest())
+       if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
                enable_unrestricted_guest = 0;
  
        if (!cpu_has_vmx_flexpriority())
        if (enable_ept && !cpu_has_vmx_ept_2m_page())
                kvm_disable_largepages();
  
-       if (!cpu_has_vmx_ple())
+       if (!cpu_has_vmx_ple()) {
                ple_gap = 0;
+               ple_window = 0;
+               ple_window_grow = 0;
+               ple_window_max = 0;
+               ple_window_shrink = 0;
+       }
  
        if (!cpu_has_vmx_apicv()) {
                enable_apicv = 0;
@@@ -8415,9 -8419,9 +8416,9 @@@ static bool nested_vmx_exit_reflected(s
        case EXIT_REASON_RDPMC:
                return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
        case EXIT_REASON_RDRAND:
-               return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND);
+               return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
        case EXIT_REASON_RDSEED:
-               return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED);
+               return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
        case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
                return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
        case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
@@@ -9475,7 -9479,6 +9476,6 @@@ static void vmx_switch_vmcs(struct kvm_
        vmx->loaded_vmcs = vmcs;
        vmx_vcpu_put(vcpu);
        vmx_vcpu_load(vcpu, cpu);
-       vcpu->cpu = cpu;
        put_cpu();
  }
  
@@@ -9556,11 -9559,9 +9556,9 @@@ static struct kvm_vcpu *vmx_create_vcpu
        cpu = get_cpu();
        vmx_vcpu_load(&vmx->vcpu, cpu);
        vmx->vcpu.cpu = cpu;
-       err = vmx_vcpu_setup(vmx);
+       vmx_vcpu_setup(vmx);
        vmx_vcpu_put(&vmx->vcpu);
        put_cpu();
-       if (err)
-               goto free_vmcs;
        if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
                err = alloc_apic_access_page(kvm);
                if (err)
        }
  
        if (enable_ept) {
-               if (!kvm->arch.ept_identity_map_addr)
-                       kvm->arch.ept_identity_map_addr =
-                               VMX_EPT_IDENTITY_PAGETABLE_ADDR;
                err = init_rmode_identity_map(kvm);
                if (err)
                        goto free_vmcs;
@@@ -11325,6 -11323,8 +11320,8 @@@ static void load_vmcs12_host_state(stru
        vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
        vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
        vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
+       vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
+       vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
  
        /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1.  */
        if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
@@@ -11421,8 -11421,11 +11418,11 @@@ static void nested_vmx_vmexit(struct kv
        leave_guest_mode(vcpu);
  
        if (likely(!vmx->fail)) {
-               prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
-                              exit_qualification);
+               if (exit_reason == -1)
+                       sync_vmcs12(vcpu, vmcs12);
+               else
+                       prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
+                                      exit_qualification);
  
                if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
                                         vmcs12->vm_exit_msr_store_count))
         */
        kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
  
-       if (enable_shadow_vmcs)
+       if (enable_shadow_vmcs && exit_reason != -1)
                vmx->nested.sync_shadow_vmcs = true;
  
        /* in case we halted in L2 */
                                INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
                }
  
-               trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
-                                              vmcs12->exit_qualification,
-                                              vmcs12->idt_vectoring_info_field,
-                                              vmcs12->vm_exit_intr_info,
-                                              vmcs12->vm_exit_intr_error_code,
-                                              KVM_ISA_VMX);
+               if (exit_reason != -1)
+                       trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
+                                                      vmcs12->exit_qualification,
+                                                      vmcs12->idt_vectoring_info_field,
+                                                      vmcs12->vm_exit_intr_info,
+                                                      vmcs12->vm_exit_intr_error_code,
+                                                      KVM_ISA_VMX);
  
                load_vmcs12_host_state(vcpu, vmcs12);
  
@@@ -11938,6 -11942,54 +11939,54 @@@ static void vmx_setup_mce(struct kvm_vc
                        ~FEATURE_CONTROL_LMCE;
  }
  
+ static int vmx_smi_allowed(struct kvm_vcpu *vcpu)
+ {
+       /* we need a nested vmexit to enter SMM, postpone if run is pending */
+       if (to_vmx(vcpu)->nested.nested_run_pending)
+               return 0;
+       return 1;
+ }
+ static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
+ {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
+       if (vmx->nested.smm.guest_mode)
+               nested_vmx_vmexit(vcpu, -1, 0, 0);
+       vmx->nested.smm.vmxon = vmx->nested.vmxon;
+       vmx->nested.vmxon = false;
+       return 0;
+ }
+ static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
+ {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       int ret;
+       if (vmx->nested.smm.vmxon) {
+               vmx->nested.vmxon = true;
+               vmx->nested.smm.vmxon = false;
+       }
+       if (vmx->nested.smm.guest_mode) {
+               vcpu->arch.hflags &= ~HF_SMM_MASK;
+               ret = enter_vmx_non_root_mode(vcpu, false);
+               vcpu->arch.hflags |= HF_SMM_MASK;
+               if (ret)
+                       return ret;
+               vmx->nested.smm.guest_mode = false;
+       }
+       return 0;
+ }
+ static int enable_smi_window(struct kvm_vcpu *vcpu)
+ {
+       return 0;
+ }
  static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
        .cpu_has_kvm_support = cpu_has_kvm_support,
        .disabled_by_bios = vmx_disabled_by_bios,
  #endif
  
        .setup_mce = vmx_setup_mce,
+       .smi_allowed = vmx_smi_allowed,
+       .pre_enter_smm = vmx_pre_enter_smm,
+       .pre_leave_smm = vmx_pre_leave_smm,
+       .enable_smi_window = enable_smi_window,
  };
  
  static int __init vmx_init(void)
index 538bfa8ba9b4a321beb3157ad74cb41723eb44a0,061476e92db724f9fcce3acd61483260f365a5fd..57cb2f00fc07ce7f5ffb526bd9bb03ed11287626
@@@ -77,7 -77,6 +77,7 @@@ static bool arch_timer_mem_use_virtual
  static bool arch_counter_suspend_stop;
  static bool vdso_default = true;
  
 +static cpumask_t evtstrm_available = CPU_MASK_NONE;
  static bool evtstrm_enable = IS_ENABLED(CONFIG_ARM_ARCH_TIMER_EVTSTREAM);
  
  static int __init early_evtstrm_cfg(char *buf)
@@@ -159,6 -158,7 +159,7 @@@ u32 arch_timer_reg_read(int access, enu
   * if we don't have the cp15 accessors we won't have a problem.
   */
  u64 (*arch_timer_read_counter)(void) = arch_counter_get_cntvct;
+ EXPORT_SYMBOL_GPL(arch_timer_read_counter);
  
  static u64 arch_counter_read(struct clocksource *cs)
  {
@@@ -218,6 -218,11 +219,11 @@@ static u32 notrace fsl_a008585_read_cnt
        return __fsl_a008585_read_reg(cntv_tval_el0);
  }
  
+ static u64 notrace fsl_a008585_read_cntpct_el0(void)
+ {
+       return __fsl_a008585_read_reg(cntpct_el0);
+ }
  static u64 notrace fsl_a008585_read_cntvct_el0(void)
  {
        return __fsl_a008585_read_reg(cntvct_el0);
@@@ -259,6 -264,11 +265,11 @@@ static u32 notrace hisi_161010101_read_
        return __hisi_161010101_read_reg(cntv_tval_el0);
  }
  
+ static u64 notrace hisi_161010101_read_cntpct_el0(void)
+ {
+       return __hisi_161010101_read_reg(cntpct_el0);
+ }
  static u64 notrace hisi_161010101_read_cntvct_el0(void)
  {
        return __hisi_161010101_read_reg(cntvct_el0);
@@@ -289,6 -299,15 +300,15 @@@ static struct ate_acpi_oem_info hisi_16
  #endif
  
  #ifdef CONFIG_ARM64_ERRATUM_858921
+ static u64 notrace arm64_858921_read_cntpct_el0(void)
+ {
+       u64 old, new;
+       old = read_sysreg(cntpct_el0);
+       new = read_sysreg(cntpct_el0);
+       return (((old ^ new) >> 32) & 1) ? old : new;
+ }
  static u64 notrace arm64_858921_read_cntvct_el0(void)
  {
        u64 old, new;
  #endif
  
  #ifdef CONFIG_ARM_ARCH_TIMER_OOL_WORKAROUND
 -DEFINE_PER_CPU(const struct arch_timer_erratum_workaround *,
 -             timer_unstable_counter_workaround);
 +DEFINE_PER_CPU(const struct arch_timer_erratum_workaround *, timer_unstable_counter_workaround);
  EXPORT_SYMBOL_GPL(timer_unstable_counter_workaround);
  
  DEFINE_STATIC_KEY_FALSE(arch_timer_read_ool_enabled);
@@@ -310,16 -330,19 +330,19 @@@ static void erratum_set_next_event_tval
                                                struct clock_event_device *clk)
  {
        unsigned long ctrl;
-       u64 cval = evt + arch_counter_get_cntvct();
+       u64 cval;
  
        ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk);
        ctrl |= ARCH_TIMER_CTRL_ENABLE;
        ctrl &= ~ARCH_TIMER_CTRL_IT_MASK;
  
-       if (access == ARCH_TIMER_PHYS_ACCESS)
+       if (access == ARCH_TIMER_PHYS_ACCESS) {
+               cval = evt + arch_counter_get_cntpct();
                write_sysreg(cval, cntp_cval_el0);
-       else
+       } else {
+               cval = evt + arch_counter_get_cntvct();
                write_sysreg(cval, cntv_cval_el0);
+       }
  
        arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk);
  }
@@@ -346,6 -369,7 +369,7 @@@ static const struct arch_timer_erratum_
                .desc = "Freescale erratum a005858",
                .read_cntp_tval_el0 = fsl_a008585_read_cntp_tval_el0,
                .read_cntv_tval_el0 = fsl_a008585_read_cntv_tval_el0,
+               .read_cntpct_el0 = fsl_a008585_read_cntpct_el0,
                .read_cntvct_el0 = fsl_a008585_read_cntvct_el0,
                .set_next_event_phys = erratum_set_next_event_tval_phys,
                .set_next_event_virt = erratum_set_next_event_tval_virt,
                .desc = "HiSilicon erratum 161010101",
                .read_cntp_tval_el0 = hisi_161010101_read_cntp_tval_el0,
                .read_cntv_tval_el0 = hisi_161010101_read_cntv_tval_el0,
+               .read_cntpct_el0 = hisi_161010101_read_cntpct_el0,
                .read_cntvct_el0 = hisi_161010101_read_cntvct_el0,
                .set_next_event_phys = erratum_set_next_event_tval_phys,
                .set_next_event_virt = erratum_set_next_event_tval_virt,
                .desc = "HiSilicon erratum 161010101",
                .read_cntp_tval_el0 = hisi_161010101_read_cntp_tval_el0,
                .read_cntv_tval_el0 = hisi_161010101_read_cntv_tval_el0,
+               .read_cntpct_el0 = hisi_161010101_read_cntpct_el0,
                .read_cntvct_el0 = hisi_161010101_read_cntvct_el0,
                .set_next_event_phys = erratum_set_next_event_tval_phys,
                .set_next_event_virt = erratum_set_next_event_tval_virt,
                .match_type = ate_match_local_cap_id,
                .id = (void *)ARM64_WORKAROUND_858921,
                .desc = "ARM erratum 858921",
+               .read_cntpct_el0 = arm64_858921_read_cntpct_el0,
                .read_cntvct_el0 = arm64_858921_read_cntvct_el0,
        },
  #endif
@@@ -740,7 -767,6 +767,7 @@@ static void arch_timer_evtstrm_enable(i
  #ifdef CONFIG_COMPAT
        compat_elf_hwcap |= COMPAT_HWCAP_EVTSTRM;
  #endif
 +      cpumask_set_cpu(smp_processor_id(), &evtstrm_available);
  }
  
  static void arch_timer_configure_evtstream(void)
@@@ -865,16 -891,6 +892,16 @@@ u32 arch_timer_get_rate(void
        return arch_timer_rate;
  }
  
 +bool arch_timer_evtstrm_available(void)
 +{
 +      /*
 +       * We might get called from a preemptible context. This is fine
 +       * because availability of the event stream should be always the same
 +       * for a preemptible context and context where we might resume a task.
 +       */
 +      return cpumask_test_cpu(raw_smp_processor_id(), &evtstrm_available);
 +}
 +
  static u64 arch_counter_get_cntvct_mem(void)
  {
        u32 vct_lo, vct_hi, tmp_hi;
@@@ -901,7 -917,7 +928,7 @@@ static void __init arch_counter_registe
  
        /* Register the CP15 based counter if we have one */
        if (type & ARCH_TIMER_TYPE_CP15) {
-               if (IS_ENABLED(CONFIG_ARM64) ||
+               if ((IS_ENABLED(CONFIG_ARM64) && !is_hyp_mode_available()) ||
                    arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI)
                        arch_timer_read_counter = arch_counter_get_cntvct;
                else
@@@ -940,8 -956,6 +967,8 @@@ static int arch_timer_dying_cpu(unsigne
  {
        struct clock_event_device *clk = this_cpu_ptr(arch_timer_evt);
  
 +      cpumask_clear_cpu(smp_processor_id(), &evtstrm_available);
 +
        arch_timer_stop(clk);
        return 0;
  }
@@@ -951,16 -965,10 +978,16 @@@ static DEFINE_PER_CPU(unsigned long, sa
  static int arch_timer_cpu_pm_notify(struct notifier_block *self,
                                    unsigned long action, void *hcpu)
  {
 -      if (action == CPU_PM_ENTER)
 +      if (action == CPU_PM_ENTER) {
                __this_cpu_write(saved_cntkctl, arch_timer_get_cntkctl());
 -      else if (action == CPU_PM_ENTER_FAILED || action == CPU_PM_EXIT)
 +
 +              cpumask_clear_cpu(smp_processor_id(), &evtstrm_available);
 +      } else if (action == CPU_PM_ENTER_FAILED || action == CPU_PM_EXIT) {
                arch_timer_set_cntkctl(__this_cpu_read(saved_cntkctl));
 +
 +              if (elf_hwcap & HWCAP_EVTSTRM)
 +                      cpumask_set_cpu(smp_processor_id(), &evtstrm_available);
 +      }
        return NOTIFY_OK;
  }
  
@@@ -1036,6 -1044,7 +1063,6 @@@ static int __init arch_timer_register(v
        if (err)
                goto out_unreg_notify;
  
 -
        /* Register and immediately configure the timer on the boot CPU */
        err = cpuhp_setup_state(CPUHP_AP_ARM_ARCH_TIMER_STARTING,
                                "clockevents/arm/arch_timer:starting",
@@@ -1286,6 -1295,10 +1313,6 @@@ arch_timer_mem_find_best_frame(struct a
  
        iounmap(cntctlbase);
  
 -      if (!best_frame)
 -              pr_err("Unable to find a suitable frame in timer @ %pa\n",
 -                      &timer_mem->cntctlbase);
 -
        return best_frame;
  }
  
@@@ -1386,8 -1399,6 +1413,8 @@@ static int __init arch_timer_mem_of_ini
  
        frame = arch_timer_mem_find_best_frame(timer_mem);
        if (!frame) {
 +              pr_err("Unable to find a suitable frame in timer @ %pa\n",
 +                      &timer_mem->cntctlbase);
                ret = -EINVAL;
                goto out;
        }
@@@ -1436,7 -1447,7 +1463,7 @@@ arch_timer_mem_verify_cntfrq(struct arc
  static int __init arch_timer_mem_acpi_init(int platform_timer_count)
  {
        struct arch_timer_mem *timers, *timer;
 -      struct arch_timer_mem_frame *frame;
 +      struct arch_timer_mem_frame *frame, *best_frame = NULL;
        int timer_count, i, ret = 0;
  
        timers = kcalloc(platform_timer_count, sizeof(*timers),
        if (ret || !timer_count)
                goto out;
  
 -      for (i = 0; i < timer_count; i++) {
 -              ret = arch_timer_mem_verify_cntfrq(&timers[i]);
 -              if (ret) {
 -                      pr_err("Disabling MMIO timers due to CNTFRQ mismatch\n");
 -                      goto out;
 -              }
 -      }
 -
        /*
         * While unlikely, it's theoretically possible that none of the frames
         * in a timer expose the combination of feature we want.
                timer = &timers[i];
  
                frame = arch_timer_mem_find_best_frame(timer);
 -              if (frame)
 -                      break;
 +              if (!best_frame)
 +                      best_frame = frame;
 +
 +              ret = arch_timer_mem_verify_cntfrq(timer);
 +              if (ret) {
 +                      pr_err("Disabling MMIO timers due to CNTFRQ mismatch\n");
 +                      goto out;
 +              }
 +
 +              if (!best_frame) /* implies !frame */
 +                      /*
 +                       * Only complain about missing suitable frames if we
 +                       * haven't already found one in a previous iteration.
 +                       */
 +                      pr_err("Unable to find a suitable frame in timer @ %pa\n",
 +                              &timer->cntctlbase);
        }
  
 -      if (frame)
 -              ret = arch_timer_mem_frame_register(frame);
 +      if (best_frame)
 +              ret = arch_timer_mem_frame_register(best_frame);
  out:
        kfree(timers);
        return ret;
index b54b55597ffb9c8351ff98e97dc05535503ec493,854334a6f225488f2cd4dd89f47e61e1fd086a40..17221143f5057ce35f84f6021bf972f32b61bc48
@@@ -55,7 -55,6 +55,7 @@@ struct gic_chip_data 
        struct irq_domain       *domain;
        u64                     redist_stride;
        u32                     nr_redist_regions;
 +      bool                    has_rss;
        unsigned int            irq_nr;
        struct partition_desc   *ppi_descs[16];
  };
@@@ -64,9 -63,7 +64,9 @@@ static struct gic_chip_data gic_data __
  static struct static_key supports_deactivate = STATIC_KEY_INIT_TRUE;
  
  static struct gic_kvm_info gic_v3_kvm_info;
 +static DEFINE_PER_CPU(bool, has_rss);
  
 +#define MPIDR_RS(mpidr)                       (((mpidr) & 0xF0UL) >> 4)
  #define gic_data_rdist()              (this_cpu_ptr(gic_data.rdists.rdist))
  #define gic_data_rdist_rd_base()      (gic_data_rdist()->rd_base)
  #define gic_data_rdist_sgi_base()     (gic_data_rdist_rd_base() + SZ_64K)
@@@ -529,10 -526,6 +529,10 @@@ static void gic_update_vlpi_properties(
  
  static void gic_cpu_sys_reg_init(void)
  {
 +      int i, cpu = smp_processor_id();
 +      u64 mpidr = cpu_logical_map(cpu);
 +      u64 need_rss = MPIDR_RS(mpidr);
 +
        /*
         * Need to check that the SRE bit has actually been set. If
         * not, it means that SRE is disabled at EL2. We're going to
  
        /* ... and let's hit the road... */
        gic_write_grpen1(1);
 +
 +      /* Keep the RSS capability status in per_cpu variable */
 +      per_cpu(has_rss, cpu) = !!(gic_read_ctlr() & ICC_CTLR_EL1_RSS);
 +
 +      /* Check all the CPUs have capable of sending SGIs to other CPUs */
 +      for_each_online_cpu(i) {
 +              bool have_rss = per_cpu(has_rss, i) && per_cpu(has_rss, cpu);
 +
 +              need_rss |= MPIDR_RS(cpu_logical_map(i));
 +              if (need_rss && (!have_rss))
 +                      pr_crit("CPU%d (%lx) can't SGI CPU%d (%lx), no RSS\n",
 +                              cpu, (unsigned long)mpidr,
 +                              i, (unsigned long)cpu_logical_map(i));
 +      }
 +
 +      /**
 +       * GIC spec says, when ICC_CTLR_EL1.RSS==1 and GICD_TYPER.RSS==0,
 +       * writing ICC_ASGI1R_EL1 register with RS != 0 is a CONSTRAINED
 +       * UNPREDICTABLE choice of :
 +       *   - The write is ignored.
 +       *   - The RS field is treated as 0.
 +       */
 +      if (need_rss && (!gic_data.has_rss))
 +              pr_crit_once("RSS is required but GICD doesn't support it\n");
  }
  
  static int gic_dist_supports_lpis(void)
@@@ -622,9 -591,6 +622,9 @@@ static void gic_cpu_init(void
  
  #ifdef CONFIG_SMP
  
 +#define MPIDR_TO_SGI_RS(mpidr)        (MPIDR_RS(mpidr) << ICC_SGI1R_RS_SHIFT)
 +#define MPIDR_TO_SGI_CLUSTER_ID(mpidr)        ((mpidr) & ~0xFUL)
 +
  static int gic_starting_cpu(unsigned int cpu)
  {
        gic_cpu_init();
@@@ -639,6 -605,13 +639,6 @@@ static u16 gic_compute_target_list(int 
        u16 tlist = 0;
  
        while (cpu < nr_cpu_ids) {
 -              /*
 -               * If we ever get a cluster of more than 16 CPUs, just
 -               * scream and skip that CPU.
 -               */
 -              if (WARN_ON((mpidr & 0xff) >= 16))
 -                      goto out;
 -
                tlist |= 1 << (mpidr & 0xf);
  
                next_cpu = cpumask_next(cpu, mask);
  
                mpidr = cpu_logical_map(cpu);
  
 -              if (cluster_id != (mpidr & ~0xffUL)) {
 +              if (cluster_id != MPIDR_TO_SGI_CLUSTER_ID(mpidr)) {
                        cpu--;
                        goto out;
                }
@@@ -670,7 -643,6 +670,7 @@@ static void gic_send_sgi(u64 cluster_id
               MPIDR_TO_SGI_AFFINITY(cluster_id, 2)     |
               irq << ICC_SGI1R_SGI_ID_SHIFT            |
               MPIDR_TO_SGI_AFFINITY(cluster_id, 1)     |
 +             MPIDR_TO_SGI_RS(cluster_id)              |
               tlist << ICC_SGI1R_TARGET_LIST_SHIFT);
  
        pr_debug("CPU%d: ICC_SGI1R_EL1 %llx\n", smp_processor_id(), val);
@@@ -691,7 -663,7 +691,7 @@@ static void gic_raise_softirq(const str
        smp_wmb();
  
        for_each_cpu(cpu, mask) {
 -              unsigned long cluster_id = cpu_logical_map(cpu) & ~0xffUL;
 +              u64 cluster_id = MPIDR_TO_SGI_CLUSTER_ID(cpu_logical_map(cpu));
                u16 tlist;
  
                tlist = gic_compute_target_list(&cpu, mask, cluster_id);
@@@ -1035,10 -1007,6 +1035,10 @@@ static int __init gic_init_bases(void _
                goto out_free;
        }
  
 +      gic_data.has_rss = !!(typer & GICD_TYPER_RSS);
 +      pr_info("Distributor has %sRange Selector support\n",
 +              gic_data.has_rss ? "" : "no ");
 +
        set_handle_irq(gic_handle_irq);
  
        gic_update_vlpi_properties();
@@@ -1260,7 -1228,9 +1260,9 @@@ static int __init gic_of_init(struct de
                goto out_unmap_rdist;
  
        gic_populate_ppi_partitions(node);
-       gic_of_setup_kvm_info(node);
+       if (static_key_true(&supports_deactivate))
+               gic_of_setup_kvm_info(node);
        return 0;
  
  out_unmap_rdist:
@@@ -1549,7 -1519,9 +1551,9 @@@ gic_acpi_init(struct acpi_subtable_head
                goto out_fwhandle_free;
  
        acpi_set_irq_model(ACPI_IRQ_MODEL_GIC, domain_handle);
-       gic_acpi_setup_kvm_info();
+       if (static_key_true(&supports_deactivate))
+               gic_acpi_setup_kvm_info();
  
        return 0;
  
index f641e8e2c78d1e7af926b808fa46d00285c86fff,cd9371b749c2e67902ec7938230bcf70fb7ab075..121af5cf688f957fa209ef29aa9656a79674c5ef
@@@ -1256,19 -1256,6 +1256,19 @@@ static void gic_teardown(struct gic_chi
  
  #ifdef CONFIG_OF
  static int gic_cnt __initdata;
 +static bool gicv2_force_probe;
 +
 +static int __init gicv2_force_probe_cfg(char *buf)
 +{
 +      return strtobool(buf, &gicv2_force_probe);
 +}
 +early_param("irqchip.gicv2_force_probe", gicv2_force_probe_cfg);
 +
 +static bool gic_check_gicv2(void __iomem *base)
 +{
 +      u32 val = readl_relaxed(base + GIC_CPU_IDENT);
 +      return (val & 0xff0fff) == 0x02043B;
 +}
  
  static bool gic_check_eoimode(struct device_node *node, void __iomem **base)
  {
  
        if (!is_hyp_mode_available())
                return false;
 -      if (resource_size(&cpuif_res) < SZ_8K)
 -              return false;
 -      if (resource_size(&cpuif_res) == SZ_128K) {
 -              u32 val_low, val_high;
 +      if (resource_size(&cpuif_res) < SZ_8K) {
 +              void __iomem *alt;
 +              /*
 +               * Check for a stupid firmware that only exposes the
 +               * first page of a GICv2.
 +               */
 +              if (!gic_check_gicv2(*base))
 +                      return false;
 +
 +              if (!gicv2_force_probe) {
 +                      pr_warn("GIC: GICv2 detected, but range too small and irqchip.gicv2_force_probe not set\n");
 +                      return false;
 +              }
 +
 +              alt = ioremap(cpuif_res.start, SZ_8K);
 +              if (!alt)
 +                      return false;
 +              if (!gic_check_gicv2(alt + SZ_4K)) {
 +                      /*
 +                       * The first page was that of a GICv2, and
 +                       * the second was *something*. Let's trust it
 +                       * to be a GICv2, and update the mapping.
 +                       */
 +                      pr_warn("GIC: GICv2 at %pa, but range is too small (broken DT?), assuming 8kB\n",
 +                              &cpuif_res.start);
 +                      iounmap(*base);
 +                      *base = alt;
 +                      return true;
 +              }
  
                /*
 -               * Verify that we have the first 4kB of a GIC400
 +               * We detected *two* initial GICv2 pages in a
 +               * row. Could be a GICv2 aliased over two 64kB
 +               * pages. Update the resource, map the iospace, and
 +               * pray.
 +               */
 +              iounmap(alt);
 +              alt = ioremap(cpuif_res.start, SZ_128K);
 +              if (!alt)
 +                      return false;
 +              pr_warn("GIC: Aliased GICv2 at %pa, trying to find the canonical range over 128kB\n",
 +                      &cpuif_res.start);
 +              cpuif_res.end = cpuif_res.start + SZ_128K -1;
 +              iounmap(*base);
 +              *base = alt;
 +      }
 +      if (resource_size(&cpuif_res) == SZ_128K) {
 +              /*
 +               * Verify that we have the first 4kB of a GICv2
                 * aliased over the first 64kB by checking the
                 * GICC_IIDR register on both ends.
                 */
 -              val_low = readl_relaxed(*base + GIC_CPU_IDENT);
 -              val_high = readl_relaxed(*base + GIC_CPU_IDENT + 0xf000);
 -              if ((val_low & 0xffff0fff) != 0x0202043B ||
 -                  val_low != val_high)
 +              if (!gic_check_gicv2(*base) ||
 +                  !gic_check_gicv2(*base + 0xf000))
                        return false;
  
                /*
@@@ -1420,7 -1367,8 +1420,8 @@@ static void __init gic_of_setup_kvm_inf
        if (ret)
                return;
  
-       gic_set_kvm_info(&gic_v2_kvm_info);
+       if (static_key_true(&supports_deactivate))
+               gic_set_kvm_info(&gic_v2_kvm_info);
  }
  
  int __init
@@@ -1652,7 -1600,8 +1653,8 @@@ static int __init gic_v2_acpi_init(stru
        if (IS_ENABLED(CONFIG_ARM_GIC_V2M))
                gicv2m_init(NULL, gic_data[0].domain);
  
-       gic_acpi_setup_kvm_info();
+       if (static_key_true(&supports_deactivate))
+               gic_acpi_setup_kvm_info();
  
        return 0;
  }
diff --combined include/uapi/linux/kvm.h
index 7e99999d6236fa2940fa2b565442e8b1b1331407,b605956968368ac86de08e7c0b4753d3940066f1..282d7613fce8788bc466913d7fcacc960dd1c6de
@@@ -1,4 -1,3 +1,4 @@@
 +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
  #ifndef __LINUX_KVM_H
  #define __LINUX_KVM_H
  
@@@ -931,6 -930,7 +931,7 @@@ struct kvm_ppc_resize_hpt 
  #define KVM_CAP_PPC_SMT_POSSIBLE 147
  #define KVM_CAP_HYPERV_SYNIC2 148
  #define KVM_CAP_HYPERV_VP_INDEX 149
+ #define KVM_CAP_S390_AIS_MIGRATION 150
  
  #ifdef KVM_CAP_IRQ_ROUTING
  
diff --combined virt/kvm/arm/arm.c
index 4cf9b91e6c9b28ab0d2c92f23701daa7f9ad24ce,bc126fb99a3d2ce1c4bc55781c1db51f4aada264..772bf74ac2e9ae8380e0ba2b87b385883eca6d4c
@@@ -307,8 -307,7 +307,7 @@@ void kvm_arch_vcpu_destroy(struct kvm_v
  
  int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
  {
-       return kvm_timer_should_fire(vcpu_vtimer(vcpu)) ||
-              kvm_timer_should_fire(vcpu_ptimer(vcpu));
+       return kvm_timer_is_pending(vcpu);
  }
  
  void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
@@@ -354,18 -353,18 +353,18 @@@ void kvm_arch_vcpu_load(struct kvm_vcp
        vcpu->arch.host_cpu_context = this_cpu_ptr(kvm_host_cpu_state);
  
        kvm_arm_set_running_vcpu(vcpu);
        kvm_vgic_load(vcpu);
+       kvm_timer_vcpu_load(vcpu);
  }
  
  void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
  {
+       kvm_timer_vcpu_put(vcpu);
        kvm_vgic_put(vcpu);
  
        vcpu->cpu = -1;
  
        kvm_arm_set_running_vcpu(NULL);
-       kvm_timer_vcpu_put(vcpu);
  }
  
  static void vcpu_power_off(struct kvm_vcpu *vcpu)
@@@ -652,16 -651,12 +651,15 @@@ int kvm_arch_vcpu_ioctl_run(struct kvm_
                 */
                preempt_disable();
  
 +              /* Flush FP/SIMD state that can't survive guest entry/exit */
 +              kvm_fpsimd_flush_cpu_state();
 +
                kvm_pmu_flush_hwstate(vcpu);
  
-               kvm_timer_flush_hwstate(vcpu);
-               kvm_vgic_flush_hwstate(vcpu);
                local_irq_disable();
  
+               kvm_vgic_flush_hwstate(vcpu);
                /*
                 * If we have a singal pending, or need to notify a userspace
                 * irqchip about timer or PMU level changes, then we exit (and
                if (ret <= 0 || need_new_vmid_gen(vcpu->kvm) ||
                    kvm_request_pending(vcpu)) {
                        vcpu->mode = OUTSIDE_GUEST_MODE;
-                       local_irq_enable();
                        kvm_pmu_sync_hwstate(vcpu);
                        kvm_timer_sync_hwstate(vcpu);
                        kvm_vgic_sync_hwstate(vcpu);
+                       local_irq_enable();
                        preempt_enable();
                        continue;
                }
  
                kvm_arm_clear_debug(vcpu);
  
+               /*
+                * We must sync the PMU state before the vgic state so
+                * that the vgic can properly sample the updated state of the
+                * interrupt line.
+                */
+               kvm_pmu_sync_hwstate(vcpu);
+               /*
+                * Sync the vgic state before syncing the timer state because
+                * the timer code needs to know if the virtual timer
+                * interrupts are active.
+                */
+               kvm_vgic_sync_hwstate(vcpu);
+               /*
+                * Sync the timer hardware state before enabling interrupts as
+                * we don't want vtimer interrupts to race with syncing the
+                * timer virtual interrupt state.
+                */
+               kvm_timer_sync_hwstate(vcpu);
                /*
                 * We may have taken a host interrupt in HYP mode (ie
                 * while executing the guest). This interrupt is still
                guest_exit();
                trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
  
-               /*
-                * We must sync the PMU and timer state before the vgic state so
-                * that the vgic can properly sample the updated state of the
-                * interrupt line.
-                */
-               kvm_pmu_sync_hwstate(vcpu);
-               kvm_timer_sync_hwstate(vcpu);
-               kvm_vgic_sync_hwstate(vcpu);
                preempt_enable();
  
                ret = handle_exit(vcpu, run, ret);
@@@ -1329,12 -1335,21 +1338,12 @@@ static void teardown_hyp_mode(void
  {
        int cpu;
  
 -      if (is_kernel_in_hyp_mode())
 -              return;
 -
        free_hyp_pgds();
        for_each_possible_cpu(cpu)
                free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
        hyp_cpu_pm_exit();
  }
  
 -static int init_vhe_mode(void)
 -{
 -      kvm_info("VHE mode initialized successfully\n");
 -      return 0;
 -}
 -
  /**
   * Inits Hyp-mode on all online CPUs
   */
@@@ -1415,6 -1430,8 +1424,6 @@@ static int init_hyp_mode(void
                }
        }
  
 -      kvm_info("Hyp mode initialized successfully\n");
 -
        return 0;
  
  out_err:
@@@ -1448,7 -1465,6 +1457,7 @@@ int kvm_arch_init(void *opaque
  {
        int err;
        int ret, cpu;
 +      bool in_hyp_mode;
  
        if (!is_hyp_mode_available()) {
                kvm_err("HYP mode not available\n");
        if (err)
                return err;
  
 -      if (is_kernel_in_hyp_mode())
 -              err = init_vhe_mode();
 -      else
 +      in_hyp_mode = is_kernel_in_hyp_mode();
 +
 +      if (!in_hyp_mode) {
                err = init_hyp_mode();
 -      if (err)
 -              goto out_err;
 +              if (err)
 +                      goto out_err;
 +      }
  
        err = init_subsystems();
        if (err)
                goto out_hyp;
  
 +      if (in_hyp_mode)
 +              kvm_info("VHE mode initialized successfully\n");
 +      else
 +              kvm_info("Hyp mode initialized successfully\n");
 +
        return 0;
  
  out_hyp:
 -      teardown_hyp_mode();
 +      if (!in_hyp_mode)
 +              teardown_hyp_mode();
  out_err:
        teardown_common_resources();
        return err;
index 547f12dc4d543bafd3b28c74352761aafc62f0a0,40791c12171059c8444928f7e03e5edacb057e68..d2a99ab0ade7a2a83a36466d3e76bff88b1e48a5
@@@ -278,6 -278,7 +278,7 @@@ static int update_lpi_config(struct kv
        u64 propbase = GICR_PROPBASER_ADDRESS(kvm->arch.vgic.propbaser);
        u8 prop;
        int ret;
+       unsigned long flags;
  
        ret = kvm_read_guest(kvm, propbase + irq->intid - GIC_LPI_OFFSET,
                             &prop, 1);
        if (ret)
                return ret;
  
-       spin_lock(&irq->irq_lock);
+       spin_lock_irqsave(&irq->irq_lock, flags);
  
        if (!filter_vcpu || filter_vcpu == irq->target_vcpu) {
                irq->priority = LPI_PROP_PRIORITY(prop);
                irq->enabled = LPI_PROP_ENABLE_BIT(prop);
  
-               vgic_queue_irq_unlock(kvm, irq);
+               vgic_queue_irq_unlock(kvm, irq, flags);
        } else {
-               spin_unlock(&irq->irq_lock);
+               spin_unlock_irqrestore(&irq->irq_lock, flags);
        }
  
        return 0;
@@@ -393,6 -394,7 +394,7 @@@ static int its_sync_lpi_pending_table(s
        int ret = 0;
        u32 *intids;
        int nr_irqs, i;
+       unsigned long flags;
  
        nr_irqs = vgic_copy_lpi_list(vcpu, &intids);
        if (nr_irqs < 0)
                }
  
                irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]);
-               spin_lock(&irq->irq_lock);
+               spin_lock_irqsave(&irq->irq_lock, flags);
                irq->pending_latch = pendmask & (1U << bit_nr);
-               vgic_queue_irq_unlock(vcpu->kvm, irq);
+               vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
                vgic_put_irq(vcpu->kvm, irq);
        }
  
@@@ -515,6 -517,7 +517,7 @@@ static int vgic_its_trigger_msi(struct 
  {
        struct kvm_vcpu *vcpu;
        struct its_ite *ite;
+       unsigned long flags;
  
        if (!its->enabled)
                return -EBUSY;
        if (!vcpu->arch.vgic_cpu.lpis_enabled)
                return -EBUSY;
  
-       spin_lock(&ite->irq->irq_lock);
+       spin_lock_irqsave(&ite->irq->irq_lock, flags);
        ite->irq->pending_latch = true;
-       vgic_queue_irq_unlock(kvm, ite->irq);
+       vgic_queue_irq_unlock(kvm, ite->irq, flags);
  
        return 0;
  }
@@@ -894,7 -897,7 +897,7 @@@ static int vgic_its_cmd_handle_mapi(str
  }
  
  /* Requires the its_lock to be held. */
- static void vgic_its_unmap_device(struct kvm *kvm, struct its_device *device)
+ static void vgic_its_free_device(struct kvm *kvm, struct its_device *device)
  {
        struct its_ite *ite, *temp;
  
        kfree(device);
  }
  
+ /* its lock must be held */
+ static void vgic_its_free_device_list(struct kvm *kvm, struct vgic_its *its)
+ {
+       struct its_device *cur, *temp;
+       list_for_each_entry_safe(cur, temp, &its->device_list, dev_list)
+               vgic_its_free_device(kvm, cur);
+ }
+ /* its lock must be held */
+ static void vgic_its_free_collection_list(struct kvm *kvm, struct vgic_its *its)
+ {
+       struct its_collection *cur, *temp;
+       list_for_each_entry_safe(cur, temp, &its->collection_list, coll_list)
+               vgic_its_free_collection(its, cur->collection_id);
+ }
  /* Must be called with its_lock mutex held */
  static struct its_device *vgic_its_alloc_device(struct vgic_its *its,
                                                u32 device_id, gpa_t itt_addr,
@@@ -957,7 -978,7 +978,7 @@@ static int vgic_its_cmd_handle_mapd(str
         * by removing the mapping and re-establishing it.
         */
        if (device)
-               vgic_its_unmap_device(kvm, device);
+               vgic_its_free_device(kvm, device);
  
        /*
         * The spec does not say whether unmapping a not-mapped device
@@@ -1410,7 -1431,7 +1431,7 @@@ static void vgic_mmio_write_its_baser(s
                                      unsigned long val)
  {
        const struct vgic_its_abi *abi = vgic_its_get_abi(its);
-       u64 entry_size, device_type;
+       u64 entry_size, table_type;
        u64 reg, *regptr, clearbits = 0;
  
        /* When GITS_CTLR.Enable is 1, we ignore write accesses. */
        case 0:
                regptr = &its->baser_device_table;
                entry_size = abi->dte_esz;
-               device_type = GITS_BASER_TYPE_DEVICE;
+               table_type = GITS_BASER_TYPE_DEVICE;
                break;
        case 1:
                regptr = &its->baser_coll_table;
                entry_size = abi->cte_esz;
-               device_type = GITS_BASER_TYPE_COLLECTION;
+               table_type = GITS_BASER_TYPE_COLLECTION;
                clearbits = GITS_BASER_INDIRECT;
                break;
        default:
        reg &= ~clearbits;
  
        reg |= (entry_size - 1) << GITS_BASER_ENTRY_SIZE_SHIFT;
-       reg |= device_type << GITS_BASER_TYPE_SHIFT;
+       reg |= table_type << GITS_BASER_TYPE_SHIFT;
        reg = vgic_sanitise_its_baser(reg);
  
        *regptr = reg;
+       if (!(reg & GITS_BASER_VALID)) {
+               /* Take the its_lock to prevent a race with a save/restore */
+               mutex_lock(&its->its_lock);
+               switch (table_type) {
+               case GITS_BASER_TYPE_DEVICE:
+                       vgic_its_free_device_list(kvm, its);
+                       break;
+               case GITS_BASER_TYPE_COLLECTION:
+                       vgic_its_free_collection_list(kvm, its);
+                       break;
+               }
+               mutex_unlock(&its->its_lock);
+       }
  }
  
  static unsigned long vgic_mmio_read_its_ctlr(struct kvm *vcpu,
@@@ -1466,16 -1501,6 +1501,16 @@@ static void vgic_mmio_write_its_ctlr(st
  {
        mutex_lock(&its->cmd_lock);
  
 +      /*
 +       * It is UNPREDICTABLE to enable the ITS if any of the CBASER or
 +       * device/collection BASER are invalid
 +       */
 +      if (!its->enabled && (val & GITS_CTLR_ENABLE) &&
 +              (!(its->baser_device_table & GITS_BASER_VALID) ||
 +               !(its->baser_coll_table & GITS_BASER_VALID) ||
 +               !(its->cbaser & GITS_CBASER_VALID)))
 +              goto out;
 +
        its->enabled = !!(val & GITS_CTLR_ENABLE);
  
        /*
         */
        vgic_its_process_commands(kvm, its);
  
 +out:
        mutex_unlock(&its->cmd_lock);
  }
  
@@@ -1623,46 -1647,17 +1658,17 @@@ static int vgic_its_create(struct kvm_d
        return vgic_its_set_abi(its, NR_ITS_ABIS - 1);
  }
  
- static void vgic_its_free_device(struct kvm *kvm, struct its_device *dev)
- {
-       struct its_ite *ite, *tmp;
-       list_for_each_entry_safe(ite, tmp, &dev->itt_head, ite_list)
-               its_free_ite(kvm, ite);
-       list_del(&dev->dev_list);
-       kfree(dev);
- }
  static void vgic_its_destroy(struct kvm_device *kvm_dev)
  {
        struct kvm *kvm = kvm_dev->kvm;
        struct vgic_its *its = kvm_dev->private;
-       struct list_head *cur, *temp;
-       /*
-        * We may end up here without the lists ever having been initialized.
-        * Check this and bail out early to avoid dereferencing a NULL pointer.
-        */
-       if (!its->device_list.next)
-               return;
  
        mutex_lock(&its->its_lock);
-       list_for_each_safe(cur, temp, &its->device_list) {
-               struct its_device *dev;
  
-               dev = list_entry(cur, struct its_device, dev_list);
-               vgic_its_free_device(kvm, dev);
-       }
+       vgic_its_free_device_list(kvm, its);
+       vgic_its_free_collection_list(kvm, its);
  
-       list_for_each_safe(cur, temp, &its->collection_list) {
-               struct its_collection *coll;
-               coll = list_entry(cur, struct its_collection, coll_list);
-               list_del(cur);
-               kfree(coll);
-       }
        mutex_unlock(&its->its_lock);
        kfree(its);
  }
  
@@@ -1812,33 -1807,37 +1818,33 @@@ typedef int (*entry_fn_t)(struct vgic_i
  static int scan_its_table(struct vgic_its *its, gpa_t base, int size, int esz,
                          int start_id, entry_fn_t fn, void *opaque)
  {
 -      void *entry = kzalloc(esz, GFP_KERNEL);
        struct kvm *kvm = its->dev->kvm;
        unsigned long len = size;
        int id = start_id;
        gpa_t gpa = base;
 +      char entry[esz];
        int ret;
  
 +      memset(entry, 0, esz);
 +
        while (len > 0) {
                int next_offset;
                size_t byte_offset;
  
                ret = kvm_read_guest(kvm, gpa, entry, esz);
                if (ret)
 -                      goto out;
 +                      return ret;
  
                next_offset = fn(its, id, entry, opaque);
 -              if (next_offset <= 0) {
 -                      ret = next_offset;
 -                      goto out;
 -              }
 +              if (next_offset <= 0)
 +                      return next_offset;
  
                byte_offset = next_offset * esz;
                id += next_offset;
                gpa += byte_offset;
                len -= byte_offset;
        }
 -      ret =  1;
 -
 -out:
 -      kfree(entry);
 -      return ret;
 +      return 1;
  }
  
  /**
@@@ -1947,14 -1946,6 +1953,14 @@@ static int vgic_its_save_itt(struct vgi
        return 0;
  }
  
 +/**
 + * vgic_its_restore_itt - restore the ITT of a device
 + *
 + * @its: its handle
 + * @dev: device handle
 + *
 + * Return 0 on success, < 0 on error
 + */
  static int vgic_its_restore_itt(struct vgic_its *its, struct its_device *dev)
  {
        const struct vgic_its_abi *abi = vgic_its_get_abi(its);
        ret = scan_its_table(its, base, max_size, ite_esz, 0,
                             vgic_its_restore_ite, dev);
  
 +      /* scan_its_table returns +1 if all ITEs are invalid */
 +      if (ret > 0)
 +              ret = 0;
 +
        return ret;
  }
  
@@@ -2067,12 -2054,11 +2073,12 @@@ static int vgic_its_device_cmp(void *pr
  static int vgic_its_save_device_tables(struct vgic_its *its)
  {
        const struct vgic_its_abi *abi = vgic_its_get_abi(its);
 +      u64 baser = its->baser_device_table;
        struct its_device *dev;
        int dte_esz = abi->dte_esz;
 -      u64 baser;
  
 -      baser = its->baser_device_table;
 +      if (!(baser & GITS_BASER_VALID))
 +              return 0;
  
        list_sort(NULL, &its->device_list, vgic_its_device_cmp);
  
@@@ -2127,7 -2113,10 +2133,7 @@@ static int handle_l1_dte(struct vgic_it
        ret = scan_its_table(its, gpa, SZ_64K, dte_esz,
                             l2_start_id, vgic_its_restore_dte, NULL);
  
 -      if (ret <= 0)
 -              return ret;
 -
 -      return 1;
 +      return ret;
  }
  
  /**
@@@ -2157,9 -2146,8 +2163,9 @@@ static int vgic_its_restore_device_tabl
                                     vgic_its_restore_dte, NULL);
        }
  
 +      /* scan_its_table returns +1 if all entries are invalid */
        if (ret > 0)
 -              ret = -EINVAL;
 +              ret = 0;
  
        return ret;
  }
@@@ -2216,17 -2204,17 +2222,17 @@@ static int vgic_its_restore_cte(struct 
  static int vgic_its_save_collection_table(struct vgic_its *its)
  {
        const struct vgic_its_abi *abi = vgic_its_get_abi(its);
 +      u64 baser = its->baser_coll_table;
 +      gpa_t gpa = BASER_ADDRESS(baser);
        struct its_collection *collection;
        u64 val;
 -      gpa_t gpa;
        size_t max_size, filled = 0;
        int ret, cte_esz = abi->cte_esz;
  
 -      gpa = BASER_ADDRESS(its->baser_coll_table);
 -      if (!gpa)
 +      if (!(baser & GITS_BASER_VALID))
                return 0;
  
 -      max_size = GITS_BASER_NR_PAGES(its->baser_coll_table) * SZ_64K;
 +      max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
  
        list_for_each_entry(collection, &its->collection_list, coll_list) {
                ret = vgic_its_save_cte(its, collection, gpa, cte_esz);
  static int vgic_its_restore_collection_table(struct vgic_its *its)
  {
        const struct vgic_its_abi *abi = vgic_its_get_abi(its);
 +      u64 baser = its->baser_coll_table;
        int cte_esz = abi->cte_esz;
        size_t max_size, read = 0;
        gpa_t gpa;
        int ret;
  
 -      if (!(its->baser_coll_table & GITS_BASER_VALID))
 +      if (!(baser & GITS_BASER_VALID))
                return 0;
  
 -      gpa = BASER_ADDRESS(its->baser_coll_table);
 +      gpa = BASER_ADDRESS(baser);
  
 -      max_size = GITS_BASER_NR_PAGES(its->baser_coll_table) * SZ_64K;
 +      max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
  
        while (read < max_size) {
                ret = vgic_its_restore_cte(its, gpa, cte_esz);
                gpa += cte_esz;
                read += cte_esz;
        }
 +
 +      if (ret > 0)
 +              return 0;
 +
        return ret;
  }
  
   */
  static int vgic_its_save_tables_v0(struct vgic_its *its)
  {
-       struct kvm *kvm = its->dev->kvm;
        int ret;
  
-       mutex_lock(&kvm->lock);
-       mutex_lock(&its->its_lock);
-       if (!lock_all_vcpus(kvm)) {
-               mutex_unlock(&its->its_lock);
-               mutex_unlock(&kvm->lock);
-               return -EBUSY;
-       }
        ret = vgic_its_save_device_tables(its);
        if (ret)
-               goto out;
-       ret = vgic_its_save_collection_table(its);
+               return ret;
  
- out:
-       unlock_all_vcpus(kvm);
-       mutex_unlock(&its->its_lock);
-       mutex_unlock(&kvm->lock);
-       return ret;
+       return vgic_its_save_collection_table(its);
  }
  
  /**
   */
  static int vgic_its_restore_tables_v0(struct vgic_its *its)
  {
-       struct kvm *kvm = its->dev->kvm;
        int ret;
  
-       mutex_lock(&kvm->lock);
-       mutex_lock(&its->its_lock);
-       if (!lock_all_vcpus(kvm)) {
-               mutex_unlock(&its->its_lock);
-               mutex_unlock(&kvm->lock);
-               return -EBUSY;
-       }
        ret = vgic_its_restore_collection_table(its);
        if (ret)
-               goto out;
-       ret = vgic_its_restore_device_tables(its);
- out:
-       unlock_all_vcpus(kvm);
-       mutex_unlock(&its->its_lock);
-       mutex_unlock(&kvm->lock);
+               return ret;
  
-       return ret;
+       return vgic_its_restore_device_tables(its);
  }
  
  static int vgic_its_commit_v0(struct vgic_its *its)
        return 0;
  }
  
+ static void vgic_its_reset(struct kvm *kvm, struct vgic_its *its)
+ {
+       /* We need to keep the ABI specific field values */
+       its->baser_coll_table &= ~GITS_BASER_VALID;
+       its->baser_device_table &= ~GITS_BASER_VALID;
+       its->cbaser = 0;
+       its->creadr = 0;
+       its->cwriter = 0;
+       its->enabled = 0;
+       vgic_its_free_device_list(kvm, its);
+       vgic_its_free_collection_list(kvm, its);
+ }
  static int vgic_its_has_attr(struct kvm_device *dev,
                             struct kvm_device_attr *attr)
  {
                switch (attr->attr) {
                case KVM_DEV_ARM_VGIC_CTRL_INIT:
                        return 0;
+               case KVM_DEV_ARM_ITS_CTRL_RESET:
+                       return 0;
                case KVM_DEV_ARM_ITS_SAVE_TABLES:
                        return 0;
                case KVM_DEV_ARM_ITS_RESTORE_TABLES:
        return -ENXIO;
  }
  
+ static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr)
+ {
+       const struct vgic_its_abi *abi = vgic_its_get_abi(its);
+       int ret = 0;
+       if (attr == KVM_DEV_ARM_VGIC_CTRL_INIT) /* Nothing to do */
+               return 0;
+       mutex_lock(&kvm->lock);
+       mutex_lock(&its->its_lock);
+       if (!lock_all_vcpus(kvm)) {
+               mutex_unlock(&its->its_lock);
+               mutex_unlock(&kvm->lock);
+               return -EBUSY;
+       }
+       switch (attr) {
+       case KVM_DEV_ARM_ITS_CTRL_RESET:
+               vgic_its_reset(kvm, its);
+               break;
+       case KVM_DEV_ARM_ITS_SAVE_TABLES:
+               ret = abi->save_tables(its);
+               break;
+       case KVM_DEV_ARM_ITS_RESTORE_TABLES:
+               ret = abi->restore_tables(its);
+               break;
+       }
+       unlock_all_vcpus(kvm);
+       mutex_unlock(&its->its_lock);
+       mutex_unlock(&kvm->lock);
+       return ret;
+ }
  static int vgic_its_set_attr(struct kvm_device *dev,
                             struct kvm_device_attr *attr)
  {
  
                return vgic_register_its_iodev(dev->kvm, its, addr);
        }
-       case KVM_DEV_ARM_VGIC_GRP_CTRL: {
-               const struct vgic_its_abi *abi = vgic_its_get_abi(its);
-               switch (attr->attr) {
-               case KVM_DEV_ARM_VGIC_CTRL_INIT:
-                       /* Nothing to do */
-                       return 0;
-               case KVM_DEV_ARM_ITS_SAVE_TABLES:
-                       return abi->save_tables(its);
-               case KVM_DEV_ARM_ITS_RESTORE_TABLES:
-                       return abi->restore_tables(its);
-               }
-       }
+       case KVM_DEV_ARM_VGIC_GRP_CTRL:
+               return vgic_its_ctrl(dev->kvm, its, attr->attr);
        case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: {
                u64 __user *uaddr = (u64 __user *)(long)attr->addr;
                u64 reg;
diff --combined virt/kvm/kvm_main.c
index ce507ae1d4f50e6af019c94baaa800a8aaaa695c,c114d7948743f8e24420e29bd5d7837c887dce44..2dd1a9ca459988f6101c952b07e09995577e4986
@@@ -122,7 -122,6 +122,6 @@@ static void hardware_disable_all(void)
  
  static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
  
- static void kvm_release_pfn_dirty(kvm_pfn_t pfn);
  static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn);
  
  __visible bool kvm_rebooting;
@@@ -1679,11 -1678,12 +1678,12 @@@ void kvm_release_page_dirty(struct pag
  }
  EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
  
static void kvm_release_pfn_dirty(kvm_pfn_t pfn)
+ void kvm_release_pfn_dirty(kvm_pfn_t pfn)
  {
        kvm_set_pfn_dirty(pfn);
        kvm_release_pfn_clean(pfn);
  }
+ EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
  
  void kvm_set_pfn_dirty(kvm_pfn_t pfn)
  {
@@@ -2302,7 -2302,7 +2302,7 @@@ void kvm_vcpu_on_spin(struct kvm_vcpu *
                                continue;
                        } else if (pass && i > last_boosted_vcpu)
                                break;
 -                      if (!ACCESS_ONCE(vcpu->preempted))
 +                      if (!READ_ONCE(vcpu->preempted))
                                continue;
                        if (vcpu == me)
                                continue;
@@@ -4010,7 -4010,7 +4010,7 @@@ int kvm_init(void *opaque, unsigned vcp
        if (!vcpu_align)
                vcpu_align = __alignof__(struct kvm_vcpu);
        kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align,
-                                          0, NULL);
+                                          SLAB_ACCOUNT, NULL);
        if (!kvm_vcpu_cache) {
                r = -ENOMEM;
                goto out_free_3;