+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
/*
* Copyright (C) 2012 - Virtual Open Systems and Columbia University
* Author: Christoffer Dall <c.dall@virtualopensystems.com>
(__ARM_CP15_REG(op1, 0, crm, 0) | KVM_REG_SIZE_U64)
#define ARM_CP15_REG64(...) __ARM_CP15_REG64(__VA_ARGS__)
+ /* PL1 Physical Timer Registers */
+ #define KVM_REG_ARM_PTIMER_CTL ARM_CP15_REG32(0, 14, 2, 1)
+ #define KVM_REG_ARM_PTIMER_CNT ARM_CP15_REG64(0, 14)
+ #define KVM_REG_ARM_PTIMER_CVAL ARM_CP15_REG64(2, 14)
+
+ /* Virtual Timer Registers */
#define KVM_REG_ARM_TIMER_CTL ARM_CP15_REG32(0, 14, 3, 1)
#define KVM_REG_ARM_TIMER_CNT ARM_CP15_REG64(1, 14)
#define KVM_REG_ARM_TIMER_CVAL ARM_CP15_REG64(3, 14)
#define KVM_DEV_ARM_ITS_SAVE_TABLES 1
#define KVM_DEV_ARM_ITS_RESTORE_TABLES 2
#define KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES 3
+ #define KVM_DEV_ARM_ITS_CTRL_RESET 4
/* KVM_IRQ_LINE irq field index values */
#define KVM_ARM_IRQ_TYPE_SHIFT 24
const char *desc;
u32 (*read_cntp_tval_el0)(void);
u32 (*read_cntv_tval_el0)(void);
+ u64 (*read_cntpct_el0)(void);
u64 (*read_cntvct_el0)(void);
int (*set_next_event_phys)(unsigned long, struct clock_event_device *);
int (*set_next_event_virt)(unsigned long, struct clock_event_device *);
static inline void arch_timer_set_cntkctl(u32 cntkctl)
{
write_sysreg(cntkctl, cntkctl_el1);
+ isb();
}
static inline u64 arch_counter_get_cntpct(void)
{
- /*
- * AArch64 kernel and user space mandate the use of CNTVCT.
- */
- BUG();
- return 0;
+ isb();
+ return arch_timer_reg_read_stable(cntpct_el0);
}
static inline u64 arch_counter_get_cntvct(void)
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
/*
* Copyright (C) 2012,2013 - ARM Ltd
* Author: Marc Zyngier <marc.zyngier@arm.com>
#define ARM64_SYS_REG(...) (__ARM64_SYS_REG(__VA_ARGS__) | KVM_REG_SIZE_U64)
+ /* Physical Timer EL0 Registers */
+ #define KVM_REG_ARM_PTIMER_CTL ARM64_SYS_REG(3, 3, 14, 2, 1)
+ #define KVM_REG_ARM_PTIMER_CVAL ARM64_SYS_REG(3, 3, 14, 2, 2)
+ #define KVM_REG_ARM_PTIMER_CNT ARM64_SYS_REG(3, 3, 14, 0, 1)
+
+ /* EL0 Virtual Timer Registers */
#define KVM_REG_ARM_TIMER_CTL ARM64_SYS_REG(3, 3, 14, 3, 1)
#define KVM_REG_ARM_TIMER_CNT ARM64_SYS_REG(3, 3, 14, 3, 2)
#define KVM_REG_ARM_TIMER_CVAL ARM64_SYS_REG(3, 3, 14, 0, 2)
#define KVM_DEV_ARM_ITS_SAVE_TABLES 1
#define KVM_DEV_ARM_ITS_RESTORE_TABLES 2
#define KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES 3
+ #define KVM_DEV_ARM_ITS_CTRL_RESET 4
/* Device Control API on vcpu fd */
#define KVM_ARM_VCPU_PMU_V3_CTRL 0
val = read_sysreg(cpacr_el1);
val |= CPACR_EL1_TTA;
- val &= ~CPACR_EL1_FPEN;
+ val &= ~(CPACR_EL1_FPEN | CPACR_EL1_ZEN);
write_sysreg(val, cpacr_el1);
write_sysreg(__kvm_hyp_vector, vbar_el1);
u64 val;
val = CPTR_EL2_DEFAULT;
- val |= CPTR_EL2_TTA | CPTR_EL2_TFP;
+ val |= CPTR_EL2_TTA | CPTR_EL2_TFP | CPTR_EL2_TZ;
write_sysreg(val, cptr_el2);
}
* it will cause an exception.
*/
val = vcpu->arch.hcr_el2;
+
if (!(val & HCR_RW) && system_supports_fpsimd()) {
write_sysreg(1 << 30, fpexc32_el2);
isb();
}
+
+ if (val & HCR_RW) /* for AArch64 only: */
+ val |= HCR_TID3; /* TID3: trap feature register accesses */
+
write_sysreg(val, hcr_el2);
+
/* Trap on AArch32 cp15 c15 accesses (EL1 or EL0) */
write_sysreg(1 << 15, hstr_el2);
/*
write_sysreg(mdcr_el2, mdcr_el2);
write_sysreg(HCR_HOST_VHE_FLAGS, hcr_el2);
- write_sysreg(CPACR_EL1_FPEN, cpacr_el1);
+ write_sysreg(CPACR_EL1_DEFAULT, cpacr_el1);
write_sysreg(vectors, vbar_el1);
}
__activate_vm(vcpu);
__vgic_restore_state(vcpu);
- __timer_restore_state(vcpu);
+ __timer_enable_traps(vcpu);
/*
* We must restore the 32-bit state before the sysregs, thanks
__sysreg_save_guest_state(guest_ctxt);
__sysreg32_save_state(vcpu);
- __timer_save_state(vcpu);
+ __timer_disable_traps(vcpu);
__vgic_save_state(vcpu);
__deactivate_traps(vcpu);
vcpu = (struct kvm_vcpu *)read_sysreg(tpidr_el2);
host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
- __timer_save_state(vcpu);
+ __timer_disable_traps(vcpu);
__deactivate_traps(vcpu);
__deactivate_vm(vcpu);
__sysreg_restore_host_state(host_ctxt);
#include <linux/bsearch.h>
#include <linux/kvm_host.h>
#include <linux/mm.h>
+#include <linux/printk.h>
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
- struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
u64 now = kvm_phys_timer_read();
+ u64 cval;
- if (p->is_write)
- ptimer->cnt_cval = p->regval + now;
- else
- p->regval = ptimer->cnt_cval - now;
+ if (p->is_write) {
+ kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL,
+ p->regval + now);
+ } else {
+ cval = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL);
+ p->regval = cval - now;
+ }
return true;
}
struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
- struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
-
- if (p->is_write) {
- /* ISTATUS bit is read-only */
- ptimer->cnt_ctl = p->regval & ~ARCH_TIMER_CTRL_IT_STAT;
- } else {
- u64 now = kvm_phys_timer_read();
-
- p->regval = ptimer->cnt_ctl;
- /*
- * Set ISTATUS bit if it's expired.
- * Note that according to ARMv8 ARM Issue A.k, ISTATUS bit is
- * UNKNOWN when ENABLE bit is 0, so we chose to set ISTATUS bit
- * regardless of ENABLE bit for our implementation convenience.
- */
- if (ptimer->cnt_cval <= now)
- p->regval |= ARCH_TIMER_CTRL_IT_STAT;
- }
+ if (p->is_write)
+ kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CTL, p->regval);
+ else
+ p->regval = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CTL);
return true;
}
struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
- struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
-
if (p->is_write)
- ptimer->cnt_cval = p->regval;
+ kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL, p->regval);
else
- p->regval = ptimer->cnt_cval;
+ p->regval = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL);
return true;
}
+/* Read a sanitised cpufeature ID register by sys_reg_desc */
+static u64 read_id_reg(struct sys_reg_desc const *r, bool raz)
+{
+ u32 id = sys_reg((u32)r->Op0, (u32)r->Op1,
+ (u32)r->CRn, (u32)r->CRm, (u32)r->Op2);
+ u64 val = raz ? 0 : read_sanitised_ftr_reg(id);
+
+ if (id == SYS_ID_AA64PFR0_EL1) {
+ if (val & (0xfUL << ID_AA64PFR0_SVE_SHIFT))
+ pr_err_once("kvm [%i]: SVE unsupported for guests, suppressing\n",
+ task_pid_nr(current));
+
+ val &= ~(0xfUL << ID_AA64PFR0_SVE_SHIFT);
+ }
+
+ return val;
+}
+
+/* cpufeature ID register access trap handlers */
+
+static bool __access_id_reg(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r,
+ bool raz)
+{
+ if (p->is_write)
+ return write_to_read_only(vcpu, p, r);
+
+ p->regval = read_id_reg(r, raz);
+ return true;
+}
+
+static bool access_id_reg(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ return __access_id_reg(vcpu, p, r, false);
+}
+
+static bool access_raz_id_reg(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ return __access_id_reg(vcpu, p, r, true);
+}
+
+static int reg_from_user(u64 *val, const void __user *uaddr, u64 id);
+static int reg_to_user(void __user *uaddr, const u64 *val, u64 id);
+static u64 sys_reg_to_index(const struct sys_reg_desc *reg);
+
+/*
+ * cpufeature ID register user accessors
+ *
+ * For now, these registers are immutable for userspace, so no values
+ * are stored, and for set_id_reg() we don't allow the effective value
+ * to be changed.
+ */
+static int __get_id_reg(const struct sys_reg_desc *rd, void __user *uaddr,
+ bool raz)
+{
+ const u64 id = sys_reg_to_index(rd);
+ const u64 val = read_id_reg(rd, raz);
+
+ return reg_to_user(uaddr, &val, id);
+}
+
+static int __set_id_reg(const struct sys_reg_desc *rd, void __user *uaddr,
+ bool raz)
+{
+ const u64 id = sys_reg_to_index(rd);
+ int err;
+ u64 val;
+
+ err = reg_from_user(&val, uaddr, id);
+ if (err)
+ return err;
+
+ /* This is what we mean by invariant: you can't change it. */
+ if (val != read_id_reg(rd, raz))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+ const struct kvm_one_reg *reg, void __user *uaddr)
+{
+ return __get_id_reg(rd, uaddr, false);
+}
+
+static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+ const struct kvm_one_reg *reg, void __user *uaddr)
+{
+ return __set_id_reg(rd, uaddr, false);
+}
+
+static int get_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+ const struct kvm_one_reg *reg, void __user *uaddr)
+{
+ return __get_id_reg(rd, uaddr, true);
+}
+
+static int set_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+ const struct kvm_one_reg *reg, void __user *uaddr)
+{
+ return __set_id_reg(rd, uaddr, true);
+}
+
+/* sys_reg_desc initialiser for known cpufeature ID registers */
+#define ID_SANITISED(name) { \
+ SYS_DESC(SYS_##name), \
+ .access = access_id_reg, \
+ .get_user = get_id_reg, \
+ .set_user = set_id_reg, \
+}
+
+/*
+ * sys_reg_desc initialiser for architecturally unallocated cpufeature ID
+ * register with encoding Op0=3, Op1=0, CRn=0, CRm=crm, Op2=op2
+ * (1 <= crm < 8, 0 <= Op2 < 8).
+ */
+#define ID_UNALLOCATED(crm, op2) { \
+ Op0(3), Op1(0), CRn(0), CRm(crm), Op2(op2), \
+ .access = access_raz_id_reg, \
+ .get_user = get_raz_id_reg, \
+ .set_user = set_raz_id_reg, \
+}
+
+/*
+ * sys_reg_desc initialiser for known ID registers that we hide from guests.
+ * For now, these are exposed just like unallocated ID regs: they appear
+ * RAZ for the guest.
+ */
+#define ID_HIDDEN(name) { \
+ SYS_DESC(SYS_##name), \
+ .access = access_raz_id_reg, \
+ .get_user = get_raz_id_reg, \
+ .set_user = set_raz_id_reg, \
+}
+
/*
* Architected system registers.
* Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2
{ SYS_DESC(SYS_DBGVCR32_EL2), NULL, reset_val, DBGVCR32_EL2, 0 },
{ SYS_DESC(SYS_MPIDR_EL1), NULL, reset_mpidr, MPIDR_EL1 },
+
+ /*
+ * ID regs: all ID_SANITISED() entries here must have corresponding
+ * entries in arm64_ftr_regs[].
+ */
+
+ /* AArch64 mappings of the AArch32 ID registers */
+ /* CRm=1 */
+ ID_SANITISED(ID_PFR0_EL1),
+ ID_SANITISED(ID_PFR1_EL1),
+ ID_SANITISED(ID_DFR0_EL1),
+ ID_HIDDEN(ID_AFR0_EL1),
+ ID_SANITISED(ID_MMFR0_EL1),
+ ID_SANITISED(ID_MMFR1_EL1),
+ ID_SANITISED(ID_MMFR2_EL1),
+ ID_SANITISED(ID_MMFR3_EL1),
+
+ /* CRm=2 */
+ ID_SANITISED(ID_ISAR0_EL1),
+ ID_SANITISED(ID_ISAR1_EL1),
+ ID_SANITISED(ID_ISAR2_EL1),
+ ID_SANITISED(ID_ISAR3_EL1),
+ ID_SANITISED(ID_ISAR4_EL1),
+ ID_SANITISED(ID_ISAR5_EL1),
+ ID_SANITISED(ID_MMFR4_EL1),
+ ID_UNALLOCATED(2,7),
+
+ /* CRm=3 */
+ ID_SANITISED(MVFR0_EL1),
+ ID_SANITISED(MVFR1_EL1),
+ ID_SANITISED(MVFR2_EL1),
+ ID_UNALLOCATED(3,3),
+ ID_UNALLOCATED(3,4),
+ ID_UNALLOCATED(3,5),
+ ID_UNALLOCATED(3,6),
+ ID_UNALLOCATED(3,7),
+
+ /* AArch64 ID registers */
+ /* CRm=4 */
+ ID_SANITISED(ID_AA64PFR0_EL1),
+ ID_SANITISED(ID_AA64PFR1_EL1),
+ ID_UNALLOCATED(4,2),
+ ID_UNALLOCATED(4,3),
+ ID_UNALLOCATED(4,4),
+ ID_UNALLOCATED(4,5),
+ ID_UNALLOCATED(4,6),
+ ID_UNALLOCATED(4,7),
+
+ /* CRm=5 */
+ ID_SANITISED(ID_AA64DFR0_EL1),
+ ID_SANITISED(ID_AA64DFR1_EL1),
+ ID_UNALLOCATED(5,2),
+ ID_UNALLOCATED(5,3),
+ ID_HIDDEN(ID_AA64AFR0_EL1),
+ ID_HIDDEN(ID_AA64AFR1_EL1),
+ ID_UNALLOCATED(5,6),
+ ID_UNALLOCATED(5,7),
+
+ /* CRm=6 */
+ ID_SANITISED(ID_AA64ISAR0_EL1),
+ ID_SANITISED(ID_AA64ISAR1_EL1),
+ ID_UNALLOCATED(6,2),
+ ID_UNALLOCATED(6,3),
+ ID_UNALLOCATED(6,4),
+ ID_UNALLOCATED(6,5),
+ ID_UNALLOCATED(6,6),
+ ID_UNALLOCATED(6,7),
+
+ /* CRm=7 */
+ ID_SANITISED(ID_AA64MMFR0_EL1),
+ ID_SANITISED(ID_AA64MMFR1_EL1),
+ ID_SANITISED(ID_AA64MMFR2_EL1),
+ ID_UNALLOCATED(7,3),
+ ID_UNALLOCATED(7,4),
+ ID_UNALLOCATED(7,5),
+ ID_UNALLOCATED(7,6),
+ ID_UNALLOCATED(7,7),
+
{ SYS_DESC(SYS_SCTLR_EL1), access_vm_reg, reset_val, SCTLR_EL1, 0x00C50078 },
{ SYS_DESC(SYS_CPACR_EL1), NULL, reset_val, CPACR_EL1, 0 },
{ SYS_DESC(SYS_TTBR0_EL1), access_vm_reg, reset_unknown, TTBR0_EL1 },
if (!r)
r = find_reg(¶ms, sys_reg_descs, ARRAY_SIZE(sys_reg_descs));
- /* Not saved in the sys_reg array? */
- if (r && !r->reg)
+ /* Not saved in the sys_reg array and not otherwise accessible? */
+ if (r && !(r->reg || r->get_user))
r = NULL;
return r;
FUNCTION_INVARIANT(midr_el1)
FUNCTION_INVARIANT(ctr_el0)
FUNCTION_INVARIANT(revidr_el1)
-FUNCTION_INVARIANT(id_pfr0_el1)
-FUNCTION_INVARIANT(id_pfr1_el1)
-FUNCTION_INVARIANT(id_dfr0_el1)
-FUNCTION_INVARIANT(id_afr0_el1)
-FUNCTION_INVARIANT(id_mmfr0_el1)
-FUNCTION_INVARIANT(id_mmfr1_el1)
-FUNCTION_INVARIANT(id_mmfr2_el1)
-FUNCTION_INVARIANT(id_mmfr3_el1)
-FUNCTION_INVARIANT(id_isar0_el1)
-FUNCTION_INVARIANT(id_isar1_el1)
-FUNCTION_INVARIANT(id_isar2_el1)
-FUNCTION_INVARIANT(id_isar3_el1)
-FUNCTION_INVARIANT(id_isar4_el1)
-FUNCTION_INVARIANT(id_isar5_el1)
FUNCTION_INVARIANT(clidr_el1)
FUNCTION_INVARIANT(aidr_el1)
static struct sys_reg_desc invariant_sys_regs[] = {
{ SYS_DESC(SYS_MIDR_EL1), NULL, get_midr_el1 },
{ SYS_DESC(SYS_REVIDR_EL1), NULL, get_revidr_el1 },
- { SYS_DESC(SYS_ID_PFR0_EL1), NULL, get_id_pfr0_el1 },
- { SYS_DESC(SYS_ID_PFR1_EL1), NULL, get_id_pfr1_el1 },
- { SYS_DESC(SYS_ID_DFR0_EL1), NULL, get_id_dfr0_el1 },
- { SYS_DESC(SYS_ID_AFR0_EL1), NULL, get_id_afr0_el1 },
- { SYS_DESC(SYS_ID_MMFR0_EL1), NULL, get_id_mmfr0_el1 },
- { SYS_DESC(SYS_ID_MMFR1_EL1), NULL, get_id_mmfr1_el1 },
- { SYS_DESC(SYS_ID_MMFR2_EL1), NULL, get_id_mmfr2_el1 },
- { SYS_DESC(SYS_ID_MMFR3_EL1), NULL, get_id_mmfr3_el1 },
- { SYS_DESC(SYS_ID_ISAR0_EL1), NULL, get_id_isar0_el1 },
- { SYS_DESC(SYS_ID_ISAR1_EL1), NULL, get_id_isar1_el1 },
- { SYS_DESC(SYS_ID_ISAR2_EL1), NULL, get_id_isar2_el1 },
- { SYS_DESC(SYS_ID_ISAR3_EL1), NULL, get_id_isar3_el1 },
- { SYS_DESC(SYS_ID_ISAR4_EL1), NULL, get_id_isar4_el1 },
- { SYS_DESC(SYS_ID_ISAR5_EL1), NULL, get_id_isar5_el1 },
{ SYS_DESC(SYS_CLIDR_EL1), NULL, get_clidr_el1 },
{ SYS_DESC(SYS_AIDR_EL1), NULL, get_aidr_el1 },
{ SYS_DESC(SYS_CTR_EL0), NULL, get_ctr_el0 },
return true;
}
+static int walk_one_sys_reg(const struct sys_reg_desc *rd,
+ u64 __user **uind,
+ unsigned int *total)
+{
+ /*
+ * Ignore registers we trap but don't save,
+ * and for which no custom user accessor is provided.
+ */
+ if (!(rd->reg || rd->get_user))
+ return 0;
+
+ if (!copy_reg_to_user(rd, uind))
+ return -EFAULT;
+
+ (*total)++;
+ return 0;
+}
+
/* Assumed ordered tables, see kvm_sys_reg_table_init. */
static int walk_sys_regs(struct kvm_vcpu *vcpu, u64 __user *uind)
{
const struct sys_reg_desc *i1, *i2, *end1, *end2;
unsigned int total = 0;
size_t num;
+ int err;
/* We check for duplicates here, to allow arch-specific overrides. */
i1 = get_target_table(vcpu->arch.target, true, &num);
while (i1 || i2) {
int cmp = cmp_sys_reg(i1, i2);
/* target-specific overrides generic entry. */
- if (cmp <= 0) {
- /* Ignore registers we trap but don't save. */
- if (i1->reg) {
- if (!copy_reg_to_user(i1, &uind))
- return -EFAULT;
- total++;
- }
- } else {
- /* Ignore registers we trap but don't save. */
- if (i2->reg) {
- if (!copy_reg_to_user(i2, &uind))
- return -EFAULT;
- total++;
- }
- }
+ if (cmp <= 0)
+ err = walk_one_sys_reg(i1, &uind, &total);
+ else
+ err = walk_one_sys_reg(i2, &uind, &total);
+
+ if (err)
+ return err;
if (cmp <= 0 && ++i1 == end1)
i1 = NULL;
#ifdef CONFIG_PPC_MM_SLICES
OFFSET(PACALOWSLICESPSIZE, paca_struct, mm_ctx_low_slices_psize);
OFFSET(PACAHIGHSLICEPSIZE, paca_struct, mm_ctx_high_slices_psize);
- DEFINE(PACA_ADDR_LIMIT, offsetof(struct paca_struct, addr_limit));
+ OFFSET(PACA_SLB_ADDR_LIMIT, paca_struct, mm_ctx_slb_addr_limit);
DEFINE(MMUPSIZEDEFSIZE, sizeof(struct mmu_psize_def));
#endif /* CONFIG_PPC_MM_SLICES */
#endif
OFFSET(TCD_ESEL_FIRST, tlb_core_data, esel_first);
#endif /* CONFIG_PPC_BOOK3E */
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
OFFSET(PACASLBCACHE, paca_struct, slb_cache);
OFFSET(PACASLBCACHEPTR, paca_struct, slb_cache_ptr);
OFFSET(PACAVMALLOCSLLP, paca_struct, vmalloc_sllp);
OFFSET(LPPACA_DTLIDX, lppaca, dtl_idx);
OFFSET(LPPACA_YIELDCOUNT, lppaca, yield_count);
OFFSET(PACA_DTL_RIDX, paca_struct, dtl_ridx);
-#endif /* CONFIG_PPC_STD_MMU_64 */
+#endif /* CONFIG_PPC_BOOK3S_64 */
OFFSET(PACAEMERGSP, paca_struct, emergency_sp);
#ifdef CONFIG_PPC_BOOK3S_64
OFFSET(PACAMCEMERGSP, paca_struct, mc_emergency_sp);
HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr);
HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi);
HSTATE_FIELD(HSTATE_PTID, ptid);
+ HSTATE_FIELD(HSTATE_TID, tid);
HSTATE_FIELD(HSTATE_MMCR0, host_mmcr[0]);
HSTATE_FIELD(HSTATE_MMCR1, host_mmcr[1]);
HSTATE_FIELD(HSTATE_MMCRA, host_mmcr[2]);
OFFSET(KVM_SPLIT_LDBAR, kvm_split_mode, ldbar);
OFFSET(KVM_SPLIT_DO_NAP, kvm_split_mode, do_nap);
OFFSET(KVM_SPLIT_NAPPED, kvm_split_mode, napped);
+ OFFSET(KVM_SPLIT_DO_SET, kvm_split_mode, do_set);
+ OFFSET(KVM_SPLIT_DO_RESTORE, kvm_split_mode, do_restore);
#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
#ifdef CONFIG_PPC_BOOK3S_64
*/
#include <linux/kvm_host.h>
+ #include <linux/kernel.h>
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/preempt.h>
#include <asm/reg.h>
#include <asm/ppc-opcode.h>
+#include <asm/asm-prototypes.h>
#include <asm/disassemble.h>
#include <asm/cputable.h>
#include <asm/cacheflush.h>
module_param(target_smt_mode, int, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)");
+ static bool indep_threads_mode = true;
+ module_param(indep_threads_mode, bool, S_IRUGO | S_IWUSR);
+ MODULE_PARM_DESC(indep_threads_mode, "Independent-threads mode (only on POWER9)");
+
#ifdef CONFIG_KVM_XICS
static struct kernel_param_ops module_param_ops = {
.set = param_set_int,
static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
+ static void kvmppc_setup_partition_table(struct kvm *kvm);
static inline struct kvm_vcpu *next_runnable_thread(struct kvmppc_vcore *vc,
int *ip)
vcpu->stat.ext_intr_exits++;
r = RESUME_GUEST;
break;
- /* HMI is hypervisor interrupt and host has handled it. Resume guest.*/
+ /* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/
case BOOK3S_INTERRUPT_HMI:
case BOOK3S_INTERRUPT_PERFMON:
+ case BOOK3S_INTERRUPT_SYSTEM_RESET:
r = RESUME_GUEST;
break;
case BOOK3S_INTERRUPT_MACHINE_CHECK:
* MMU mode (radix or HPT), unfortunately, but since we only support
* HPT guests on a HPT host so far, that isn't an impediment yet.
*/
- static int threads_per_vcore(void)
+ static int threads_per_vcore(struct kvm *kvm)
{
- if (cpu_has_feature(CPU_FTR_ARCH_300))
+ if (kvm->arch.threads_indep)
return 1;
return threads_per_subcore;
}
{"cede", offsetof(struct kvm_vcpu, arch.cede_time)},
};
- #define N_TIMINGS (sizeof(timings) / sizeof(timings[0]))
+ #define N_TIMINGS (ARRAY_SIZE(timings))
struct debugfs_timings_state {
struct kvm_vcpu *vcpu;
kvmppc_ipi_thread(cpu);
}
- static void kvmppc_wait_for_nap(void)
+ static void kvmppc_wait_for_nap(int n_threads)
{
int cpu = smp_processor_id();
int i, loops;
- int n_threads = threads_per_vcore();
if (n_threads <= 1)
return;
vc->vcore_state = VCORE_PREEMPT;
vc->pcpu = smp_processor_id();
- if (vc->num_threads < threads_per_vcore()) {
+ if (vc->num_threads < threads_per_vcore(vc->kvm)) {
spin_lock(&lp->lock);
list_add_tail(&vc->preempt_list, &lp->list);
spin_unlock(&lp->lock);
/*
* This mapping means subcores 0 and 1 can use threads 0-3 and 4-7
- * respectively in 2-way micro-threading (split-core) mode.
+ * respectively in 2-way micro-threading (split-core) mode on POWER8.
*/
static int subcore_thread_map[MAX_SUBCORES] = { 0, 4, 2, 6 };
static bool subcore_config_ok(int n_subcores, int n_threads)
{
- /* Can only dynamically split if unsplit to begin with */
+ /*
+ * POWER9 "SMT4" cores are permanently in what is effectively a 4-way split-core
+ * mode, with one thread per subcore.
+ */
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ return n_subcores <= 4 && n_threads == 1;
+
+ /* On POWER8, can only dynamically split if unsplit to begin with */
if (n_subcores > 1 && threads_per_subcore < MAX_SMT_THREADS)
return false;
if (n_subcores > MAX_SUBCORES)
if (!cpu_has_feature(CPU_FTR_ARCH_207S))
return false;
+ /* POWER9 currently requires all threads to be in the same MMU mode */
+ if (cpu_has_feature(CPU_FTR_ARCH_300) &&
+ kvm_is_radix(vc->kvm) != kvm_is_radix(cip->vc[0]->kvm))
+ return false;
+
if (n_threads < cip->max_subcore_threads)
n_threads = cip->max_subcore_threads;
if (!subcore_config_ok(cip->n_subcores + 1, n_threads))
case BOOK3S_INTERRUPT_HMI:
local_paca->irq_happened |= PACA_IRQ_HMI;
break;
+ case BOOK3S_INTERRUPT_SYSTEM_RESET:
+ replay_system_reset();
+ break;
}
}
int target_threads;
int controlled_threads;
int trap;
+ bool is_power8;
+ bool hpt_on_radix;
/*
* Remove from the list any threads that have a signal pending
* the number of threads per subcore, except on POWER9,
* where it's 1 because the threads are (mostly) independent.
*/
- controlled_threads = threads_per_vcore();
+ controlled_threads = threads_per_vcore(vc->kvm);
/*
* Make sure we are running on primary threads, and that secondary
* threads are offline. Also check if the number of threads in this
* guest are greater than the current system threads per guest.
+ * On POWER9, we need to be not in independent-threads mode if
+ * this is a HPT guest on a radix host.
*/
- if ((controlled_threads > 1) &&
- ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
+ hpt_on_radix = radix_enabled() && !kvm_is_radix(vc->kvm);
+ if (((controlled_threads > 1) &&
+ ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) ||
+ (hpt_on_radix && vc->kvm->arch.threads_indep)) {
for_each_runnable_thread(i, vcpu, vc) {
vcpu->arch.ret = -EBUSY;
kvmppc_remove_runnable(vc, vcpu);
* Hard-disable interrupts, and check resched flag and signals.
* If we need to reschedule or deliver a signal, clean up
* and return without going into the guest(s).
- * If the hpte_setup_done flag has been cleared, don't go into the
+ * If the mmu_ready flag has been cleared, don't go into the
* guest because that means a HPT resize operation is in progress.
*/
local_irq_disable();
hard_irq_disable();
if (lazy_irq_pending() || need_resched() ||
- recheck_signals(&core_info) ||
- (!kvm_is_radix(vc->kvm) && !vc->kvm->arch.hpte_setup_done)) {
+ recheck_signals(&core_info) || !vc->kvm->arch.mmu_ready) {
local_irq_enable();
vc->vcore_state = VCORE_INACTIVE;
/* Unlock all except the primary vcore */
cmd_bit = stat_bit = 0;
split = core_info.n_subcores;
sip = NULL;
- if (split > 1) {
- /* threads_per_subcore must be MAX_SMT_THREADS (8) here */
- if (split == 2 && (dynamic_mt_modes & 2)) {
- cmd_bit = HID0_POWER8_1TO2LPAR;
- stat_bit = HID0_POWER8_2LPARMODE;
- } else {
- split = 4;
- cmd_bit = HID0_POWER8_1TO4LPAR;
- stat_bit = HID0_POWER8_4LPARMODE;
- }
- subcore_size = MAX_SMT_THREADS / split;
+ is_power8 = cpu_has_feature(CPU_FTR_ARCH_207S)
+ && !cpu_has_feature(CPU_FTR_ARCH_300);
+
+ if (split > 1 || hpt_on_radix) {
sip = &split_info;
memset(&split_info, 0, sizeof(split_info));
- split_info.rpr = mfspr(SPRN_RPR);
- split_info.pmmar = mfspr(SPRN_PMMAR);
- split_info.ldbar = mfspr(SPRN_LDBAR);
- split_info.subcore_size = subcore_size;
for (sub = 0; sub < core_info.n_subcores; ++sub)
split_info.vc[sub] = core_info.vc[sub];
+
+ if (is_power8) {
+ if (split == 2 && (dynamic_mt_modes & 2)) {
+ cmd_bit = HID0_POWER8_1TO2LPAR;
+ stat_bit = HID0_POWER8_2LPARMODE;
+ } else {
+ split = 4;
+ cmd_bit = HID0_POWER8_1TO4LPAR;
+ stat_bit = HID0_POWER8_4LPARMODE;
+ }
+ subcore_size = MAX_SMT_THREADS / split;
+ split_info.rpr = mfspr(SPRN_RPR);
+ split_info.pmmar = mfspr(SPRN_PMMAR);
+ split_info.ldbar = mfspr(SPRN_LDBAR);
+ split_info.subcore_size = subcore_size;
+ } else {
+ split_info.subcore_size = 1;
+ if (hpt_on_radix) {
+ /* Use the split_info for LPCR/LPIDR changes */
+ split_info.lpcr_req = vc->lpcr;
+ split_info.lpidr_req = vc->kvm->arch.lpid;
+ split_info.host_lpcr = vc->kvm->arch.host_lpcr;
+ split_info.do_set = 1;
+ }
+ }
+
/* order writes to split_info before kvm_split_mode pointer */
smp_wmb();
}
- for (thr = 0; thr < controlled_threads; ++thr)
+
+ for (thr = 0; thr < controlled_threads; ++thr) {
+ paca[pcpu + thr].kvm_hstate.tid = thr;
+ paca[pcpu + thr].kvm_hstate.napping = 0;
paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip;
+ }
- /* Initiate micro-threading (split-core) if required */
+ /* Initiate micro-threading (split-core) on POWER8 if required */
if (cmd_bit) {
unsigned long hid0 = mfspr(SPRN_HID0);
/* Start all the threads */
active = 0;
for (sub = 0; sub < core_info.n_subcores; ++sub) {
- thr = subcore_thread_map[sub];
+ thr = is_power8 ? subcore_thread_map[sub] : sub;
thr0_done = false;
active |= 1 << thr;
pvc = core_info.vc[sub];
* the vcore pointer in the PACA of the secondaries.
*/
smp_mb();
- if (cmd_bit)
- split_info.do_nap = 1; /* ask secondaries to nap when done */
/*
* When doing micro-threading, poke the inactive threads as well.
* This gets them to the nap instruction after kvm_do_nap,
* which reduces the time taken to unsplit later.
+ * For POWER9 HPT guest on radix host, we need all the secondary
+ * threads woken up so they can do the LPCR/LPIDR change.
*/
- if (split > 1)
+ if (cmd_bit || hpt_on_radix) {
+ split_info.do_nap = 1; /* ask secondaries to nap when done */
for (thr = 1; thr < threads_per_subcore; ++thr)
if (!(active & (1 << thr)))
kvmppc_ipi_thread(pcpu + thr);
+ }
vc->vcore_state = VCORE_RUNNING;
preempt_disable();
vc->vcore_state = VCORE_EXITING;
/* wait for secondary threads to finish writing their state to memory */
- kvmppc_wait_for_nap();
+ kvmppc_wait_for_nap(controlled_threads);
/* Return to whole-core mode if we split the core earlier */
- if (split > 1) {
+ if (cmd_bit) {
unsigned long hid0 = mfspr(SPRN_HID0);
unsigned long loops = 0;
cpu_relax();
++loops;
}
- split_info.do_nap = 0;
+ } else if (hpt_on_radix) {
+ /* Wait for all threads to have seen final sync */
+ for (thr = 1; thr < controlled_threads; ++thr) {
+ while (paca[pcpu + thr].kvm_hstate.kvm_split_mode) {
+ HMT_low();
+ barrier();
+ }
+ HMT_medium();
+ }
}
+ split_info.do_nap = 0;
kvmppc_set_host_core(pcpu);
trace_kvmppc_vcore_wakeup(do_sleep, block_ns);
}
+ static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu)
+ {
+ int r = 0;
+ struct kvm *kvm = vcpu->kvm;
+
+ mutex_lock(&kvm->lock);
+ if (!kvm->arch.mmu_ready) {
+ if (!kvm_is_radix(kvm))
+ r = kvmppc_hv_setup_htab_rma(vcpu);
+ if (!r) {
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ kvmppc_setup_partition_table(kvm);
+ kvm->arch.mmu_ready = 1;
+ }
+ }
+ mutex_unlock(&kvm->lock);
+ return r;
+ }
+
static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
{
int n_ceded, i, r;
while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
!signal_pending(current)) {
- /* See if the HPT and VRMA are ready to go */
- if (!kvm_is_radix(vcpu->kvm) &&
- !vcpu->kvm->arch.hpte_setup_done) {
+ /* See if the MMU is ready to go */
+ if (!vcpu->kvm->arch.mmu_ready) {
spin_unlock(&vc->lock);
- r = kvmppc_hv_setup_htab_rma(vcpu);
+ r = kvmhv_setup_mmu(vcpu);
spin_lock(&vc->lock);
if (r) {
kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
- kvm_run->fail_entry.hardware_entry_failure_reason = 0;
+ kvm_run->fail_entry.
+ hardware_entry_failure_reason = 0;
vcpu->arch.ret = r;
break;
}
unsigned long ebb_regs[3] = {}; /* shut up GCC */
unsigned long user_tar = 0;
unsigned int user_vrsave;
+ struct kvm *kvm;
if (!vcpu->arch.sane) {
run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
return -EINTR;
}
- atomic_inc(&vcpu->kvm->arch.vcpus_running);
- /* Order vcpus_running vs. hpte_setup_done, see kvmppc_alloc_reset_hpt */
+ kvm = vcpu->kvm;
+ atomic_inc(&kvm->arch.vcpus_running);
+ /* Order vcpus_running vs. mmu_ready, see kvmppc_alloc_reset_hpt */
smp_mb();
flush_all_to_thread(current);
trace_kvm_hcall_exit(vcpu, r);
kvmppc_core_prepare_to_enter(vcpu);
} else if (r == RESUME_PAGE_FAULT) {
- srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ srcu_idx = srcu_read_lock(&kvm->srcu);
r = kvmppc_book3s_hv_page_fault(run, vcpu,
vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
- srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
+ srcu_read_unlock(&kvm->srcu, srcu_idx);
} else if (r == RESUME_PASSTHROUGH) {
if (WARN_ON(xive_enabled()))
r = H_SUCCESS;
mtspr(SPRN_VRSAVE, user_vrsave);
vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
- atomic_dec(&vcpu->kvm->arch.vcpus_running);
+ atomic_dec(&kvm->arch.vcpus_running);
return r;
}
static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
- int linux_psize)
+ int shift, int sllp)
{
- struct mmu_psize_def *def = &mmu_psize_defs[linux_psize];
-
- if (!def->shift)
- return;
- (*sps)->page_shift = def->shift;
- (*sps)->slb_enc = def->sllp;
- (*sps)->enc[0].page_shift = def->shift;
- (*sps)->enc[0].pte_enc = def->penc[linux_psize];
+ (*sps)->page_shift = shift;
+ (*sps)->slb_enc = sllp;
+ (*sps)->enc[0].page_shift = shift;
+ (*sps)->enc[0].pte_enc = kvmppc_pgsize_lp_encoding(shift, shift);
/*
- * Add 16MB MPSS support if host supports it
+ * Add 16MB MPSS support (may get filtered out by userspace)
*/
- if (linux_psize != MMU_PAGE_16M && def->penc[MMU_PAGE_16M] != -1) {
- (*sps)->enc[1].page_shift = 24;
- (*sps)->enc[1].pte_enc = def->penc[MMU_PAGE_16M];
+ if (shift != 24) {
+ int penc = kvmppc_pgsize_lp_encoding(shift, 24);
+ if (penc != -1) {
+ (*sps)->enc[1].page_shift = 24;
+ (*sps)->enc[1].pte_enc = penc;
+ }
}
(*sps)++;
}
{
struct kvm_ppc_one_seg_page_size *sps;
- /*
- * Since we don't yet support HPT guests on a radix host,
- * return an error if the host uses radix.
- */
- if (radix_enabled())
- return -EINVAL;
-
/*
* POWER7, POWER8 and POWER9 all support 32 storage keys for data.
* POWER7 doesn't support keys for instruction accesses,
info->data_keys = 32;
info->instr_keys = cpu_has_feature(CPU_FTR_ARCH_207S) ? 32 : 0;
- info->flags = KVM_PPC_PAGE_SIZES_REAL;
- if (mmu_has_feature(MMU_FTR_1T_SEGMENT))
- info->flags |= KVM_PPC_1T_SEGMENTS;
- info->slb_size = mmu_slb_size;
+ /* POWER7, 8 and 9 all have 1T segments and 32-entry SLB */
+ info->flags = KVM_PPC_PAGE_SIZES_REAL | KVM_PPC_1T_SEGMENTS;
+ info->slb_size = 32;
/* We only support these sizes for now, and no muti-size segments */
sps = &info->sps[0];
- kvmppc_add_seg_page_size(&sps, MMU_PAGE_4K);
- kvmppc_add_seg_page_size(&sps, MMU_PAGE_64K);
- kvmppc_add_seg_page_size(&sps, MMU_PAGE_16M);
+ kvmppc_add_seg_page_size(&sps, 12, 0);
+ kvmppc_add_seg_page_size(&sps, 16, SLB_VSID_L | SLB_VSID_LP_01);
+ kvmppc_add_seg_page_size(&sps, 24, SLB_VSID_L);
return 0;
}
struct kvm_memory_slot *memslot;
int i, r;
unsigned long n;
- unsigned long *buf;
+ unsigned long *buf, *p;
struct kvm_vcpu *vcpu;
mutex_lock(&kvm->slots_lock);
goto out;
/*
- * Use second half of bitmap area because radix accumulates
- * bits in the first half.
+ * Use second half of bitmap area because both HPT and radix
+ * accumulate bits in the first half.
*/
n = kvm_dirty_bitmap_bytes(memslot);
buf = memslot->dirty_bitmap + n / sizeof(long);
if (r)
goto out;
+ /*
+ * We accumulate dirty bits in the first half of the
+ * memslot's dirty_bitmap area, for when pages are paged
+ * out or modified by the host directly. Pick up these
+ * bits and add them to the map.
+ */
+ p = memslot->dirty_bitmap;
+ for (i = 0; i < n / sizeof(long); ++i)
+ buf[i] |= xchg(&p[i], 0);
+
/* Harvest dirty bits from VPA and DTL updates */
/* Note: we never modify the SLB shadow buffer areas */
kvm_for_each_vcpu(i, vcpu, kvm) {
static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot,
unsigned long npages)
{
- /*
- * For now, if radix_enabled() then we only support radix guests,
- * and in that case we don't need the rmap array.
- */
- if (radix_enabled()) {
- slot->arch.rmap = NULL;
- return 0;
- }
-
slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap));
if (!slot->arch.rmap)
return -ENOMEM;
const struct kvm_memory_slot *new)
{
unsigned long npages = mem->memory_size >> PAGE_SHIFT;
- struct kvm_memslots *slots;
- struct kvm_memory_slot *memslot;
/*
* If we are making a new memslot, it might make
*/
if (npages)
atomic64_inc(&kvm->arch.mmio_update);
-
- if (npages && old->npages && !kvm_is_radix(kvm)) {
- /*
- * If modifying a memslot, reset all the rmap dirty bits.
- * If this is a new memslot, we don't need to do anything
- * since the rmap array starts out as all zeroes,
- * i.e. no pages are dirty.
- */
- slots = kvm_memslots(kvm);
- memslot = id_to_memslot(slots, mem->slot);
- kvmppc_hv_get_dirty_log_hpt(kvm, memslot, NULL);
- }
}
/*
mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1);
}
+ /*
+ * Set up HPT (hashed page table) and RMA (real-mode area).
+ * Must be called with kvm->lock held.
+ */
static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
{
int err = 0;
unsigned long psize, porder;
int srcu_idx;
- mutex_lock(&kvm->lock);
- if (kvm->arch.hpte_setup_done)
- goto out; /* another vcpu beat us to it */
-
/* Allocate hashed page table (if not done already) and reset it */
if (!kvm->arch.hpt.virt) {
int order = KVM_DEFAULT_HPT_ORDER;
/* the -4 is to account for senc values starting at 0x10 */
lpcr = senc << (LPCR_VRMASD_SH - 4);
kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
- } else {
- kvmppc_setup_partition_table(kvm);
}
- /* Order updates to kvm->arch.lpcr etc. vs. hpte_setup_done */
+ /* Order updates to kvm->arch.lpcr etc. vs. mmu_ready */
smp_wmb();
- kvm->arch.hpte_setup_done = 1;
err = 0;
out_srcu:
srcu_read_unlock(&kvm->srcu, srcu_idx);
out:
- mutex_unlock(&kvm->lock);
return err;
up_out:
goto out_srcu;
}
+ /* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */
+ int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
+ {
+ kvmppc_free_radix(kvm);
+ kvmppc_update_lpcr(kvm, LPCR_VPM1,
+ LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
+ kvmppc_rmap_reset(kvm);
+ kvm->arch.radix = 0;
+ kvm->arch.process_table = 0;
+ return 0;
+ }
+
+ /* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */
+ int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
+ {
+ int err;
+
+ err = kvmppc_init_vm_radix(kvm);
+ if (err)
+ return err;
+
+ kvmppc_free_hpt(&kvm->arch.hpt);
+ kvmppc_update_lpcr(kvm, LPCR_UPRT | LPCR_GTSE | LPCR_HR,
+ LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
+ kvm->arch.radix = 1;
+ return 0;
+ }
+
#ifdef CONFIG_KVM_XICS
/*
* Allocate a per-core structure for managing state about which cores are
}
/*
- * For now, if the host uses radix, the guest must be radix.
+ * If the host uses radix, the guest starts out as radix.
*/
if (radix_enabled()) {
kvm->arch.radix = 1;
+ kvm->arch.mmu_ready = 1;
lpcr &= ~LPCR_VPM1;
lpcr |= LPCR_UPRT | LPCR_GTSE | LPCR_HR;
ret = kvmppc_init_vm_radix(kvm);
* Work out how many sets the TLB has, for the use of
* the TLB invalidation loop in book3s_hv_rmhandlers.S.
*/
- if (kvm_is_radix(kvm))
+ if (radix_enabled())
kvm->arch.tlb_sets = POWER9_TLB_SETS_RADIX; /* 128 */
else if (cpu_has_feature(CPU_FTR_ARCH_300))
kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH; /* 256 */
/*
* Track that we now have a HV mode VM active. This blocks secondary
* CPU threads from coming online.
- * On POWER9, we only need to do this for HPT guests on a radix
- * host, which is not yet supported.
+ * On POWER9, we only need to do this if the "indep_threads_mode"
+ * module parameter has been set to N.
*/
- if (!cpu_has_feature(CPU_FTR_ARCH_300))
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ kvm->arch.threads_indep = indep_threads_mode;
+ if (!kvm->arch.threads_indep)
kvm_hv_vm_activated();
/*
{
debugfs_remove_recursive(kvm->arch.debugfs_dir);
- if (!cpu_has_feature(CPU_FTR_ARCH_300))
+ if (!kvm->arch.threads_indep)
kvm_hv_vm_deactivated();
kvmppc_free_vcores(kvm);
{
unsigned long lpcr;
int radix;
+ int err;
/* If not on a POWER9, reject it */
if (!cpu_has_feature(CPU_FTR_ARCH_300))
if (cfg->flags & ~(KVM_PPC_MMUV3_RADIX | KVM_PPC_MMUV3_GTSE))
return -EINVAL;
- /* We can't change a guest to/from radix yet */
- radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX);
- if (radix != kvm_is_radix(kvm))
- return -EINVAL;
-
/* GR (guest radix) bit in process_table field must match */
+ radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX);
if (!!(cfg->process_table & PATB_GR) != radix)
return -EINVAL;
if ((cfg->process_table & PRTS_MASK) > 24)
return -EINVAL;
+ /* We can change a guest to/from radix now, if the host is radix */
+ if (radix && !radix_enabled())
+ return -EINVAL;
+
mutex_lock(&kvm->lock);
+ if (radix != kvm_is_radix(kvm)) {
+ if (kvm->arch.mmu_ready) {
+ kvm->arch.mmu_ready = 0;
+ /* order mmu_ready vs. vcpus_running */
+ smp_mb();
+ if (atomic_read(&kvm->arch.vcpus_running)) {
+ kvm->arch.mmu_ready = 1;
+ err = -EBUSY;
+ goto out_unlock;
+ }
+ }
+ if (radix)
+ err = kvmppc_switch_mmu_to_radix(kvm);
+ else
+ err = kvmppc_switch_mmu_to_hpt(kvm);
+ if (err)
+ goto out_unlock;
+ }
+
kvm->arch.process_table = cfg->process_table;
kvmppc_setup_partition_table(kvm);
lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0;
kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE);
- mutex_unlock(&kvm->lock);
+ err = 0;
- return 0;
+ out_unlock:
+ mutex_unlock(&kvm->lock);
+ return err;
}
static struct kvmppc_ops kvm_ops_hv = {
MODULE_LICENSE("GPL");
MODULE_ALIAS_MISCDEV(KVM_MINOR);
MODULE_ALIAS("devname:kvm");
-
vcpu->arch.local_int.pending_irqs;
}
+ static inline int isc_to_irq_type(unsigned long isc)
+ {
+ return IRQ_PEND_IO_ISC_0 + isc;
+ }
+
+ static inline int irq_type_to_isc(unsigned long irq_type)
+ {
+ return irq_type - IRQ_PEND_IO_ISC_0;
+ }
+
static unsigned long disable_iscs(struct kvm_vcpu *vcpu,
unsigned long active_mask)
{
for (i = 0; i <= MAX_ISC; i++)
if (!(vcpu->arch.sie_block->gcr[6] & isc_to_isc_bits(i)))
- active_mask &= ~(1UL << (IRQ_PEND_IO_ISC_0 + i));
+ active_mask &= ~(1UL << (isc_to_irq_type(i)));
return active_mask;
}
fi = &vcpu->kvm->arch.float_int;
spin_lock(&fi->lock);
- isc_list = &fi->lists[irq_type - IRQ_PEND_IO_ISC_0];
+ isc_list = &fi->lists[irq_type_to_isc(irq_type)];
inti = list_first_entry_or_null(isc_list,
struct kvm_s390_interrupt_info,
list);
* in kvm_vcpu_block without having the waitqueue set (polling)
*/
vcpu->valid_wakeup = true;
+ /*
+ * This is mostly to document, that the read in swait_active could
+ * be moved before other stores, leading to subtle races.
+ * All current users do not store or use an atomic like update
+ */
+ smp_mb__after_atomic();
if (swait_active(&vcpu->wq)) {
/*
* The vcpu gave up the cpu voluntarily, mark it as a good
list_del_init(&iter->list);
fi->counters[FIRQ_CNTR_IO] -= 1;
if (list_empty(isc_list))
- clear_bit(IRQ_PEND_IO_ISC_0 + isc, &fi->pending_irqs);
+ clear_bit(isc_to_irq_type(isc), &fi->pending_irqs);
spin_unlock(&fi->lock);
return iter;
}
isc = int_word_to_isc(inti->io.io_int_word);
list = &fi->lists[FIRQ_LIST_IO_ISC_0 + isc];
list_add_tail(&inti->list, list);
- set_bit(IRQ_PEND_IO_ISC_0 + isc, &fi->pending_irqs);
+ set_bit(isc_to_irq_type(isc), &fi->pending_irqs);
spin_unlock(&fi->lock);
return 0;
}
return -EINVAL;
if (copy_from_user(&schid, (void __user *) attr->addr, sizeof(schid)))
return -EFAULT;
+ if (!schid)
+ return -EINVAL;
kfree(kvm_s390_get_io_int(kvm, isc_mask, schid));
/*
* If userspace is conforming to the architecture, we can have at most
mci.val = mcck_info->mcic;
if (mci.sr)
- cr14 |= MCCK_CR14_RECOVERY_SUB_MASK;
+ cr14 |= CR14_RECOVERY_SUBMASK;
if (mci.dg)
- cr14 |= MCCK_CR14_DEGRAD_SUB_MASK;
+ cr14 |= CR14_DEGRADATION_SUBMASK;
if (mci.w)
- cr14 |= MCCK_CR14_WARN_SUB_MASK;
+ cr14 |= CR14_WARNING_SUBMASK;
mchk = mci.ck ? &inti.mchk : &irq.u.mchk;
mchk->cr14 = cr14;
case KVM_CAP_S390_USER_INSTR0:
case KVM_CAP_S390_CMMA_MIGRATION:
case KVM_CAP_S390_AIS:
+ case KVM_CAP_S390_AIS_MIGRATION:
r = 1;
break;
case KVM_CAP_S390_MEM_OP:
*/
if ((kvm_run->kvm_dirty_regs & KVM_SYNC_RICCB) &&
test_kvm_facility(vcpu->kvm, 64) &&
- riccb->valid &&
+ riccb->v &&
!(vcpu->arch.sie_block->ecb3 & ECB3_RI)) {
VCPU_EVENT(vcpu, 3, "%s", "ENABLE: RI (sync_regs)");
vcpu->arch.sie_block->ecb3 |= ECB3_RI;
+/* SPDX-License-Identifier: GPL-2.0 */
/******************************************************************************
* x86_emulate.h
*
unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt);
void (*set_hflags)(struct x86_emulate_ctxt *ctxt, unsigned hflags);
+ int (*pre_leave_smm)(struct x86_emulate_ctxt *ctxt, u64 smbase);
+
};
typedef u32 __attribute__((vector_size(16))) sse128_t;
void (*cancel_hv_timer)(struct kvm_vcpu *vcpu);
void (*setup_mce)(struct kvm_vcpu *vcpu);
+
+ int (*smi_allowed)(struct kvm_vcpu *vcpu);
+ int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate);
+ int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smbase);
+ int (*enable_smi_window)(struct kvm_vcpu *vcpu);
};
struct kvm_arch_async_pf {
static inline int kvm_cpu_get_apicid(int mps_cpu)
{
#ifdef CONFIG_X86_LOCAL_APIC
- return __default_cpu_present_to_apicid(mps_cpu);
+ return default_cpu_present_to_apicid(mps_cpu);
#else
WARN_ON_ONCE(1);
return BAD_APICID;
#endif
}
+ #define put_smstate(type, buf, offset, val) \
+ *(type *)((buf) + (offset) - 0x7e00) = val
+
#endif /* _ASM_X86_KVM_HOST_H */
apic->divide_count);
}
+ static void limit_periodic_timer_frequency(struct kvm_lapic *apic)
+ {
+ /*
+ * Do not allow the guest to program periodic timers with small
+ * interval, since the hrtimers are not throttled by the host
+ * scheduler.
+ */
+ if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
+ s64 min_period = min_timer_period_us * 1000LL;
+
+ if (apic->lapic_timer.period < min_period) {
+ pr_info_ratelimited(
+ "kvm: vcpu %i: requested %lld ns "
+ "lapic timer period limited to %lld ns\n",
+ apic->vcpu->vcpu_id,
+ apic->lapic_timer.period, min_period);
+ apic->lapic_timer.period = min_period;
+ }
+ }
+ }
+
static void apic_update_lvtt(struct kvm_lapic *apic)
{
u32 timer_mode = kvm_lapic_get_reg(apic, APIC_LVTT) &
apic->lapic_timer.timer_mode_mask;
if (apic->lapic_timer.timer_mode != timer_mode) {
+ if (apic_lvtt_tscdeadline(apic) != (timer_mode ==
+ APIC_LVT_TIMER_TSCDEADLINE)) {
+ hrtimer_cancel(&apic->lapic_timer.timer);
+ kvm_lapic_set_reg(apic, APIC_TMICT, 0);
+ apic->lapic_timer.period = 0;
+ apic->lapic_timer.tscdeadline = 0;
+ }
apic->lapic_timer.timer_mode = timer_mode;
- hrtimer_cancel(&apic->lapic_timer.timer);
+ limit_periodic_timer_frequency(apic);
}
}
HRTIMER_MODE_ABS_PINNED);
}
+ static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor)
+ {
+ ktime_t now, remaining;
+ u64 ns_remaining_old, ns_remaining_new;
+
+ apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
+ * APIC_BUS_CYCLE_NS * apic->divide_count;
+ limit_periodic_timer_frequency(apic);
+
+ now = ktime_get();
+ remaining = ktime_sub(apic->lapic_timer.target_expiration, now);
+ if (ktime_to_ns(remaining) < 0)
+ remaining = 0;
+
+ ns_remaining_old = ktime_to_ns(remaining);
+ ns_remaining_new = mul_u64_u32_div(ns_remaining_old,
+ apic->divide_count, old_divisor);
+
+ apic->lapic_timer.tscdeadline +=
+ nsec_to_cycles(apic->vcpu, ns_remaining_new) -
+ nsec_to_cycles(apic->vcpu, ns_remaining_old);
+ apic->lapic_timer.target_expiration = ktime_add_ns(now, ns_remaining_new);
+ }
+
static bool set_target_expiration(struct kvm_lapic *apic)
{
ktime_t now;
apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
* APIC_BUS_CYCLE_NS * apic->divide_count;
- if (!apic->lapic_timer.period)
+ if (!apic->lapic_timer.period) {
+ apic->lapic_timer.tscdeadline = 0;
return false;
-
- /*
- * Do not allow the guest to program periodic timers with small
- * interval, since the hrtimers are not throttled by the host
- * scheduler.
- */
- if (apic_lvtt_period(apic)) {
- s64 min_period = min_timer_period_us * 1000LL;
-
- if (apic->lapic_timer.period < min_period) {
- pr_info_ratelimited(
- "kvm: vcpu %i: requested %lld ns "
- "lapic timer period limited to %lld ns\n",
- apic->vcpu->vcpu_id,
- apic->lapic_timer.period, min_period);
- apic->lapic_timer.period = min_period;
- }
}
+ limit_periodic_timer_frequency(apic);
+
apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
PRIx64 ", "
"timer initial count 0x%x, period %lldns, "
if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending))
return false;
+ if (!ktimer->tscdeadline)
+ return false;
+
r = kvm_x86_ops->set_hv_timer(apic->vcpu, ktimer->tscdeadline);
if (r < 0)
return false;
start_apic_timer(apic);
break;
- case APIC_TDCR:
+ case APIC_TDCR: {
+ uint32_t old_divisor = apic->divide_count;
+
if (val & 4)
apic_debug("KVM_WRITE:TDCR %x\n", val);
kvm_lapic_set_reg(apic, APIC_TDCR, val);
update_divide_count(apic);
+ if (apic->divide_count != old_divisor &&
+ apic->lapic_timer.period) {
+ hrtimer_cancel(&apic->lapic_timer.timer);
+ update_target_expiration(apic, old_divisor);
+ restart_apic_timer(apic);
+ }
break;
-
+ }
case APIC_ESR:
if (apic_x2apic_mode(apic) && val != 0) {
apic_debug("KVM_WRITE:ESR not zero %x\n", val);
vcpu->arch.apic_base | MSR_IA32_APICBASE_BSP);
vcpu->arch.pv_eoi.msr_val = 0;
apic_update_ppr(apic);
+ if (vcpu->arch.apicv_active) {
+ kvm_x86_ops->apicv_post_state_restore(vcpu);
+ kvm_x86_ops->hwapic_irr_update(vcpu, -1);
+ kvm_x86_ops->hwapic_isr_update(vcpu, -1);
+ }
vcpu->arch.apic_arb_prio = 0;
vcpu->arch.apic_attention = 0;
/* make pte_list_desc fit well in cache line */
#define PTE_LIST_EXT 3
+ /*
+ * Return values of handle_mmio_page_fault and mmu.page_fault:
+ * RET_PF_RETRY: let CPU fault again on the address.
+ * RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
+ *
+ * For handle_mmio_page_fault only:
+ * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
+ */
+ enum {
+ RET_PF_RETRY = 0,
+ RET_PF_EMULATE = 1,
+ RET_PF_INVALID = 2,
+ };
+
struct pte_list_desc {
u64 *sptes[PTE_LIST_EXT];
struct pte_list_desc *more;
static u64 __get_spte_lockless(u64 *sptep)
{
- return ACCESS_ONCE(*sptep);
+ return READ_ONCE(*sptep);
}
#else
union split_spte {
static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
{
- return __shadow_walk_next(iterator, *iterator->sptep);
+ __shadow_walk_next(iterator, *iterator->sptep);
}
static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
return ret;
}
- static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
- int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
- bool speculative, bool host_writable)
+ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
+ int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
+ bool speculative, bool host_writable)
{
int was_rmapped = 0;
int rmap_count;
- bool emulate = false;
+ int ret = RET_PF_RETRY;
pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
*sptep, write_fault, gfn);
if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative,
true, host_writable)) {
if (write_fault)
- emulate = true;
+ ret = RET_PF_EMULATE;
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
if (unlikely(is_mmio_spte(*sptep)))
- emulate = true;
+ ret = RET_PF_EMULATE;
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
kvm_release_pfn_clean(pfn);
- return emulate;
+ return ret;
}
static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
* Do not cache the mmio info caused by writing the readonly gfn
* into the spte otherwise read access on readonly gfn also can
* caused mmio page fault and treat it as mmio access.
- * Return 1 to tell kvm to emulate it.
*/
if (pfn == KVM_PFN_ERR_RO_FAULT)
- return 1;
+ return RET_PF_EMULATE;
if (pfn == KVM_PFN_ERR_HWPOISON) {
kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
- return 0;
+ return RET_PF_RETRY;
}
return -EFAULT;
}
if (fast_page_fault(vcpu, v, level, error_code))
- return 0;
+ return RET_PF_RETRY;
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
- return 0;
+ return RET_PF_RETRY;
if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
return r;
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
- return 0;
+ return RET_PF_RETRY;
}
return reserved;
}
- /*
- * Return values of handle_mmio_page_fault:
- * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction
- * directly.
- * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page
- * fault path update the mmio spte.
- * RET_MMIO_PF_RETRY: let CPU fault again on the address.
- * RET_MMIO_PF_BUG: a bug was detected (and a WARN was printed).
- */
- enum {
- RET_MMIO_PF_EMULATE = 1,
- RET_MMIO_PF_INVALID = 2,
- RET_MMIO_PF_RETRY = 0,
- RET_MMIO_PF_BUG = -1
- };
-
static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
{
u64 spte;
bool reserved;
if (mmio_info_in_cache(vcpu, addr, direct))
- return RET_MMIO_PF_EMULATE;
+ return RET_PF_EMULATE;
reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
if (WARN_ON(reserved))
- return RET_MMIO_PF_BUG;
+ return -EINVAL;
if (is_mmio_spte(spte)) {
gfn_t gfn = get_mmio_spte_gfn(spte);
unsigned access = get_mmio_spte_access(spte);
if (!check_mmio_spte(vcpu, spte))
- return RET_MMIO_PF_INVALID;
+ return RET_PF_INVALID;
if (direct)
addr = 0;
trace_handle_mmio_page_fault(addr, gfn, access);
vcpu_cache_mmio_info(vcpu, addr, gfn, access);
- return RET_MMIO_PF_EMULATE;
+ return RET_PF_EMULATE;
}
/*
* If the page table is zapped by other cpus, let CPU fault again on
* the address.
*/
- return RET_MMIO_PF_RETRY;
+ return RET_PF_RETRY;
}
EXPORT_SYMBOL_GPL(handle_mmio_page_fault);
pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
if (page_fault_handle_page_track(vcpu, error_code, gfn))
- return 1;
+ return RET_PF_EMULATE;
r = mmu_topup_memory_caches(vcpu);
if (r)
}
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
- u64 fault_address, char *insn, int insn_len,
- bool need_unprotect)
+ u64 fault_address, char *insn, int insn_len)
{
int r = 1;
default:
trace_kvm_page_fault(fault_address, error_code);
- if (need_unprotect && kvm_event_needs_reinjection(vcpu))
+ if (kvm_event_needs_reinjection(vcpu))
kvm_mmu_unprotect_page_virt(vcpu, fault_address);
r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
insn_len);
MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
if (page_fault_handle_page_track(vcpu, error_code, gfn))
- return 1;
+ return RET_PF_EMULATE;
r = mmu_topup_memory_caches(vcpu);
if (r)
}
if (fast_page_fault(vcpu, gpa, level, error_code))
- return 0;
+ return RET_PF_RETRY;
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
- return 0;
+ return RET_PF_RETRY;
if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
return r;
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
- return 0;
+ return RET_PF_RETRY;
}
static void nonpaging_init_context(struct kvm_vcpu *vcpu,
* If we don't have indirect shadow pages, it means no page is
* write-protected, so we can exit simply.
*/
- if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
+ if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
return;
remote_flush = local_flush = false;
vcpu->arch.gpa_val = cr2;
}
+ r = RET_PF_INVALID;
if (unlikely(error_code & PFERR_RSVD_MASK)) {
r = handle_mmio_page_fault(vcpu, cr2, direct);
- if (r == RET_MMIO_PF_EMULATE) {
+ if (r == RET_PF_EMULATE) {
emulation_type = 0;
goto emulate;
}
- if (r == RET_MMIO_PF_RETRY)
- return 1;
- if (r < 0)
- return r;
- /* Must be RET_MMIO_PF_INVALID. */
}
- r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code),
- false);
+ if (r == RET_PF_INVALID) {
+ r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code),
+ false);
+ WARN_ON(r == RET_PF_INVALID);
+ }
+
+ if (r == RET_PF_RETRY)
+ return 1;
if (r < 0)
return r;
- if (!r)
- return 1;
/*
* Before emulating the instruction, check if the error code
static void free_mmu_pages(struct kvm_vcpu *vcpu)
{
free_page((unsigned long)vcpu->arch.mmu.pae_root);
- if (vcpu->arch.mmu.lm_root != NULL)
- free_page((unsigned long)vcpu->arch.mmu.lm_root);
+ free_page((unsigned long)vcpu->arch.mmu.lm_root);
}
static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
static void mmu_destroy_caches(void)
{
- if (pte_list_desc_cache)
- kmem_cache_destroy(pte_list_desc_cache);
- if (mmu_page_header_cache)
- kmem_cache_destroy(mmu_page_header_cache);
+ kmem_cache_destroy(pte_list_desc_cache);
+ kmem_cache_destroy(mmu_page_header_cache);
}
int kvm_mmu_module_init(void)
pte_list_desc_cache = kmem_cache_create("pte_list_desc",
sizeof(struct pte_list_desc),
- 0, 0, NULL);
+ 0, SLAB_ACCOUNT, NULL);
if (!pte_list_desc_cache)
goto nomem;
mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
sizeof(struct kvm_mmu_page),
- 0, 0, NULL);
+ 0, SLAB_ACCOUNT, NULL);
if (!mmu_page_header_cache)
goto nomem;
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __KVM_X86_MMU_H
#define __KVM_X86_MMU_H
bool accessed_dirty);
bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
- u64 fault_address, char *insn, int insn_len,
- bool need_unprotect);
+ u64 fault_address, char *insn, int insn_len);
static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
{
u64 nested_vmx_cr4_fixed1;
u64 nested_vmx_vmcs_enum;
u64 nested_vmx_vmfunc_controls;
+
+ /* SMM related state */
+ struct {
+ /* in VMX operation on SMM entry? */
+ bool vmxon;
+ /* in guest mode on SMM entry? */
+ bool guest_mode;
+ } smm;
};
#define POSTED_INTR_ON 0
static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
static bool vmx_xsaves_supported(void);
- static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
static void vmx_get_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
static bool guest_state_valid(struct kvm_vcpu *vcpu);
static u32 vmx_segment_access_rights(struct kvm_segment *var);
- static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
- static int alloc_identity_pagetable(struct kvm *kvm);
static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
static inline void ept_sync_global(void)
{
- if (cpu_has_vmx_invept_global())
- __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
+ __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
}
static inline void ept_sync_context(u64 eptp)
{
- if (enable_ept) {
- if (cpu_has_vmx_invept_context())
- __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
- else
- ept_sync_global();
- }
+ if (cpu_has_vmx_invept_context())
+ __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
+ else
+ ept_sync_global();
}
static __always_inline void vmcs_check16(unsigned long field)
SECONDARY_EXEC_ENABLE_PML;
vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT;
}
- } else
- vmx->nested.nested_vmx_ept_caps = 0;
+ }
if (cpu_has_vmx_vmfunc()) {
vmx->nested.nested_vmx_secondary_ctls_high |=
* Advertise EPTP switching unconditionally
* since we emulate it
*/
- vmx->nested.nested_vmx_vmfunc_controls =
- VMX_VMFUNC_EPTP_SWITCHING;
+ if (enable_ept)
+ vmx->nested.nested_vmx_vmfunc_controls =
+ VMX_VMFUNC_EPTP_SWITCHING;
}
/*
SECONDARY_EXEC_ENABLE_VPID;
vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
VMX_VPID_EXTENT_SUPPORTED_MASK;
- } else
- vmx->nested.nested_vmx_vpid_caps = 0;
+ }
if (enable_unrestricted_guest)
vmx->nested.nested_vmx_secondary_ctls_high |=
wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
}
kvm_cpu_vmxon(phys_addr);
- ept_sync_global();
+ if (enable_ept)
+ ept_sync_global();
return 0;
}
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_SHADOW_VMCS |
SECONDARY_EXEC_XSAVES |
- SECONDARY_EXEC_RDSEED |
- SECONDARY_EXEC_RDRAND |
+ SECONDARY_EXEC_RDSEED_EXITING |
+ SECONDARY_EXEC_RDRAND_EXITING |
SECONDARY_EXEC_ENABLE_PML |
SECONDARY_EXEC_TSC_SCALING |
SECONDARY_EXEC_ENABLE_VMFUNC;
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+ rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
+ &vmx_capability.ept, &vmx_capability.vpid);
+
if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
/* CR3 accesses and invlpg don't need to cause VM Exits when EPT
enabled */
_cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
CPU_BASED_CR3_STORE_EXITING |
CPU_BASED_INVLPG_EXITING);
- rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
- vmx_capability.ept, vmx_capability.vpid);
+ } else if (vmx_capability.ept) {
+ vmx_capability.ept = 0;
+ pr_warn_once("EPT CAP should not exist if not support "
+ "1-setting enable EPT VM-execution control\n");
+ }
+ if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
+ vmx_capability.vpid) {
+ vmx_capability.vpid = 0;
+ pr_warn_once("VPID CAP should not exist if not support "
+ "1-setting enable VPID VM-execution control\n");
}
min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
kvm_pfn_t identity_map_pfn;
u32 tmp;
- if (!enable_ept)
- return 0;
-
/* Protect kvm->arch.ept_identity_pagetable_done. */
mutex_lock(&kvm->slots_lock);
if (likely(kvm->arch.ept_identity_pagetable_done))
goto out2;
+ if (!kvm->arch.ept_identity_map_addr)
+ kvm->arch.ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
- r = alloc_identity_pagetable(kvm);
+ r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
+ kvm->arch.ept_identity_map_addr, PAGE_SIZE);
if (r < 0)
goto out2;
return r;
}
- static int alloc_identity_pagetable(struct kvm *kvm)
- {
- /* Called with kvm->slots_lock held. */
-
- int r = 0;
-
- BUG_ON(kvm->arch.ept_identity_pagetable_done);
-
- r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
- kvm->arch.ept_identity_map_addr, PAGE_SIZE);
-
- return r;
- }
-
static int allocate_vpid(void)
{
int vpid;
static bool vmx_rdrand_supported(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
- SECONDARY_EXEC_RDRAND;
+ SECONDARY_EXEC_RDRAND_EXITING;
}
static bool vmx_rdseed_supported(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
- SECONDARY_EXEC_RDSEED;
+ SECONDARY_EXEC_RDSEED_EXITING;
}
static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
if (vmx_rdrand_supported()) {
bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND);
if (rdrand_enabled)
- exec_control &= ~SECONDARY_EXEC_RDRAND;
+ exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING;
if (nested) {
if (rdrand_enabled)
vmx->nested.nested_vmx_secondary_ctls_high |=
- SECONDARY_EXEC_RDRAND;
+ SECONDARY_EXEC_RDRAND_EXITING;
else
vmx->nested.nested_vmx_secondary_ctls_high &=
- ~SECONDARY_EXEC_RDRAND;
+ ~SECONDARY_EXEC_RDRAND_EXITING;
}
}
if (vmx_rdseed_supported()) {
bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED);
if (rdseed_enabled)
- exec_control &= ~SECONDARY_EXEC_RDSEED;
+ exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING;
if (nested) {
if (rdseed_enabled)
vmx->nested.nested_vmx_secondary_ctls_high |=
- SECONDARY_EXEC_RDSEED;
+ SECONDARY_EXEC_RDSEED_EXITING;
else
vmx->nested.nested_vmx_secondary_ctls_high &=
- ~SECONDARY_EXEC_RDSEED;
+ ~SECONDARY_EXEC_RDSEED_EXITING;
}
}
/*
* Sets up the vmcs for emulated real mode.
*/
- static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
+ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
{
#ifdef CONFIG_X86_64
unsigned long a;
vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
}
-
- return 0;
}
static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
+ if (kvm_mpx_supported())
+ vmcs_write64(GUEST_BNDCFGS, 0);
setup_msrs(vmx);
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
- if (kvm_vcpu_apicv_active(vcpu))
- memset(&vmx->pi_desc, 0, sizeof(struct pi_desc));
-
if (vmx->vpid != 0)
vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
cr2 = vmcs_readl(EXIT_QUALIFICATION);
/* EPT won't cause page fault directly */
WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept);
- return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0,
- true);
+ return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
}
ex_no = intr_info & INTR_INFO_VECTOR_MASK;
if (!cpu_has_vmx_ept() ||
!cpu_has_vmx_ept_4levels() ||
- !cpu_has_vmx_ept_mt_wb()) {
+ !cpu_has_vmx_ept_mt_wb() ||
+ !cpu_has_vmx_invept_global())
enable_ept = 0;
- enable_unrestricted_guest = 0;
- enable_ept_ad_bits = 0;
- }
if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
enable_ept_ad_bits = 0;
- if (!cpu_has_vmx_unrestricted_guest())
+ if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
enable_unrestricted_guest = 0;
if (!cpu_has_vmx_flexpriority())
if (enable_ept && !cpu_has_vmx_ept_2m_page())
kvm_disable_largepages();
- if (!cpu_has_vmx_ple())
+ if (!cpu_has_vmx_ple()) {
ple_gap = 0;
+ ple_window = 0;
+ ple_window_grow = 0;
+ ple_window_max = 0;
+ ple_window_shrink = 0;
+ }
if (!cpu_has_vmx_apicv()) {
enable_apicv = 0;
case EXIT_REASON_RDPMC:
return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
case EXIT_REASON_RDRAND:
- return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND);
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
case EXIT_REASON_RDSEED:
- return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED);
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
vmx->loaded_vmcs = vmcs;
vmx_vcpu_put(vcpu);
vmx_vcpu_load(vcpu, cpu);
- vcpu->cpu = cpu;
put_cpu();
}
cpu = get_cpu();
vmx_vcpu_load(&vmx->vcpu, cpu);
vmx->vcpu.cpu = cpu;
- err = vmx_vcpu_setup(vmx);
+ vmx_vcpu_setup(vmx);
vmx_vcpu_put(&vmx->vcpu);
put_cpu();
- if (err)
- goto free_vmcs;
if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
err = alloc_apic_access_page(kvm);
if (err)
}
if (enable_ept) {
- if (!kvm->arch.ept_identity_map_addr)
- kvm->arch.ept_identity_map_addr =
- VMX_EPT_IDENTITY_PAGETABLE_ADDR;
err = init_rmode_identity_map(kvm);
if (err)
goto free_vmcs;
vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
+ vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
+ vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
/* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */
if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
leave_guest_mode(vcpu);
if (likely(!vmx->fail)) {
- prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
- exit_qualification);
+ if (exit_reason == -1)
+ sync_vmcs12(vcpu, vmcs12);
+ else
+ prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
+ exit_qualification);
if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
vmcs12->vm_exit_msr_store_count))
*/
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
- if (enable_shadow_vmcs)
+ if (enable_shadow_vmcs && exit_reason != -1)
vmx->nested.sync_shadow_vmcs = true;
/* in case we halted in L2 */
INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
}
- trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
- vmcs12->exit_qualification,
- vmcs12->idt_vectoring_info_field,
- vmcs12->vm_exit_intr_info,
- vmcs12->vm_exit_intr_error_code,
- KVM_ISA_VMX);
+ if (exit_reason != -1)
+ trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
+ vmcs12->exit_qualification,
+ vmcs12->idt_vectoring_info_field,
+ vmcs12->vm_exit_intr_info,
+ vmcs12->vm_exit_intr_error_code,
+ KVM_ISA_VMX);
load_vmcs12_host_state(vcpu, vmcs12);
~FEATURE_CONTROL_LMCE;
}
+ static int vmx_smi_allowed(struct kvm_vcpu *vcpu)
+ {
+ /* we need a nested vmexit to enter SMM, postpone if run is pending */
+ if (to_vmx(vcpu)->nested.nested_run_pending)
+ return 0;
+ return 1;
+ }
+
+ static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
+ {
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
+ if (vmx->nested.smm.guest_mode)
+ nested_vmx_vmexit(vcpu, -1, 0, 0);
+
+ vmx->nested.smm.vmxon = vmx->nested.vmxon;
+ vmx->nested.vmxon = false;
+ return 0;
+ }
+
+ static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
+ {
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int ret;
+
+ if (vmx->nested.smm.vmxon) {
+ vmx->nested.vmxon = true;
+ vmx->nested.smm.vmxon = false;
+ }
+
+ if (vmx->nested.smm.guest_mode) {
+ vcpu->arch.hflags &= ~HF_SMM_MASK;
+ ret = enter_vmx_non_root_mode(vcpu, false);
+ vcpu->arch.hflags |= HF_SMM_MASK;
+ if (ret)
+ return ret;
+
+ vmx->nested.smm.guest_mode = false;
+ }
+ return 0;
+ }
+
+ static int enable_smi_window(struct kvm_vcpu *vcpu)
+ {
+ return 0;
+ }
+
static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
#endif
.setup_mce = vmx_setup_mce,
+
+ .smi_allowed = vmx_smi_allowed,
+ .pre_enter_smm = vmx_pre_enter_smm,
+ .pre_leave_smm = vmx_pre_leave_smm,
+ .enable_smi_window = enable_smi_window,
};
static int __init vmx_init(void)
static bool arch_counter_suspend_stop;
static bool vdso_default = true;
+static cpumask_t evtstrm_available = CPU_MASK_NONE;
static bool evtstrm_enable = IS_ENABLED(CONFIG_ARM_ARCH_TIMER_EVTSTREAM);
static int __init early_evtstrm_cfg(char *buf)
* if we don't have the cp15 accessors we won't have a problem.
*/
u64 (*arch_timer_read_counter)(void) = arch_counter_get_cntvct;
+ EXPORT_SYMBOL_GPL(arch_timer_read_counter);
static u64 arch_counter_read(struct clocksource *cs)
{
return __fsl_a008585_read_reg(cntv_tval_el0);
}
+ static u64 notrace fsl_a008585_read_cntpct_el0(void)
+ {
+ return __fsl_a008585_read_reg(cntpct_el0);
+ }
+
static u64 notrace fsl_a008585_read_cntvct_el0(void)
{
return __fsl_a008585_read_reg(cntvct_el0);
return __hisi_161010101_read_reg(cntv_tval_el0);
}
+ static u64 notrace hisi_161010101_read_cntpct_el0(void)
+ {
+ return __hisi_161010101_read_reg(cntpct_el0);
+ }
+
static u64 notrace hisi_161010101_read_cntvct_el0(void)
{
return __hisi_161010101_read_reg(cntvct_el0);
#endif
#ifdef CONFIG_ARM64_ERRATUM_858921
+ static u64 notrace arm64_858921_read_cntpct_el0(void)
+ {
+ u64 old, new;
+
+ old = read_sysreg(cntpct_el0);
+ new = read_sysreg(cntpct_el0);
+ return (((old ^ new) >> 32) & 1) ? old : new;
+ }
+
static u64 notrace arm64_858921_read_cntvct_el0(void)
{
u64 old, new;
#endif
#ifdef CONFIG_ARM_ARCH_TIMER_OOL_WORKAROUND
-DEFINE_PER_CPU(const struct arch_timer_erratum_workaround *,
- timer_unstable_counter_workaround);
+DEFINE_PER_CPU(const struct arch_timer_erratum_workaround *, timer_unstable_counter_workaround);
EXPORT_SYMBOL_GPL(timer_unstable_counter_workaround);
DEFINE_STATIC_KEY_FALSE(arch_timer_read_ool_enabled);
struct clock_event_device *clk)
{
unsigned long ctrl;
- u64 cval = evt + arch_counter_get_cntvct();
+ u64 cval;
ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk);
ctrl |= ARCH_TIMER_CTRL_ENABLE;
ctrl &= ~ARCH_TIMER_CTRL_IT_MASK;
- if (access == ARCH_TIMER_PHYS_ACCESS)
+ if (access == ARCH_TIMER_PHYS_ACCESS) {
+ cval = evt + arch_counter_get_cntpct();
write_sysreg(cval, cntp_cval_el0);
- else
+ } else {
+ cval = evt + arch_counter_get_cntvct();
write_sysreg(cval, cntv_cval_el0);
+ }
arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk);
}
.desc = "Freescale erratum a005858",
.read_cntp_tval_el0 = fsl_a008585_read_cntp_tval_el0,
.read_cntv_tval_el0 = fsl_a008585_read_cntv_tval_el0,
+ .read_cntpct_el0 = fsl_a008585_read_cntpct_el0,
.read_cntvct_el0 = fsl_a008585_read_cntvct_el0,
.set_next_event_phys = erratum_set_next_event_tval_phys,
.set_next_event_virt = erratum_set_next_event_tval_virt,
.desc = "HiSilicon erratum 161010101",
.read_cntp_tval_el0 = hisi_161010101_read_cntp_tval_el0,
.read_cntv_tval_el0 = hisi_161010101_read_cntv_tval_el0,
+ .read_cntpct_el0 = hisi_161010101_read_cntpct_el0,
.read_cntvct_el0 = hisi_161010101_read_cntvct_el0,
.set_next_event_phys = erratum_set_next_event_tval_phys,
.set_next_event_virt = erratum_set_next_event_tval_virt,
.desc = "HiSilicon erratum 161010101",
.read_cntp_tval_el0 = hisi_161010101_read_cntp_tval_el0,
.read_cntv_tval_el0 = hisi_161010101_read_cntv_tval_el0,
+ .read_cntpct_el0 = hisi_161010101_read_cntpct_el0,
.read_cntvct_el0 = hisi_161010101_read_cntvct_el0,
.set_next_event_phys = erratum_set_next_event_tval_phys,
.set_next_event_virt = erratum_set_next_event_tval_virt,
.match_type = ate_match_local_cap_id,
.id = (void *)ARM64_WORKAROUND_858921,
.desc = "ARM erratum 858921",
+ .read_cntpct_el0 = arm64_858921_read_cntpct_el0,
.read_cntvct_el0 = arm64_858921_read_cntvct_el0,
},
#endif
#ifdef CONFIG_COMPAT
compat_elf_hwcap |= COMPAT_HWCAP_EVTSTRM;
#endif
+ cpumask_set_cpu(smp_processor_id(), &evtstrm_available);
}
static void arch_timer_configure_evtstream(void)
return arch_timer_rate;
}
+bool arch_timer_evtstrm_available(void)
+{
+ /*
+ * We might get called from a preemptible context. This is fine
+ * because availability of the event stream should be always the same
+ * for a preemptible context and context where we might resume a task.
+ */
+ return cpumask_test_cpu(raw_smp_processor_id(), &evtstrm_available);
+}
+
static u64 arch_counter_get_cntvct_mem(void)
{
u32 vct_lo, vct_hi, tmp_hi;
/* Register the CP15 based counter if we have one */
if (type & ARCH_TIMER_TYPE_CP15) {
- if (IS_ENABLED(CONFIG_ARM64) ||
+ if ((IS_ENABLED(CONFIG_ARM64) && !is_hyp_mode_available()) ||
arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI)
arch_timer_read_counter = arch_counter_get_cntvct;
else
{
struct clock_event_device *clk = this_cpu_ptr(arch_timer_evt);
+ cpumask_clear_cpu(smp_processor_id(), &evtstrm_available);
+
arch_timer_stop(clk);
return 0;
}
static int arch_timer_cpu_pm_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
- if (action == CPU_PM_ENTER)
+ if (action == CPU_PM_ENTER) {
__this_cpu_write(saved_cntkctl, arch_timer_get_cntkctl());
- else if (action == CPU_PM_ENTER_FAILED || action == CPU_PM_EXIT)
+
+ cpumask_clear_cpu(smp_processor_id(), &evtstrm_available);
+ } else if (action == CPU_PM_ENTER_FAILED || action == CPU_PM_EXIT) {
arch_timer_set_cntkctl(__this_cpu_read(saved_cntkctl));
+
+ if (elf_hwcap & HWCAP_EVTSTRM)
+ cpumask_set_cpu(smp_processor_id(), &evtstrm_available);
+ }
return NOTIFY_OK;
}
if (err)
goto out_unreg_notify;
-
/* Register and immediately configure the timer on the boot CPU */
err = cpuhp_setup_state(CPUHP_AP_ARM_ARCH_TIMER_STARTING,
"clockevents/arm/arch_timer:starting",
iounmap(cntctlbase);
- if (!best_frame)
- pr_err("Unable to find a suitable frame in timer @ %pa\n",
- &timer_mem->cntctlbase);
-
return best_frame;
}
frame = arch_timer_mem_find_best_frame(timer_mem);
if (!frame) {
+ pr_err("Unable to find a suitable frame in timer @ %pa\n",
+ &timer_mem->cntctlbase);
ret = -EINVAL;
goto out;
}
static int __init arch_timer_mem_acpi_init(int platform_timer_count)
{
struct arch_timer_mem *timers, *timer;
- struct arch_timer_mem_frame *frame;
+ struct arch_timer_mem_frame *frame, *best_frame = NULL;
int timer_count, i, ret = 0;
timers = kcalloc(platform_timer_count, sizeof(*timers),
if (ret || !timer_count)
goto out;
- for (i = 0; i < timer_count; i++) {
- ret = arch_timer_mem_verify_cntfrq(&timers[i]);
- if (ret) {
- pr_err("Disabling MMIO timers due to CNTFRQ mismatch\n");
- goto out;
- }
- }
-
/*
* While unlikely, it's theoretically possible that none of the frames
* in a timer expose the combination of feature we want.
timer = &timers[i];
frame = arch_timer_mem_find_best_frame(timer);
- if (frame)
- break;
+ if (!best_frame)
+ best_frame = frame;
+
+ ret = arch_timer_mem_verify_cntfrq(timer);
+ if (ret) {
+ pr_err("Disabling MMIO timers due to CNTFRQ mismatch\n");
+ goto out;
+ }
+
+ if (!best_frame) /* implies !frame */
+ /*
+ * Only complain about missing suitable frames if we
+ * haven't already found one in a previous iteration.
+ */
+ pr_err("Unable to find a suitable frame in timer @ %pa\n",
+ &timer->cntctlbase);
}
- if (frame)
- ret = arch_timer_mem_frame_register(frame);
+ if (best_frame)
+ ret = arch_timer_mem_frame_register(best_frame);
out:
kfree(timers);
return ret;
struct irq_domain *domain;
u64 redist_stride;
u32 nr_redist_regions;
+ bool has_rss;
unsigned int irq_nr;
struct partition_desc *ppi_descs[16];
};
static struct static_key supports_deactivate = STATIC_KEY_INIT_TRUE;
static struct gic_kvm_info gic_v3_kvm_info;
+static DEFINE_PER_CPU(bool, has_rss);
+#define MPIDR_RS(mpidr) (((mpidr) & 0xF0UL) >> 4)
#define gic_data_rdist() (this_cpu_ptr(gic_data.rdists.rdist))
#define gic_data_rdist_rd_base() (gic_data_rdist()->rd_base)
#define gic_data_rdist_sgi_base() (gic_data_rdist_rd_base() + SZ_64K)
static void gic_cpu_sys_reg_init(void)
{
+ int i, cpu = smp_processor_id();
+ u64 mpidr = cpu_logical_map(cpu);
+ u64 need_rss = MPIDR_RS(mpidr);
+
/*
* Need to check that the SRE bit has actually been set. If
* not, it means that SRE is disabled at EL2. We're going to
/* ... and let's hit the road... */
gic_write_grpen1(1);
+
+ /* Keep the RSS capability status in per_cpu variable */
+ per_cpu(has_rss, cpu) = !!(gic_read_ctlr() & ICC_CTLR_EL1_RSS);
+
+ /* Check all the CPUs have capable of sending SGIs to other CPUs */
+ for_each_online_cpu(i) {
+ bool have_rss = per_cpu(has_rss, i) && per_cpu(has_rss, cpu);
+
+ need_rss |= MPIDR_RS(cpu_logical_map(i));
+ if (need_rss && (!have_rss))
+ pr_crit("CPU%d (%lx) can't SGI CPU%d (%lx), no RSS\n",
+ cpu, (unsigned long)mpidr,
+ i, (unsigned long)cpu_logical_map(i));
+ }
+
+ /**
+ * GIC spec says, when ICC_CTLR_EL1.RSS==1 and GICD_TYPER.RSS==0,
+ * writing ICC_ASGI1R_EL1 register with RS != 0 is a CONSTRAINED
+ * UNPREDICTABLE choice of :
+ * - The write is ignored.
+ * - The RS field is treated as 0.
+ */
+ if (need_rss && (!gic_data.has_rss))
+ pr_crit_once("RSS is required but GICD doesn't support it\n");
}
static int gic_dist_supports_lpis(void)
#ifdef CONFIG_SMP
+#define MPIDR_TO_SGI_RS(mpidr) (MPIDR_RS(mpidr) << ICC_SGI1R_RS_SHIFT)
+#define MPIDR_TO_SGI_CLUSTER_ID(mpidr) ((mpidr) & ~0xFUL)
+
static int gic_starting_cpu(unsigned int cpu)
{
gic_cpu_init();
u16 tlist = 0;
while (cpu < nr_cpu_ids) {
- /*
- * If we ever get a cluster of more than 16 CPUs, just
- * scream and skip that CPU.
- */
- if (WARN_ON((mpidr & 0xff) >= 16))
- goto out;
-
tlist |= 1 << (mpidr & 0xf);
next_cpu = cpumask_next(cpu, mask);
mpidr = cpu_logical_map(cpu);
- if (cluster_id != (mpidr & ~0xffUL)) {
+ if (cluster_id != MPIDR_TO_SGI_CLUSTER_ID(mpidr)) {
cpu--;
goto out;
}
MPIDR_TO_SGI_AFFINITY(cluster_id, 2) |
irq << ICC_SGI1R_SGI_ID_SHIFT |
MPIDR_TO_SGI_AFFINITY(cluster_id, 1) |
+ MPIDR_TO_SGI_RS(cluster_id) |
tlist << ICC_SGI1R_TARGET_LIST_SHIFT);
pr_debug("CPU%d: ICC_SGI1R_EL1 %llx\n", smp_processor_id(), val);
smp_wmb();
for_each_cpu(cpu, mask) {
- unsigned long cluster_id = cpu_logical_map(cpu) & ~0xffUL;
+ u64 cluster_id = MPIDR_TO_SGI_CLUSTER_ID(cpu_logical_map(cpu));
u16 tlist;
tlist = gic_compute_target_list(&cpu, mask, cluster_id);
goto out_free;
}
+ gic_data.has_rss = !!(typer & GICD_TYPER_RSS);
+ pr_info("Distributor has %sRange Selector support\n",
+ gic_data.has_rss ? "" : "no ");
+
set_handle_irq(gic_handle_irq);
gic_update_vlpi_properties();
goto out_unmap_rdist;
gic_populate_ppi_partitions(node);
- gic_of_setup_kvm_info(node);
+
+ if (static_key_true(&supports_deactivate))
+ gic_of_setup_kvm_info(node);
return 0;
out_unmap_rdist:
goto out_fwhandle_free;
acpi_set_irq_model(ACPI_IRQ_MODEL_GIC, domain_handle);
- gic_acpi_setup_kvm_info();
+
+ if (static_key_true(&supports_deactivate))
+ gic_acpi_setup_kvm_info();
return 0;
#ifdef CONFIG_OF
static int gic_cnt __initdata;
+static bool gicv2_force_probe;
+
+static int __init gicv2_force_probe_cfg(char *buf)
+{
+ return strtobool(buf, &gicv2_force_probe);
+}
+early_param("irqchip.gicv2_force_probe", gicv2_force_probe_cfg);
+
+static bool gic_check_gicv2(void __iomem *base)
+{
+ u32 val = readl_relaxed(base + GIC_CPU_IDENT);
+ return (val & 0xff0fff) == 0x02043B;
+}
static bool gic_check_eoimode(struct device_node *node, void __iomem **base)
{
if (!is_hyp_mode_available())
return false;
- if (resource_size(&cpuif_res) < SZ_8K)
- return false;
- if (resource_size(&cpuif_res) == SZ_128K) {
- u32 val_low, val_high;
+ if (resource_size(&cpuif_res) < SZ_8K) {
+ void __iomem *alt;
+ /*
+ * Check for a stupid firmware that only exposes the
+ * first page of a GICv2.
+ */
+ if (!gic_check_gicv2(*base))
+ return false;
+
+ if (!gicv2_force_probe) {
+ pr_warn("GIC: GICv2 detected, but range too small and irqchip.gicv2_force_probe not set\n");
+ return false;
+ }
+
+ alt = ioremap(cpuif_res.start, SZ_8K);
+ if (!alt)
+ return false;
+ if (!gic_check_gicv2(alt + SZ_4K)) {
+ /*
+ * The first page was that of a GICv2, and
+ * the second was *something*. Let's trust it
+ * to be a GICv2, and update the mapping.
+ */
+ pr_warn("GIC: GICv2 at %pa, but range is too small (broken DT?), assuming 8kB\n",
+ &cpuif_res.start);
+ iounmap(*base);
+ *base = alt;
+ return true;
+ }
/*
- * Verify that we have the first 4kB of a GIC400
+ * We detected *two* initial GICv2 pages in a
+ * row. Could be a GICv2 aliased over two 64kB
+ * pages. Update the resource, map the iospace, and
+ * pray.
+ */
+ iounmap(alt);
+ alt = ioremap(cpuif_res.start, SZ_128K);
+ if (!alt)
+ return false;
+ pr_warn("GIC: Aliased GICv2 at %pa, trying to find the canonical range over 128kB\n",
+ &cpuif_res.start);
+ cpuif_res.end = cpuif_res.start + SZ_128K -1;
+ iounmap(*base);
+ *base = alt;
+ }
+ if (resource_size(&cpuif_res) == SZ_128K) {
+ /*
+ * Verify that we have the first 4kB of a GICv2
* aliased over the first 64kB by checking the
* GICC_IIDR register on both ends.
*/
- val_low = readl_relaxed(*base + GIC_CPU_IDENT);
- val_high = readl_relaxed(*base + GIC_CPU_IDENT + 0xf000);
- if ((val_low & 0xffff0fff) != 0x0202043B ||
- val_low != val_high)
+ if (!gic_check_gicv2(*base) ||
+ !gic_check_gicv2(*base + 0xf000))
return false;
/*
if (ret)
return;
- gic_set_kvm_info(&gic_v2_kvm_info);
+ if (static_key_true(&supports_deactivate))
+ gic_set_kvm_info(&gic_v2_kvm_info);
}
int __init
if (IS_ENABLED(CONFIG_ARM_GIC_V2M))
gicv2m_init(NULL, gic_data[0].domain);
- gic_acpi_setup_kvm_info();
+ if (static_key_true(&supports_deactivate))
+ gic_acpi_setup_kvm_info();
return 0;
}
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef __LINUX_KVM_H
#define __LINUX_KVM_H
#define KVM_CAP_PPC_SMT_POSSIBLE 147
#define KVM_CAP_HYPERV_SYNIC2 148
#define KVM_CAP_HYPERV_VP_INDEX 149
+ #define KVM_CAP_S390_AIS_MIGRATION 150
#ifdef KVM_CAP_IRQ_ROUTING
int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
{
- return kvm_timer_should_fire(vcpu_vtimer(vcpu)) ||
- kvm_timer_should_fire(vcpu_ptimer(vcpu));
+ return kvm_timer_is_pending(vcpu);
}
void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
vcpu->arch.host_cpu_context = this_cpu_ptr(kvm_host_cpu_state);
kvm_arm_set_running_vcpu(vcpu);
-
kvm_vgic_load(vcpu);
+ kvm_timer_vcpu_load(vcpu);
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
+ kvm_timer_vcpu_put(vcpu);
kvm_vgic_put(vcpu);
vcpu->cpu = -1;
kvm_arm_set_running_vcpu(NULL);
- kvm_timer_vcpu_put(vcpu);
}
static void vcpu_power_off(struct kvm_vcpu *vcpu)
*/
preempt_disable();
+ /* Flush FP/SIMD state that can't survive guest entry/exit */
+ kvm_fpsimd_flush_cpu_state();
+
kvm_pmu_flush_hwstate(vcpu);
- kvm_timer_flush_hwstate(vcpu);
- kvm_vgic_flush_hwstate(vcpu);
-
local_irq_disable();
+ kvm_vgic_flush_hwstate(vcpu);
+
/*
* If we have a singal pending, or need to notify a userspace
* irqchip about timer or PMU level changes, then we exit (and
if (ret <= 0 || need_new_vmid_gen(vcpu->kvm) ||
kvm_request_pending(vcpu)) {
vcpu->mode = OUTSIDE_GUEST_MODE;
- local_irq_enable();
kvm_pmu_sync_hwstate(vcpu);
kvm_timer_sync_hwstate(vcpu);
kvm_vgic_sync_hwstate(vcpu);
+ local_irq_enable();
preempt_enable();
continue;
}
kvm_arm_clear_debug(vcpu);
+ /*
+ * We must sync the PMU state before the vgic state so
+ * that the vgic can properly sample the updated state of the
+ * interrupt line.
+ */
+ kvm_pmu_sync_hwstate(vcpu);
+
+ /*
+ * Sync the vgic state before syncing the timer state because
+ * the timer code needs to know if the virtual timer
+ * interrupts are active.
+ */
+ kvm_vgic_sync_hwstate(vcpu);
+
+ /*
+ * Sync the timer hardware state before enabling interrupts as
+ * we don't want vtimer interrupts to race with syncing the
+ * timer virtual interrupt state.
+ */
+ kvm_timer_sync_hwstate(vcpu);
+
/*
* We may have taken a host interrupt in HYP mode (ie
* while executing the guest). This interrupt is still
guest_exit();
trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
- /*
- * We must sync the PMU and timer state before the vgic state so
- * that the vgic can properly sample the updated state of the
- * interrupt line.
- */
- kvm_pmu_sync_hwstate(vcpu);
- kvm_timer_sync_hwstate(vcpu);
-
- kvm_vgic_sync_hwstate(vcpu);
-
preempt_enable();
ret = handle_exit(vcpu, run, ret);
{
int cpu;
- if (is_kernel_in_hyp_mode())
- return;
-
free_hyp_pgds();
for_each_possible_cpu(cpu)
free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
hyp_cpu_pm_exit();
}
-static int init_vhe_mode(void)
-{
- kvm_info("VHE mode initialized successfully\n");
- return 0;
-}
-
/**
* Inits Hyp-mode on all online CPUs
*/
}
}
- kvm_info("Hyp mode initialized successfully\n");
-
return 0;
out_err:
{
int err;
int ret, cpu;
+ bool in_hyp_mode;
if (!is_hyp_mode_available()) {
kvm_err("HYP mode not available\n");
if (err)
return err;
- if (is_kernel_in_hyp_mode())
- err = init_vhe_mode();
- else
+ in_hyp_mode = is_kernel_in_hyp_mode();
+
+ if (!in_hyp_mode) {
err = init_hyp_mode();
- if (err)
- goto out_err;
+ if (err)
+ goto out_err;
+ }
err = init_subsystems();
if (err)
goto out_hyp;
+ if (in_hyp_mode)
+ kvm_info("VHE mode initialized successfully\n");
+ else
+ kvm_info("Hyp mode initialized successfully\n");
+
return 0;
out_hyp:
- teardown_hyp_mode();
+ if (!in_hyp_mode)
+ teardown_hyp_mode();
out_err:
teardown_common_resources();
return err;
u64 propbase = GICR_PROPBASER_ADDRESS(kvm->arch.vgic.propbaser);
u8 prop;
int ret;
+ unsigned long flags;
ret = kvm_read_guest(kvm, propbase + irq->intid - GIC_LPI_OFFSET,
&prop, 1);
if (ret)
return ret;
- spin_lock(&irq->irq_lock);
+ spin_lock_irqsave(&irq->irq_lock, flags);
if (!filter_vcpu || filter_vcpu == irq->target_vcpu) {
irq->priority = LPI_PROP_PRIORITY(prop);
irq->enabled = LPI_PROP_ENABLE_BIT(prop);
- vgic_queue_irq_unlock(kvm, irq);
+ vgic_queue_irq_unlock(kvm, irq, flags);
} else {
- spin_unlock(&irq->irq_lock);
+ spin_unlock_irqrestore(&irq->irq_lock, flags);
}
return 0;
int ret = 0;
u32 *intids;
int nr_irqs, i;
+ unsigned long flags;
nr_irqs = vgic_copy_lpi_list(vcpu, &intids);
if (nr_irqs < 0)
}
irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]);
- spin_lock(&irq->irq_lock);
+ spin_lock_irqsave(&irq->irq_lock, flags);
irq->pending_latch = pendmask & (1U << bit_nr);
- vgic_queue_irq_unlock(vcpu->kvm, irq);
+ vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
vgic_put_irq(vcpu->kvm, irq);
}
{
struct kvm_vcpu *vcpu;
struct its_ite *ite;
+ unsigned long flags;
if (!its->enabled)
return -EBUSY;
if (!vcpu->arch.vgic_cpu.lpis_enabled)
return -EBUSY;
- spin_lock(&ite->irq->irq_lock);
+ spin_lock_irqsave(&ite->irq->irq_lock, flags);
ite->irq->pending_latch = true;
- vgic_queue_irq_unlock(kvm, ite->irq);
+ vgic_queue_irq_unlock(kvm, ite->irq, flags);
return 0;
}
}
/* Requires the its_lock to be held. */
- static void vgic_its_unmap_device(struct kvm *kvm, struct its_device *device)
+ static void vgic_its_free_device(struct kvm *kvm, struct its_device *device)
{
struct its_ite *ite, *temp;
kfree(device);
}
+ /* its lock must be held */
+ static void vgic_its_free_device_list(struct kvm *kvm, struct vgic_its *its)
+ {
+ struct its_device *cur, *temp;
+
+ list_for_each_entry_safe(cur, temp, &its->device_list, dev_list)
+ vgic_its_free_device(kvm, cur);
+ }
+
+ /* its lock must be held */
+ static void vgic_its_free_collection_list(struct kvm *kvm, struct vgic_its *its)
+ {
+ struct its_collection *cur, *temp;
+
+ list_for_each_entry_safe(cur, temp, &its->collection_list, coll_list)
+ vgic_its_free_collection(its, cur->collection_id);
+ }
+
/* Must be called with its_lock mutex held */
static struct its_device *vgic_its_alloc_device(struct vgic_its *its,
u32 device_id, gpa_t itt_addr,
* by removing the mapping and re-establishing it.
*/
if (device)
- vgic_its_unmap_device(kvm, device);
+ vgic_its_free_device(kvm, device);
/*
* The spec does not say whether unmapping a not-mapped device
unsigned long val)
{
const struct vgic_its_abi *abi = vgic_its_get_abi(its);
- u64 entry_size, device_type;
+ u64 entry_size, table_type;
u64 reg, *regptr, clearbits = 0;
/* When GITS_CTLR.Enable is 1, we ignore write accesses. */
case 0:
regptr = &its->baser_device_table;
entry_size = abi->dte_esz;
- device_type = GITS_BASER_TYPE_DEVICE;
+ table_type = GITS_BASER_TYPE_DEVICE;
break;
case 1:
regptr = &its->baser_coll_table;
entry_size = abi->cte_esz;
- device_type = GITS_BASER_TYPE_COLLECTION;
+ table_type = GITS_BASER_TYPE_COLLECTION;
clearbits = GITS_BASER_INDIRECT;
break;
default:
reg &= ~clearbits;
reg |= (entry_size - 1) << GITS_BASER_ENTRY_SIZE_SHIFT;
- reg |= device_type << GITS_BASER_TYPE_SHIFT;
+ reg |= table_type << GITS_BASER_TYPE_SHIFT;
reg = vgic_sanitise_its_baser(reg);
*regptr = reg;
+
+ if (!(reg & GITS_BASER_VALID)) {
+ /* Take the its_lock to prevent a race with a save/restore */
+ mutex_lock(&its->its_lock);
+ switch (table_type) {
+ case GITS_BASER_TYPE_DEVICE:
+ vgic_its_free_device_list(kvm, its);
+ break;
+ case GITS_BASER_TYPE_COLLECTION:
+ vgic_its_free_collection_list(kvm, its);
+ break;
+ }
+ mutex_unlock(&its->its_lock);
+ }
}
static unsigned long vgic_mmio_read_its_ctlr(struct kvm *vcpu,
{
mutex_lock(&its->cmd_lock);
+ /*
+ * It is UNPREDICTABLE to enable the ITS if any of the CBASER or
+ * device/collection BASER are invalid
+ */
+ if (!its->enabled && (val & GITS_CTLR_ENABLE) &&
+ (!(its->baser_device_table & GITS_BASER_VALID) ||
+ !(its->baser_coll_table & GITS_BASER_VALID) ||
+ !(its->cbaser & GITS_CBASER_VALID)))
+ goto out;
+
its->enabled = !!(val & GITS_CTLR_ENABLE);
/*
*/
vgic_its_process_commands(kvm, its);
+out:
mutex_unlock(&its->cmd_lock);
}
return vgic_its_set_abi(its, NR_ITS_ABIS - 1);
}
- static void vgic_its_free_device(struct kvm *kvm, struct its_device *dev)
- {
- struct its_ite *ite, *tmp;
-
- list_for_each_entry_safe(ite, tmp, &dev->itt_head, ite_list)
- its_free_ite(kvm, ite);
- list_del(&dev->dev_list);
- kfree(dev);
- }
-
static void vgic_its_destroy(struct kvm_device *kvm_dev)
{
struct kvm *kvm = kvm_dev->kvm;
struct vgic_its *its = kvm_dev->private;
- struct list_head *cur, *temp;
-
- /*
- * We may end up here without the lists ever having been initialized.
- * Check this and bail out early to avoid dereferencing a NULL pointer.
- */
- if (!its->device_list.next)
- return;
mutex_lock(&its->its_lock);
- list_for_each_safe(cur, temp, &its->device_list) {
- struct its_device *dev;
- dev = list_entry(cur, struct its_device, dev_list);
- vgic_its_free_device(kvm, dev);
- }
+ vgic_its_free_device_list(kvm, its);
+ vgic_its_free_collection_list(kvm, its);
- list_for_each_safe(cur, temp, &its->collection_list) {
- struct its_collection *coll;
-
- coll = list_entry(cur, struct its_collection, coll_list);
- list_del(cur);
- kfree(coll);
- }
mutex_unlock(&its->its_lock);
-
kfree(its);
}
static int scan_its_table(struct vgic_its *its, gpa_t base, int size, int esz,
int start_id, entry_fn_t fn, void *opaque)
{
- void *entry = kzalloc(esz, GFP_KERNEL);
struct kvm *kvm = its->dev->kvm;
unsigned long len = size;
int id = start_id;
gpa_t gpa = base;
+ char entry[esz];
int ret;
+ memset(entry, 0, esz);
+
while (len > 0) {
int next_offset;
size_t byte_offset;
ret = kvm_read_guest(kvm, gpa, entry, esz);
if (ret)
- goto out;
+ return ret;
next_offset = fn(its, id, entry, opaque);
- if (next_offset <= 0) {
- ret = next_offset;
- goto out;
- }
+ if (next_offset <= 0)
+ return next_offset;
byte_offset = next_offset * esz;
id += next_offset;
gpa += byte_offset;
len -= byte_offset;
}
- ret = 1;
-
-out:
- kfree(entry);
- return ret;
+ return 1;
}
/**
return 0;
}
+/**
+ * vgic_its_restore_itt - restore the ITT of a device
+ *
+ * @its: its handle
+ * @dev: device handle
+ *
+ * Return 0 on success, < 0 on error
+ */
static int vgic_its_restore_itt(struct vgic_its *its, struct its_device *dev)
{
const struct vgic_its_abi *abi = vgic_its_get_abi(its);
ret = scan_its_table(its, base, max_size, ite_esz, 0,
vgic_its_restore_ite, dev);
+ /* scan_its_table returns +1 if all ITEs are invalid */
+ if (ret > 0)
+ ret = 0;
+
return ret;
}
static int vgic_its_save_device_tables(struct vgic_its *its)
{
const struct vgic_its_abi *abi = vgic_its_get_abi(its);
+ u64 baser = its->baser_device_table;
struct its_device *dev;
int dte_esz = abi->dte_esz;
- u64 baser;
- baser = its->baser_device_table;
+ if (!(baser & GITS_BASER_VALID))
+ return 0;
list_sort(NULL, &its->device_list, vgic_its_device_cmp);
ret = scan_its_table(its, gpa, SZ_64K, dte_esz,
l2_start_id, vgic_its_restore_dte, NULL);
- if (ret <= 0)
- return ret;
-
- return 1;
+ return ret;
}
/**
vgic_its_restore_dte, NULL);
}
+ /* scan_its_table returns +1 if all entries are invalid */
if (ret > 0)
- ret = -EINVAL;
+ ret = 0;
return ret;
}
static int vgic_its_save_collection_table(struct vgic_its *its)
{
const struct vgic_its_abi *abi = vgic_its_get_abi(its);
+ u64 baser = its->baser_coll_table;
+ gpa_t gpa = BASER_ADDRESS(baser);
struct its_collection *collection;
u64 val;
- gpa_t gpa;
size_t max_size, filled = 0;
int ret, cte_esz = abi->cte_esz;
- gpa = BASER_ADDRESS(its->baser_coll_table);
- if (!gpa)
+ if (!(baser & GITS_BASER_VALID))
return 0;
- max_size = GITS_BASER_NR_PAGES(its->baser_coll_table) * SZ_64K;
+ max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
list_for_each_entry(collection, &its->collection_list, coll_list) {
ret = vgic_its_save_cte(its, collection, gpa, cte_esz);
static int vgic_its_restore_collection_table(struct vgic_its *its)
{
const struct vgic_its_abi *abi = vgic_its_get_abi(its);
+ u64 baser = its->baser_coll_table;
int cte_esz = abi->cte_esz;
size_t max_size, read = 0;
gpa_t gpa;
int ret;
- if (!(its->baser_coll_table & GITS_BASER_VALID))
+ if (!(baser & GITS_BASER_VALID))
return 0;
- gpa = BASER_ADDRESS(its->baser_coll_table);
+ gpa = BASER_ADDRESS(baser);
- max_size = GITS_BASER_NR_PAGES(its->baser_coll_table) * SZ_64K;
+ max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
while (read < max_size) {
ret = vgic_its_restore_cte(its, gpa, cte_esz);
gpa += cte_esz;
read += cte_esz;
}
+
+ if (ret > 0)
+ return 0;
+
return ret;
}
*/
static int vgic_its_save_tables_v0(struct vgic_its *its)
{
- struct kvm *kvm = its->dev->kvm;
int ret;
- mutex_lock(&kvm->lock);
- mutex_lock(&its->its_lock);
-
- if (!lock_all_vcpus(kvm)) {
- mutex_unlock(&its->its_lock);
- mutex_unlock(&kvm->lock);
- return -EBUSY;
- }
-
ret = vgic_its_save_device_tables(its);
if (ret)
- goto out;
-
- ret = vgic_its_save_collection_table(its);
+ return ret;
- out:
- unlock_all_vcpus(kvm);
- mutex_unlock(&its->its_lock);
- mutex_unlock(&kvm->lock);
- return ret;
+ return vgic_its_save_collection_table(its);
}
/**
*/
static int vgic_its_restore_tables_v0(struct vgic_its *its)
{
- struct kvm *kvm = its->dev->kvm;
int ret;
- mutex_lock(&kvm->lock);
- mutex_lock(&its->its_lock);
-
- if (!lock_all_vcpus(kvm)) {
- mutex_unlock(&its->its_lock);
- mutex_unlock(&kvm->lock);
- return -EBUSY;
- }
-
ret = vgic_its_restore_collection_table(its);
if (ret)
- goto out;
-
- ret = vgic_its_restore_device_tables(its);
- out:
- unlock_all_vcpus(kvm);
- mutex_unlock(&its->its_lock);
- mutex_unlock(&kvm->lock);
+ return ret;
- return ret;
+ return vgic_its_restore_device_tables(its);
}
static int vgic_its_commit_v0(struct vgic_its *its)
return 0;
}
+ static void vgic_its_reset(struct kvm *kvm, struct vgic_its *its)
+ {
+ /* We need to keep the ABI specific field values */
+ its->baser_coll_table &= ~GITS_BASER_VALID;
+ its->baser_device_table &= ~GITS_BASER_VALID;
+ its->cbaser = 0;
+ its->creadr = 0;
+ its->cwriter = 0;
+ its->enabled = 0;
+ vgic_its_free_device_list(kvm, its);
+ vgic_its_free_collection_list(kvm, its);
+ }
+
static int vgic_its_has_attr(struct kvm_device *dev,
struct kvm_device_attr *attr)
{
switch (attr->attr) {
case KVM_DEV_ARM_VGIC_CTRL_INIT:
return 0;
+ case KVM_DEV_ARM_ITS_CTRL_RESET:
+ return 0;
case KVM_DEV_ARM_ITS_SAVE_TABLES:
return 0;
case KVM_DEV_ARM_ITS_RESTORE_TABLES:
return -ENXIO;
}
+ static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr)
+ {
+ const struct vgic_its_abi *abi = vgic_its_get_abi(its);
+ int ret = 0;
+
+ if (attr == KVM_DEV_ARM_VGIC_CTRL_INIT) /* Nothing to do */
+ return 0;
+
+ mutex_lock(&kvm->lock);
+ mutex_lock(&its->its_lock);
+
+ if (!lock_all_vcpus(kvm)) {
+ mutex_unlock(&its->its_lock);
+ mutex_unlock(&kvm->lock);
+ return -EBUSY;
+ }
+
+ switch (attr) {
+ case KVM_DEV_ARM_ITS_CTRL_RESET:
+ vgic_its_reset(kvm, its);
+ break;
+ case KVM_DEV_ARM_ITS_SAVE_TABLES:
+ ret = abi->save_tables(its);
+ break;
+ case KVM_DEV_ARM_ITS_RESTORE_TABLES:
+ ret = abi->restore_tables(its);
+ break;
+ }
+
+ unlock_all_vcpus(kvm);
+ mutex_unlock(&its->its_lock);
+ mutex_unlock(&kvm->lock);
+ return ret;
+ }
+
static int vgic_its_set_attr(struct kvm_device *dev,
struct kvm_device_attr *attr)
{
return vgic_register_its_iodev(dev->kvm, its, addr);
}
- case KVM_DEV_ARM_VGIC_GRP_CTRL: {
- const struct vgic_its_abi *abi = vgic_its_get_abi(its);
-
- switch (attr->attr) {
- case KVM_DEV_ARM_VGIC_CTRL_INIT:
- /* Nothing to do */
- return 0;
- case KVM_DEV_ARM_ITS_SAVE_TABLES:
- return abi->save_tables(its);
- case KVM_DEV_ARM_ITS_RESTORE_TABLES:
- return abi->restore_tables(its);
- }
- }
+ case KVM_DEV_ARM_VGIC_GRP_CTRL:
+ return vgic_its_ctrl(dev->kvm, its, attr->attr);
case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: {
u64 __user *uaddr = (u64 __user *)(long)attr->addr;
u64 reg;
static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
- static void kvm_release_pfn_dirty(kvm_pfn_t pfn);
static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn);
__visible bool kvm_rebooting;
}
EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
- static void kvm_release_pfn_dirty(kvm_pfn_t pfn)
+ void kvm_release_pfn_dirty(kvm_pfn_t pfn)
{
kvm_set_pfn_dirty(pfn);
kvm_release_pfn_clean(pfn);
}
+ EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
void kvm_set_pfn_dirty(kvm_pfn_t pfn)
{
continue;
} else if (pass && i > last_boosted_vcpu)
break;
- if (!ACCESS_ONCE(vcpu->preempted))
+ if (!READ_ONCE(vcpu->preempted))
continue;
if (vcpu == me)
continue;
if (!vcpu_align)
vcpu_align = __alignof__(struct kvm_vcpu);
kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align,
- 0, NULL);
+ SLAB_ACCOUNT, NULL);
if (!kvm_vcpu_cache) {
r = -ENOMEM;
goto out_free_3;