Merge tag 'kvm-arm-for-v4.16' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm...
authorRadim Krčmář <rkrcmar@redhat.com>
Wed, 31 Jan 2018 12:34:41 +0000 (13:34 +0100)
committerRadim Krčmář <rkrcmar@redhat.com>
Wed, 31 Jan 2018 12:34:41 +0000 (13:34 +0100)
KVM/ARM Changes for v4.16

The changes for this version include icache invalidation optimizations
(improving VM startup time), support for forwarded level-triggered
interrupts (improved performance for timers and passthrough platform
devices), a small fix for power-management notifiers, and some cosmetic
changes.

66 files changed:
Documentation/virtual/kvm/00-INDEX
Documentation/virtual/kvm/amd-memory-encryption.rst [new file with mode: 0644]
Documentation/virtual/kvm/api.txt
Documentation/virtual/kvm/cpuid.txt
MAINTAINERS
arch/arm64/kvm/guest.c
arch/mips/kvm/Kconfig
arch/mips/kvm/mips.c
arch/powerpc/kvm/Kconfig
arch/powerpc/kvm/book3s.c
arch/powerpc/kvm/booke.c
arch/powerpc/kvm/powerpc.c
arch/s390/include/asm/bitops.h
arch/s390/include/asm/css_chars.h
arch/s390/include/asm/kvm_host.h
arch/s390/include/asm/sclp.h
arch/s390/kvm/Kconfig
arch/s390/kvm/diag.c
arch/s390/kvm/interrupt.c
arch/s390/kvm/kvm-s390.c
arch/s390/kvm/kvm-s390.h
arch/s390/kvm/priv.c
arch/s390/kvm/sigp.c
arch/s390/kvm/vsie.c
arch/s390/mm/gmap.c
arch/x86/include/asm/cpufeatures.h
arch/x86/include/asm/kvm_host.h
arch/x86/include/asm/msr-index.h
arch/x86/include/asm/pat.h
arch/x86/include/asm/svm.h
arch/x86/include/uapi/asm/kvm_para.h
arch/x86/kernel/cpu/amd.c
arch/x86/kernel/cpu/scattered.c
arch/x86/kernel/kvm.c
arch/x86/kvm/Kconfig
arch/x86/kvm/cpuid.c
arch/x86/kvm/emulate.c
arch/x86/kvm/irq.c
arch/x86/kvm/lapic.c
arch/x86/kvm/lapic.h
arch/x86/kvm/mmu.c
arch/x86/kvm/mmu_audit.c
arch/x86/kvm/svm.c
arch/x86/kvm/vmx.c
arch/x86/kvm/vmx_shadow_fields.h [new file with mode: 0644]
arch/x86/kvm/x86.c
arch/x86/kvm/x86.h
arch/x86/mm/pat.c
drivers/crypto/ccp/Kconfig
drivers/crypto/ccp/Makefile
drivers/crypto/ccp/psp-dev.c [new file with mode: 0644]
drivers/crypto/ccp/psp-dev.h [new file with mode: 0644]
drivers/crypto/ccp/sp-dev.c
drivers/crypto/ccp/sp-dev.h
drivers/crypto/ccp/sp-pci.c
drivers/s390/char/sclp_early.c
include/linux/kvm_host.h
include/linux/psp-sev.h [new file with mode: 0644]
include/uapi/linux/kvm.h
include/uapi/linux/psp-sev.h [new file with mode: 0644]
kernel/configs/kvm_guest.config
tools/kvm/kvm_stat/kvm_stat
tools/kvm/kvm_stat/kvm_stat.txt
virt/kvm/Kconfig
virt/kvm/arm/arm.c
virt/kvm/kvm_main.c

index 69fe1a8b7ad16ed34311a4676dfc8858ae3532d1..3da73aabff5ac0b126dce945e0e13f77289ccb1b 100644 (file)
@@ -26,3 +26,6 @@ s390-diag.txt
        - Diagnose hypercall description (for IBM S/390)
 timekeeping.txt
        - timekeeping virtualization for x86-based architectures.
+amd-memory-encryption.txt
+       - notes on AMD Secure Encrypted Virtualization feature and SEV firmware
+         command description
diff --git a/Documentation/virtual/kvm/amd-memory-encryption.rst b/Documentation/virtual/kvm/amd-memory-encryption.rst
new file mode 100644 (file)
index 0000000..71d6d25
--- /dev/null
@@ -0,0 +1,247 @@
+======================================
+Secure Encrypted Virtualization (SEV)
+======================================
+
+Overview
+========
+
+Secure Encrypted Virtualization (SEV) is a feature found on AMD processors.
+
+SEV is an extension to the AMD-V architecture which supports running
+virtual machines (VMs) under the control of a hypervisor. When enabled,
+the memory contents of a VM will be transparently encrypted with a key
+unique to that VM.
+
+The hypervisor can determine the SEV support through the CPUID
+instruction. The CPUID function 0x8000001f reports information related
+to SEV::
+
+       0x8000001f[eax]:
+                       Bit[1]  indicates support for SEV
+           ...
+                 [ecx]:
+                       Bits[31:0]  Number of encrypted guests supported simultaneously
+
+If support for SEV is present, MSR 0xc001_0010 (MSR_K8_SYSCFG) and MSR 0xc001_0015
+(MSR_K7_HWCR) can be used to determine if it can be enabled::
+
+       0xc001_0010:
+               Bit[23]    1 = memory encryption can be enabled
+                          0 = memory encryption can not be enabled
+
+       0xc001_0015:
+               Bit[0]     1 = memory encryption can be enabled
+                          0 = memory encryption can not be enabled
+
+When SEV support is available, it can be enabled in a specific VM by
+setting the SEV bit before executing VMRUN.::
+
+       VMCB[0x90]:
+               Bit[1]      1 = SEV is enabled
+                           0 = SEV is disabled
+
+SEV hardware uses ASIDs to associate a memory encryption key with a VM.
+Hence, the ASID for the SEV-enabled guests must be from 1 to a maximum value
+defined in the CPUID 0x8000001f[ecx] field.
+
+SEV Key Management
+==================
+
+The SEV guest key management is handled by a separate processor called the AMD
+Secure Processor (AMD-SP). Firmware running inside the AMD-SP provides a secure
+key management interface to perform common hypervisor activities such as
+encrypting bootstrap code, snapshot, migrating and debugging the guest. For more
+information, see the SEV Key Management spec [api-spec]_
+
+KVM implements the following commands to support common lifecycle events of SEV
+guests, such as launching, running, snapshotting, migrating and decommissioning.
+
+1. KVM_SEV_INIT
+---------------
+
+The KVM_SEV_INIT command is used by the hypervisor to initialize the SEV platform
+context. In a typical workflow, this command should be the first command issued.
+
+Returns: 0 on success, -negative on error
+
+2. KVM_SEV_LAUNCH_START
+-----------------------
+
+The KVM_SEV_LAUNCH_START command is used for creating the memory encryption
+context. To create the encryption context, user must provide a guest policy,
+the owner's public Diffie-Hellman (PDH) key and session information.
+
+Parameters: struct  kvm_sev_launch_start (in/out)
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_launch_start {
+                __u32 handle;           /* if zero then firmware creates a new handle */
+                __u32 policy;           /* guest's policy */
+
+                __u64 dh_uaddr;         /* userspace address pointing to the guest owner's PDH key */
+                __u32 dh_len;
+
+                __u64 session_addr;     /* userspace address which points to the guest session information */
+                __u32 session_len;
+        };
+
+On success, the 'handle' field contains a new handle and on error, a negative value.
+
+For more details, see SEV spec Section 6.2.
+
+3. KVM_SEV_LAUNCH_UPDATE_DATA
+-----------------------------
+
+The KVM_SEV_LAUNCH_UPDATE_DATA is used for encrypting a memory region. It also
+calculates a measurement of the memory contents. The measurement is a signature
+of the memory contents that can be sent to the guest owner as an attestation
+that the memory was encrypted correctly by the firmware.
+
+Parameters (in): struct  kvm_sev_launch_update_data
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_launch_update {
+                __u64 uaddr;    /* userspace address to be encrypted (must be 16-byte aligned) */
+                __u32 len;      /* length of the data to be encrypted (must be 16-byte aligned) */
+        };
+
+For more details, see SEV spec Section 6.3.
+
+4. KVM_SEV_LAUNCH_MEASURE
+-------------------------
+
+The KVM_SEV_LAUNCH_MEASURE command is used to retrieve the measurement of the
+data encrypted by the KVM_SEV_LAUNCH_UPDATE_DATA command. The guest owner may
+wait to provide the guest with confidential information until it can verify the
+measurement. Since the guest owner knows the initial contents of the guest at
+boot, the measurement can be verified by comparing it to what the guest owner
+expects.
+
+Parameters (in): struct  kvm_sev_launch_measure
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_launch_measure {
+                __u64 uaddr;    /* where to copy the measurement */
+                __u32 len;      /* length of measurement blob */
+        };
+
+For more details on the measurement verification flow, see SEV spec Section 6.4.
+
+5. KVM_SEV_LAUNCH_FINISH
+------------------------
+
+After completion of the launch flow, the KVM_SEV_LAUNCH_FINISH command can be
+issued to make the guest ready for the execution.
+
+Returns: 0 on success, -negative on error
+
+6. KVM_SEV_GUEST_STATUS
+-----------------------
+
+The KVM_SEV_GUEST_STATUS command is used to retrieve status information about a
+SEV-enabled guest.
+
+Parameters (out): struct kvm_sev_guest_status
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_guest_status {
+                __u32 handle;   /* guest handle */
+                __u32 policy;   /* guest policy */
+                __u8 state;     /* guest state (see enum below) */
+        };
+
+SEV guest state:
+
+::
+
+        enum {
+        SEV_STATE_INVALID = 0;
+        SEV_STATE_LAUNCHING,    /* guest is currently being launched */
+        SEV_STATE_SECRET,       /* guest is being launched and ready to accept the ciphertext data */
+        SEV_STATE_RUNNING,      /* guest is fully launched and running */
+        SEV_STATE_RECEIVING,    /* guest is being migrated in from another SEV machine */
+        SEV_STATE_SENDING       /* guest is getting migrated out to another SEV machine */
+        };
+
+7. KVM_SEV_DBG_DECRYPT
+----------------------
+
+The KVM_SEV_DEBUG_DECRYPT command can be used by the hypervisor to request the
+firmware to decrypt the data at the given memory region.
+
+Parameters (in): struct kvm_sev_dbg
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_dbg {
+                __u64 src_uaddr;        /* userspace address of data to decrypt */
+                __u64 dst_uaddr;        /* userspace address of destination */
+                __u32 len;              /* length of memory region to decrypt */
+        };
+
+The command returns an error if the guest policy does not allow debugging.
+
+8. KVM_SEV_DBG_ENCRYPT
+----------------------
+
+The KVM_SEV_DEBUG_ENCRYPT command can be used by the hypervisor to request the
+firmware to encrypt the data at the given memory region.
+
+Parameters (in): struct kvm_sev_dbg
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_dbg {
+                __u64 src_uaddr;        /* userspace address of data to encrypt */
+                __u64 dst_uaddr;        /* userspace address of destination */
+                __u32 len;              /* length of memory region to encrypt */
+        };
+
+The command returns an error if the guest policy does not allow debugging.
+
+9. KVM_SEV_LAUNCH_SECRET
+------------------------
+
+The KVM_SEV_LAUNCH_SECRET command can be used by the hypervisor to inject secret
+data after the measurement has been validated by the guest owner.
+
+Parameters (in): struct kvm_sev_launch_secret
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_launch_secret {
+                __u64 hdr_uaddr;        /* userspace address containing the packet header */
+                __u32 hdr_len;
+
+                __u64 guest_uaddr;      /* the guest memory region where the secret should be injected */
+                __u32 guest_len;
+
+                __u64 trans_uaddr;      /* the hypervisor memory region which contains the secret */
+                __u32 trans_len;
+        };
+
+References
+==========
+
+.. [white-paper] http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2013/12/AMD_Memory_Encryption_Whitepaper_v7-Public.pdf
+.. [api-spec] http://support.amd.com/TechDocs/55766_SEV-KM%20API_Specification.pdf
+.. [amd-apm] http://support.amd.com/TechDocs/24593.pdf (section 15.34)
+.. [kvm-forum]  http://www.linux-kvm.org/images/7/74/02x08A-Thomas_Lendacky-AMDs_Virtualizatoin_Memory_Encryption_Technology.pdf
index 57d3ee9e4bde2a799715ca75871fd61b27858b0a..e5f1743e0b3eb4955357941766bb002da9ca2ba8 100644 (file)
@@ -3403,6 +3403,56 @@ invalid, if invalid pages are written to (e.g. after the end of memory)
 or if no page table is present for the addresses (e.g. when using
 hugepages).
 
+4.109 KVM_MEMORY_ENCRYPT_OP
+
+Capability: basic
+Architectures: x86
+Type: system
+Parameters: an opaque platform specific structure (in/out)
+Returns: 0 on success; -1 on error
+
+If the platform supports creating encrypted VMs then this ioctl can be used
+for issuing platform-specific memory encryption commands to manage those
+encrypted VMs.
+
+Currently, this ioctl is used for issuing Secure Encrypted Virtualization
+(SEV) commands on AMD Processors. The SEV commands are defined in
+Documentation/virtual/kvm/amd-memory-encryption.txt.
+
+4.110 KVM_MEMORY_ENCRYPT_REG_REGION
+
+Capability: basic
+Architectures: x86
+Type: system
+Parameters: struct kvm_enc_region (in)
+Returns: 0 on success; -1 on error
+
+This ioctl can be used to register a guest memory region which may
+contain encrypted data (e.g. guest RAM, SMRAM etc).
+
+It is used in the SEV-enabled guest. When encryption is enabled, a guest
+memory region may contain encrypted data. The SEV memory encryption
+engine uses a tweak such that two identical plaintext pages, each at
+different locations will have differing ciphertexts. So swapping or
+moving ciphertext of those pages will not result in plaintext being
+swapped. So relocating (or migrating) physical backing pages for the SEV
+guest will require some additional steps.
+
+Note: The current SEV key management spec does not provide commands to
+swap or migrate (move) ciphertext pages. Hence, for now we pin the guest
+memory region registered with the ioctl.
+
+4.111 KVM_MEMORY_ENCRYPT_UNREG_REGION
+
+Capability: basic
+Architectures: x86
+Type: system
+Parameters: struct kvm_enc_region (in)
+Returns: 0 on success; -1 on error
+
+This ioctl can be used to unregister the guest memory region registered
+with KVM_MEMORY_ENCRYPT_REG_REGION ioctl above.
+
 5. The kvm_run structure
 ------------------------
 
index 3c65feb83010133de17382c4fe4f21d7b602ff16..dcab6dc11e3b08117456f10903ad3ac2fa29eb99 100644 (file)
@@ -54,6 +54,10 @@ KVM_FEATURE_PV_UNHALT              ||     7 || guest checks this feature bit
                                    ||       || before enabling paravirtualized
                                    ||       || spinlock support.
 ------------------------------------------------------------------------------
+KVM_FEATURE_PV_TLB_FLUSH           ||     9 || guest checks this feature bit
+                                   ||       || before enabling paravirtualized
+                                   ||       || tlb flush.
+------------------------------------------------------------------------------
 KVM_FEATURE_CLOCKSOURCE_STABLE_BIT ||    24 || host will warn if no guest-side
                                    ||       || per-cpu warps are expected in
                                    ||       || kvmclock.
index 82ad0eabce4f3e3955dfab54c7df2cae03644268..71781ad422f0a72c9aeba7d904b180b8685264d2 100644 (file)
@@ -7692,7 +7692,9 @@ F:        arch/powerpc/kernel/kvm*
 
 KERNEL VIRTUAL MACHINE for s390 (KVM/s390)
 M:     Christian Borntraeger <borntraeger@de.ibm.com>
-M:     Cornelia Huck <cohuck@redhat.com>
+M:     Janosch Frank <frankja@linux.vnet.ibm.com>
+R:     David Hildenbrand <david@redhat.com>
+R:     Cornelia Huck <cohuck@redhat.com>
 L:     linux-s390@vger.kernel.org
 W:     http://www.ibm.com/developerworks/linux/linux390/
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git
@@ -11862,6 +11864,7 @@ F:      drivers/pci/hotplug/s390_pci_hpc.c
 S390 VFIO-CCW DRIVER
 M:     Cornelia Huck <cohuck@redhat.com>
 M:     Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
+M:     Halil Pasic <pasic@linux.vnet.ibm.com>
 L:     linux-s390@vger.kernel.org
 L:     kvm@vger.kernel.org
 S:     Supported
index 5c7f657dd20740c1b21e75d13cf711f4b7bfbdf8..d7e3299a773460fcd3b39930864e078e72453475 100644 (file)
@@ -361,10 +361,16 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
                                        struct kvm_guest_debug *dbg)
 {
+       int ret = 0;
+
+       vcpu_load(vcpu);
+
        trace_kvm_set_guest_debug(vcpu, dbg->control);
 
-       if (dbg->control & ~KVM_GUESTDBG_VALID_MASK)
-               return -EINVAL;
+       if (dbg->control & ~KVM_GUESTDBG_VALID_MASK) {
+               ret = -EINVAL;
+               goto out;
+       }
 
        if (dbg->control & KVM_GUESTDBG_ENABLE) {
                vcpu->guest_debug = dbg->control;
@@ -378,7 +384,10 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
                /* If not enabled clear all flags */
                vcpu->guest_debug = 0;
        }
-       return 0;
+
+out:
+       vcpu_put(vcpu);
+       return ret;
 }
 
 int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu,
index b17447ce887314f486772c0ebde81c46ec655bd5..76b93a9c8c9b2578b9950767d9b02f31309e1efa 100644 (file)
@@ -22,6 +22,7 @@ config KVM
        select PREEMPT_NOTIFIERS
        select ANON_INODES
        select KVM_GENERIC_DIRTYLOG_READ_PROTECT
+       select HAVE_KVM_VCPU_ASYNC_IOCTL
        select KVM_MMIO
        select MMU_NOTIFIER
        select SRCU
index 75fdeaa8c62f21a5420c963968c0188bbb459f49..2549fdd27ee16842c1ce7dd2bd422f27a2d3a769 100644 (file)
@@ -446,6 +446,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
        int r = -EINTR;
 
+       vcpu_load(vcpu);
+
        kvm_sigset_activate(vcpu);
 
        if (vcpu->mmio_needed) {
@@ -480,6 +482,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 out:
        kvm_sigset_deactivate(vcpu);
 
+       vcpu_put(vcpu);
        return r;
 }
 
@@ -900,6 +903,26 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
        return r;
 }
 
+long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl,
+                              unsigned long arg)
+{
+       struct kvm_vcpu *vcpu = filp->private_data;
+       void __user *argp = (void __user *)arg;
+
+       if (ioctl == KVM_INTERRUPT) {
+               struct kvm_mips_interrupt irq;
+
+               if (copy_from_user(&irq, argp, sizeof(irq)))
+                       return -EFAULT;
+               kvm_debug("[%d] %s: irq: %d\n", vcpu->vcpu_id, __func__,
+                         irq.irq);
+
+               return kvm_vcpu_ioctl_interrupt(vcpu, &irq);
+       }
+
+       return -ENOIOCTLCMD;
+}
+
 long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl,
                         unsigned long arg)
 {
@@ -907,56 +930,54 @@ long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl,
        void __user *argp = (void __user *)arg;
        long r;
 
+       vcpu_load(vcpu);
+
        switch (ioctl) {
        case KVM_SET_ONE_REG:
        case KVM_GET_ONE_REG: {
                struct kvm_one_reg reg;
 
+               r = -EFAULT;
                if (copy_from_user(&reg, argp, sizeof(reg)))
-                       return -EFAULT;
+                       break;
                if (ioctl == KVM_SET_ONE_REG)
-                       return kvm_mips_set_reg(vcpu, &reg);
+                       r = kvm_mips_set_reg(vcpu, &reg);
                else
-                       return kvm_mips_get_reg(vcpu, &reg);
+                       r = kvm_mips_get_reg(vcpu, &reg);
+               break;
        }
        case KVM_GET_REG_LIST: {
                struct kvm_reg_list __user *user_list = argp;
                struct kvm_reg_list reg_list;
                unsigned n;
 
+               r = -EFAULT;
                if (copy_from_user(&reg_list, user_list, sizeof(reg_list)))
-                       return -EFAULT;
+                       break;
                n = reg_list.n;
                reg_list.n = kvm_mips_num_regs(vcpu);
                if (copy_to_user(user_list, &reg_list, sizeof(reg_list)))
-                       return -EFAULT;
+                       break;
+               r = -E2BIG;
                if (n < reg_list.n)
-                       return -E2BIG;
-               return kvm_mips_copy_reg_indices(vcpu, user_list->reg);
-       }
-       case KVM_INTERRUPT:
-               {
-                       struct kvm_mips_interrupt irq;
-
-                       if (copy_from_user(&irq, argp, sizeof(irq)))
-                               return -EFAULT;
-                       kvm_debug("[%d] %s: irq: %d\n", vcpu->vcpu_id, __func__,
-                                 irq.irq);
-
-                       r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
                        break;
-               }
+               r = kvm_mips_copy_reg_indices(vcpu, user_list->reg);
+               break;
+       }
        case KVM_ENABLE_CAP: {
                struct kvm_enable_cap cap;
 
+               r = -EFAULT;
                if (copy_from_user(&cap, argp, sizeof(cap)))
-                       return -EFAULT;
+                       break;
                r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
                break;
        }
        default:
                r = -ENOIOCTLCMD;
        }
+
+       vcpu_put(vcpu);
        return r;
 }
 
@@ -1145,6 +1166,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
        int i;
 
+       vcpu_load(vcpu);
+
        for (i = 1; i < ARRAY_SIZE(vcpu->arch.gprs); i++)
                vcpu->arch.gprs[i] = regs->gpr[i];
        vcpu->arch.gprs[0] = 0; /* zero is special, and cannot be set. */
@@ -1152,6 +1175,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
        vcpu->arch.lo = regs->lo;
        vcpu->arch.pc = regs->pc;
 
+       vcpu_put(vcpu);
        return 0;
 }
 
@@ -1159,6 +1183,8 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
        int i;
 
+       vcpu_load(vcpu);
+
        for (i = 0; i < ARRAY_SIZE(vcpu->arch.gprs); i++)
                regs->gpr[i] = vcpu->arch.gprs[i];
 
@@ -1166,6 +1192,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
        regs->lo = vcpu->arch.lo;
        regs->pc = vcpu->arch.pc;
 
+       vcpu_put(vcpu);
        return 0;
 }
 
index b12b8eb39c29785a755d608cc2649ea7f3f1017f..f884a0529dfeb088c69464a0b596094c6728a295 100644 (file)
@@ -22,6 +22,7 @@ config KVM
        select PREEMPT_NOTIFIERS
        select ANON_INODES
        select HAVE_KVM_EVENTFD
+       select HAVE_KVM_VCPU_ASYNC_IOCTL
        select SRCU
        select KVM_VFIO
        select IRQ_BYPASS_MANAGER
index 72d977e309523f9c1720f32da241b29ad6e24755..234531d1bee1e24052e23c100b3856d6c55fd590 100644 (file)
@@ -484,19 +484,33 @@ void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu)
 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
                                  struct kvm_sregs *sregs)
 {
-       return vcpu->kvm->arch.kvm_ops->get_sregs(vcpu, sregs);
+       int ret;
+
+       vcpu_load(vcpu);
+       ret = vcpu->kvm->arch.kvm_ops->get_sregs(vcpu, sregs);
+       vcpu_put(vcpu);
+
+       return ret;
 }
 
 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
                                  struct kvm_sregs *sregs)
 {
-       return vcpu->kvm->arch.kvm_ops->set_sregs(vcpu, sregs);
+       int ret;
+
+       vcpu_load(vcpu);
+       ret = vcpu->kvm->arch.kvm_ops->set_sregs(vcpu, sregs);
+       vcpu_put(vcpu);
+
+       return ret;
 }
 
 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
        int i;
 
+       vcpu_load(vcpu);
+
        regs->pc = kvmppc_get_pc(vcpu);
        regs->cr = kvmppc_get_cr(vcpu);
        regs->ctr = kvmppc_get_ctr(vcpu);
@@ -518,6 +532,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
        for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
                regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
 
+       vcpu_put(vcpu);
        return 0;
 }
 
@@ -525,6 +540,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
        int i;
 
+       vcpu_load(vcpu);
+
        kvmppc_set_pc(vcpu, regs->pc);
        kvmppc_set_cr(vcpu, regs->cr);
        kvmppc_set_ctr(vcpu, regs->ctr);
@@ -545,6 +562,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
        for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
                kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
 
+       vcpu_put(vcpu);
        return 0;
 }
 
@@ -737,7 +755,9 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
                                        struct kvm_guest_debug *dbg)
 {
+       vcpu_load(vcpu);
        vcpu->guest_debug = dbg->control;
+       vcpu_put(vcpu);
        return 0;
 }
 
index 83b485810aea2fbfccc01718d1823f30b18e9398..6038e2e7aee03c2b29edb65624be370f01a6c928 100644 (file)
@@ -1431,6 +1431,8 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
        int i;
 
+       vcpu_load(vcpu);
+
        regs->pc = vcpu->arch.pc;
        regs->cr = kvmppc_get_cr(vcpu);
        regs->ctr = vcpu->arch.ctr;
@@ -1452,6 +1454,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
        for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
                regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
 
+       vcpu_put(vcpu);
        return 0;
 }
 
@@ -1459,6 +1462,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
        int i;
 
+       vcpu_load(vcpu);
+
        vcpu->arch.pc = regs->pc;
        kvmppc_set_cr(vcpu, regs->cr);
        vcpu->arch.ctr = regs->ctr;
@@ -1480,6 +1485,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
        for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
                kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
 
+       vcpu_put(vcpu);
        return 0;
 }
 
@@ -1607,30 +1613,42 @@ int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
                                   struct kvm_sregs *sregs)
 {
+       int ret;
+
+       vcpu_load(vcpu);
+
        sregs->pvr = vcpu->arch.pvr;
 
        get_sregs_base(vcpu, sregs);
        get_sregs_arch206(vcpu, sregs);
-       return vcpu->kvm->arch.kvm_ops->get_sregs(vcpu, sregs);
+       ret = vcpu->kvm->arch.kvm_ops->get_sregs(vcpu, sregs);
+
+       vcpu_put(vcpu);
+       return ret;
 }
 
 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
                                   struct kvm_sregs *sregs)
 {
-       int ret;
+       int ret = -EINVAL;
 
+       vcpu_load(vcpu);
        if (vcpu->arch.pvr != sregs->pvr)
-               return -EINVAL;
+               goto out;
 
        ret = set_sregs_base(vcpu, sregs);
        if (ret < 0)
-               return ret;
+               goto out;
 
        ret = set_sregs_arch206(vcpu, sregs);
        if (ret < 0)
-               return ret;
+               goto out;
 
-       return vcpu->kvm->arch.kvm_ops->set_sregs(vcpu, sregs);
+       ret = vcpu->kvm->arch.kvm_ops->set_sregs(vcpu, sregs);
+
+out:
+       vcpu_put(vcpu);
+       return ret;
 }
 
 int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
@@ -1773,7 +1791,9 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 {
        int r;
 
+       vcpu_load(vcpu);
        r = kvmppc_core_vcpu_translate(vcpu, tr);
+       vcpu_put(vcpu);
        return r;
 }
 
@@ -1996,12 +2016,15 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 {
        struct debug_reg *dbg_reg;
        int n, b = 0, w = 0;
+       int ret = 0;
+
+       vcpu_load(vcpu);
 
        if (!(dbg->control & KVM_GUESTDBG_ENABLE)) {
                vcpu->arch.dbg_reg.dbcr0 = 0;
                vcpu->guest_debug = 0;
                kvm_guest_protect_msr(vcpu, MSR_DE, false);
-               return 0;
+               goto out;
        }
 
        kvm_guest_protect_msr(vcpu, MSR_DE, true);
@@ -2033,8 +2056,9 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 #endif
 
        if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
-               return 0;
+               goto out;
 
+       ret = -EINVAL;
        for (n = 0; n < (KVMPPC_BOOKE_IAC_NUM + KVMPPC_BOOKE_DAC_NUM); n++) {
                uint64_t addr = dbg->arch.bp[n].addr;
                uint32_t type = dbg->arch.bp[n].type;
@@ -2045,21 +2069,24 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
                if (type & ~(KVMPPC_DEBUG_WATCH_READ |
                             KVMPPC_DEBUG_WATCH_WRITE |
                             KVMPPC_DEBUG_BREAKPOINT))
-                       return -EINVAL;
+                       goto out;
 
                if (type & KVMPPC_DEBUG_BREAKPOINT) {
                        /* Setting H/W breakpoint */
                        if (kvmppc_booke_add_breakpoint(dbg_reg, addr, b++))
-                               return -EINVAL;
+                               goto out;
                } else {
                        /* Setting H/W watchpoint */
                        if (kvmppc_booke_add_watchpoint(dbg_reg, addr,
                                                        type, w++))
-                               return -EINVAL;
+                               goto out;
                }
        }
 
-       return 0;
+       ret = 0;
+out:
+       vcpu_put(vcpu);
+       return ret;
 }
 
 void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
index 1915e86cef6f8fc2e05852ddc7a0867eca1c560b..66a310779de54768f9b6a3ecfd748de93232db8e 100644 (file)
@@ -1408,6 +1408,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
        int r;
 
+       vcpu_load(vcpu);
+
        if (vcpu->mmio_needed) {
                vcpu->mmio_needed = 0;
                if (!vcpu->mmio_is_write)
@@ -1422,7 +1424,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
                        r = kvmppc_emulate_mmio_vsx_loadstore(vcpu, run);
                        if (r == RESUME_HOST) {
                                vcpu->mmio_needed = 1;
-                               return r;
+                               goto out;
                        }
                }
 #endif
@@ -1456,6 +1458,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 
        kvm_sigset_deactivate(vcpu);
 
+out:
+       vcpu_put(vcpu);
        return r;
 }
 
@@ -1603,23 +1607,31 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
        return -EINVAL;
 }
 
-long kvm_arch_vcpu_ioctl(struct file *filp,
-                         unsigned int ioctl, unsigned long arg)
+long kvm_arch_vcpu_async_ioctl(struct file *filp,
+                              unsigned int ioctl, unsigned long arg)
 {
        struct kvm_vcpu *vcpu = filp->private_data;
        void __user *argp = (void __user *)arg;
-       long r;
 
-       switch (ioctl) {
-       case KVM_INTERRUPT: {
+       if (ioctl == KVM_INTERRUPT) {
                struct kvm_interrupt irq;
-               r = -EFAULT;
                if (copy_from_user(&irq, argp, sizeof(irq)))
-                       goto out;
-               r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
-               goto out;
+                       return -EFAULT;
+               return kvm_vcpu_ioctl_interrupt(vcpu, &irq);
        }
+       return -ENOIOCTLCMD;
+}
 
+long kvm_arch_vcpu_ioctl(struct file *filp,
+                         unsigned int ioctl, unsigned long arg)
+{
+       struct kvm_vcpu *vcpu = filp->private_data;
+       void __user *argp = (void __user *)arg;
+       long r;
+
+       vcpu_load(vcpu);
+
+       switch (ioctl) {
        case KVM_ENABLE_CAP:
        {
                struct kvm_enable_cap cap;
@@ -1659,6 +1671,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
        }
 
 out:
+       vcpu_put(vcpu);
        return r;
 }
 
index 31e400c1a1f35426a9cbcf8d94349d247e866557..86e5b2fdee3c8ee1b6da3f290de5be4b4460734c 100644 (file)
@@ -261,6 +261,11 @@ static inline void clear_bit_inv(unsigned long nr, volatile unsigned long *ptr)
        return clear_bit(nr ^ (BITS_PER_LONG - 1), ptr);
 }
 
+static inline int test_and_clear_bit_inv(unsigned long nr, volatile unsigned long *ptr)
+{
+       return test_and_clear_bit(nr ^ (BITS_PER_LONG - 1), ptr);
+}
+
 static inline void __set_bit_inv(unsigned long nr, volatile unsigned long *ptr)
 {
        return __set_bit(nr ^ (BITS_PER_LONG - 1), ptr);
index a478eb61aaf7f34e272c50101e7ff3596b643dc3..fb56fa3283a2c35cc79930df704e397265e56383 100644 (file)
@@ -20,7 +20,9 @@ struct css_general_char {
        u32 aif_tdd : 1; /* bit 56 */
        u32 : 1;
        u32 qebsm : 1;   /* bit 58 */
-       u32 : 8;
+       u32 : 2;
+       u32 aiv : 1;     /* bit 61 */
+       u32 : 5;
        u32 aif_osa : 1; /* bit 67 */
        u32 : 12;
        u32 eadm_rf : 1; /* bit 80 */
index e14f381757f67b6c0111c78c491c2c1078a7f177..59dd46adf0e862ccb3da778d35cb3790a7bb91db 100644 (file)
@@ -2,7 +2,7 @@
 /*
  * definition for kernel virtual machines on s390
  *
- * Copyright IBM Corp. 2008, 2009
+ * Copyright IBM Corp. 2008, 2018
  *
  *    Author(s): Carsten Otte <cotte@de.ibm.com>
  */
@@ -183,6 +183,7 @@ struct kvm_s390_sie_block {
 #define ECA_IB         0x40000000
 #define ECA_SIGPI      0x10000000
 #define ECA_MVPGI      0x01000000
+#define ECA_AIV                0x00200000
 #define ECA_VX         0x00020000
 #define ECA_PROTEXCI   0x00002000
 #define ECA_SII                0x00000001
@@ -227,7 +228,9 @@ struct kvm_s390_sie_block {
        __u8    epdx;                   /* 0x0069 */
        __u8    reserved6a[2];          /* 0x006a */
        __u32   todpr;                  /* 0x006c */
-       __u8    reserved70[16];         /* 0x0070 */
+#define GISA_FORMAT1 0x00000001
+       __u32   gd;                     /* 0x0070 */
+       __u8    reserved74[12];         /* 0x0074 */
        __u64   mso;                    /* 0x0080 */
        __u64   msl;                    /* 0x0088 */
        psw_t   gpsw;                   /* 0x0090 */
@@ -316,18 +319,30 @@ struct kvm_vcpu_stat {
        u64 deliver_program_int;
        u64 deliver_io_int;
        u64 exit_wait_state;
+       u64 instruction_epsw;
+       u64 instruction_gs;
+       u64 instruction_io_other;
+       u64 instruction_lpsw;
+       u64 instruction_lpswe;
        u64 instruction_pfmf;
+       u64 instruction_ptff;
+       u64 instruction_sck;
+       u64 instruction_sckpf;
        u64 instruction_stidp;
        u64 instruction_spx;
        u64 instruction_stpx;
        u64 instruction_stap;
-       u64 instruction_storage_key;
+       u64 instruction_iske;
+       u64 instruction_ri;
+       u64 instruction_rrbe;
+       u64 instruction_sske;
        u64 instruction_ipte_interlock;
-       u64 instruction_stsch;
-       u64 instruction_chsc;
        u64 instruction_stsi;
        u64 instruction_stfl;
+       u64 instruction_tb;
+       u64 instruction_tpi;
        u64 instruction_tprot;
+       u64 instruction_tsch;
        u64 instruction_sie;
        u64 instruction_essa;
        u64 instruction_sthyi;
@@ -353,6 +368,7 @@ struct kvm_vcpu_stat {
        u64 diagnose_258;
        u64 diagnose_308;
        u64 diagnose_500;
+       u64 diagnose_other;
 };
 
 #define PGM_OPERATION                  0x01
@@ -409,35 +425,35 @@ struct kvm_vcpu_stat {
 #define PGM_PER                                0x80
 #define PGM_CRYPTO_OPERATION           0x119
 
-/* irq types in order of priority */
+/* irq types in ascend order of priorities */
 enum irq_types {
-       IRQ_PEND_MCHK_EX = 0,
-       IRQ_PEND_SVC,
-       IRQ_PEND_PROG,
-       IRQ_PEND_MCHK_REP,
-       IRQ_PEND_EXT_IRQ_KEY,
-       IRQ_PEND_EXT_MALFUNC,
-       IRQ_PEND_EXT_EMERGENCY,
-       IRQ_PEND_EXT_EXTERNAL,
-       IRQ_PEND_EXT_CLOCK_COMP,
-       IRQ_PEND_EXT_CPU_TIMER,
-       IRQ_PEND_EXT_TIMING,
-       IRQ_PEND_EXT_SERVICE,
-       IRQ_PEND_EXT_HOST,
-       IRQ_PEND_PFAULT_INIT,
-       IRQ_PEND_PFAULT_DONE,
-       IRQ_PEND_VIRTIO,
-       IRQ_PEND_IO_ISC_0,
-       IRQ_PEND_IO_ISC_1,
-       IRQ_PEND_IO_ISC_2,
-       IRQ_PEND_IO_ISC_3,
-       IRQ_PEND_IO_ISC_4,
-       IRQ_PEND_IO_ISC_5,
-       IRQ_PEND_IO_ISC_6,
-       IRQ_PEND_IO_ISC_7,
-       IRQ_PEND_SIGP_STOP,
+       IRQ_PEND_SET_PREFIX = 0,
        IRQ_PEND_RESTART,
-       IRQ_PEND_SET_PREFIX,
+       IRQ_PEND_SIGP_STOP,
+       IRQ_PEND_IO_ISC_7,
+       IRQ_PEND_IO_ISC_6,
+       IRQ_PEND_IO_ISC_5,
+       IRQ_PEND_IO_ISC_4,
+       IRQ_PEND_IO_ISC_3,
+       IRQ_PEND_IO_ISC_2,
+       IRQ_PEND_IO_ISC_1,
+       IRQ_PEND_IO_ISC_0,
+       IRQ_PEND_VIRTIO,
+       IRQ_PEND_PFAULT_DONE,
+       IRQ_PEND_PFAULT_INIT,
+       IRQ_PEND_EXT_HOST,
+       IRQ_PEND_EXT_SERVICE,
+       IRQ_PEND_EXT_TIMING,
+       IRQ_PEND_EXT_CPU_TIMER,
+       IRQ_PEND_EXT_CLOCK_COMP,
+       IRQ_PEND_EXT_EXTERNAL,
+       IRQ_PEND_EXT_EMERGENCY,
+       IRQ_PEND_EXT_MALFUNC,
+       IRQ_PEND_EXT_IRQ_KEY,
+       IRQ_PEND_MCHK_REP,
+       IRQ_PEND_PROG,
+       IRQ_PEND_SVC,
+       IRQ_PEND_MCHK_EX,
        IRQ_PEND_COUNT
 };
 
@@ -515,9 +531,6 @@ struct kvm_s390_irq_payload {
 
 struct kvm_s390_local_interrupt {
        spinlock_t lock;
-       struct kvm_s390_float_interrupt *float_int;
-       struct swait_queue_head *wq;
-       atomic_t *cpuflags;
        DECLARE_BITMAP(sigp_emerg_pending, KVM_MAX_VCPUS);
        struct kvm_s390_irq_payload irq;
        unsigned long pending_irqs;
@@ -706,14 +719,50 @@ struct kvm_s390_crypto_cb {
        struct kvm_s390_apcb1 apcb1;            /* 0x0080 */
 };
 
+struct kvm_s390_gisa {
+       union {
+               struct { /* common to all formats */
+                       u32 next_alert;
+                       u8  ipm;
+                       u8  reserved01[2];
+                       u8  iam;
+               };
+               struct { /* format 0 */
+                       u32 next_alert;
+                       u8  ipm;
+                       u8  reserved01;
+                       u8  : 6;
+                       u8  g : 1;
+                       u8  c : 1;
+                       u8  iam;
+                       u8  reserved02[4];
+                       u32 airq_count;
+               } g0;
+               struct { /* format 1 */
+                       u32 next_alert;
+                       u8  ipm;
+                       u8  simm;
+                       u8  nimm;
+                       u8  iam;
+                       u8  aism[8];
+                       u8  : 6;
+                       u8  g : 1;
+                       u8  c : 1;
+                       u8  reserved03[11];
+                       u32 airq_count;
+               } g1;
+       };
+};
+
 /*
- * sie_page2 has to be allocated as DMA because fac_list and crycb need
- * 31bit addresses in the sie control block.
+ * sie_page2 has to be allocated as DMA because fac_list, crycb and
+ * gisa need 31bit addresses in the sie control block.
  */
 struct sie_page2 {
        __u64 fac_list[S390_ARCH_FAC_LIST_SIZE_U64];    /* 0x0000 */
        struct kvm_s390_crypto_cb crycb;                /* 0x0800 */
-       u8 reserved900[0x1000 - 0x900];                 /* 0x0900 */
+       struct kvm_s390_gisa gisa;                      /* 0x0900 */
+       u8 reserved920[0x1000 - 0x920];                 /* 0x0920 */
 };
 
 struct kvm_s390_vsie {
@@ -760,6 +809,7 @@ struct kvm_arch{
        struct kvm_s390_migration_state *migration_state;
        /* subset of available cpu features enabled by user space */
        DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
+       struct kvm_s390_gisa *gisa;
 };
 
 #define KVM_HVA_ERR_BAD                (-1UL)
index d3c1a8a2e3ad4be4d7c4fb7d2d5f30dbbe52bc27..3cae9168f63c4f5070fd659ee93ab0c82b6a26a4 100644 (file)
@@ -77,6 +77,7 @@ struct sclp_info {
        unsigned char has_ibs : 1;
        unsigned char has_skey : 1;
        unsigned char has_kss : 1;
+       unsigned char has_gisaf : 1;
        unsigned int ibc;
        unsigned int mtid;
        unsigned int mtid_cp;
index 9a4594e0a1ffe2a79b8f8233351f2c2b96b1b438..a3dbd459cce91bf2941222477bec5a085eff10af 100644 (file)
@@ -23,6 +23,7 @@ config KVM
        select PREEMPT_NOTIFIERS
        select ANON_INODES
        select HAVE_KVM_CPU_RELAX_INTERCEPT
+       select HAVE_KVM_VCPU_ASYNC_IOCTL
        select HAVE_KVM_EVENTFD
        select KVM_ASYNC_PF
        select KVM_ASYNC_PF_SYNC
index 89aa114a2cbada0989cec25757ec93daed36d064..45634b3d2e0aedf90ece3f2c715e6ab9e53d9f00 100644 (file)
@@ -257,6 +257,7 @@ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu)
        case 0x500:
                return __diag_virtio_hypercall(vcpu);
        default:
+               vcpu->stat.diagnose_other++;
                return -EOPNOTSUPP;
        }
 }
index 024ad8bcc51655e98ffed300817bc0e06e051cf7..aabf46f5f883d44d71cddc88b5ef28ec677ff200 100644 (file)
@@ -36,7 +36,7 @@ static int sca_ext_call_pending(struct kvm_vcpu *vcpu, int *src_id)
 {
        int c, scn;
 
-       if (!(atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_ECALL_PEND))
+       if (!kvm_s390_test_cpuflags(vcpu, CPUSTAT_ECALL_PEND))
                return 0;
 
        BUG_ON(!kvm_s390_use_sca_entries());
@@ -101,18 +101,17 @@ static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id)
                /* another external call is pending */
                return -EBUSY;
        }
-       atomic_or(CPUSTAT_ECALL_PEND, &vcpu->arch.sie_block->cpuflags);
+       kvm_s390_set_cpuflags(vcpu, CPUSTAT_ECALL_PEND);
        return 0;
 }
 
 static void sca_clear_ext_call(struct kvm_vcpu *vcpu)
 {
-       struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
        int rc, expect;
 
        if (!kvm_s390_use_sca_entries())
                return;
-       atomic_andnot(CPUSTAT_ECALL_PEND, li->cpuflags);
+       kvm_s390_clear_cpuflags(vcpu, CPUSTAT_ECALL_PEND);
        read_lock(&vcpu->kvm->arch.sca_lock);
        if (vcpu->kvm->arch.use_esca) {
                struct esca_block *sca = vcpu->kvm->arch.sca;
@@ -190,8 +189,8 @@ static int cpu_timer_irq_pending(struct kvm_vcpu *vcpu)
 
 static inline int is_ioirq(unsigned long irq_type)
 {
-       return ((irq_type >= IRQ_PEND_IO_ISC_0) &&
-               (irq_type <= IRQ_PEND_IO_ISC_7));
+       return ((irq_type >= IRQ_PEND_IO_ISC_7) &&
+               (irq_type <= IRQ_PEND_IO_ISC_0));
 }
 
 static uint64_t isc_to_isc_bits(int isc)
@@ -199,25 +198,59 @@ static uint64_t isc_to_isc_bits(int isc)
        return (0x80 >> isc) << 24;
 }
 
+static inline u32 isc_to_int_word(u8 isc)
+{
+       return ((u32)isc << 27) | 0x80000000;
+}
+
 static inline u8 int_word_to_isc(u32 int_word)
 {
        return (int_word & 0x38000000) >> 27;
 }
 
+/*
+ * To use atomic bitmap functions, we have to provide a bitmap address
+ * that is u64 aligned. However, the ipm might be u32 aligned.
+ * Therefore, we logically start the bitmap at the very beginning of the
+ * struct and fixup the bit number.
+ */
+#define IPM_BIT_OFFSET (offsetof(struct kvm_s390_gisa, ipm) * BITS_PER_BYTE)
+
+static inline void kvm_s390_gisa_set_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc)
+{
+       set_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa);
+}
+
+static inline u8 kvm_s390_gisa_get_ipm(struct kvm_s390_gisa *gisa)
+{
+       return READ_ONCE(gisa->ipm);
+}
+
+static inline void kvm_s390_gisa_clear_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc)
+{
+       clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa);
+}
+
+static inline int kvm_s390_gisa_tac_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc)
+{
+       return test_and_clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa);
+}
+
 static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu)
 {
        return vcpu->kvm->arch.float_int.pending_irqs |
-              vcpu->arch.local_int.pending_irqs;
+               vcpu->arch.local_int.pending_irqs |
+               kvm_s390_gisa_get_ipm(vcpu->kvm->arch.gisa) << IRQ_PEND_IO_ISC_7;
 }
 
 static inline int isc_to_irq_type(unsigned long isc)
 {
-       return IRQ_PEND_IO_ISC_0 + isc;
+       return IRQ_PEND_IO_ISC_0 - isc;
 }
 
 static inline int irq_type_to_isc(unsigned long irq_type)
 {
-       return irq_type - IRQ_PEND_IO_ISC_0;
+       return IRQ_PEND_IO_ISC_0 - irq_type;
 }
 
 static unsigned long disable_iscs(struct kvm_vcpu *vcpu,
@@ -278,20 +311,20 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu)
 
 static void __set_cpu_idle(struct kvm_vcpu *vcpu)
 {
-       atomic_or(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
-       set_bit(vcpu->vcpu_id, vcpu->arch.local_int.float_int->idle_mask);
+       kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT);
+       set_bit(vcpu->vcpu_id, vcpu->kvm->arch.float_int.idle_mask);
 }
 
 static void __unset_cpu_idle(struct kvm_vcpu *vcpu)
 {
-       atomic_andnot(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
-       clear_bit(vcpu->vcpu_id, vcpu->arch.local_int.float_int->idle_mask);
+       kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT);
+       clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.float_int.idle_mask);
 }
 
 static void __reset_intercept_indicators(struct kvm_vcpu *vcpu)
 {
-       atomic_andnot(CPUSTAT_IO_INT | CPUSTAT_EXT_INT | CPUSTAT_STOP_INT,
-                   &vcpu->arch.sie_block->cpuflags);
+       kvm_s390_clear_cpuflags(vcpu, CPUSTAT_IO_INT | CPUSTAT_EXT_INT |
+                                     CPUSTAT_STOP_INT);
        vcpu->arch.sie_block->lctl = 0x0000;
        vcpu->arch.sie_block->ictl &= ~(ICTL_LPSW | ICTL_STCTL | ICTL_PINT);
 
@@ -302,17 +335,12 @@ static void __reset_intercept_indicators(struct kvm_vcpu *vcpu)
        }
 }
 
-static void __set_cpuflag(struct kvm_vcpu *vcpu, u32 flag)
-{
-       atomic_or(flag, &vcpu->arch.sie_block->cpuflags);
-}
-
 static void set_intercept_indicators_io(struct kvm_vcpu *vcpu)
 {
        if (!(pending_irqs(vcpu) & IRQ_PEND_IO_MASK))
                return;
        else if (psw_ioint_disabled(vcpu))
-               __set_cpuflag(vcpu, CPUSTAT_IO_INT);
+               kvm_s390_set_cpuflags(vcpu, CPUSTAT_IO_INT);
        else
                vcpu->arch.sie_block->lctl |= LCTL_CR6;
 }
@@ -322,7 +350,7 @@ static void set_intercept_indicators_ext(struct kvm_vcpu *vcpu)
        if (!(pending_irqs(vcpu) & IRQ_PEND_EXT_MASK))
                return;
        if (psw_extint_disabled(vcpu))
-               __set_cpuflag(vcpu, CPUSTAT_EXT_INT);
+               kvm_s390_set_cpuflags(vcpu, CPUSTAT_EXT_INT);
        else
                vcpu->arch.sie_block->lctl |= LCTL_CR0;
 }
@@ -340,7 +368,7 @@ static void set_intercept_indicators_mchk(struct kvm_vcpu *vcpu)
 static void set_intercept_indicators_stop(struct kvm_vcpu *vcpu)
 {
        if (kvm_s390_is_stop_irq_pending(vcpu))
-               __set_cpuflag(vcpu, CPUSTAT_STOP_INT);
+               kvm_s390_set_cpuflags(vcpu, CPUSTAT_STOP_INT);
 }
 
 /* Set interception request for non-deliverable interrupts */
@@ -897,18 +925,38 @@ static int __must_check __deliver_virtio(struct kvm_vcpu *vcpu)
        return rc ? -EFAULT : 0;
 }
 
+static int __do_deliver_io(struct kvm_vcpu *vcpu, struct kvm_s390_io_info *io)
+{
+       int rc;
+
+       rc  = put_guest_lc(vcpu, io->subchannel_id, (u16 *)__LC_SUBCHANNEL_ID);
+       rc |= put_guest_lc(vcpu, io->subchannel_nr, (u16 *)__LC_SUBCHANNEL_NR);
+       rc |= put_guest_lc(vcpu, io->io_int_parm, (u32 *)__LC_IO_INT_PARM);
+       rc |= put_guest_lc(vcpu, io->io_int_word, (u32 *)__LC_IO_INT_WORD);
+       rc |= write_guest_lc(vcpu, __LC_IO_OLD_PSW,
+                            &vcpu->arch.sie_block->gpsw,
+                            sizeof(psw_t));
+       rc |= read_guest_lc(vcpu, __LC_IO_NEW_PSW,
+                           &vcpu->arch.sie_block->gpsw,
+                           sizeof(psw_t));
+       return rc ? -EFAULT : 0;
+}
+
 static int __must_check __deliver_io(struct kvm_vcpu *vcpu,
                                     unsigned long irq_type)
 {
        struct list_head *isc_list;
        struct kvm_s390_float_interrupt *fi;
        struct kvm_s390_interrupt_info *inti = NULL;
+       struct kvm_s390_io_info io;
+       u32 isc;
        int rc = 0;
 
        fi = &vcpu->kvm->arch.float_int;
 
        spin_lock(&fi->lock);
-       isc_list = &fi->lists[irq_type_to_isc(irq_type)];
+       isc = irq_type_to_isc(irq_type);
+       isc_list = &fi->lists[isc];
        inti = list_first_entry_or_null(isc_list,
                                        struct kvm_s390_interrupt_info,
                                        list);
@@ -936,24 +984,31 @@ static int __must_check __deliver_io(struct kvm_vcpu *vcpu,
        spin_unlock(&fi->lock);
 
        if (inti) {
-               rc  = put_guest_lc(vcpu, inti->io.subchannel_id,
-                               (u16 *)__LC_SUBCHANNEL_ID);
-               rc |= put_guest_lc(vcpu, inti->io.subchannel_nr,
-                               (u16 *)__LC_SUBCHANNEL_NR);
-               rc |= put_guest_lc(vcpu, inti->io.io_int_parm,
-                               (u32 *)__LC_IO_INT_PARM);
-               rc |= put_guest_lc(vcpu, inti->io.io_int_word,
-                               (u32 *)__LC_IO_INT_WORD);
-               rc |= write_guest_lc(vcpu, __LC_IO_OLD_PSW,
-                               &vcpu->arch.sie_block->gpsw,
-                               sizeof(psw_t));
-               rc |= read_guest_lc(vcpu, __LC_IO_NEW_PSW,
-                               &vcpu->arch.sie_block->gpsw,
-                               sizeof(psw_t));
+               rc = __do_deliver_io(vcpu, &(inti->io));
                kfree(inti);
+               goto out;
        }
 
-       return rc ? -EFAULT : 0;
+       if (vcpu->kvm->arch.gisa &&
+           kvm_s390_gisa_tac_ipm_gisc(vcpu->kvm->arch.gisa, isc)) {
+               /*
+                * in case an adapter interrupt was not delivered
+                * in SIE context KVM will handle the delivery
+                */
+               VCPU_EVENT(vcpu, 4, "%s isc %u", "deliver: I/O (AI/gisa)", isc);
+               memset(&io, 0, sizeof(io));
+               io.io_int_word = isc_to_int_word(isc);
+               vcpu->stat.deliver_io_int++;
+               trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id,
+                       KVM_S390_INT_IO(1, 0, 0, 0),
+                       ((__u32)io.subchannel_id << 16) |
+                       io.subchannel_nr,
+                       ((__u64)io.io_int_parm << 32) |
+                       io.io_int_word);
+               rc = __do_deliver_io(vcpu, &io);
+       }
+out:
+       return rc;
 }
 
 typedef int (*deliver_irq_t)(struct kvm_vcpu *vcpu);
@@ -1155,8 +1210,8 @@ int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
                set_bit(IRQ_PEND_EXT_CPU_TIMER, &li->pending_irqs);
 
        while ((irqs = deliverable_irqs(vcpu)) && !rc) {
-               /* bits are in the order of interrupt priority */
-               irq_type = find_first_bit(&irqs, IRQ_PEND_COUNT);
+               /* bits are in the reverse order of interrupt priority */
+               irq_type = find_last_bit(&irqs, IRQ_PEND_COUNT);
                if (is_ioirq(irq_type)) {
                        rc = __deliver_io(vcpu, irq_type);
                } else {
@@ -1228,7 +1283,7 @@ static int __inject_pfault_init(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
 
        li->irq.ext = irq->u.ext;
        set_bit(IRQ_PEND_PFAULT_INIT, &li->pending_irqs);
-       atomic_or(CPUSTAT_EXT_INT, li->cpuflags);
+       kvm_s390_set_cpuflags(vcpu, CPUSTAT_EXT_INT);
        return 0;
 }
 
@@ -1253,7 +1308,7 @@ static int __inject_extcall(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
        if (test_and_set_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs))
                return -EBUSY;
        *extcall = irq->u.extcall;
-       atomic_or(CPUSTAT_EXT_INT, li->cpuflags);
+       kvm_s390_set_cpuflags(vcpu, CPUSTAT_EXT_INT);
        return 0;
 }
 
@@ -1297,7 +1352,7 @@ static int __inject_sigp_stop(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
        if (test_and_set_bit(IRQ_PEND_SIGP_STOP, &li->pending_irqs))
                return -EBUSY;
        stop->flags = irq->u.stop.flags;
-       __set_cpuflag(vcpu, CPUSTAT_STOP_INT);
+       kvm_s390_set_cpuflags(vcpu, CPUSTAT_STOP_INT);
        return 0;
 }
 
@@ -1329,7 +1384,7 @@ static int __inject_sigp_emergency(struct kvm_vcpu *vcpu,
 
        set_bit(irq->u.emerg.code, li->sigp_emerg_pending);
        set_bit(IRQ_PEND_EXT_EMERGENCY, &li->pending_irqs);
-       atomic_or(CPUSTAT_EXT_INT, li->cpuflags);
+       kvm_s390_set_cpuflags(vcpu, CPUSTAT_EXT_INT);
        return 0;
 }
 
@@ -1373,7 +1428,7 @@ static int __inject_ckc(struct kvm_vcpu *vcpu)
                                   0, 0);
 
        set_bit(IRQ_PEND_EXT_CLOCK_COMP, &li->pending_irqs);
-       atomic_or(CPUSTAT_EXT_INT, li->cpuflags);
+       kvm_s390_set_cpuflags(vcpu, CPUSTAT_EXT_INT);
        return 0;
 }
 
@@ -1386,7 +1441,7 @@ static int __inject_cpu_timer(struct kvm_vcpu *vcpu)
                                   0, 0);
 
        set_bit(IRQ_PEND_EXT_CPU_TIMER, &li->pending_irqs);
-       atomic_or(CPUSTAT_EXT_INT, li->cpuflags);
+       kvm_s390_set_cpuflags(vcpu, CPUSTAT_EXT_INT);
        return 0;
 }
 
@@ -1416,20 +1471,86 @@ static struct kvm_s390_interrupt_info *get_io_int(struct kvm *kvm,
        return NULL;
 }
 
+static struct kvm_s390_interrupt_info *get_top_io_int(struct kvm *kvm,
+                                                     u64 isc_mask, u32 schid)
+{
+       struct kvm_s390_interrupt_info *inti = NULL;
+       int isc;
+
+       for (isc = 0; isc <= MAX_ISC && !inti; isc++) {
+               if (isc_mask & isc_to_isc_bits(isc))
+                       inti = get_io_int(kvm, isc, schid);
+       }
+       return inti;
+}
+
+static int get_top_gisa_isc(struct kvm *kvm, u64 isc_mask, u32 schid)
+{
+       unsigned long active_mask;
+       int isc;
+
+       if (schid)
+               goto out;
+       if (!kvm->arch.gisa)
+               goto out;
+
+       active_mask = (isc_mask & kvm_s390_gisa_get_ipm(kvm->arch.gisa) << 24) << 32;
+       while (active_mask) {
+               isc = __fls(active_mask) ^ (BITS_PER_LONG - 1);
+               if (kvm_s390_gisa_tac_ipm_gisc(kvm->arch.gisa, isc))
+                       return isc;
+               clear_bit_inv(isc, &active_mask);
+       }
+out:
+       return -EINVAL;
+}
+
 /*
  * Dequeue and return an I/O interrupt matching any of the interruption
  * subclasses as designated by the isc mask in cr6 and the schid (if != 0).
+ * Take into account the interrupts pending in the interrupt list and in GISA.
+ *
+ * Note that for a guest that does not enable I/O interrupts
+ * but relies on TPI, a flood of classic interrupts may starve
+ * out adapter interrupts on the same isc. Linux does not do
+ * that, and it is possible to work around the issue by configuring
+ * different iscs for classic and adapter interrupts in the guest,
+ * but we may want to revisit this in the future.
  */
 struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
                                                    u64 isc_mask, u32 schid)
 {
-       struct kvm_s390_interrupt_info *inti = NULL;
+       struct kvm_s390_interrupt_info *inti, *tmp_inti;
        int isc;
 
-       for (isc = 0; isc <= MAX_ISC && !inti; isc++) {
-               if (isc_mask & isc_to_isc_bits(isc))
-                       inti = get_io_int(kvm, isc, schid);
+       inti = get_top_io_int(kvm, isc_mask, schid);
+
+       isc = get_top_gisa_isc(kvm, isc_mask, schid);
+       if (isc < 0)
+               /* no AI in GISA */
+               goto out;
+
+       if (!inti)
+               /* AI in GISA but no classical IO int */
+               goto gisa_out;
+
+       /* both types of interrupts present */
+       if (int_word_to_isc(inti->io.io_int_word) <= isc) {
+               /* classical IO int with higher priority */
+               kvm_s390_gisa_set_ipm_gisc(kvm->arch.gisa, isc);
+               goto out;
        }
+gisa_out:
+       tmp_inti = kzalloc(sizeof(*inti), GFP_KERNEL);
+       if (tmp_inti) {
+               tmp_inti->type = KVM_S390_INT_IO(1, 0, 0, 0);
+               tmp_inti->io.io_int_word = isc_to_int_word(isc);
+               if (inti)
+                       kvm_s390_reinject_io_int(kvm, inti);
+               inti = tmp_inti;
+       } else
+               kvm_s390_gisa_set_ipm_gisc(kvm->arch.gisa, isc);
+out:
        return inti;
 }
 
@@ -1517,6 +1638,15 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
        struct list_head *list;
        int isc;
 
+       isc = int_word_to_isc(inti->io.io_int_word);
+
+       if (kvm->arch.gisa && inti->type & KVM_S390_INT_IO_AI_MASK) {
+               VM_EVENT(kvm, 4, "%s isc %1u", "inject: I/O (AI/gisa)", isc);
+               kvm_s390_gisa_set_ipm_gisc(kvm->arch.gisa, isc);
+               kfree(inti);
+               return 0;
+       }
+
        fi = &kvm->arch.float_int;
        spin_lock(&fi->lock);
        if (fi->counters[FIRQ_CNTR_IO] >= KVM_S390_MAX_FLOAT_IRQS) {
@@ -1532,7 +1662,6 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
                        inti->io.subchannel_id >> 8,
                        inti->io.subchannel_id >> 1 & 0x3,
                        inti->io.subchannel_nr);
-       isc = int_word_to_isc(inti->io.io_int_word);
        list = &fi->lists[FIRQ_LIST_IO_ISC_0 + isc];
        list_add_tail(&inti->list, list);
        set_bit(isc_to_irq_type(isc), &fi->pending_irqs);
@@ -1546,7 +1675,6 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
 static void __floating_irq_kick(struct kvm *kvm, u64 type)
 {
        struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
-       struct kvm_s390_local_interrupt *li;
        struct kvm_vcpu *dst_vcpu;
        int sigcpu, online_vcpus, nr_tries = 0;
 
@@ -1568,20 +1696,17 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type)
        dst_vcpu = kvm_get_vcpu(kvm, sigcpu);
 
        /* make the VCPU drop out of the SIE, or wake it up if sleeping */
-       li = &dst_vcpu->arch.local_int;
-       spin_lock(&li->lock);
        switch (type) {
        case KVM_S390_MCHK:
-               atomic_or(CPUSTAT_STOP_INT, li->cpuflags);
+               kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_STOP_INT);
                break;
        case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
-               atomic_or(CPUSTAT_IO_INT, li->cpuflags);
+               kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_IO_INT);
                break;
        default:
-               atomic_or(CPUSTAT_EXT_INT, li->cpuflags);
+               kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_EXT_INT);
                break;
        }
-       spin_unlock(&li->lock);
        kvm_s390_vcpu_wakeup(dst_vcpu);
 }
 
@@ -1820,6 +1945,7 @@ void kvm_s390_clear_float_irqs(struct kvm *kvm)
        for (i = 0; i < FIRQ_MAX_COUNT; i++)
                fi->counters[i] = 0;
        spin_unlock(&fi->lock);
+       kvm_s390_gisa_clear(kvm);
 };
 
 static int get_all_floating_irqs(struct kvm *kvm, u8 __user *usrbuf, u64 len)
@@ -1847,6 +1973,22 @@ static int get_all_floating_irqs(struct kvm *kvm, u8 __user *usrbuf, u64 len)
 
        max_irqs = len / sizeof(struct kvm_s390_irq);
 
+       if (kvm->arch.gisa &&
+           kvm_s390_gisa_get_ipm(kvm->arch.gisa)) {
+               for (i = 0; i <= MAX_ISC; i++) {
+                       if (n == max_irqs) {
+                               /* signal userspace to try again */
+                               ret = -ENOMEM;
+                               goto out_nolock;
+                       }
+                       if (kvm_s390_gisa_tac_ipm_gisc(kvm->arch.gisa, i)) {
+                               irq = (struct kvm_s390_irq *) &buf[n];
+                               irq->type = KVM_S390_INT_IO(1, 0, 0, 0);
+                               irq->u.io.io_int_word = isc_to_int_word(i);
+                               n++;
+                       }
+               }
+       }
        fi = &kvm->arch.float_int;
        spin_lock(&fi->lock);
        for (i = 0; i < FIRQ_LIST_COUNT; i++) {
@@ -1885,6 +2027,7 @@ static int get_all_floating_irqs(struct kvm *kvm, u8 __user *usrbuf, u64 len)
 
 out:
        spin_unlock(&fi->lock);
+out_nolock:
        if (!ret && n > 0) {
                if (copy_to_user(usrbuf, buf, sizeof(struct kvm_s390_irq) * n))
                        ret = -EFAULT;
@@ -2245,7 +2388,7 @@ static int kvm_s390_inject_airq(struct kvm *kvm,
        struct kvm_s390_interrupt s390int = {
                .type = KVM_S390_INT_IO(1, 0, 0, 0),
                .parm = 0,
-               .parm64 = (adapter->isc << 27) | 0x80000000,
+               .parm64 = isc_to_int_word(adapter->isc),
        };
        int ret = 0;
 
@@ -2687,3 +2830,28 @@ int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, __u8 __user *buf, int len)
 
        return n;
 }
+
+void kvm_s390_gisa_clear(struct kvm *kvm)
+{
+       if (kvm->arch.gisa) {
+               memset(kvm->arch.gisa, 0, sizeof(struct kvm_s390_gisa));
+               kvm->arch.gisa->next_alert = (u32)(u64)kvm->arch.gisa;
+               VM_EVENT(kvm, 3, "gisa 0x%pK cleared", kvm->arch.gisa);
+       }
+}
+
+void kvm_s390_gisa_init(struct kvm *kvm)
+{
+       if (css_general_characteristics.aiv) {
+               kvm->arch.gisa = &kvm->arch.sie_page2->gisa;
+               VM_EVENT(kvm, 3, "gisa 0x%pK initialized", kvm->arch.gisa);
+               kvm_s390_gisa_clear(kvm);
+       }
+}
+
+void kvm_s390_gisa_destroy(struct kvm *kvm)
+{
+       if (!kvm->arch.gisa)
+               return;
+       kvm->arch.gisa = NULL;
+}
index ec8b68e97d3cd4755074463e467a82474471e7c6..58bee42d7a9de0360980add641f1c304e4ea3873 100644 (file)
@@ -2,7 +2,7 @@
 /*
  * hosting IBM Z kernel virtual machines (s390x)
  *
- * Copyright IBM Corp. 2008, 2017
+ * Copyright IBM Corp. 2008, 2018
  *
  *    Author(s): Carsten Otte <cotte@de.ibm.com>
  *               Christian Borntraeger <borntraeger@de.ibm.com>
@@ -87,19 +87,31 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "deliver_restart_signal", VCPU_STAT(deliver_restart_signal) },
        { "deliver_program_interruption", VCPU_STAT(deliver_program_int) },
        { "exit_wait_state", VCPU_STAT(exit_wait_state) },
+       { "instruction_epsw", VCPU_STAT(instruction_epsw) },
+       { "instruction_gs", VCPU_STAT(instruction_gs) },
+       { "instruction_io_other", VCPU_STAT(instruction_io_other) },
+       { "instruction_lpsw", VCPU_STAT(instruction_lpsw) },
+       { "instruction_lpswe", VCPU_STAT(instruction_lpswe) },
        { "instruction_pfmf", VCPU_STAT(instruction_pfmf) },
+       { "instruction_ptff", VCPU_STAT(instruction_ptff) },
        { "instruction_stidp", VCPU_STAT(instruction_stidp) },
+       { "instruction_sck", VCPU_STAT(instruction_sck) },
+       { "instruction_sckpf", VCPU_STAT(instruction_sckpf) },
        { "instruction_spx", VCPU_STAT(instruction_spx) },
        { "instruction_stpx", VCPU_STAT(instruction_stpx) },
        { "instruction_stap", VCPU_STAT(instruction_stap) },
-       { "instruction_storage_key", VCPU_STAT(instruction_storage_key) },
+       { "instruction_iske", VCPU_STAT(instruction_iske) },
+       { "instruction_ri", VCPU_STAT(instruction_ri) },
+       { "instruction_rrbe", VCPU_STAT(instruction_rrbe) },
+       { "instruction_sske", VCPU_STAT(instruction_sske) },
        { "instruction_ipte_interlock", VCPU_STAT(instruction_ipte_interlock) },
-       { "instruction_stsch", VCPU_STAT(instruction_stsch) },
-       { "instruction_chsc", VCPU_STAT(instruction_chsc) },
        { "instruction_essa", VCPU_STAT(instruction_essa) },
        { "instruction_stsi", VCPU_STAT(instruction_stsi) },
        { "instruction_stfl", VCPU_STAT(instruction_stfl) },
+       { "instruction_tb", VCPU_STAT(instruction_tb) },
+       { "instruction_tpi", VCPU_STAT(instruction_tpi) },
        { "instruction_tprot", VCPU_STAT(instruction_tprot) },
+       { "instruction_tsch", VCPU_STAT(instruction_tsch) },
        { "instruction_sthyi", VCPU_STAT(instruction_sthyi) },
        { "instruction_sie", VCPU_STAT(instruction_sie) },
        { "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) },
@@ -118,12 +130,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "instruction_sigp_cpu_reset", VCPU_STAT(instruction_sigp_cpu_reset) },
        { "instruction_sigp_init_cpu_reset", VCPU_STAT(instruction_sigp_init_cpu_reset) },
        { "instruction_sigp_unknown", VCPU_STAT(instruction_sigp_unknown) },
-       { "diagnose_10", VCPU_STAT(diagnose_10) },
-       { "diagnose_44", VCPU_STAT(diagnose_44) },
-       { "diagnose_9c", VCPU_STAT(diagnose_9c) },
-       { "diagnose_258", VCPU_STAT(diagnose_258) },
-       { "diagnose_308", VCPU_STAT(diagnose_308) },
-       { "diagnose_500", VCPU_STAT(diagnose_500) },
+       { "instruction_diag_10", VCPU_STAT(diagnose_10) },
+       { "instruction_diag_44", VCPU_STAT(diagnose_44) },
+       { "instruction_diag_9c", VCPU_STAT(diagnose_9c) },
+       { "instruction_diag_258", VCPU_STAT(diagnose_258) },
+       { "instruction_diag_308", VCPU_STAT(diagnose_308) },
+       { "instruction_diag_500", VCPU_STAT(diagnose_500) },
+       { "instruction_diag_other", VCPU_STAT(diagnose_other) },
        { NULL }
 };
 
@@ -573,7 +586,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
        case KVM_CAP_S390_GS:
                r = -EINVAL;
                mutex_lock(&kvm->lock);
-               if (atomic_read(&kvm->online_vcpus)) {
+               if (kvm->created_vcpus) {
                        r = -EBUSY;
                } else if (test_facility(133)) {
                        set_kvm_facility(kvm->arch.model.fac_mask, 133);
@@ -1084,7 +1097,6 @@ static int kvm_s390_set_processor_feat(struct kvm *kvm,
                                       struct kvm_device_attr *attr)
 {
        struct kvm_s390_vm_cpu_feat data;
-       int ret = -EBUSY;
 
        if (copy_from_user(&data, (void __user *)attr->addr, sizeof(data)))
                return -EFAULT;
@@ -1094,13 +1106,18 @@ static int kvm_s390_set_processor_feat(struct kvm *kvm,
                return -EINVAL;
 
        mutex_lock(&kvm->lock);
-       if (!atomic_read(&kvm->online_vcpus)) {
-               bitmap_copy(kvm->arch.cpu_feat, (unsigned long *) data.feat,
-                           KVM_S390_VM_CPU_FEAT_NR_BITS);
-               ret = 0;
+       if (kvm->created_vcpus) {
+               mutex_unlock(&kvm->lock);
+               return -EBUSY;
        }
+       bitmap_copy(kvm->arch.cpu_feat, (unsigned long *) data.feat,
+                   KVM_S390_VM_CPU_FEAT_NR_BITS);
        mutex_unlock(&kvm->lock);
-       return ret;
+       VM_EVENT(kvm, 3, "SET: guest feat: 0x%16.16llx.0x%16.16llx.0x%16.16llx",
+                        data.feat[0],
+                        data.feat[1],
+                        data.feat[2]);
+       return 0;
 }
 
 static int kvm_s390_set_processor_subfunc(struct kvm *kvm,
@@ -1202,6 +1219,10 @@ static int kvm_s390_get_processor_feat(struct kvm *kvm,
                    KVM_S390_VM_CPU_FEAT_NR_BITS);
        if (copy_to_user((void __user *)attr->addr, &data, sizeof(data)))
                return -EFAULT;
+       VM_EVENT(kvm, 3, "GET: guest feat: 0x%16.16llx.0x%16.16llx.0x%16.16llx",
+                        data.feat[0],
+                        data.feat[1],
+                        data.feat[2]);
        return 0;
 }
 
@@ -1215,6 +1236,10 @@ static int kvm_s390_get_machine_feat(struct kvm *kvm,
                    KVM_S390_VM_CPU_FEAT_NR_BITS);
        if (copy_to_user((void __user *)attr->addr, &data, sizeof(data)))
                return -EFAULT;
+       VM_EVENT(kvm, 3, "GET: host feat:  0x%16.16llx.0x%16.16llx.0x%16.16llx",
+                        data.feat[0],
+                        data.feat[1],
+                        data.feat[2]);
        return 0;
 }
 
@@ -1903,6 +1928,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        if (!kvm->arch.dbf)
                goto out_err;
 
+       BUILD_BUG_ON(sizeof(struct sie_page2) != 4096);
        kvm->arch.sie_page2 =
             (struct sie_page2 *) get_zeroed_page(GFP_KERNEL | GFP_DMA);
        if (!kvm->arch.sie_page2)
@@ -1973,6 +1999,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
        spin_lock_init(&kvm->arch.start_stop_lock);
        kvm_s390_vsie_init(kvm);
+       kvm_s390_gisa_init(kvm);
        KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid);
 
        return 0;
@@ -2035,6 +2062,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
        kvm_free_vcpus(kvm);
        sca_dispose(kvm);
        debug_unregister(kvm->arch.dbf);
+       kvm_s390_gisa_destroy(kvm);
        free_page((unsigned long)kvm->arch.sie_page2);
        if (!kvm_is_ucontrol(kvm))
                gmap_remove(kvm->arch.gmap);
@@ -2304,7 +2332,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 
        gmap_enable(vcpu->arch.enabled_gmap);
-       atomic_or(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
+       kvm_s390_set_cpuflags(vcpu, CPUSTAT_RUNNING);
        if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
                __start_cpu_timer_accounting(vcpu);
        vcpu->cpu = cpu;
@@ -2315,7 +2343,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
        vcpu->cpu = -1;
        if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
                __stop_cpu_timer_accounting(vcpu);
-       atomic_andnot(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
+       kvm_s390_clear_cpuflags(vcpu, CPUSTAT_RUNNING);
        vcpu->arch.enabled_gmap = gmap_get_enabled();
        gmap_disable(vcpu->arch.enabled_gmap);
 
@@ -2411,9 +2439,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
                                                    CPUSTAT_STOPPED);
 
        if (test_kvm_facility(vcpu->kvm, 78))
-               atomic_or(CPUSTAT_GED2, &vcpu->arch.sie_block->cpuflags);
+               kvm_s390_set_cpuflags(vcpu, CPUSTAT_GED2);
        else if (test_kvm_facility(vcpu->kvm, 8))
-               atomic_or(CPUSTAT_GED, &vcpu->arch.sie_block->cpuflags);
+               kvm_s390_set_cpuflags(vcpu, CPUSTAT_GED);
 
        kvm_s390_vcpu_setup_model(vcpu);
 
@@ -2445,12 +2473,17 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
        if (test_kvm_facility(vcpu->kvm, 139))
                vcpu->arch.sie_block->ecd |= ECD_MEF;
 
+       if (vcpu->arch.sie_block->gd) {
+               vcpu->arch.sie_block->eca |= ECA_AIV;
+               VCPU_EVENT(vcpu, 3, "AIV gisa format-%u enabled for cpu %03u",
+                          vcpu->arch.sie_block->gd & 0x3, vcpu->vcpu_id);
+       }
        vcpu->arch.sie_block->sdnxo = ((unsigned long) &vcpu->run->s.regs.sdnx)
                                        | SDNXC;
        vcpu->arch.sie_block->riccbd = (unsigned long) &vcpu->run->s.regs.riccb;
 
        if (sclp.has_kss)
-               atomic_or(CPUSTAT_KSS, &vcpu->arch.sie_block->cpuflags);
+               kvm_s390_set_cpuflags(vcpu, CPUSTAT_KSS);
        else
                vcpu->arch.sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
 
@@ -2497,9 +2530,9 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 
        vcpu->arch.sie_block->icpua = id;
        spin_lock_init(&vcpu->arch.local_int.lock);
-       vcpu->arch.local_int.float_int = &kvm->arch.float_int;
-       vcpu->arch.local_int.wq = &vcpu->wq;
-       vcpu->arch.local_int.cpuflags = &vcpu->arch.sie_block->cpuflags;
+       vcpu->arch.sie_block->gd = (u32)(u64)kvm->arch.gisa;
+       if (vcpu->arch.sie_block->gd && sclp.has_gisaf)
+               vcpu->arch.sie_block->gd |= GISA_FORMAT1;
        seqcount_init(&vcpu->arch.cputm_seqcount);
 
        rc = kvm_vcpu_init(vcpu, kvm, id);
@@ -2556,7 +2589,7 @@ static void kvm_s390_vcpu_request_handled(struct kvm_vcpu *vcpu)
  * return immediately. */
 void exit_sie(struct kvm_vcpu *vcpu)
 {
-       atomic_or(CPUSTAT_STOP_INT, &vcpu->arch.sie_block->cpuflags);
+       kvm_s390_set_cpuflags(vcpu, CPUSTAT_STOP_INT);
        while (vcpu->arch.sie_block->prog0c & PROG_IN_SIE)
                cpu_relax();
 }
@@ -2709,47 +2742,70 @@ static int kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu)
 
 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
+       vcpu_load(vcpu);
        memcpy(&vcpu->run->s.regs.gprs, &regs->gprs, sizeof(regs->gprs));
+       vcpu_put(vcpu);
        return 0;
 }
 
 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
+       vcpu_load(vcpu);
        memcpy(&regs->gprs, &vcpu->run->s.regs.gprs, sizeof(regs->gprs));
+       vcpu_put(vcpu);
        return 0;
 }
 
 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
                                  struct kvm_sregs *sregs)
 {
+       vcpu_load(vcpu);
+
        memcpy(&vcpu->run->s.regs.acrs, &sregs->acrs, sizeof(sregs->acrs));
        memcpy(&vcpu->arch.sie_block->gcr, &sregs->crs, sizeof(sregs->crs));
+
+       vcpu_put(vcpu);
        return 0;
 }
 
 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
                                  struct kvm_sregs *sregs)
 {
+       vcpu_load(vcpu);
+
        memcpy(&sregs->acrs, &vcpu->run->s.regs.acrs, sizeof(sregs->acrs));
        memcpy(&sregs->crs, &vcpu->arch.sie_block->gcr, sizeof(sregs->crs));
+
+       vcpu_put(vcpu);
        return 0;
 }
 
 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
-       if (test_fp_ctl(fpu->fpc))
-               return -EINVAL;
+       int ret = 0;
+
+       vcpu_load(vcpu);
+
+       if (test_fp_ctl(fpu->fpc)) {
+               ret = -EINVAL;
+               goto out;
+       }
        vcpu->run->s.regs.fpc = fpu->fpc;
        if (MACHINE_HAS_VX)
                convert_fp_to_vx((__vector128 *) vcpu->run->s.regs.vrs,
                                 (freg_t *) fpu->fprs);
        else
                memcpy(vcpu->run->s.regs.fprs, &fpu->fprs, sizeof(fpu->fprs));
-       return 0;
+
+out:
+       vcpu_put(vcpu);
+       return ret;
 }
 
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
+       vcpu_load(vcpu);
+
        /* make sure we have the latest values */
        save_fpu_regs();
        if (MACHINE_HAS_VX)
@@ -2758,6 +2814,8 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
        else
                memcpy(fpu->fprs, vcpu->run->s.regs.fprs, sizeof(fpu->fprs));
        fpu->fpc = vcpu->run->s.regs.fpc;
+
+       vcpu_put(vcpu);
        return 0;
 }
 
@@ -2789,41 +2847,56 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 {
        int rc = 0;
 
+       vcpu_load(vcpu);
+
        vcpu->guest_debug = 0;
        kvm_s390_clear_bp_data(vcpu);
 
-       if (dbg->control & ~VALID_GUESTDBG_FLAGS)
-               return -EINVAL;
-       if (!sclp.has_gpere)
-               return -EINVAL;
+       if (dbg->control & ~VALID_GUESTDBG_FLAGS) {
+               rc = -EINVAL;
+               goto out;
+       }
+       if (!sclp.has_gpere) {
+               rc = -EINVAL;
+               goto out;
+       }
 
        if (dbg->control & KVM_GUESTDBG_ENABLE) {
                vcpu->guest_debug = dbg->control;
                /* enforce guest PER */
-               atomic_or(CPUSTAT_P, &vcpu->arch.sie_block->cpuflags);
+               kvm_s390_set_cpuflags(vcpu, CPUSTAT_P);
 
                if (dbg->control & KVM_GUESTDBG_USE_HW_BP)
                        rc = kvm_s390_import_bp_data(vcpu, dbg);
        } else {
-               atomic_andnot(CPUSTAT_P, &vcpu->arch.sie_block->cpuflags);
+               kvm_s390_clear_cpuflags(vcpu, CPUSTAT_P);
                vcpu->arch.guestdbg.last_bp = 0;
        }
 
        if (rc) {
                vcpu->guest_debug = 0;
                kvm_s390_clear_bp_data(vcpu);
-               atomic_andnot(CPUSTAT_P, &vcpu->arch.sie_block->cpuflags);
+               kvm_s390_clear_cpuflags(vcpu, CPUSTAT_P);
        }
 
+out:
+       vcpu_put(vcpu);
        return rc;
 }
 
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
                                    struct kvm_mp_state *mp_state)
 {
+       int ret;
+
+       vcpu_load(vcpu);
+
        /* CHECK_STOP and LOAD are not supported yet */
-       return is_vcpu_stopped(vcpu) ? KVM_MP_STATE_STOPPED :
-                                      KVM_MP_STATE_OPERATING;
+       ret = is_vcpu_stopped(vcpu) ? KVM_MP_STATE_STOPPED :
+                                     KVM_MP_STATE_OPERATING;
+
+       vcpu_put(vcpu);
+       return ret;
 }
 
 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
@@ -2831,6 +2904,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 {
        int rc = 0;
 
+       vcpu_load(vcpu);
+
        /* user space knows about this interface - let it control the state */
        vcpu->kvm->arch.user_cpu_state_ctrl = 1;
 
@@ -2848,12 +2923,13 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
                rc = -ENXIO;
        }
 
+       vcpu_put(vcpu);
        return rc;
 }
 
 static bool ibs_enabled(struct kvm_vcpu *vcpu)
 {
-       return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_IBS;
+       return kvm_s390_test_cpuflags(vcpu, CPUSTAT_IBS);
 }
 
 static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu)
@@ -2889,8 +2965,7 @@ retry:
        if (kvm_check_request(KVM_REQ_ENABLE_IBS, vcpu)) {
                if (!ibs_enabled(vcpu)) {
                        trace_kvm_s390_enable_disable_ibs(vcpu->vcpu_id, 1);
-                       atomic_or(CPUSTAT_IBS,
-                                       &vcpu->arch.sie_block->cpuflags);
+                       kvm_s390_set_cpuflags(vcpu, CPUSTAT_IBS);
                }
                goto retry;
        }
@@ -2898,8 +2973,7 @@ retry:
        if (kvm_check_request(KVM_REQ_DISABLE_IBS, vcpu)) {
                if (ibs_enabled(vcpu)) {
                        trace_kvm_s390_enable_disable_ibs(vcpu->vcpu_id, 0);
-                       atomic_andnot(CPUSTAT_IBS,
-                                         &vcpu->arch.sie_block->cpuflags);
+                       kvm_s390_clear_cpuflags(vcpu, CPUSTAT_IBS);
                }
                goto retry;
        }
@@ -3373,9 +3447,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        if (kvm_run->immediate_exit)
                return -EINTR;
 
+       vcpu_load(vcpu);
+
        if (guestdbg_exit_pending(vcpu)) {
                kvm_s390_prepare_debug_exit(vcpu);
-               return 0;
+               rc = 0;
+               goto out;
        }
 
        kvm_sigset_activate(vcpu);
@@ -3385,7 +3462,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        } else if (is_vcpu_stopped(vcpu)) {
                pr_err_ratelimited("can't run stopped vcpu %d\n",
                                   vcpu->vcpu_id);
-               return -EINVAL;
+               rc = -EINVAL;
+               goto out;
        }
 
        sync_regs(vcpu, kvm_run);
@@ -3415,6 +3493,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        kvm_sigset_deactivate(vcpu);
 
        vcpu->stat.exit_userspace++;
+out:
+       vcpu_put(vcpu);
        return rc;
 }
 
@@ -3543,7 +3623,7 @@ void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu)
                __disable_ibs_on_all_vcpus(vcpu->kvm);
        }
 
-       atomic_andnot(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags);
+       kvm_s390_clear_cpuflags(vcpu, CPUSTAT_STOPPED);
        /*
         * Another VCPU might have used IBS while we were offline.
         * Let's play safe and flush the VCPU at startup.
@@ -3569,7 +3649,7 @@ void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu)
        /* SIGP STOP and SIGP STOP AND STORE STATUS has been fully processed */
        kvm_s390_clear_stop_irq(vcpu);
 
-       atomic_or(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags);
+       kvm_s390_set_cpuflags(vcpu, CPUSTAT_STOPPED);
        __disable_ibs_on_vcpu(vcpu);
 
        for (i = 0; i < online_vcpus; i++) {
@@ -3676,36 +3756,45 @@ static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu,
        return r;
 }
 
-long kvm_arch_vcpu_ioctl(struct file *filp,
-                        unsigned int ioctl, unsigned long arg)
+long kvm_arch_vcpu_async_ioctl(struct file *filp,
+                              unsigned int ioctl, unsigned long arg)
 {
        struct kvm_vcpu *vcpu = filp->private_data;
        void __user *argp = (void __user *)arg;
-       int idx;
-       long r;
 
        switch (ioctl) {
        case KVM_S390_IRQ: {
                struct kvm_s390_irq s390irq;
 
-               r = -EFAULT;
                if (copy_from_user(&s390irq, argp, sizeof(s390irq)))
-                       break;
-               r = kvm_s390_inject_vcpu(vcpu, &s390irq);
-               break;
+                       return -EFAULT;
+               return kvm_s390_inject_vcpu(vcpu, &s390irq);
        }
        case KVM_S390_INTERRUPT: {
                struct kvm_s390_interrupt s390int;
                struct kvm_s390_irq s390irq;
 
-               r = -EFAULT;
                if (copy_from_user(&s390int, argp, sizeof(s390int)))
-                       break;
+                       return -EFAULT;
                if (s390int_to_s390irq(&s390int, &s390irq))
                        return -EINVAL;
-               r = kvm_s390_inject_vcpu(vcpu, &s390irq);
-               break;
+               return kvm_s390_inject_vcpu(vcpu, &s390irq);
+       }
        }
+       return -ENOIOCTLCMD;
+}
+
+long kvm_arch_vcpu_ioctl(struct file *filp,
+                        unsigned int ioctl, unsigned long arg)
+{
+       struct kvm_vcpu *vcpu = filp->private_data;
+       void __user *argp = (void __user *)arg;
+       int idx;
+       long r;
+
+       vcpu_load(vcpu);
+
+       switch (ioctl) {
        case KVM_S390_STORE_STATUS:
                idx = srcu_read_lock(&vcpu->kvm->srcu);
                r = kvm_s390_vcpu_store_status(vcpu, arg);
@@ -3830,6 +3919,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
        default:
                r = -ENOTTY;
        }
+
+       vcpu_put(vcpu);
        return r;
 }
 
index 5e46ba429bcb4dfe4345f531339557b6af71ef40..bd31b37b0e6f83905e7204b2eb439050aaeb1187 100644 (file)
@@ -47,14 +47,29 @@ do { \
          d_args); \
 } while (0)
 
+static inline void kvm_s390_set_cpuflags(struct kvm_vcpu *vcpu, u32 flags)
+{
+       atomic_or(flags, &vcpu->arch.sie_block->cpuflags);
+}
+
+static inline void kvm_s390_clear_cpuflags(struct kvm_vcpu *vcpu, u32 flags)
+{
+       atomic_andnot(flags, &vcpu->arch.sie_block->cpuflags);
+}
+
+static inline bool kvm_s390_test_cpuflags(struct kvm_vcpu *vcpu, u32 flags)
+{
+       return (atomic_read(&vcpu->arch.sie_block->cpuflags) & flags) == flags;
+}
+
 static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu)
 {
-       return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_STOPPED;
+       return kvm_s390_test_cpuflags(vcpu, CPUSTAT_STOPPED);
 }
 
 static inline int is_vcpu_idle(struct kvm_vcpu *vcpu)
 {
-       return test_bit(vcpu->vcpu_id, vcpu->arch.local_int.float_int->idle_mask);
+       return test_bit(vcpu->vcpu_id, vcpu->kvm->arch.float_int.idle_mask);
 }
 
 static inline int kvm_is_ucontrol(struct kvm *kvm)
@@ -367,6 +382,9 @@ int kvm_s390_set_irq_state(struct kvm_vcpu *vcpu,
                           void __user *buf, int len);
 int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu,
                           __u8 __user *buf, int len);
+void kvm_s390_gisa_init(struct kvm *kvm);
+void kvm_s390_gisa_clear(struct kvm *kvm);
+void kvm_s390_gisa_destroy(struct kvm *kvm);
 
 /* implemented in guestdbg.c */
 void kvm_s390_backup_guest_per_regs(struct kvm_vcpu *vcpu);
index 572496c688cc0c647bd220310bfdc6e9635d4723..125a7ff98e2ad8170472cc4fa94015b117d0978c 100644 (file)
@@ -2,7 +2,7 @@
 /*
  * handling privileged instructions
  *
- * Copyright IBM Corp. 2008, 2013
+ * Copyright IBM Corp. 2008, 2018
  *
  *    Author(s): Carsten Otte <cotte@de.ibm.com>
  *               Christian Borntraeger <borntraeger@de.ibm.com>
@@ -34,6 +34,8 @@
 
 static int handle_ri(struct kvm_vcpu *vcpu)
 {
+       vcpu->stat.instruction_ri++;
+
        if (test_kvm_facility(vcpu->kvm, 64)) {
                VCPU_EVENT(vcpu, 3, "%s", "ENABLE: RI (lazy)");
                vcpu->arch.sie_block->ecb3 |= ECB3_RI;
@@ -53,6 +55,8 @@ int kvm_s390_handle_aa(struct kvm_vcpu *vcpu)
 
 static int handle_gs(struct kvm_vcpu *vcpu)
 {
+       vcpu->stat.instruction_gs++;
+
        if (test_kvm_facility(vcpu->kvm, 133)) {
                VCPU_EVENT(vcpu, 3, "%s", "ENABLE: GS (lazy)");
                preempt_disable();
@@ -85,6 +89,8 @@ static int handle_set_clock(struct kvm_vcpu *vcpu)
        u8 ar;
        u64 op2, val;
 
+       vcpu->stat.instruction_sck++;
+
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
@@ -203,14 +209,14 @@ int kvm_s390_skey_check_enable(struct kvm_vcpu *vcpu)
 
        trace_kvm_s390_skey_related_inst(vcpu);
        if (!(sie_block->ictl & (ICTL_ISKE | ICTL_SSKE | ICTL_RRBE)) &&
-           !(atomic_read(&sie_block->cpuflags) & CPUSTAT_KSS))
+           !kvm_s390_test_cpuflags(vcpu, CPUSTAT_KSS))
                return rc;
 
        rc = s390_enable_skey();
        VCPU_EVENT(vcpu, 3, "enabling storage keys for guest: %d", rc);
        if (!rc) {
-               if (atomic_read(&sie_block->cpuflags) & CPUSTAT_KSS)
-                       atomic_andnot(CPUSTAT_KSS, &sie_block->cpuflags);
+               if (kvm_s390_test_cpuflags(vcpu, CPUSTAT_KSS))
+                       kvm_s390_clear_cpuflags(vcpu, CPUSTAT_KSS);
                else
                        sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE |
                                             ICTL_RRBE);
@@ -222,7 +228,6 @@ static int try_handle_skey(struct kvm_vcpu *vcpu)
 {
        int rc;
 
-       vcpu->stat.instruction_storage_key++;
        rc = kvm_s390_skey_check_enable(vcpu);
        if (rc)
                return rc;
@@ -242,6 +247,8 @@ static int handle_iske(struct kvm_vcpu *vcpu)
        int reg1, reg2;
        int rc;
 
+       vcpu->stat.instruction_iske++;
+
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
@@ -274,6 +281,8 @@ static int handle_rrbe(struct kvm_vcpu *vcpu)
        int reg1, reg2;
        int rc;
 
+       vcpu->stat.instruction_rrbe++;
+
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
@@ -312,6 +321,8 @@ static int handle_sske(struct kvm_vcpu *vcpu)
        int reg1, reg2;
        int rc;
 
+       vcpu->stat.instruction_sske++;
+
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
@@ -392,6 +403,8 @@ static int handle_test_block(struct kvm_vcpu *vcpu)
        gpa_t addr;
        int reg2;
 
+       vcpu->stat.instruction_tb++;
+
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
@@ -424,6 +437,8 @@ static int handle_tpi(struct kvm_vcpu *vcpu)
        u64 addr;
        u8 ar;
 
+       vcpu->stat.instruction_tpi++;
+
        addr = kvm_s390_get_base_disp_s(vcpu, &ar);
        if (addr & 3)
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
@@ -484,6 +499,8 @@ static int handle_tsch(struct kvm_vcpu *vcpu)
        struct kvm_s390_interrupt_info *inti = NULL;
        const u64 isc_mask = 0xffUL << 24; /* all iscs set */
 
+       vcpu->stat.instruction_tsch++;
+
        /* a valid schid has at least one bit set */
        if (vcpu->run->s.regs.gprs[1])
                inti = kvm_s390_get_io_int(vcpu->kvm, isc_mask,
@@ -527,6 +544,7 @@ static int handle_io_inst(struct kvm_vcpu *vcpu)
                if (vcpu->arch.sie_block->ipa == 0xb235)
                        return handle_tsch(vcpu);
                /* Handle in userspace. */
+               vcpu->stat.instruction_io_other++;
                return -EOPNOTSUPP;
        } else {
                /*
@@ -592,6 +610,8 @@ int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu)
        int rc;
        u8 ar;
 
+       vcpu->stat.instruction_lpsw++;
+
        if (gpsw->mask & PSW_MASK_PSTATE)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
@@ -619,6 +639,8 @@ static int handle_lpswe(struct kvm_vcpu *vcpu)
        int rc;
        u8 ar;
 
+       vcpu->stat.instruction_lpswe++;
+
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
@@ -828,6 +850,8 @@ static int handle_epsw(struct kvm_vcpu *vcpu)
 {
        int reg1, reg2;
 
+       vcpu->stat.instruction_epsw++;
+
        kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
 
        /* This basically extracts the mask half of the psw. */
@@ -1332,6 +1356,8 @@ static int handle_sckpf(struct kvm_vcpu *vcpu)
 {
        u32 value;
 
+       vcpu->stat.instruction_sckpf++;
+
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
@@ -1347,6 +1373,8 @@ static int handle_sckpf(struct kvm_vcpu *vcpu)
 
 static int handle_ptff(struct kvm_vcpu *vcpu)
 {
+       vcpu->stat.instruction_ptff++;
+
        /* we don't emulate any control instructions yet */
        kvm_s390_set_psw_cc(vcpu, 3);
        return 0;
index c1f5cde2c878e63e32d44a47a10e3c77ccfc32ae..683036c1c92a8f9428622c8ba5b897b86cd3e6ae 100644 (file)
 static int __sigp_sense(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu,
                        u64 *reg)
 {
-       struct kvm_s390_local_interrupt *li;
-       int cpuflags;
+       const bool stopped = kvm_s390_test_cpuflags(dst_vcpu, CPUSTAT_STOPPED);
        int rc;
        int ext_call_pending;
 
-       li = &dst_vcpu->arch.local_int;
-
-       cpuflags = atomic_read(li->cpuflags);
        ext_call_pending = kvm_s390_ext_call_pending(dst_vcpu);
-       if (!(cpuflags & CPUSTAT_STOPPED) && !ext_call_pending)
+       if (!stopped && !ext_call_pending)
                rc = SIGP_CC_ORDER_CODE_ACCEPTED;
        else {
                *reg &= 0xffffffff00000000UL;
                if (ext_call_pending)
                        *reg |= SIGP_STATUS_EXT_CALL_PENDING;
-               if (cpuflags & CPUSTAT_STOPPED)
+               if (stopped)
                        *reg |= SIGP_STATUS_STOPPED;
                rc = SIGP_CC_STATUS_STORED;
        }
@@ -208,11 +204,9 @@ static int __sigp_store_status_at_addr(struct kvm_vcpu *vcpu,
                                       struct kvm_vcpu *dst_vcpu,
                                       u32 addr, u64 *reg)
 {
-       int flags;
        int rc;
 
-       flags = atomic_read(dst_vcpu->arch.local_int.cpuflags);
-       if (!(flags & CPUSTAT_STOPPED)) {
+       if (!kvm_s390_test_cpuflags(dst_vcpu, CPUSTAT_STOPPED)) {
                *reg &= 0xffffffff00000000UL;
                *reg |= SIGP_STATUS_INCORRECT_STATE;
                return SIGP_CC_STATUS_STORED;
@@ -231,7 +225,6 @@ static int __sigp_store_status_at_addr(struct kvm_vcpu *vcpu,
 static int __sigp_sense_running(struct kvm_vcpu *vcpu,
                                struct kvm_vcpu *dst_vcpu, u64 *reg)
 {
-       struct kvm_s390_local_interrupt *li;
        int rc;
 
        if (!test_kvm_facility(vcpu->kvm, 9)) {
@@ -240,8 +233,7 @@ static int __sigp_sense_running(struct kvm_vcpu *vcpu,
                return SIGP_CC_STATUS_STORED;
        }
 
-       li = &dst_vcpu->arch.local_int;
-       if (atomic_read(li->cpuflags) & CPUSTAT_RUNNING) {
+       if (kvm_s390_test_cpuflags(dst_vcpu, CPUSTAT_RUNNING)) {
                /* running */
                rc = SIGP_CC_ORDER_CODE_ACCEPTED;
        } else {
index 5d6ae0326d9e8fa2707e8d066488c8313dfef879..6d494ed5907ee1f9718240a2c6072dd2ca9bf60b 100644 (file)
@@ -28,13 +28,23 @@ struct vsie_page {
         * the same offset as that in struct sie_page!
         */
        struct mcck_volatile_info mcck_info;    /* 0x0200 */
-       /* the pinned originial scb */
+       /*
+        * The pinned original scb. Be aware that other VCPUs can modify
+        * it while we read from it. Values that are used for conditions or
+        * are reused conditionally, should be accessed via READ_ONCE.
+        */
        struct kvm_s390_sie_block *scb_o;       /* 0x0218 */
        /* the shadow gmap in use by the vsie_page */
        struct gmap *gmap;                      /* 0x0220 */
        /* address of the last reported fault to guest2 */
        unsigned long fault_addr;               /* 0x0228 */
-       __u8 reserved[0x0700 - 0x0230];         /* 0x0230 */
+       /* calculated guest addresses of satellite control blocks */
+       gpa_t sca_gpa;                          /* 0x0230 */
+       gpa_t itdba_gpa;                        /* 0x0238 */
+       gpa_t gvrd_gpa;                         /* 0x0240 */
+       gpa_t riccbd_gpa;                       /* 0x0248 */
+       gpa_t sdnx_gpa;                         /* 0x0250 */
+       __u8 reserved[0x0700 - 0x0258];         /* 0x0258 */
        struct kvm_s390_crypto_cb crycb;        /* 0x0700 */
        __u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */
 };
@@ -140,12 +150,13 @@ static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 {
        struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
        struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
-       u32 crycb_addr = scb_o->crycbd & 0x7ffffff8U;
+       const uint32_t crycbd_o = READ_ONCE(scb_o->crycbd);
+       const u32 crycb_addr = crycbd_o & 0x7ffffff8U;
        unsigned long *b1, *b2;
        u8 ecb3_flags;
 
        scb_s->crycbd = 0;
-       if (!(scb_o->crycbd & vcpu->arch.sie_block->crycbd & CRYCB_FORMAT1))
+       if (!(crycbd_o & vcpu->arch.sie_block->crycbd & CRYCB_FORMAT1))
                return 0;
        /* format-1 is supported with message-security-assist extension 3 */
        if (!test_kvm_facility(vcpu->kvm, 76))
@@ -183,12 +194,15 @@ static void prepare_ibc(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 {
        struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
        struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+       /* READ_ONCE does not work on bitfields - use a temporary variable */
+       const uint32_t __new_ibc = scb_o->ibc;
+       const uint32_t new_ibc = READ_ONCE(__new_ibc) & 0x0fffU;
        __u64 min_ibc = (sclp.ibc >> 16) & 0x0fffU;
 
        scb_s->ibc = 0;
        /* ibc installed in g2 and requested for g3 */
-       if (vcpu->kvm->arch.model.ibc && (scb_o->ibc & 0x0fffU)) {
-               scb_s->ibc = scb_o->ibc & 0x0fffU;
+       if (vcpu->kvm->arch.model.ibc && new_ibc) {
+               scb_s->ibc = new_ibc;
                /* takte care of the minimum ibc level of the machine */
                if (scb_s->ibc < min_ibc)
                        scb_s->ibc = min_ibc;
@@ -253,6 +267,10 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 {
        struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
        struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       /* READ_ONCE does not work on bitfields - use a temporary variable */
+       const uint32_t __new_prefix = scb_o->prefix;
+       const uint32_t new_prefix = READ_ONCE(__new_prefix);
+       const bool wants_tx = READ_ONCE(scb_o->ecb) & ECB_TE;
        bool had_tx = scb_s->ecb & ECB_TE;
        unsigned long new_mso = 0;
        int rc;
@@ -299,14 +317,14 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
        scb_s->icpua = scb_o->icpua;
 
        if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_SM))
-               new_mso = scb_o->mso & 0xfffffffffff00000UL;
+               new_mso = READ_ONCE(scb_o->mso) & 0xfffffffffff00000UL;
        /* if the hva of the prefix changes, we have to remap the prefix */
-       if (scb_s->mso != new_mso || scb_s->prefix != scb_o->prefix)
+       if (scb_s->mso != new_mso || scb_s->prefix != new_prefix)
                prefix_unmapped(vsie_page);
         /* SIE will do mso/msl validity and exception checks for us */
        scb_s->msl = scb_o->msl & 0xfffffffffff00000UL;
        scb_s->mso = new_mso;
-       scb_s->prefix = scb_o->prefix;
+       scb_s->prefix = new_prefix;
 
        /* We have to definetly flush the tlb if this scb never ran */
        if (scb_s->ihcpu != 0xffffU)
@@ -318,11 +336,11 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
        if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_ESOP))
                scb_s->ecb |= scb_o->ecb & ECB_HOSTPROTINT;
        /* transactional execution */
-       if (test_kvm_facility(vcpu->kvm, 73)) {
+       if (test_kvm_facility(vcpu->kvm, 73) && wants_tx) {
                /* remap the prefix is tx is toggled on */
-               if ((scb_o->ecb & ECB_TE) && !had_tx)
+               if (!had_tx)
                        prefix_unmapped(vsie_page);
-               scb_s->ecb |= scb_o->ecb & ECB_TE;
+               scb_s->ecb |= ECB_TE;
        }
        /* SIMD */
        if (test_kvm_facility(vcpu->kvm, 129)) {
@@ -463,46 +481,42 @@ static void unpin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t hpa)
 /* unpin all blocks previously pinned by pin_blocks(), marking them dirty */
 static void unpin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 {
-       struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
        struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
        hpa_t hpa;
-       gpa_t gpa;
 
        hpa = (u64) scb_s->scaoh << 32 | scb_s->scaol;
        if (hpa) {
-               gpa = scb_o->scaol & ~0xfUL;
-               if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO))
-                       gpa |= (u64) scb_o->scaoh << 32;
-               unpin_guest_page(vcpu->kvm, gpa, hpa);
+               unpin_guest_page(vcpu->kvm, vsie_page->sca_gpa, hpa);
+               vsie_page->sca_gpa = 0;
                scb_s->scaol = 0;
                scb_s->scaoh = 0;
        }
 
        hpa = scb_s->itdba;
        if (hpa) {
-               gpa = scb_o->itdba & ~0xffUL;
-               unpin_guest_page(vcpu->kvm, gpa, hpa);
+               unpin_guest_page(vcpu->kvm, vsie_page->itdba_gpa, hpa);
+               vsie_page->itdba_gpa = 0;
                scb_s->itdba = 0;
        }
 
        hpa = scb_s->gvrd;
        if (hpa) {
-               gpa = scb_o->gvrd & ~0x1ffUL;
-               unpin_guest_page(vcpu->kvm, gpa, hpa);
+               unpin_guest_page(vcpu->kvm, vsie_page->gvrd_gpa, hpa);
+               vsie_page->gvrd_gpa = 0;
                scb_s->gvrd = 0;
        }
 
        hpa = scb_s->riccbd;
        if (hpa) {
-               gpa = scb_o->riccbd & ~0x3fUL;
-               unpin_guest_page(vcpu->kvm, gpa, hpa);
+               unpin_guest_page(vcpu->kvm, vsie_page->riccbd_gpa, hpa);
+               vsie_page->riccbd_gpa = 0;
                scb_s->riccbd = 0;
        }
 
        hpa = scb_s->sdnxo;
        if (hpa) {
-               gpa = scb_o->sdnxo;
-               unpin_guest_page(vcpu->kvm, gpa, hpa);
+               unpin_guest_page(vcpu->kvm, vsie_page->sdnx_gpa, hpa);
+               vsie_page->sdnx_gpa = 0;
                scb_s->sdnxo = 0;
        }
 }
@@ -529,9 +543,9 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
        gpa_t gpa;
        int rc = 0;
 
-       gpa = scb_o->scaol & ~0xfUL;
+       gpa = READ_ONCE(scb_o->scaol) & ~0xfUL;
        if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO))
-               gpa |= (u64) scb_o->scaoh << 32;
+               gpa |= (u64) READ_ONCE(scb_o->scaoh) << 32;
        if (gpa) {
                if (!(gpa & ~0x1fffUL))
                        rc = set_validity_icpt(scb_s, 0x0038U);
@@ -547,11 +561,12 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
                }
                if (rc)
                        goto unpin;
+               vsie_page->sca_gpa = gpa;
                scb_s->scaoh = (u32)((u64)hpa >> 32);
                scb_s->scaol = (u32)(u64)hpa;
        }
 
-       gpa = scb_o->itdba & ~0xffUL;
+       gpa = READ_ONCE(scb_o->itdba) & ~0xffUL;
        if (gpa && (scb_s->ecb & ECB_TE)) {
                if (!(gpa & ~0x1fffU)) {
                        rc = set_validity_icpt(scb_s, 0x0080U);
@@ -563,10 +578,11 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
                        rc = set_validity_icpt(scb_s, 0x0080U);
                        goto unpin;
                }
+               vsie_page->itdba_gpa = gpa;
                scb_s->itdba = hpa;
        }
 
-       gpa = scb_o->gvrd & ~0x1ffUL;
+       gpa = READ_ONCE(scb_o->gvrd) & ~0x1ffUL;
        if (gpa && (scb_s->eca & ECA_VX) && !(scb_s->ecd & ECD_HOSTREGMGMT)) {
                if (!(gpa & ~0x1fffUL)) {
                        rc = set_validity_icpt(scb_s, 0x1310U);
@@ -581,10 +597,11 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
                        rc = set_validity_icpt(scb_s, 0x1310U);
                        goto unpin;
                }
+               vsie_page->gvrd_gpa = gpa;
                scb_s->gvrd = hpa;
        }
 
-       gpa = scb_o->riccbd & ~0x3fUL;
+       gpa = READ_ONCE(scb_o->riccbd) & ~0x3fUL;
        if (gpa && (scb_s->ecb3 & ECB3_RI)) {
                if (!(gpa & ~0x1fffUL)) {
                        rc = set_validity_icpt(scb_s, 0x0043U);
@@ -597,13 +614,14 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
                        goto unpin;
                }
                /* Validity 0x0044 will be checked by SIE */
+               vsie_page->riccbd_gpa = gpa;
                scb_s->riccbd = hpa;
        }
        if ((scb_s->ecb & ECB_GS) && !(scb_s->ecd & ECD_HOSTREGMGMT)) {
                unsigned long sdnxc;
 
-               gpa = scb_o->sdnxo & ~0xfUL;
-               sdnxc = scb_o->sdnxo & 0xfUL;
+               gpa = READ_ONCE(scb_o->sdnxo) & ~0xfUL;
+               sdnxc = READ_ONCE(scb_o->sdnxo) & 0xfUL;
                if (!gpa || !(gpa & ~0x1fffUL)) {
                        rc = set_validity_icpt(scb_s, 0x10b0U);
                        goto unpin;
@@ -624,6 +642,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
                        rc = set_validity_icpt(scb_s, 0x10b0U);
                        goto unpin;
                }
+               vsie_page->sdnx_gpa = gpa;
                scb_s->sdnxo = hpa | sdnxc;
        }
        return 0;
@@ -768,7 +787,7 @@ static void retry_vsie_icpt(struct vsie_page *vsie_page)
 static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 {
        struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
-       __u32 fac = vsie_page->scb_o->fac & 0x7ffffff8U;
+       __u32 fac = READ_ONCE(vsie_page->scb_o->fac) & 0x7ffffff8U;
 
        if (fac && test_kvm_facility(vcpu->kvm, 7)) {
                retry_vsie_icpt(vsie_page);
@@ -894,7 +913,7 @@ static void register_shadow_scb(struct kvm_vcpu *vcpu,
         * External calls have to lead to a kick of the vcpu and
         * therefore the vsie -> Simulate Wait state.
         */
-       atomic_or(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
+       kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT);
        /*
         * We have to adjust the g3 epoch by the g2 epoch. The epoch will
         * automatically be adjusted on tod clock changes via kvm_sync_clock.
@@ -916,7 +935,7 @@ static void register_shadow_scb(struct kvm_vcpu *vcpu,
  */
 static void unregister_shadow_scb(struct kvm_vcpu *vcpu)
 {
-       atomic_andnot(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
+       kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT);
        WRITE_ONCE(vcpu->arch.vsie_block, NULL);
 }
 
index 05d459b638f55d563d479eb978a5d60f0e421e1b..2c55a2b9d6c65bde78efacf77c71826001b3d4b1 100644 (file)
@@ -815,27 +815,17 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap,
  * @ptl: pointer to the spinlock pointer
  *
  * Returns a pointer to the locked pte for a guest address, or NULL
- *
- * Note: Can also be called for shadow gmaps.
  */
 static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr,
                               spinlock_t **ptl)
 {
        unsigned long *table;
 
-       if (gmap_is_shadow(gmap))
-               spin_lock(&gmap->guest_table_lock);
+       BUG_ON(gmap_is_shadow(gmap));
        /* Walk the gmap page table, lock and get pte pointer */
        table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */
-       if (!table || *table & _SEGMENT_ENTRY_INVALID) {
-               if (gmap_is_shadow(gmap))
-                       spin_unlock(&gmap->guest_table_lock);
+       if (!table || *table & _SEGMENT_ENTRY_INVALID)
                return NULL;
-       }
-       if (gmap_is_shadow(gmap)) {
-               *ptl = &gmap->guest_table_lock;
-               return pte_offset_map((pmd_t *) table, gaddr);
-       }
        return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl);
 }
 
@@ -889,8 +879,6 @@ static void gmap_pte_op_end(spinlock_t *ptl)
  * -EFAULT if gaddr is invalid (or mapping for shadows is missing).
  *
  * Called with sg->mm->mmap_sem in read.
- *
- * Note: Can also be called for shadow gmaps.
  */
 static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
                              unsigned long len, int prot, unsigned long bits)
@@ -900,6 +888,7 @@ static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
        pte_t *ptep;
        int rc;
 
+       BUG_ON(gmap_is_shadow(gmap));
        while (len) {
                rc = -EAGAIN;
                ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
@@ -960,7 +949,8 @@ EXPORT_SYMBOL_GPL(gmap_mprotect_notify);
  * @val: pointer to the unsigned long value to return
  *
  * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT
- * if reading using the virtual address failed.
+ * if reading using the virtual address failed. -EINVAL if called on a gmap
+ * shadow.
  *
  * Called with gmap->mm->mmap_sem in read.
  */
@@ -971,6 +961,9 @@ int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
        pte_t *ptep, pte;
        int rc;
 
+       if (gmap_is_shadow(gmap))
+               return -EINVAL;
+
        while (1) {
                rc = -EAGAIN;
                ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
@@ -1028,18 +1021,17 @@ static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
 }
 
 /**
- * gmap_protect_rmap - modify access rights to memory and create an rmap
+ * gmap_protect_rmap - restrict access rights to memory (RO) and create an rmap
  * @sg: pointer to the shadow guest address space structure
  * @raddr: rmap address in the shadow gmap
  * @paddr: address in the parent guest address space
  * @len: length of the memory area to protect
- * @prot: indicates access rights: none, read-only or read-write
  *
  * Returns 0 if successfully protected and the rmap was created, -ENOMEM
  * if out of memory and -EFAULT if paddr is invalid.
  */
 static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
-                            unsigned long paddr, unsigned long len, int prot)
+                            unsigned long paddr, unsigned long len)
 {
        struct gmap *parent;
        struct gmap_rmap *rmap;
@@ -1067,7 +1059,7 @@ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
                ptep = gmap_pte_op_walk(parent, paddr, &ptl);
                if (ptep) {
                        spin_lock(&sg->guest_table_lock);
-                       rc = ptep_force_prot(parent->mm, paddr, ptep, prot,
+                       rc = ptep_force_prot(parent->mm, paddr, ptep, PROT_READ,
                                             PGSTE_VSIE_BIT);
                        if (!rc)
                                gmap_insert_rmap(sg, vmaddr, rmap);
@@ -1077,7 +1069,7 @@ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
                radix_tree_preload_end();
                if (rc) {
                        kfree(rmap);
-                       rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
+                       rc = gmap_pte_op_fixup(parent, paddr, vmaddr, PROT_READ);
                        if (rc)
                                return rc;
                        continue;
@@ -1616,7 +1608,7 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
        origin = r2t & _REGION_ENTRY_ORIGIN;
        offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
        len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
-       rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+       rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
        spin_lock(&sg->guest_table_lock);
        if (!rc) {
                table = gmap_table_walk(sg, saddr, 4);
@@ -1699,7 +1691,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
        origin = r3t & _REGION_ENTRY_ORIGIN;
        offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
        len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
-       rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+       rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
        spin_lock(&sg->guest_table_lock);
        if (!rc) {
                table = gmap_table_walk(sg, saddr, 3);
@@ -1783,7 +1775,7 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
        origin = sgt & _REGION_ENTRY_ORIGIN;
        offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
        len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
-       rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+       rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
        spin_lock(&sg->guest_table_lock);
        if (!rc) {
                table = gmap_table_walk(sg, saddr, 2);
@@ -1902,7 +1894,7 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
        /* Make pgt read-only in parent gmap page table (not the pgste) */
        raddr = (saddr & _SEGMENT_MASK) | _SHADOW_RMAP_SEGMENT;
        origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK;
-       rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE, PROT_READ);
+       rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE);
        spin_lock(&sg->guest_table_lock);
        if (!rc) {
                table = gmap_table_walk(sg, saddr, 1);
@@ -2005,7 +1997,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_page);
  * Called with sg->parent->shadow_lock.
  */
 static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
-                              unsigned long gaddr, pte_t *pte)
+                              unsigned long gaddr)
 {
        struct gmap_rmap *rmap, *rnext, *head;
        unsigned long start, end, bits, raddr;
@@ -2090,7 +2082,7 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
                        spin_lock(&gmap->shadow_lock);
                        list_for_each_entry_safe(sg, next,
                                                 &gmap->children, list)
-                               gmap_shadow_notify(sg, vmaddr, gaddr, pte);
+                               gmap_shadow_notify(sg, vmaddr, gaddr);
                        spin_unlock(&gmap->shadow_lock);
                }
                if (bits & PGSTE_IN_BIT)
index 800104c8a3edfee7f4f52a33b8451a51ee0ed90a..19f35be95f168dc2cec30e07127520e14efe8ac5 100644 (file)
 #define X86_FEATURE_HW_PSTATE          ( 7*32+ 8) /* AMD HW-PState */
 #define X86_FEATURE_PROC_FEEDBACK      ( 7*32+ 9) /* AMD ProcFeedbackInterface */
 #define X86_FEATURE_SME                        ( 7*32+10) /* AMD Secure Memory Encryption */
+#define X86_FEATURE_SEV                        ( 7*32+11) /* AMD Secure Encrypted Virtualization */
 
 #define X86_FEATURE_INTEL_PPIN         ( 7*32+14) /* Intel Processor Inventory Number */
 #define X86_FEATURE_INTEL_PT           ( 7*32+15) /* Intel Processor Trace */
index 51679843132829e38ed204eda60d50379a4fabb5..ea7e40e9c1f0f48ed1f15d0238a2ff6c025c7c09 100644 (file)
@@ -86,7 +86,7 @@
                          | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \
                          | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \
                          | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \
-                         | X86_CR4_SMAP | X86_CR4_PKE))
+                         | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP))
 
 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
 
@@ -504,6 +504,7 @@ struct kvm_vcpu_arch {
        int mp_state;
        u64 ia32_misc_enable_msr;
        u64 smbase;
+       u64 smi_count;
        bool tpr_access_reporting;
        u64 ia32_xss;
 
@@ -760,6 +761,15 @@ enum kvm_irqchip_mode {
        KVM_IRQCHIP_SPLIT,        /* created with KVM_CAP_SPLIT_IRQCHIP */
 };
 
+struct kvm_sev_info {
+       bool active;            /* SEV enabled guest */
+       unsigned int asid;      /* ASID used for this guest */
+       unsigned int handle;    /* SEV firmware handle */
+       int fd;                 /* SEV device fd */
+       unsigned long pages_locked; /* Number of pages locked */
+       struct list_head regions_list;  /* List of registered regions */
+};
+
 struct kvm_arch {
        unsigned int n_used_mmu_pages;
        unsigned int n_requested_mmu_pages;
@@ -847,6 +857,8 @@ struct kvm_arch {
 
        bool x2apic_format;
        bool x2apic_broadcast_quirk_disabled;
+
+       struct kvm_sev_info sev_info;
 };
 
 struct kvm_vm_stat {
@@ -965,7 +977,7 @@ struct kvm_x86_ops {
        unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
        void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
 
-       void (*tlb_flush)(struct kvm_vcpu *vcpu);
+       void (*tlb_flush)(struct kvm_vcpu *vcpu, bool invalidate_gpa);
 
        void (*run)(struct kvm_vcpu *vcpu);
        int (*handle_exit)(struct kvm_vcpu *vcpu);
@@ -1017,6 +1029,7 @@ struct kvm_x86_ops {
        void (*handle_external_intr)(struct kvm_vcpu *vcpu);
        bool (*mpx_supported)(void);
        bool (*xsaves_supported)(void);
+       bool (*umip_emulated)(void);
 
        int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
 
@@ -1079,6 +1092,10 @@ struct kvm_x86_ops {
        int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate);
        int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smbase);
        int (*enable_smi_window)(struct kvm_vcpu *vcpu);
+
+       int (*mem_enc_op)(struct kvm *kvm, void __user *argp);
+       int (*mem_enc_reg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
+       int (*mem_enc_unreg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
 };
 
 struct kvm_arch_async_pf {
index 34c4922bbc3fb5ed95cb0819d7ac6b9cd37a7f41..507d3e30f7fe8ca2bd155a098c8b307994e00604 100644 (file)
 #define MSR_K7_PERFCTR3                        0xc0010007
 #define MSR_K7_CLK_CTL                 0xc001001b
 #define MSR_K7_HWCR                    0xc0010015
+#define MSR_K7_HWCR_SMMLOCK_BIT                0
+#define MSR_K7_HWCR_SMMLOCK            BIT_ULL(MSR_K7_HWCR_SMMLOCK_BIT)
 #define MSR_K7_FID_VID_CTL             0xc0010041
 #define MSR_K7_FID_VID_STATUS          0xc0010042
 
index 8a3ee355b4222e59415cbea01b51beea161733db..92015c65fa2ac95a040a745c5632870530010596 100644 (file)
@@ -22,4 +22,6 @@ int io_reserve_memtype(resource_size_t start, resource_size_t end,
 
 void io_free_memtype(resource_size_t start, resource_size_t end);
 
+bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn);
+
 #endif /* _ASM_X86_PAT_H */
index 78dd9df881577dc1bc1b2bddf84729ff2fe5fd40..0487ac0548704da1d0bbb6149aa25ffc0d0ce7d6 100644 (file)
@@ -146,6 +146,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
 #define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL
 #define SVM_VM_CR_SVM_DIS_MASK  0x0010ULL
 
+#define SVM_NESTED_CTL_NP_ENABLE       BIT(0)
+#define SVM_NESTED_CTL_SEV_ENABLE      BIT(1)
+
 struct __attribute__ ((__packed__)) vmcb_seg {
        u16 selector;
        u16 attrib;
index 09cc06483bed426ba4e487067258d1016059aba4..7a2ade4aa235380a8c28af6934d30566bb24de73 100644 (file)
@@ -25,6 +25,7 @@
 #define KVM_FEATURE_STEAL_TIME         5
 #define KVM_FEATURE_PV_EOI             6
 #define KVM_FEATURE_PV_UNHALT          7
+#define KVM_FEATURE_PV_TLB_FLUSH       9
 
 /* The last 8 bits are used to indicate how to interpret the flags field
  * in pvclock structure. If no bits are set, all flags are ignored.
@@ -51,6 +52,9 @@ struct kvm_steal_time {
        __u32 pad[11];
 };
 
+#define KVM_VCPU_PREEMPTED          (1 << 0)
+#define KVM_VCPU_FLUSH_TLB          (1 << 1)
+
 #define KVM_CLOCK_PAIRING_WALLCLOCK 0
 struct kvm_clock_pairing {
        __s64 sec;
index bcb75dc97d44075d2eecb3137b91f934072352b0..df8a2418aadfcf7f40c19742de45bfc278ad0dbd 100644 (file)
@@ -556,6 +556,51 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
        }
 }
 
+static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
+{
+       u64 msr;
+
+       /*
+        * BIOS support is required for SME and SEV.
+        *   For SME: If BIOS has enabled SME then adjust x86_phys_bits by
+        *            the SME physical address space reduction value.
+        *            If BIOS has not enabled SME then don't advertise the
+        *            SME feature (set in scattered.c).
+        *   For SEV: If BIOS has not enabled SEV then don't advertise the
+        *            SEV feature (set in scattered.c).
+        *
+        *   In all cases, since support for SME and SEV requires long mode,
+        *   don't advertise the feature under CONFIG_X86_32.
+        */
+       if (cpu_has(c, X86_FEATURE_SME) || cpu_has(c, X86_FEATURE_SEV)) {
+               /* Check if memory encryption is enabled */
+               rdmsrl(MSR_K8_SYSCFG, msr);
+               if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
+                       goto clear_all;
+
+               /*
+                * Always adjust physical address bits. Even though this
+                * will be a value above 32-bits this is still done for
+                * CONFIG_X86_32 so that accurate values are reported.
+                */
+               c->x86_phys_bits -= (cpuid_ebx(0x8000001f) >> 6) & 0x3f;
+
+               if (IS_ENABLED(CONFIG_X86_32))
+                       goto clear_all;
+
+               rdmsrl(MSR_K7_HWCR, msr);
+               if (!(msr & MSR_K7_HWCR_SMMLOCK))
+                       goto clear_sev;
+
+               return;
+
+clear_all:
+               clear_cpu_cap(c, X86_FEATURE_SME);
+clear_sev:
+               clear_cpu_cap(c, X86_FEATURE_SEV);
+       }
+}
+
 static void early_init_amd(struct cpuinfo_x86 *c)
 {
        u32 dummy;
@@ -627,26 +672,7 @@ static void early_init_amd(struct cpuinfo_x86 *c)
        if (cpu_has_amd_erratum(c, amd_erratum_400))
                set_cpu_bug(c, X86_BUG_AMD_E400);
 
-       /*
-        * BIOS support is required for SME. If BIOS has enabled SME then
-        * adjust x86_phys_bits by the SME physical address space reduction
-        * value. If BIOS has not enabled SME then don't advertise the
-        * feature (set in scattered.c). Also, since the SME support requires
-        * long mode, don't advertise the feature under CONFIG_X86_32.
-        */
-       if (cpu_has(c, X86_FEATURE_SME)) {
-               u64 msr;
-
-               /* Check if SME is enabled */
-               rdmsrl(MSR_K8_SYSCFG, msr);
-               if (msr & MSR_K8_SYSCFG_MEM_ENCRYPT) {
-                       c->x86_phys_bits -= (cpuid_ebx(0x8000001f) >> 6) & 0x3f;
-                       if (IS_ENABLED(CONFIG_X86_32))
-                               clear_cpu_cap(c, X86_FEATURE_SME);
-               } else {
-                       clear_cpu_cap(c, X86_FEATURE_SME);
-               }
-       }
+       early_detect_mem_encrypt(c);
 }
 
 static void init_amd_k8(struct cpuinfo_x86 *c)
index 05459ad3db46e2139b7d97514899d398c321c541..63a78d5fe505bd57326581676bfdbfc282ba4d5f 100644 (file)
@@ -32,6 +32,7 @@ static const struct cpuid_bit cpuid_bits[] = {
        { X86_FEATURE_CPB,              CPUID_EDX,  9, 0x80000007, 0 },
        { X86_FEATURE_PROC_FEEDBACK,    CPUID_EDX, 11, 0x80000007, 0 },
        { X86_FEATURE_SME,              CPUID_EAX,  0, 0x8000001f, 0 },
+       { X86_FEATURE_SEV,              CPUID_EAX,  1, 0x8000001f, 0 },
        { 0, 0, 0, 0, 0 }
 };
 
index b40ffbf156c181f69601ed82a09a2292783c524d..4e37d1a851a62df3f9f841f3bbd66827af0c1920 100644 (file)
@@ -498,6 +498,34 @@ static void __init kvm_apf_trap_init(void)
        update_intr_gate(X86_TRAP_PF, async_page_fault);
 }
 
+static DEFINE_PER_CPU(cpumask_var_t, __pv_tlb_mask);
+
+static void kvm_flush_tlb_others(const struct cpumask *cpumask,
+                       const struct flush_tlb_info *info)
+{
+       u8 state;
+       int cpu;
+       struct kvm_steal_time *src;
+       struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_tlb_mask);
+
+       cpumask_copy(flushmask, cpumask);
+       /*
+        * We have to call flush only on online vCPUs. And
+        * queue flush_on_enter for pre-empted vCPUs
+        */
+       for_each_cpu(cpu, flushmask) {
+               src = &per_cpu(steal_time, cpu);
+               state = READ_ONCE(src->preempted);
+               if ((state & KVM_VCPU_PREEMPTED)) {
+                       if (try_cmpxchg(&src->preempted, &state,
+                                       state | KVM_VCPU_FLUSH_TLB))
+                               __cpumask_clear_cpu(cpu, flushmask);
+               }
+       }
+
+       native_flush_tlb_others(flushmask, info);
+}
+
 static void __init kvm_guest_init(void)
 {
        int i;
@@ -517,6 +545,9 @@ static void __init kvm_guest_init(void)
                pv_time_ops.steal_clock = kvm_steal_clock;
        }
 
+       if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH))
+               pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others;
+
        if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
                apic_set_eoi_write(kvm_guest_apic_eoi_write);
 
@@ -598,6 +629,22 @@ static __init int activate_jump_labels(void)
 }
 arch_initcall(activate_jump_labels);
 
+static __init int kvm_setup_pv_tlb_flush(void)
+{
+       int cpu;
+
+       if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH)) {
+               for_each_possible_cpu(cpu) {
+                       zalloc_cpumask_var_node(per_cpu_ptr(&__pv_tlb_mask, cpu),
+                               GFP_KERNEL, cpu_to_node(cpu));
+               }
+               pr_info("KVM setup pv remote TLB flush\n");
+       }
+
+       return 0;
+}
+arch_initcall(kvm_setup_pv_tlb_flush);
+
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 
 /* Kick a cpu by its apicid. Used to wake up a halted vcpu */
@@ -643,7 +690,7 @@ __visible bool __kvm_vcpu_is_preempted(long cpu)
 {
        struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
 
-       return !!src->preempted;
+       return !!(src->preempted & KVM_VCPU_PREEMPTED);
 }
 PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
 
index 3df51c28784428ed111e065595ed9e59ef376933..92fd433c50b9b5135e4ada92dc8968f4c5ed75d4 100644 (file)
@@ -81,6 +81,14 @@ config KVM_AMD
          To compile this as a module, choose M here: the module
          will be called kvm-amd.
 
+config KVM_AMD_SEV
+       def_bool y
+       bool "AMD Secure Encrypted Virtualization (SEV) support"
+       depends on KVM_AMD && X86_64
+       depends on CRYPTO_DEV_CCP && CRYPTO_DEV_CCP_DD && CRYPTO_DEV_SP_PSP
+       ---help---
+       Provides support for launching Encrypted VMs on AMD processors.
+
 config KVM_MMU_AUDIT
        bool "Audit KVM MMU"
        depends on KVM && TRACEPOINTS
index 0099e10eb045253f2be9ec19603c55007d5789d1..ac0041c2f5afe7faa9cb6cb86937a3e06277a097 100644 (file)
@@ -293,13 +293,18 @@ static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry,
 {
        switch (func) {
        case 0:
-               entry->eax = 1;         /* only one leaf currently */
+               entry->eax = 7;
                ++*nent;
                break;
        case 1:
                entry->ecx = F(MOVBE);
                ++*nent;
                break;
+       case 7:
+               entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+               if (index == 0)
+                       entry->ecx = F(RDPID);
+               ++*nent;
        default:
                break;
        }
@@ -327,6 +332,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
        unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0;
        unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0;
        unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
+       unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0;
 
        /* cpuid 1.edx */
        const u32 kvm_cpuid_1_edx_x86_features =
@@ -387,8 +393,9 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
        /* cpuid 7.0.ecx*/
        const u32 kvm_cpuid_7_0_ecx_x86_features =
-               F(AVX512VBMI) | F(LA57) | F(PKU) |
-               0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ);
+               F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ |
+               F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
+               F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG);
 
        /* cpuid 7.0.edx*/
        const u32 kvm_cpuid_7_0_edx_x86_features =
@@ -473,6 +480,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                        entry->ebx |= F(TSC_ADJUST);
                        entry->ecx &= kvm_cpuid_7_0_ecx_x86_features;
                        cpuid_mask(&entry->ecx, CPUID_7_ECX);
+                       entry->ecx |= f_umip;
                        /* PKU is not yet implemented for shadow paging. */
                        if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
                                entry->ecx &= ~F(PKU);
@@ -594,7 +602,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                             (1 << KVM_FEATURE_ASYNC_PF) |
                             (1 << KVM_FEATURE_PV_EOI) |
                             (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
-                            (1 << KVM_FEATURE_PV_UNHALT);
+                            (1 << KVM_FEATURE_PV_UNHALT) |
+                            (1 << KVM_FEATURE_PV_TLB_FLUSH);
 
                if (sched_info_on())
                        entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
@@ -604,7 +613,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                entry->edx = 0;
                break;
        case 0x80000000:
-               entry->eax = min(entry->eax, 0x8000001a);
+               entry->eax = min(entry->eax, 0x8000001f);
                break;
        case 0x80000001:
                entry->edx &= kvm_cpuid_8000_0001_edx_x86_features;
index abe74f779f9d793e9a6c2f19417f23b5aa7ce484..cb929d0bb1bd115270b3577c77de6cfccaf51274 100644 (file)
@@ -3514,6 +3514,16 @@ static int em_cwd(struct x86_emulate_ctxt *ctxt)
        return X86EMUL_CONTINUE;
 }
 
+static int em_rdpid(struct x86_emulate_ctxt *ctxt)
+{
+       u64 tsc_aux = 0;
+
+       if (ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux))
+               return emulate_gp(ctxt, 0);
+       ctxt->dst.val = tsc_aux;
+       return X86EMUL_CONTINUE;
+}
+
 static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
 {
        u64 tsc = 0;
@@ -3633,17 +3643,27 @@ static int em_rdmsr(struct x86_emulate_ctxt *ctxt)
        return X86EMUL_CONTINUE;
 }
 
-static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt)
+static int em_store_sreg(struct x86_emulate_ctxt *ctxt, int segment)
 {
-       if (ctxt->modrm_reg > VCPU_SREG_GS)
-               return emulate_ud(ctxt);
+       if (segment > VCPU_SREG_GS &&
+           (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_UMIP) &&
+           ctxt->ops->cpl(ctxt) > 0)
+               return emulate_gp(ctxt, 0);
 
-       ctxt->dst.val = get_segment_selector(ctxt, ctxt->modrm_reg);
+       ctxt->dst.val = get_segment_selector(ctxt, segment);
        if (ctxt->dst.bytes == 4 && ctxt->dst.type == OP_MEM)
                ctxt->dst.bytes = 2;
        return X86EMUL_CONTINUE;
 }
 
+static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt)
+{
+       if (ctxt->modrm_reg > VCPU_SREG_GS)
+               return emulate_ud(ctxt);
+
+       return em_store_sreg(ctxt, ctxt->modrm_reg);
+}
+
 static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt)
 {
        u16 sel = ctxt->src.val;
@@ -3659,6 +3679,11 @@ static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt)
        return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg);
 }
 
+static int em_sldt(struct x86_emulate_ctxt *ctxt)
+{
+       return em_store_sreg(ctxt, VCPU_SREG_LDTR);
+}
+
 static int em_lldt(struct x86_emulate_ctxt *ctxt)
 {
        u16 sel = ctxt->src.val;
@@ -3668,6 +3693,11 @@ static int em_lldt(struct x86_emulate_ctxt *ctxt)
        return load_segment_descriptor(ctxt, sel, VCPU_SREG_LDTR);
 }
 
+static int em_str(struct x86_emulate_ctxt *ctxt)
+{
+       return em_store_sreg(ctxt, VCPU_SREG_TR);
+}
+
 static int em_ltr(struct x86_emulate_ctxt *ctxt)
 {
        u16 sel = ctxt->src.val;
@@ -3720,6 +3750,10 @@ static int emulate_store_desc_ptr(struct x86_emulate_ctxt *ctxt,
 {
        struct desc_ptr desc_ptr;
 
+       if ((ctxt->ops->get_cr(ctxt, 4) & X86_CR4_UMIP) &&
+           ctxt->ops->cpl(ctxt) > 0)
+               return emulate_gp(ctxt, 0);
+
        if (ctxt->mode == X86EMUL_MODE_PROT64)
                ctxt->op_bytes = 8;
        get(ctxt, &desc_ptr);
@@ -3779,6 +3813,10 @@ static int em_lidt(struct x86_emulate_ctxt *ctxt)
 
 static int em_smsw(struct x86_emulate_ctxt *ctxt)
 {
+       if ((ctxt->ops->get_cr(ctxt, 4) & X86_CR4_UMIP) &&
+           ctxt->ops->cpl(ctxt) > 0)
+               return emulate_gp(ctxt, 0);
+
        if (ctxt->dst.type == OP_MEM)
                ctxt->dst.bytes = 2;
        ctxt->dst.val = ctxt->ops->get_cr(ctxt, 0);
@@ -4364,8 +4402,8 @@ static const struct opcode group5[] = {
 };
 
 static const struct opcode group6[] = {
-       DI(Prot | DstMem,       sldt),
-       DI(Prot | DstMem,       str),
+       II(Prot | DstMem,          em_sldt, sldt),
+       II(Prot | DstMem,          em_str, str),
        II(Prot | Priv | SrcMem16, em_lldt, lldt),
        II(Prot | Priv | SrcMem16, em_ltr, ltr),
        N, N, N, N,
@@ -4396,10 +4434,20 @@ static const struct opcode group8[] = {
        F(DstMem | SrcImmByte | Lock | PageTable,       em_btc),
 };
 
+/*
+ * The "memory" destination is actually always a register, since we come
+ * from the register case of group9.
+ */
+static const struct gprefix pfx_0f_c7_7 = {
+       N, N, N, II(DstMem | ModRM | Op3264 | EmulateOnUD, em_rdpid, rdtscp),
+};
+
+
 static const struct group_dual group9 = { {
        N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N,
 }, {
-       N, N, N, N, N, N, N, N,
+       N, N, N, N, N, N, N,
+       GP(0, &pfx_0f_c7_7),
 } };
 
 static const struct opcode group11[] = {
index 5c24811e8b0bcac141dfd1e58dcc052b2fca69d7..f171051eecf3473f529abeebeef80e6893ed672a 100644 (file)
@@ -79,7 +79,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
        if (kvm_cpu_has_extint(v))
                return 1;
 
-       if (kvm_vcpu_apicv_active(v))
+       if (!is_guest_mode(v) && kvm_vcpu_apicv_active(v))
                return 0;
 
        return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
index e2c1fb8d35cea28af684d4ba76d70a5e2e12e9a5..924ac8ce9d5004f9db4126a81f178c2bf0e6ff40 100644 (file)
@@ -364,32 +364,41 @@ static u8 count_vectors(void *bitmap)
        return count;
 }
 
-int __kvm_apic_update_irr(u32 *pir, void *regs)
+bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr)
 {
        u32 i, vec;
-       u32 pir_val, irr_val;
-       int max_irr = -1;
+       u32 pir_val, irr_val, prev_irr_val;
+       int max_updated_irr;
+
+       max_updated_irr = -1;
+       *max_irr = -1;
 
        for (i = vec = 0; i <= 7; i++, vec += 32) {
                pir_val = READ_ONCE(pir[i]);
                irr_val = *((u32 *)(regs + APIC_IRR + i * 0x10));
                if (pir_val) {
+                       prev_irr_val = irr_val;
                        irr_val |= xchg(&pir[i], 0);
                        *((u32 *)(regs + APIC_IRR + i * 0x10)) = irr_val;
+                       if (prev_irr_val != irr_val) {
+                               max_updated_irr =
+                                       __fls(irr_val ^ prev_irr_val) + vec;
+                       }
                }
                if (irr_val)
-                       max_irr = __fls(irr_val) + vec;
+                       *max_irr = __fls(irr_val) + vec;
        }
 
-       return max_irr;
+       return ((max_updated_irr != -1) &&
+               (max_updated_irr == *max_irr));
 }
 EXPORT_SYMBOL_GPL(__kvm_apic_update_irr);
 
-int kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
+bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr)
 {
        struct kvm_lapic *apic = vcpu->arch.apic;
 
-       return __kvm_apic_update_irr(pir, apic->regs);
+       return __kvm_apic_update_irr(pir, apic->regs, max_irr);
 }
 EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
 
@@ -581,7 +590,7 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
 static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr)
 {
        int highest_irr;
-       if (kvm_x86_ops->sync_pir_to_irr && apic->vcpu->arch.apicv_active)
+       if (apic->vcpu->arch.apicv_active)
                highest_irr = kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
        else
                highest_irr = apic_find_highest_irr(apic);
index 4b9935a38347927ed36048e499cd13a10580f8d4..56c36014f7b76006fbb921da411e611715c901a6 100644 (file)
@@ -75,8 +75,8 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
 bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
                           int short_hand, unsigned int dest, int dest_mode);
 
-int __kvm_apic_update_irr(u32 *pir, void *regs);
-int kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
+bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr);
+bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr);
 void kvm_apic_update_ppr(struct kvm_vcpu *vcpu);
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
                     struct dest_map *dest_map);
index e5e66e5c664057bb5cc5ad2660008ccbf19b69e5..1f1da400fcdeff04345736826787a5fea9b75ff4 100644 (file)
@@ -42,6 +42,7 @@
 #include <linux/kern_levels.h>
 
 #include <asm/page.h>
+#include <asm/pat.h>
 #include <asm/cmpxchg.h>
 #include <asm/io.h>
 #include <asm/vmx.h>
@@ -381,7 +382,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 
-void kvm_mmu_clear_all_pte_masks(void)
+static void kvm_mmu_clear_all_pte_masks(void)
 {
        shadow_user_mask = 0;
        shadow_accessed_mask = 0;
@@ -2708,7 +2709,18 @@ static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
 {
        if (pfn_valid(pfn))
-               return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn));
+               return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) &&
+                       /*
+                        * Some reserved pages, such as those from NVDIMM
+                        * DAX devices, are not for MMIO, and can be mapped
+                        * with cached memory type for better performance.
+                        * However, the above check misconceives those pages
+                        * as MMIO, and results in KVM mapping them with UC
+                        * memory type, which would hurt the performance.
+                        * Therefore, we check the host memory type in addition
+                        * and only treat UC/UC-/WC pages as MMIO.
+                        */
+                       (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn));
 
        return true;
 }
@@ -3395,7 +3407,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
                spin_lock(&vcpu->kvm->mmu_lock);
                if(make_mmu_pages_available(vcpu) < 0) {
                        spin_unlock(&vcpu->kvm->mmu_lock);
-                       return 1;
+                       return -ENOSPC;
                }
                sp = kvm_mmu_get_page(vcpu, 0, 0,
                                vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL);
@@ -3410,7 +3422,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
                        spin_lock(&vcpu->kvm->mmu_lock);
                        if (make_mmu_pages_available(vcpu) < 0) {
                                spin_unlock(&vcpu->kvm->mmu_lock);
-                               return 1;
+                               return -ENOSPC;
                        }
                        sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
                                        i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL);
@@ -3450,7 +3462,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
                spin_lock(&vcpu->kvm->mmu_lock);
                if (make_mmu_pages_available(vcpu) < 0) {
                        spin_unlock(&vcpu->kvm->mmu_lock);
-                       return 1;
+                       return -ENOSPC;
                }
                sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
                                vcpu->arch.mmu.shadow_root_level, 0, ACC_ALL);
@@ -3487,7 +3499,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
                spin_lock(&vcpu->kvm->mmu_lock);
                if (make_mmu_pages_available(vcpu) < 0) {
                        spin_unlock(&vcpu->kvm->mmu_lock);
-                       return 1;
+                       return -ENOSPC;
                }
                sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL,
                                      0, ACC_ALL);
@@ -4950,6 +4962,16 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
        if (mmio_info_in_cache(vcpu, cr2, direct))
                emulation_type = 0;
 emulate:
+       /*
+        * On AMD platforms, under certain conditions insn_len may be zero on #NPF.
+        * This can happen if a guest gets a page-fault on data access but the HW
+        * table walker is not able to read the instruction page (e.g instruction
+        * page is not present in memory). In those cases we simply restart the
+        * guest.
+        */
+       if (unlikely(insn && !insn_len))
+               return 1;
+
        er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
 
        switch (er) {
index d22ddbdf5e6ed5bdab10fc73445f88542d38143a..1272861e77b9ec9868ea3b3f640a962d3cb18631 100644 (file)
@@ -19,7 +19,7 @@
 
 #include <linux/ratelimit.h>
 
-char const *audit_point_name[] = {
+static char const *audit_point_name[] = {
        "pre page fault",
        "post page fault",
        "pre pte write",
index eb714f1cdf7eee4ca9036005c3ab72ef9228ae9b..b613d331d0310e1d9acdb30a01d9182236cc761a 100644 (file)
 #include <linux/amd-iommu.h>
 #include <linux/hashtable.h>
 #include <linux/frame.h>
+#include <linux/psp-sev.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
 
 #include <asm/apic.h>
 #include <asm/perf_event.h>
@@ -211,6 +215,9 @@ struct vcpu_svm {
         */
        struct list_head ir_list;
        spinlock_t ir_list_lock;
+
+       /* which host CPU was used for running this vcpu */
+       unsigned int last_cpu;
 };
 
 /*
@@ -284,8 +291,12 @@ module_param(vls, int, 0444);
 static int vgif = true;
 module_param(vgif, int, 0444);
 
+/* enable/disable SEV support */
+static int sev = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT);
+module_param(sev, int, 0444);
+
 static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
-static void svm_flush_tlb(struct kvm_vcpu *vcpu);
+static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa);
 static void svm_complete_interrupts(struct vcpu_svm *svm);
 
 static int nested_svm_exit_handled(struct vcpu_svm *svm);
@@ -319,6 +330,38 @@ enum {
 
 #define VMCB_AVIC_APIC_BAR_MASK                0xFFFFFFFFFF000ULL
 
+static unsigned int max_sev_asid;
+static unsigned int min_sev_asid;
+static unsigned long *sev_asid_bitmap;
+#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
+
+struct enc_region {
+       struct list_head list;
+       unsigned long npages;
+       struct page **pages;
+       unsigned long uaddr;
+       unsigned long size;
+};
+
+static inline bool svm_sev_enabled(void)
+{
+       return max_sev_asid;
+}
+
+static inline bool sev_guest(struct kvm *kvm)
+{
+       struct kvm_sev_info *sev = &kvm->arch.sev_info;
+
+       return sev->active;
+}
+
+static inline int sev_get_asid(struct kvm *kvm)
+{
+       struct kvm_sev_info *sev = &kvm->arch.sev_info;
+
+       return sev->asid;
+}
+
 static inline void mark_all_dirty(struct vmcb *vmcb)
 {
        vmcb->control.clean = 0;
@@ -531,9 +574,13 @@ struct svm_cpu_data {
        u64 asid_generation;
        u32 max_asid;
        u32 next_asid;
+       u32 min_asid;
        struct kvm_ldttss_desc *tss_desc;
 
        struct page *save_area;
+
+       /* index = sev_asid, value = vmcb pointer */
+       struct vmcb **sev_vmcbs;
 };
 
 static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
@@ -788,6 +835,7 @@ static int svm_hardware_enable(void)
        sd->asid_generation = 1;
        sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
        sd->next_asid = sd->max_asid + 1;
+       sd->min_asid = max_sev_asid + 1;
 
        gdt = get_current_gdt_rw();
        sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
@@ -846,6 +894,7 @@ static void svm_cpu_uninit(int cpu)
                return;
 
        per_cpu(svm_data, raw_smp_processor_id()) = NULL;
+       kfree(sd->sev_vmcbs);
        __free_page(sd->save_area);
        kfree(sd);
 }
@@ -859,11 +908,18 @@ static int svm_cpu_init(int cpu)
        if (!sd)
                return -ENOMEM;
        sd->cpu = cpu;
-       sd->save_area = alloc_page(GFP_KERNEL);
        r = -ENOMEM;
+       sd->save_area = alloc_page(GFP_KERNEL);
        if (!sd->save_area)
                goto err_1;
 
+       if (svm_sev_enabled()) {
+               r = -ENOMEM;
+               sd->sev_vmcbs = kmalloc((max_sev_asid + 1) * sizeof(void *), GFP_KERNEL);
+               if (!sd->sev_vmcbs)
+                       goto err_1;
+       }
+
        per_cpu(svm_data, cpu) = sd;
 
        return 0;
@@ -1051,6 +1107,48 @@ static int avic_ga_log_notifier(u32 ga_tag)
        return 0;
 }
 
+static __init int sev_hardware_setup(void)
+{
+       struct sev_user_data_status *status;
+       int rc;
+
+       /* Maximum number of encrypted guests supported simultaneously */
+       max_sev_asid = cpuid_ecx(0x8000001F);
+
+       if (!max_sev_asid)
+               return 1;
+
+       /* Minimum ASID value that should be used for SEV guest */
+       min_sev_asid = cpuid_edx(0x8000001F);
+
+       /* Initialize SEV ASID bitmap */
+       sev_asid_bitmap = kcalloc(BITS_TO_LONGS(max_sev_asid),
+                               sizeof(unsigned long), GFP_KERNEL);
+       if (!sev_asid_bitmap)
+               return 1;
+
+       status = kmalloc(sizeof(*status), GFP_KERNEL);
+       if (!status)
+               return 1;
+
+       /*
+        * Check SEV platform status.
+        *
+        * PLATFORM_STATUS can be called in any state, if we failed to query
+        * the PLATFORM status then either PSP firmware does not support SEV
+        * feature or SEV firmware is dead.
+        */
+       rc = sev_platform_status(status, NULL);
+       if (rc)
+               goto err;
+
+       pr_info("SEV supported\n");
+
+err:
+       kfree(status);
+       return rc;
+}
+
 static __init int svm_hardware_setup(void)
 {
        int cpu;
@@ -1086,6 +1184,17 @@ static __init int svm_hardware_setup(void)
                kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
        }
 
+       if (sev) {
+               if (boot_cpu_has(X86_FEATURE_SEV) &&
+                   IS_ENABLED(CONFIG_KVM_AMD_SEV)) {
+                       r = sev_hardware_setup();
+                       if (r)
+                               sev = false;
+               } else {
+                       sev = false;
+               }
+       }
+
        for_each_possible_cpu(cpu) {
                r = svm_cpu_init(cpu);
                if (r)
@@ -1147,6 +1256,9 @@ static __exit void svm_hardware_unsetup(void)
 {
        int cpu;
 
+       if (svm_sev_enabled())
+               kfree(sev_asid_bitmap);
+
        for_each_possible_cpu(cpu)
                svm_cpu_uninit(cpu);
 
@@ -1299,7 +1411,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 
        if (npt_enabled) {
                /* Setup VMCB for Nested Paging */
-               control->nested_ctl = 1;
+               control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
                clr_intercept(svm, INTERCEPT_INVLPG);
                clr_exception_intercept(svm, PF_VECTOR);
                clr_cr_intercept(svm, INTERCEPT_CR3_READ);
@@ -1337,6 +1449,11 @@ static void init_vmcb(struct vcpu_svm *svm)
                svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
        }
 
+       if (sev_guest(svm->vcpu.kvm)) {
+               svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
+               clr_exception_intercept(svm, UD_VECTOR);
+       }
+
        mark_all_dirty(svm->vmcb);
 
        enable_gif(svm);
@@ -1419,6 +1536,179 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
        return 0;
 }
 
+static void __sev_asid_free(int asid)
+{
+       struct svm_cpu_data *sd;
+       int cpu, pos;
+
+       pos = asid - 1;
+       clear_bit(pos, sev_asid_bitmap);
+
+       for_each_possible_cpu(cpu) {
+               sd = per_cpu(svm_data, cpu);
+               sd->sev_vmcbs[pos] = NULL;
+       }
+}
+
+static void sev_asid_free(struct kvm *kvm)
+{
+       struct kvm_sev_info *sev = &kvm->arch.sev_info;
+
+       __sev_asid_free(sev->asid);
+}
+
+static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
+{
+       struct sev_data_decommission *decommission;
+       struct sev_data_deactivate *data;
+
+       if (!handle)
+               return;
+
+       data = kzalloc(sizeof(*data), GFP_KERNEL);
+       if (!data)
+               return;
+
+       /* deactivate handle */
+       data->handle = handle;
+       sev_guest_deactivate(data, NULL);
+
+       wbinvd_on_all_cpus();
+       sev_guest_df_flush(NULL);
+       kfree(data);
+
+       decommission = kzalloc(sizeof(*decommission), GFP_KERNEL);
+       if (!decommission)
+               return;
+
+       /* decommission handle */
+       decommission->handle = handle;
+       sev_guest_decommission(decommission, NULL);
+
+       kfree(decommission);
+}
+
+static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
+                                   unsigned long ulen, unsigned long *n,
+                                   int write)
+{
+       struct kvm_sev_info *sev = &kvm->arch.sev_info;
+       unsigned long npages, npinned, size;
+       unsigned long locked, lock_limit;
+       struct page **pages;
+       int first, last;
+
+       /* Calculate number of pages. */
+       first = (uaddr & PAGE_MASK) >> PAGE_SHIFT;
+       last = ((uaddr + ulen - 1) & PAGE_MASK) >> PAGE_SHIFT;
+       npages = (last - first + 1);
+
+       locked = sev->pages_locked + npages;
+       lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+       if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+               pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", locked, lock_limit);
+               return NULL;
+       }
+
+       /* Avoid using vmalloc for smaller buffers. */
+       size = npages * sizeof(struct page *);
+       if (size > PAGE_SIZE)
+               pages = vmalloc(size);
+       else
+               pages = kmalloc(size, GFP_KERNEL);
+
+       if (!pages)
+               return NULL;
+
+       /* Pin the user virtual address. */
+       npinned = get_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages);
+       if (npinned != npages) {
+               pr_err("SEV: Failure locking %lu pages.\n", npages);
+               goto err;
+       }
+
+       *n = npages;
+       sev->pages_locked = locked;
+
+       return pages;
+
+err:
+       if (npinned > 0)
+               release_pages(pages, npinned);
+
+       kvfree(pages);
+       return NULL;
+}
+
+static void sev_unpin_memory(struct kvm *kvm, struct page **pages,
+                            unsigned long npages)
+{
+       struct kvm_sev_info *sev = &kvm->arch.sev_info;
+
+       release_pages(pages, npages);
+       kvfree(pages);
+       sev->pages_locked -= npages;
+}
+
+static void sev_clflush_pages(struct page *pages[], unsigned long npages)
+{
+       uint8_t *page_virtual;
+       unsigned long i;
+
+       if (npages == 0 || pages == NULL)
+               return;
+
+       for (i = 0; i < npages; i++) {
+               page_virtual = kmap_atomic(pages[i]);
+               clflush_cache_range(page_virtual, PAGE_SIZE);
+               kunmap_atomic(page_virtual);
+       }
+}
+
+static void __unregister_enc_region_locked(struct kvm *kvm,
+                                          struct enc_region *region)
+{
+       /*
+        * The guest may change the memory encryption attribute from C=0 -> C=1
+        * or vice versa for this memory range. Lets make sure caches are
+        * flushed to ensure that guest data gets written into memory with
+        * correct C-bit.
+        */
+       sev_clflush_pages(region->pages, region->npages);
+
+       sev_unpin_memory(kvm, region->pages, region->npages);
+       list_del(&region->list);
+       kfree(region);
+}
+
+static void sev_vm_destroy(struct kvm *kvm)
+{
+       struct kvm_sev_info *sev = &kvm->arch.sev_info;
+       struct list_head *head = &sev->regions_list;
+       struct list_head *pos, *q;
+
+       if (!sev_guest(kvm))
+               return;
+
+       mutex_lock(&kvm->lock);
+
+       /*
+        * if userspace was terminated before unregistering the memory regions
+        * then lets unpin all the registered memory.
+        */
+       if (!list_empty(head)) {
+               list_for_each_safe(pos, q, head) {
+                       __unregister_enc_region_locked(kvm,
+                               list_entry(pos, struct enc_region, list));
+               }
+       }
+
+       mutex_unlock(&kvm->lock);
+
+       sev_unbind_asid(kvm, sev->handle);
+       sev_asid_free(kvm);
+}
+
 static void avic_vm_destroy(struct kvm *kvm)
 {
        unsigned long flags;
@@ -1437,6 +1727,12 @@ static void avic_vm_destroy(struct kvm *kvm)
        spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
 }
 
+static void svm_vm_destroy(struct kvm *kvm)
+{
+       avic_vm_destroy(kvm);
+       sev_vm_destroy(kvm);
+}
+
 static int avic_vm_init(struct kvm *kvm)
 {
        unsigned long flags;
@@ -2035,7 +2331,7 @@ static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
                return 1;
 
        if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
-               svm_flush_tlb(vcpu);
+               svm_flush_tlb(vcpu, true);
 
        vcpu->arch.cr4 = cr4;
        if (!npt_enabled)
@@ -2094,7 +2390,7 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
 {
        if (sd->next_asid > sd->max_asid) {
                ++sd->asid_generation;
-               sd->next_asid = 1;
+               sd->next_asid = sd->min_asid;
                svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
        }
 
@@ -2142,22 +2438,24 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
 
 static int pf_interception(struct vcpu_svm *svm)
 {
-       u64 fault_address = svm->vmcb->control.exit_info_2;
+       u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
        u64 error_code = svm->vmcb->control.exit_info_1;
 
        return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address,
-                       svm->vmcb->control.insn_bytes,
+                       static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
+                       svm->vmcb->control.insn_bytes : NULL,
                        svm->vmcb->control.insn_len);
 }
 
 static int npf_interception(struct vcpu_svm *svm)
 {
-       u64 fault_address = svm->vmcb->control.exit_info_2;
+       u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
        u64 error_code = svm->vmcb->control.exit_info_1;
 
        trace_kvm_page_fault(fault_address, error_code);
        return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
-                       svm->vmcb->control.insn_bytes,
+                       static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
+                       svm->vmcb->control.insn_bytes : NULL,
                        svm->vmcb->control.insn_len);
 }
 
@@ -2385,7 +2683,7 @@ static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
 
        svm->vmcb->control.nested_cr3 = __sme_set(root);
        mark_dirty(svm->vmcb, VMCB_NPT);
-       svm_flush_tlb(vcpu);
+       svm_flush_tlb(vcpu, true);
 }
 
 static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
@@ -2927,7 +3225,8 @@ static bool nested_vmcb_checks(struct vmcb *vmcb)
        if (vmcb->control.asid == 0)
                return false;
 
-       if (vmcb->control.nested_ctl && !npt_enabled)
+       if ((vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) &&
+           !npt_enabled)
                return false;
 
        return true;
@@ -2941,7 +3240,7 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
        else
                svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
 
-       if (nested_vmcb->control.nested_ctl) {
+       if (nested_vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) {
                kvm_mmu_unload(&svm->vcpu);
                svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3;
                nested_svm_init_mmu_context(&svm->vcpu);
@@ -2989,7 +3288,7 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
        svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
        svm->nested.intercept            = nested_vmcb->control.intercept;
 
-       svm_flush_tlb(&svm->vcpu);
+       svm_flush_tlb(&svm->vcpu, true);
        svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
        if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
                svm->vcpu.arch.hflags |= HF_VINTR_MASK;
@@ -4362,12 +4661,39 @@ static void reload_tss(struct kvm_vcpu *vcpu)
        load_TR_desc();
 }
 
+static void pre_sev_run(struct vcpu_svm *svm, int cpu)
+{
+       struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
+       int asid = sev_get_asid(svm->vcpu.kvm);
+
+       /* Assign the asid allocated with this SEV guest */
+       svm->vmcb->control.asid = asid;
+
+       /*
+        * Flush guest TLB:
+        *
+        * 1) when different VMCB for the same ASID is to be run on the same host CPU.
+        * 2) or this VMCB was executed on different host CPU in previous VMRUNs.
+        */
+       if (sd->sev_vmcbs[asid] == svm->vmcb &&
+           svm->last_cpu == cpu)
+               return;
+
+       svm->last_cpu = cpu;
+       sd->sev_vmcbs[asid] = svm->vmcb;
+       svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
+       mark_dirty(svm->vmcb, VMCB_ASID);
+}
+
 static void pre_svm_run(struct vcpu_svm *svm)
 {
        int cpu = raw_smp_processor_id();
 
        struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
 
+       if (sev_guest(svm->vcpu.kvm))
+               return pre_sev_run(svm, cpu);
+
        /* FIXME: handle wraparound of asid_generation */
        if (svm->asid_generation != sd->asid_generation)
                new_asid(svm, sd);
@@ -4785,7 +5111,7 @@ static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
        return 0;
 }
 
-static void svm_flush_tlb(struct kvm_vcpu *vcpu)
+static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
 
@@ -5076,7 +5402,7 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 
        svm->vmcb->save.cr3 = __sme_set(root);
        mark_dirty(svm->vmcb, VMCB_CR);
-       svm_flush_tlb(vcpu);
+       svm_flush_tlb(vcpu, true);
 }
 
 static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
@@ -5090,7 +5416,7 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
        svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
        mark_dirty(svm->vmcb, VMCB_CR);
 
-       svm_flush_tlb(vcpu);
+       svm_flush_tlb(vcpu, true);
 }
 
 static int is_disabled(void)
@@ -5176,6 +5502,12 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
                        entry->edx |= SVM_FEATURE_NPT;
 
                break;
+       case 0x8000001F:
+               /* Support memory encryption cpuid if host supports it */
+               if (boot_cpu_has(X86_FEATURE_SEV))
+                       cpuid(0x8000001f, &entry->eax, &entry->ebx,
+                               &entry->ecx, &entry->edx);
+
        }
 }
 
@@ -5204,6 +5536,11 @@ static bool svm_xsaves_supported(void)
        return false;
 }
 
+static bool svm_umip_emulated(void)
+{
+       return false;
+}
+
 static bool svm_has_wbinvd_exit(void)
 {
        return true;
@@ -5505,6 +5842,828 @@ static int enable_smi_window(struct kvm_vcpu *vcpu)
        return 0;
 }
 
+static int sev_asid_new(void)
+{
+       int pos;
+
+       /*
+        * SEV-enabled guest must use asid from min_sev_asid to max_sev_asid.
+        */
+       pos = find_next_zero_bit(sev_asid_bitmap, max_sev_asid, min_sev_asid - 1);
+       if (pos >= max_sev_asid)
+               return -EBUSY;
+
+       set_bit(pos, sev_asid_bitmap);
+       return pos + 1;
+}
+
+static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+       struct kvm_sev_info *sev = &kvm->arch.sev_info;
+       int asid, ret;
+
+       ret = -EBUSY;
+       asid = sev_asid_new();
+       if (asid < 0)
+               return ret;
+
+       ret = sev_platform_init(&argp->error);
+       if (ret)
+               goto e_free;
+
+       sev->active = true;
+       sev->asid = asid;
+       INIT_LIST_HEAD(&sev->regions_list);
+
+       return 0;
+
+e_free:
+       __sev_asid_free(asid);
+       return ret;
+}
+
+static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
+{
+       struct sev_data_activate *data;
+       int asid = sev_get_asid(kvm);
+       int ret;
+
+       wbinvd_on_all_cpus();
+
+       ret = sev_guest_df_flush(error);
+       if (ret)
+               return ret;
+
+       data = kzalloc(sizeof(*data), GFP_KERNEL);
+       if (!data)
+               return -ENOMEM;
+
+       /* activate ASID on the given handle */
+       data->handle = handle;
+       data->asid   = asid;
+       ret = sev_guest_activate(data, error);
+       kfree(data);
+
+       return ret;
+}
+
+static int __sev_issue_cmd(int fd, int id, void *data, int *error)
+{
+       struct fd f;
+       int ret;
+
+       f = fdget(fd);
+       if (!f.file)
+               return -EBADF;
+
+       ret = sev_issue_cmd_external_user(f.file, id, data, error);
+
+       fdput(f);
+       return ret;
+}
+
+static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error)
+{
+       struct kvm_sev_info *sev = &kvm->arch.sev_info;
+
+       return __sev_issue_cmd(sev->fd, id, data, error);
+}
+
+static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+       struct kvm_sev_info *sev = &kvm->arch.sev_info;
+       struct sev_data_launch_start *start;
+       struct kvm_sev_launch_start params;
+       void *dh_blob, *session_blob;
+       int *error = &argp->error;
+       int ret;
+
+       if (!sev_guest(kvm))
+               return -ENOTTY;
+
+       if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+               return -EFAULT;
+
+       start = kzalloc(sizeof(*start), GFP_KERNEL);
+       if (!start)
+               return -ENOMEM;
+
+       dh_blob = NULL;
+       if (params.dh_uaddr) {
+               dh_blob = psp_copy_user_blob(params.dh_uaddr, params.dh_len);
+               if (IS_ERR(dh_blob)) {
+                       ret = PTR_ERR(dh_blob);
+                       goto e_free;
+               }
+
+               start->dh_cert_address = __sme_set(__pa(dh_blob));
+               start->dh_cert_len = params.dh_len;
+       }
+
+       session_blob = NULL;
+       if (params.session_uaddr) {
+               session_blob = psp_copy_user_blob(params.session_uaddr, params.session_len);
+               if (IS_ERR(session_blob)) {
+                       ret = PTR_ERR(session_blob);
+                       goto e_free_dh;
+               }
+
+               start->session_address = __sme_set(__pa(session_blob));
+               start->session_len = params.session_len;
+       }
+
+       start->handle = params.handle;
+       start->policy = params.policy;
+
+       /* create memory encryption context */
+       ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, start, error);
+       if (ret)
+               goto e_free_session;
+
+       /* Bind ASID to this guest */
+       ret = sev_bind_asid(kvm, start->handle, error);
+       if (ret)
+               goto e_free_session;
+
+       /* return handle to userspace */
+       params.handle = start->handle;
+       if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params))) {
+               sev_unbind_asid(kvm, start->handle);
+               ret = -EFAULT;
+               goto e_free_session;
+       }
+
+       sev->handle = start->handle;
+       sev->fd = argp->sev_fd;
+
+e_free_session:
+       kfree(session_blob);
+e_free_dh:
+       kfree(dh_blob);
+e_free:
+       kfree(start);
+       return ret;
+}
+
+static int get_num_contig_pages(int idx, struct page **inpages,
+                               unsigned long npages)
+{
+       unsigned long paddr, next_paddr;
+       int i = idx + 1, pages = 1;
+
+       /* find the number of contiguous pages starting from idx */
+       paddr = __sme_page_pa(inpages[idx]);
+       while (i < npages) {
+               next_paddr = __sme_page_pa(inpages[i++]);
+               if ((paddr + PAGE_SIZE) == next_paddr) {
+                       pages++;
+                       paddr = next_paddr;
+                       continue;
+               }
+               break;
+       }
+
+       return pages;
+}
+
+static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+       unsigned long vaddr, vaddr_end, next_vaddr, npages, size;
+       struct kvm_sev_info *sev = &kvm->arch.sev_info;
+       struct kvm_sev_launch_update_data params;
+       struct sev_data_launch_update_data *data;
+       struct page **inpages;
+       int i, ret, pages;
+
+       if (!sev_guest(kvm))
+               return -ENOTTY;
+
+       if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+               return -EFAULT;
+
+       data = kzalloc(sizeof(*data), GFP_KERNEL);
+       if (!data)
+               return -ENOMEM;
+
+       vaddr = params.uaddr;
+       size = params.len;
+       vaddr_end = vaddr + size;
+
+       /* Lock the user memory. */
+       inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1);
+       if (!inpages) {
+               ret = -ENOMEM;
+               goto e_free;
+       }
+
+       /*
+        * The LAUNCH_UPDATE command will perform in-place encryption of the
+        * memory content (i.e it will write the same memory region with C=1).
+        * It's possible that the cache may contain the data with C=0, i.e.,
+        * unencrypted so invalidate it first.
+        */
+       sev_clflush_pages(inpages, npages);
+
+       for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) {
+               int offset, len;
+
+               /*
+                * If the user buffer is not page-aligned, calculate the offset
+                * within the page.
+                */
+               offset = vaddr & (PAGE_SIZE - 1);
+
+               /* Calculate the number of pages that can be encrypted in one go. */
+               pages = get_num_contig_pages(i, inpages, npages);
+
+               len = min_t(size_t, ((pages * PAGE_SIZE) - offset), size);
+
+               data->handle = sev->handle;
+               data->len = len;
+               data->address = __sme_page_pa(inpages[i]) + offset;
+               ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, data, &argp->error);
+               if (ret)
+                       goto e_unpin;
+
+               size -= len;
+               next_vaddr = vaddr + len;
+       }
+
+e_unpin:
+       /* content of memory is updated, mark pages dirty */
+       for (i = 0; i < npages; i++) {
+               set_page_dirty_lock(inpages[i]);
+               mark_page_accessed(inpages[i]);
+       }
+       /* unlock the user pages */
+       sev_unpin_memory(kvm, inpages, npages);
+e_free:
+       kfree(data);
+       return ret;
+}
+
+static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+       struct kvm_sev_info *sev = &kvm->arch.sev_info;
+       struct sev_data_launch_measure *data;
+       struct kvm_sev_launch_measure params;
+       void *blob = NULL;
+       int ret;
+
+       if (!sev_guest(kvm))
+               return -ENOTTY;
+
+       if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+               return -EFAULT;
+
+       data = kzalloc(sizeof(*data), GFP_KERNEL);
+       if (!data)
+               return -ENOMEM;
+
+       /* User wants to query the blob length */
+       if (!params.len)
+               goto cmd;
+
+       if (params.uaddr) {
+               if (params.len > SEV_FW_BLOB_MAX_SIZE) {
+                       ret = -EINVAL;
+                       goto e_free;
+               }
+
+               if (!access_ok(VERIFY_WRITE, params.uaddr, params.len)) {
+                       ret = -EFAULT;
+                       goto e_free;
+               }
+
+               ret = -ENOMEM;
+               blob = kmalloc(params.len, GFP_KERNEL);
+               if (!blob)
+                       goto e_free;
+
+               data->address = __psp_pa(blob);
+               data->len = params.len;
+       }
+
+cmd:
+       data->handle = sev->handle;
+       ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, data, &argp->error);
+
+       /*
+        * If we query the session length, FW responded with expected data.
+        */
+       if (!params.len)
+               goto done;
+
+       if (ret)
+               goto e_free_blob;
+
+       if (blob) {
+               if (copy_to_user((void __user *)(uintptr_t)params.uaddr, blob, params.len))
+                       ret = -EFAULT;
+       }
+
+done:
+       params.len = data->len;
+       if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params)))
+               ret = -EFAULT;
+e_free_blob:
+       kfree(blob);
+e_free:
+       kfree(data);
+       return ret;
+}
+
+static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+       struct kvm_sev_info *sev = &kvm->arch.sev_info;
+       struct sev_data_launch_finish *data;
+       int ret;
+
+       if (!sev_guest(kvm))
+               return -ENOTTY;
+
+       data = kzalloc(sizeof(*data), GFP_KERNEL);
+       if (!data)
+               return -ENOMEM;
+
+       data->handle = sev->handle;
+       ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, data, &argp->error);
+
+       kfree(data);
+       return ret;
+}
+
+static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+       struct kvm_sev_info *sev = &kvm->arch.sev_info;
+       struct kvm_sev_guest_status params;
+       struct sev_data_guest_status *data;
+       int ret;
+
+       if (!sev_guest(kvm))
+               return -ENOTTY;
+
+       data = kzalloc(sizeof(*data), GFP_KERNEL);
+       if (!data)
+               return -ENOMEM;
+
+       data->handle = sev->handle;
+       ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, data, &argp->error);
+       if (ret)
+               goto e_free;
+
+       params.policy = data->policy;
+       params.state = data->state;
+       params.handle = data->handle;
+
+       if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params)))
+               ret = -EFAULT;
+e_free:
+       kfree(data);
+       return ret;
+}
+
+static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src,
+                              unsigned long dst, int size,
+                              int *error, bool enc)
+{
+       struct kvm_sev_info *sev = &kvm->arch.sev_info;
+       struct sev_data_dbg *data;
+       int ret;
+
+       data = kzalloc(sizeof(*data), GFP_KERNEL);
+       if (!data)
+               return -ENOMEM;
+
+       data->handle = sev->handle;
+       data->dst_addr = dst;
+       data->src_addr = src;
+       data->len = size;
+
+       ret = sev_issue_cmd(kvm,
+                           enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT,
+                           data, error);
+       kfree(data);
+       return ret;
+}
+
+static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr,
+                            unsigned long dst_paddr, int sz, int *err)
+{
+       int offset;
+
+       /*
+        * Its safe to read more than we are asked, caller should ensure that
+        * destination has enough space.
+        */
+       src_paddr = round_down(src_paddr, 16);
+       offset = src_paddr & 15;
+       sz = round_up(sz + offset, 16);
+
+       return __sev_issue_dbg_cmd(kvm, src_paddr, dst_paddr, sz, err, false);
+}
+
+static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
+                                 unsigned long __user dst_uaddr,
+                                 unsigned long dst_paddr,
+                                 int size, int *err)
+{
+       struct page *tpage = NULL;
+       int ret, offset;
+
+       /* if inputs are not 16-byte then use intermediate buffer */
+       if (!IS_ALIGNED(dst_paddr, 16) ||
+           !IS_ALIGNED(paddr,     16) ||
+           !IS_ALIGNED(size,      16)) {
+               tpage = (void *)alloc_page(GFP_KERNEL);
+               if (!tpage)
+                       return -ENOMEM;
+
+               dst_paddr = __sme_page_pa(tpage);
+       }
+
+       ret = __sev_dbg_decrypt(kvm, paddr, dst_paddr, size, err);
+       if (ret)
+               goto e_free;
+
+       if (tpage) {
+               offset = paddr & 15;
+               if (copy_to_user((void __user *)(uintptr_t)dst_uaddr,
+                                page_address(tpage) + offset, size))
+                       ret = -EFAULT;
+       }
+
+e_free:
+       if (tpage)
+               __free_page(tpage);
+
+       return ret;
+}
+
+static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
+                                 unsigned long __user vaddr,
+                                 unsigned long dst_paddr,
+                                 unsigned long __user dst_vaddr,
+                                 int size, int *error)
+{
+       struct page *src_tpage = NULL;
+       struct page *dst_tpage = NULL;
+       int ret, len = size;
+
+       /* If source buffer is not aligned then use an intermediate buffer */
+       if (!IS_ALIGNED(vaddr, 16)) {
+               src_tpage = alloc_page(GFP_KERNEL);
+               if (!src_tpage)
+                       return -ENOMEM;
+
+               if (copy_from_user(page_address(src_tpage),
+                               (void __user *)(uintptr_t)vaddr, size)) {
+                       __free_page(src_tpage);
+                       return -EFAULT;
+               }
+
+               paddr = __sme_page_pa(src_tpage);
+       }
+
+       /*
+        *  If destination buffer or length is not aligned then do read-modify-write:
+        *   - decrypt destination in an intermediate buffer
+        *   - copy the source buffer in an intermediate buffer
+        *   - use the intermediate buffer as source buffer
+        */
+       if (!IS_ALIGNED(dst_vaddr, 16) || !IS_ALIGNED(size, 16)) {
+               int dst_offset;
+
+               dst_tpage = alloc_page(GFP_KERNEL);
+               if (!dst_tpage) {
+                       ret = -ENOMEM;
+                       goto e_free;
+               }
+
+               ret = __sev_dbg_decrypt(kvm, dst_paddr,
+                                       __sme_page_pa(dst_tpage), size, error);
+               if (ret)
+                       goto e_free;
+
+               /*
+                *  If source is kernel buffer then use memcpy() otherwise
+                *  copy_from_user().
+                */
+               dst_offset = dst_paddr & 15;
+
+               if (src_tpage)
+                       memcpy(page_address(dst_tpage) + dst_offset,
+                              page_address(src_tpage), size);
+               else {
+                       if (copy_from_user(page_address(dst_tpage) + dst_offset,
+                                          (void __user *)(uintptr_t)vaddr, size)) {
+                               ret = -EFAULT;
+                               goto e_free;
+                       }
+               }
+
+               paddr = __sme_page_pa(dst_tpage);
+               dst_paddr = round_down(dst_paddr, 16);
+               len = round_up(size, 16);
+       }
+
+       ret = __sev_issue_dbg_cmd(kvm, paddr, dst_paddr, len, error, true);
+
+e_free:
+       if (src_tpage)
+               __free_page(src_tpage);
+       if (dst_tpage)
+               __free_page(dst_tpage);
+       return ret;
+}
+
+static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
+{
+       unsigned long vaddr, vaddr_end, next_vaddr;
+       unsigned long dst_vaddr, dst_vaddr_end;
+       struct page **src_p, **dst_p;
+       struct kvm_sev_dbg debug;
+       unsigned long n;
+       int ret, size;
+
+       if (!sev_guest(kvm))
+               return -ENOTTY;
+
+       if (copy_from_user(&debug, (void __user *)(uintptr_t)argp->data, sizeof(debug)))
+               return -EFAULT;
+
+       vaddr = debug.src_uaddr;
+       size = debug.len;
+       vaddr_end = vaddr + size;
+       dst_vaddr = debug.dst_uaddr;
+       dst_vaddr_end = dst_vaddr + size;
+
+       for (; vaddr < vaddr_end; vaddr = next_vaddr) {
+               int len, s_off, d_off;
+
+               /* lock userspace source and destination page */
+               src_p = sev_pin_memory(kvm, vaddr & PAGE_MASK, PAGE_SIZE, &n, 0);
+               if (!src_p)
+                       return -EFAULT;
+
+               dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, 1);
+               if (!dst_p) {
+                       sev_unpin_memory(kvm, src_p, n);
+                       return -EFAULT;
+               }
+
+               /*
+                * The DBG_{DE,EN}CRYPT commands will perform {dec,en}cryption of the
+                * memory content (i.e it will write the same memory region with C=1).
+                * It's possible that the cache may contain the data with C=0, i.e.,
+                * unencrypted so invalidate it first.
+                */
+               sev_clflush_pages(src_p, 1);
+               sev_clflush_pages(dst_p, 1);
+
+               /*
+                * Since user buffer may not be page aligned, calculate the
+                * offset within the page.
+                */
+               s_off = vaddr & ~PAGE_MASK;
+               d_off = dst_vaddr & ~PAGE_MASK;
+               len = min_t(size_t, (PAGE_SIZE - s_off), size);
+
+               if (dec)
+                       ret = __sev_dbg_decrypt_user(kvm,
+                                                    __sme_page_pa(src_p[0]) + s_off,
+                                                    dst_vaddr,
+                                                    __sme_page_pa(dst_p[0]) + d_off,
+                                                    len, &argp->error);
+               else
+                       ret = __sev_dbg_encrypt_user(kvm,
+                                                    __sme_page_pa(src_p[0]) + s_off,
+                                                    vaddr,
+                                                    __sme_page_pa(dst_p[0]) + d_off,
+                                                    dst_vaddr,
+                                                    len, &argp->error);
+
+               sev_unpin_memory(kvm, src_p, 1);
+               sev_unpin_memory(kvm, dst_p, 1);
+
+               if (ret)
+                       goto err;
+
+               next_vaddr = vaddr + len;
+               dst_vaddr = dst_vaddr + len;
+               size -= len;
+       }
+err:
+       return ret;
+}
+
+static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+       struct kvm_sev_info *sev = &kvm->arch.sev_info;
+       struct sev_data_launch_secret *data;
+       struct kvm_sev_launch_secret params;
+       struct page **pages;
+       void *blob, *hdr;
+       unsigned long n;
+       int ret;
+
+       if (!sev_guest(kvm))
+               return -ENOTTY;
+
+       if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+               return -EFAULT;
+
+       pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, 1);
+       if (!pages)
+               return -ENOMEM;
+
+       /*
+        * The secret must be copied into contiguous memory region, lets verify
+        * that userspace memory pages are contiguous before we issue command.
+        */
+       if (get_num_contig_pages(0, pages, n) != n) {
+               ret = -EINVAL;
+               goto e_unpin_memory;
+       }
+
+       ret = -ENOMEM;
+       data = kzalloc(sizeof(*data), GFP_KERNEL);
+       if (!data)
+               goto e_unpin_memory;
+
+       blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
+       if (IS_ERR(blob)) {
+               ret = PTR_ERR(blob);
+               goto e_free;
+       }
+
+       data->trans_address = __psp_pa(blob);
+       data->trans_len = params.trans_len;
+
+       hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
+       if (IS_ERR(hdr)) {
+               ret = PTR_ERR(hdr);
+               goto e_free_blob;
+       }
+       data->trans_address = __psp_pa(blob);
+       data->trans_len = params.trans_len;
+
+       data->handle = sev->handle;
+       ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, data, &argp->error);
+
+       kfree(hdr);
+
+e_free_blob:
+       kfree(blob);
+e_free:
+       kfree(data);
+e_unpin_memory:
+       sev_unpin_memory(kvm, pages, n);
+       return ret;
+}
+
+static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
+{
+       struct kvm_sev_cmd sev_cmd;
+       int r;
+
+       if (!svm_sev_enabled())
+               return -ENOTTY;
+
+       if (copy_from_user(&sev_cmd, argp, sizeof(struct kvm_sev_cmd)))
+               return -EFAULT;
+
+       mutex_lock(&kvm->lock);
+
+       switch (sev_cmd.id) {
+       case KVM_SEV_INIT:
+               r = sev_guest_init(kvm, &sev_cmd);
+               break;
+       case KVM_SEV_LAUNCH_START:
+               r = sev_launch_start(kvm, &sev_cmd);
+               break;
+       case KVM_SEV_LAUNCH_UPDATE_DATA:
+               r = sev_launch_update_data(kvm, &sev_cmd);
+               break;
+       case KVM_SEV_LAUNCH_MEASURE:
+               r = sev_launch_measure(kvm, &sev_cmd);
+               break;
+       case KVM_SEV_LAUNCH_FINISH:
+               r = sev_launch_finish(kvm, &sev_cmd);
+               break;
+       case KVM_SEV_GUEST_STATUS:
+               r = sev_guest_status(kvm, &sev_cmd);
+               break;
+       case KVM_SEV_DBG_DECRYPT:
+               r = sev_dbg_crypt(kvm, &sev_cmd, true);
+               break;
+       case KVM_SEV_DBG_ENCRYPT:
+               r = sev_dbg_crypt(kvm, &sev_cmd, false);
+               break;
+       case KVM_SEV_LAUNCH_SECRET:
+               r = sev_launch_secret(kvm, &sev_cmd);
+               break;
+       default:
+               r = -EINVAL;
+               goto out;
+       }
+
+       if (copy_to_user(argp, &sev_cmd, sizeof(struct kvm_sev_cmd)))
+               r = -EFAULT;
+
+out:
+       mutex_unlock(&kvm->lock);
+       return r;
+}
+
+static int svm_register_enc_region(struct kvm *kvm,
+                                  struct kvm_enc_region *range)
+{
+       struct kvm_sev_info *sev = &kvm->arch.sev_info;
+       struct enc_region *region;
+       int ret = 0;
+
+       if (!sev_guest(kvm))
+               return -ENOTTY;
+
+       region = kzalloc(sizeof(*region), GFP_KERNEL);
+       if (!region)
+               return -ENOMEM;
+
+       region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages, 1);
+       if (!region->pages) {
+               ret = -ENOMEM;
+               goto e_free;
+       }
+
+       /*
+        * The guest may change the memory encryption attribute from C=0 -> C=1
+        * or vice versa for this memory range. Lets make sure caches are
+        * flushed to ensure that guest data gets written into memory with
+        * correct C-bit.
+        */
+       sev_clflush_pages(region->pages, region->npages);
+
+       region->uaddr = range->addr;
+       region->size = range->size;
+
+       mutex_lock(&kvm->lock);
+       list_add_tail(&region->list, &sev->regions_list);
+       mutex_unlock(&kvm->lock);
+
+       return ret;
+
+e_free:
+       kfree(region);
+       return ret;
+}
+
+static struct enc_region *
+find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
+{
+       struct kvm_sev_info *sev = &kvm->arch.sev_info;
+       struct list_head *head = &sev->regions_list;
+       struct enc_region *i;
+
+       list_for_each_entry(i, head, list) {
+               if (i->uaddr == range->addr &&
+                   i->size == range->size)
+                       return i;
+       }
+
+       return NULL;
+}
+
+
+static int svm_unregister_enc_region(struct kvm *kvm,
+                                    struct kvm_enc_region *range)
+{
+       struct enc_region *region;
+       int ret;
+
+       mutex_lock(&kvm->lock);
+
+       if (!sev_guest(kvm)) {
+               ret = -ENOTTY;
+               goto failed;
+       }
+
+       region = find_enc_region(kvm, range);
+       if (!region) {
+               ret = -EINVAL;
+               goto failed;
+       }
+
+       __unregister_enc_region_locked(kvm, region);
+
+       mutex_unlock(&kvm->lock);
+       return 0;
+
+failed:
+       mutex_unlock(&kvm->lock);
+       return ret;
+}
+
 static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
        .cpu_has_kvm_support = has_svm,
        .disabled_by_bios = is_disabled,
@@ -5521,7 +6680,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
        .vcpu_reset = svm_vcpu_reset,
 
        .vm_init = avic_vm_init,
-       .vm_destroy = avic_vm_destroy,
+       .vm_destroy = svm_vm_destroy,
 
        .prepare_guest_switch = svm_prepare_guest_switch,
        .vcpu_load = svm_vcpu_load,
@@ -5581,6 +6740,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
        .load_eoi_exitmap = svm_load_eoi_exitmap,
        .hwapic_irr_update = svm_hwapic_irr_update,
        .hwapic_isr_update = svm_hwapic_isr_update,
+       .sync_pir_to_irr = kvm_lapic_find_highest_irr,
        .apicv_post_state_restore = avic_post_state_restore,
 
        .set_tss_addr = svm_set_tss_addr,
@@ -5597,6 +6757,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
        .invpcid_supported = svm_invpcid_supported,
        .mpx_supported = svm_mpx_supported,
        .xsaves_supported = svm_xsaves_supported,
+       .umip_emulated = svm_umip_emulated,
 
        .set_supported_cpuid = svm_set_supported_cpuid,
 
@@ -5620,6 +6781,10 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
        .pre_enter_smm = svm_pre_enter_smm,
        .pre_leave_smm = svm_pre_leave_smm,
        .enable_smi_window = enable_smi_window,
+
+       .mem_enc_op = svm_mem_enc_op,
+       .mem_enc_reg_region = svm_register_enc_region,
+       .mem_enc_unreg_region = svm_unregister_enc_region,
 };
 
 static int __init svm_init(void)
index 8eba631c4dbd509d8687c6135e8dba267042f5e0..1e2ca9e8662ff2bdd4ea673b031ad96f1eda0af4 100644 (file)
@@ -184,7 +184,6 @@ module_param(ple_window_max, int, S_IRUGO);
 extern const ulong vmx_return;
 
 #define NR_AUTOLOAD_MSRS 8
-#define VMCS02_POOL_SIZE 1
 
 struct vmcs {
        u32 revision_id;
@@ -225,7 +224,7 @@ struct shared_msr_entry {
  * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
  * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
  * More than one of these structures may exist, if L1 runs multiple L2 guests.
- * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
+ * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
  * underlying hardware which will be used to run L2.
  * This structure is packed to ensure that its layout is identical across
  * machines (necessary for live migration).
@@ -408,12 +407,11 @@ struct __packed vmcs12 {
  */
 #define VMCS12_SIZE 0x1000
 
-/* Used to remember the last vmcs02 used for some recently used vmcs12s */
-struct vmcs02_list {
-       struct list_head list;
-       gpa_t vmptr;
-       struct loaded_vmcs vmcs02;
-};
+/*
+ * VMCS12_MAX_FIELD_INDEX is the highest index value used in any
+ * supported VMCS12 field encoding.
+ */
+#define VMCS12_MAX_FIELD_INDEX 0x17
 
 /*
  * The nested_vmx structure is part of vcpu_vmx, and holds information we need
@@ -438,16 +436,17 @@ struct nested_vmx {
         * data hold by vmcs12
         */
        bool sync_shadow_vmcs;
+       bool dirty_vmcs12;
 
-       /* vmcs02_list cache of VMCSs recently used to run L2 guests */
-       struct list_head vmcs02_pool;
-       int vmcs02_num;
        bool change_vmcs01_virtual_x2apic_mode;
        /* L2 must run next, and mustn't decide to exit to L1. */
        bool nested_run_pending;
+
+       struct loaded_vmcs vmcs02;
+
        /*
-        * Guest pages referred to in vmcs02 with host-physical pointers, so
-        * we must keep them pinned while L2 runs.
+        * Guest pages referred to in the vmcs02 with host-physical
+        * pointers, so we must keep them pinned while L2 runs.
         */
        struct page *apic_access_page;
        struct page *virtual_apic_page;
@@ -658,6 +657,8 @@ struct vcpu_vmx {
 
        u32 host_pkru;
 
+       unsigned long host_debugctlmsr;
+
        /*
         * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
         * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included
@@ -686,67 +687,24 @@ static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
        return &(to_vmx(vcpu)->pi_desc);
 }
 
+#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
 #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
-#define FIELD(number, name)    [number] = VMCS12_OFFSET(name)
-#define FIELD64(number, name)  [number] = VMCS12_OFFSET(name), \
-                               [number##_HIGH] = VMCS12_OFFSET(name)+4
+#define FIELD(number, name)    [ROL16(number, 6)] = VMCS12_OFFSET(name)
+#define FIELD64(number, name)                                          \
+       FIELD(number, name),                                            \
+       [ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32)
 
 
-static unsigned long shadow_read_only_fields[] = {
-       /*
-        * We do NOT shadow fields that are modified when L0
-        * traps and emulates any vmx instruction (e.g. VMPTRLD,
-        * VMXON...) executed by L1.
-        * For example, VM_INSTRUCTION_ERROR is read
-        * by L1 if a vmx instruction fails (part of the error path).
-        * Note the code assumes this logic. If for some reason
-        * we start shadowing these fields then we need to
-        * force a shadow sync when L0 emulates vmx instructions
-        * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified
-        * by nested_vmx_failValid)
-        */
-       VM_EXIT_REASON,
-       VM_EXIT_INTR_INFO,
-       VM_EXIT_INSTRUCTION_LEN,
-       IDT_VECTORING_INFO_FIELD,
-       IDT_VECTORING_ERROR_CODE,
-       VM_EXIT_INTR_ERROR_CODE,
-       EXIT_QUALIFICATION,
-       GUEST_LINEAR_ADDRESS,
-       GUEST_PHYSICAL_ADDRESS
+static u16 shadow_read_only_fields[] = {
+#define SHADOW_FIELD_RO(x) x,
+#include "vmx_shadow_fields.h"
 };
 static int max_shadow_read_only_fields =
        ARRAY_SIZE(shadow_read_only_fields);
 
-static unsigned long shadow_read_write_fields[] = {
-       TPR_THRESHOLD,
-       GUEST_RIP,
-       GUEST_RSP,
-       GUEST_CR0,
-       GUEST_CR3,
-       GUEST_CR4,
-       GUEST_INTERRUPTIBILITY_INFO,
-       GUEST_RFLAGS,
-       GUEST_CS_SELECTOR,
-       GUEST_CS_AR_BYTES,
-       GUEST_CS_LIMIT,
-       GUEST_CS_BASE,
-       GUEST_ES_BASE,
-       GUEST_BNDCFGS,
-       CR0_GUEST_HOST_MASK,
-       CR0_READ_SHADOW,
-       CR4_READ_SHADOW,
-       TSC_OFFSET,
-       EXCEPTION_BITMAP,
-       CPU_BASED_VM_EXEC_CONTROL,
-       VM_ENTRY_EXCEPTION_ERROR_CODE,
-       VM_ENTRY_INTR_INFO_FIELD,
-       VM_ENTRY_INSTRUCTION_LEN,
-       VM_ENTRY_EXCEPTION_ERROR_CODE,
-       HOST_FS_BASE,
-       HOST_GS_BASE,
-       HOST_FS_SELECTOR,
-       HOST_GS_SELECTOR
+static u16 shadow_read_write_fields[] = {
+#define SHADOW_FIELD_RW(x) x,
+#include "vmx_shadow_fields.h"
 };
 static int max_shadow_read_write_fields =
        ARRAY_SIZE(shadow_read_write_fields);
@@ -897,13 +855,25 @@ static const unsigned short vmcs_field_to_offset_table[] = {
 
 static inline short vmcs_field_to_offset(unsigned long field)
 {
-       BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX);
+       unsigned index;
 
-       if (field >= ARRAY_SIZE(vmcs_field_to_offset_table) ||
-           vmcs_field_to_offset_table[field] == 0)
+       if (field >> 15)
                return -ENOENT;
 
-       return vmcs_field_to_offset_table[field];
+       index = ROL16(field, 6);
+       if (index >= ARRAY_SIZE(vmcs_field_to_offset_table))
+               return -ENOENT;
+
+       /*
+        * FIXME: Mitigation for CVE-2017-5753.  To be replaced with a
+        * generic mechanism.
+        */
+       asm("lfence");
+
+       if (vmcs_field_to_offset_table[index] == 0)
+               return -ENOENT;
+
+       return vmcs_field_to_offset_table[index];
 }
 
 static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
@@ -943,8 +913,6 @@ static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
 static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
 
 enum {
-       VMX_IO_BITMAP_A,
-       VMX_IO_BITMAP_B,
        VMX_MSR_BITMAP_LEGACY,
        VMX_MSR_BITMAP_LONGMODE,
        VMX_MSR_BITMAP_LEGACY_X2APIC_APICV,
@@ -958,8 +926,6 @@ enum {
 
 static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
 
-#define vmx_io_bitmap_a                      (vmx_bitmap[VMX_IO_BITMAP_A])
-#define vmx_io_bitmap_b                      (vmx_bitmap[VMX_IO_BITMAP_B])
 #define vmx_msr_bitmap_legacy                (vmx_bitmap[VMX_MSR_BITMAP_LEGACY])
 #define vmx_msr_bitmap_longmode              (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE])
 #define vmx_msr_bitmap_legacy_x2apic_apicv   (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC_APICV])
@@ -2326,6 +2292,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
        vmx_vcpu_pi_load(vcpu, cpu);
        vmx->host_pkru = read_pkru();
+       vmx->host_debugctlmsr = get_debugctlmsr();
 }
 
 static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
@@ -2913,7 +2880,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
        rdmsrl(MSR_IA32_VMX_CR4_FIXED1, vmx->nested.nested_vmx_cr4_fixed1);
 
        /* highest index: VMX_PREEMPTION_TIMER_VALUE */
-       vmx->nested.nested_vmx_vmcs_enum = 0x2e;
+       vmx->nested.nested_vmx_vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1;
 }
 
 /*
@@ -3249,6 +3216,7 @@ static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
  */
 static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct shared_msr_entry *msr;
 
        switch (msr_info->index) {
@@ -3260,8 +3228,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                msr_info->data = vmcs_readl(GUEST_GS_BASE);
                break;
        case MSR_KERNEL_GS_BASE:
-               vmx_load_host_state(to_vmx(vcpu));
-               msr_info->data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
+               vmx_load_host_state(vmx);
+               msr_info->data = vmx->msr_guest_kernel_gs_base;
                break;
 #endif
        case MSR_EFER:
@@ -3287,13 +3255,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                break;
        case MSR_IA32_MCG_EXT_CTL:
                if (!msr_info->host_initiated &&
-                   !(to_vmx(vcpu)->msr_ia32_feature_control &
+                   !(vmx->msr_ia32_feature_control &
                      FEATURE_CONTROL_LMCE))
                        return 1;
                msr_info->data = vcpu->arch.mcg_ext_ctl;
                break;
        case MSR_IA32_FEATURE_CONTROL:
-               msr_info->data = to_vmx(vcpu)->msr_ia32_feature_control;
+               msr_info->data = vmx->msr_ia32_feature_control;
                break;
        case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
                if (!nested_vmx_allowed(vcpu))
@@ -3310,7 +3278,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        return 1;
                /* Otherwise falls through */
        default:
-               msr = find_msr_entry(to_vmx(vcpu), msr_info->index);
+               msr = find_msr_entry(vmx, msr_info->index);
                if (msr) {
                        msr_info->data = msr->data;
                        break;
@@ -3632,7 +3600,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 #endif
              CPU_BASED_CR3_LOAD_EXITING |
              CPU_BASED_CR3_STORE_EXITING |
-             CPU_BASED_USE_IO_BITMAPS |
+             CPU_BASED_UNCOND_IO_EXITING |
              CPU_BASED_MOV_DR_EXITING |
              CPU_BASED_USE_TSC_OFFSETING |
              CPU_BASED_INVLPG_EXITING |
@@ -3662,6 +3630,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
                        SECONDARY_EXEC_ENABLE_EPT |
                        SECONDARY_EXEC_UNRESTRICTED_GUEST |
                        SECONDARY_EXEC_PAUSE_LOOP_EXITING |
+                       SECONDARY_EXEC_DESC |
                        SECONDARY_EXEC_RDTSCP |
                        SECONDARY_EXEC_ENABLE_INVPCID |
                        SECONDARY_EXEC_APIC_REGISTER_VIRT |
@@ -3853,6 +3822,19 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
        WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
 }
 
+static void vmx_nested_free_vmcs02(struct vcpu_vmx *vmx)
+{
+       struct loaded_vmcs *loaded_vmcs = &vmx->nested.vmcs02;
+
+       /*
+        * Just leak the VMCS02 if the WARN triggers. Better than
+        * a use-after-free.
+        */
+       if (WARN_ON(vmx->loaded_vmcs == loaded_vmcs))
+               return;
+       free_loaded_vmcs(loaded_vmcs);
+}
+
 static void free_kvm_area(void)
 {
        int cpu;
@@ -3863,17 +3845,17 @@ static void free_kvm_area(void)
        }
 }
 
-enum vmcs_field_type {
-       VMCS_FIELD_TYPE_U16 = 0,
-       VMCS_FIELD_TYPE_U64 = 1,
-       VMCS_FIELD_TYPE_U32 = 2,
-       VMCS_FIELD_TYPE_NATURAL_WIDTH = 3
+enum vmcs_field_width {
+       VMCS_FIELD_WIDTH_U16 = 0,
+       VMCS_FIELD_WIDTH_U64 = 1,
+       VMCS_FIELD_WIDTH_U32 = 2,
+       VMCS_FIELD_WIDTH_NATURAL_WIDTH = 3
 };
 
-static inline int vmcs_field_type(unsigned long field)
+static inline int vmcs_field_width(unsigned long field)
 {
        if (0x1 & field)        /* the *_HIGH fields are all 32 bit */
-               return VMCS_FIELD_TYPE_U32;
+               return VMCS_FIELD_WIDTH_U32;
        return (field >> 13) & 0x3 ;
 }
 
@@ -3886,43 +3868,66 @@ static void init_vmcs_shadow_fields(void)
 {
        int i, j;
 
-       /* No checks for read only fields yet */
+       for (i = j = 0; i < max_shadow_read_only_fields; i++) {
+               u16 field = shadow_read_only_fields[i];
+               if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
+                   (i + 1 == max_shadow_read_only_fields ||
+                    shadow_read_only_fields[i + 1] != field + 1))
+                       pr_err("Missing field from shadow_read_only_field %x\n",
+                              field + 1);
+
+               clear_bit(field, vmx_vmread_bitmap);
+#ifdef CONFIG_X86_64
+               if (field & 1)
+                       continue;
+#endif
+               if (j < i)
+                       shadow_read_only_fields[j] = field;
+               j++;
+       }
+       max_shadow_read_only_fields = j;
 
        for (i = j = 0; i < max_shadow_read_write_fields; i++) {
-               switch (shadow_read_write_fields[i]) {
-               case GUEST_BNDCFGS:
-                       if (!kvm_mpx_supported())
+               u16 field = shadow_read_write_fields[i];
+               if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
+                   (i + 1 == max_shadow_read_write_fields ||
+                    shadow_read_write_fields[i + 1] != field + 1))
+                       pr_err("Missing field from shadow_read_write_field %x\n",
+                              field + 1);
+
+               /*
+                * PML and the preemption timer can be emulated, but the
+                * processor cannot vmwrite to fields that don't exist
+                * on bare metal.
+                */
+               switch (field) {
+               case GUEST_PML_INDEX:
+                       if (!cpu_has_vmx_pml())
+                               continue;
+                       break;
+               case VMX_PREEMPTION_TIMER_VALUE:
+                       if (!cpu_has_vmx_preemption_timer())
+                               continue;
+                       break;
+               case GUEST_INTR_STATUS:
+                       if (!cpu_has_vmx_apicv())
                                continue;
                        break;
                default:
                        break;
                }
 
+               clear_bit(field, vmx_vmwrite_bitmap);
+               clear_bit(field, vmx_vmread_bitmap);
+#ifdef CONFIG_X86_64
+               if (field & 1)
+                       continue;
+#endif
                if (j < i)
-                       shadow_read_write_fields[j] =
-                               shadow_read_write_fields[i];
+                       shadow_read_write_fields[j] = field;
                j++;
        }
        max_shadow_read_write_fields = j;
-
-       /* shadowed fields guest access without vmexit */
-       for (i = 0; i < max_shadow_read_write_fields; i++) {
-               unsigned long field = shadow_read_write_fields[i];
-
-               clear_bit(field, vmx_vmwrite_bitmap);
-               clear_bit(field, vmx_vmread_bitmap);
-               if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64) {
-                       clear_bit(field + 1, vmx_vmwrite_bitmap);
-                       clear_bit(field + 1, vmx_vmread_bitmap);
-               }
-       }
-       for (i = 0; i < max_shadow_read_only_fields; i++) {
-               unsigned long field = shadow_read_only_fields[i];
-
-               clear_bit(field, vmx_vmread_bitmap);
-               if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64)
-                       clear_bit(field + 1, vmx_vmread_bitmap);
-       }
 }
 
 static __init int alloc_kvm_area(void)
@@ -4135,9 +4140,10 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
 
 #endif
 
-static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid)
+static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid,
+                               bool invalidate_gpa)
 {
-       if (enable_ept) {
+       if (enable_ept && (invalidate_gpa || !enable_vpid)) {
                if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
                        return;
                ept_sync_context(construct_eptp(vcpu, vcpu->arch.mmu.root_hpa));
@@ -4146,15 +4152,15 @@ static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid)
        }
 }
 
-static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
+static void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
 {
-       __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid);
+       __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa);
 }
 
 static void vmx_flush_tlb_ept_only(struct kvm_vcpu *vcpu)
 {
        if (enable_ept)
-               vmx_flush_tlb(vcpu);
+               vmx_flush_tlb(vcpu, true);
 }
 
 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
@@ -4352,7 +4358,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
                ept_load_pdptrs(vcpu);
        }
 
-       vmx_flush_tlb(vcpu);
+       vmx_flush_tlb(vcpu, true);
        vmcs_writel(GUEST_CR3, guest_cr3);
 }
 
@@ -4369,6 +4375,14 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
                (to_vmx(vcpu)->rmode.vm86_active ?
                 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
 
+       if ((cr4 & X86_CR4_UMIP) && !boot_cpu_has(X86_FEATURE_UMIP)) {
+               vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
+                             SECONDARY_EXEC_DESC);
+               hw_cr4 &= ~X86_CR4_UMIP;
+       } else
+               vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
+                               SECONDARY_EXEC_DESC);
+
        if (cr4 & X86_CR4_VMXE) {
                /*
                 * To use VMXON (and later other VMX instructions), a guest
@@ -4958,11 +4972,6 @@ static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
 {
        int f = sizeof(unsigned long);
 
-       if (!cpu_has_vmx_msr_bitmap()) {
-               WARN_ON(1);
-               return;
-       }
-
        /*
         * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
         * have the write-low and read-high bitmap offsets the wrong way round.
@@ -5003,14 +5012,15 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
                                                msr, MSR_TYPE_R | MSR_TYPE_W);
 }
 
-static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_active)
+#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
+
+static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_only)
 {
-       if (apicv_active) {
-               __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv,
-                               msr, type);
-               __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv,
-                               msr, type);
-       } else {
+       __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv,
+                                       msr, type);
+       __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv,
+                                       msr, type);
+       if (!apicv_only) {
                __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
                                msr, type);
                __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
@@ -5062,7 +5072,8 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
        max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
        if (max_irr != 256) {
                vapic_page = kmap(vmx->nested.virtual_apic_page);
-               __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page);
+               __kvm_apic_update_irr(vmx->nested.pi_desc->pir,
+                       vapic_page, &max_irr);
                kunmap(vmx->nested.virtual_apic_page);
 
                status = vmcs_read16(GUEST_INTR_STATUS);
@@ -5122,14 +5133,15 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
 
        if (is_guest_mode(vcpu) &&
            vector == vmx->nested.posted_intr_nv) {
-               /* the PIR and ON have been set by L1. */
-               kvm_vcpu_trigger_posted_interrupt(vcpu, true);
                /*
                 * If a posted intr is not recognized by hardware,
                 * we will accomplish it in the next vmentry.
                 */
                vmx->nested.pi_pending = true;
                kvm_make_request(KVM_REQ_EVENT, vcpu);
+               /* the PIR and ON have been set by L1. */
+               if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
+                       kvm_vcpu_kick(vcpu);
                return 0;
        }
        return -1;
@@ -5308,6 +5320,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
        struct kvm_vcpu *vcpu = &vmx->vcpu;
 
        u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
+
        if (!cpu_need_virtualize_apic_accesses(vcpu))
                exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
        if (vmx->vpid == 0)
@@ -5326,6 +5339,11 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
                exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
                                  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
        exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+
+       /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
+        * in vmx_set_cr4.  */
+       exec_control &= ~SECONDARY_EXEC_DESC;
+
        /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
           (handle_vmptrld).
           We can NOT enable shadow_vmcs here because we don't have yet
@@ -5445,10 +5463,6 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
 #endif
        int i;
 
-       /* I/O */
-       vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
-       vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));
-
        if (enable_shadow_vmcs) {
                vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
                vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
@@ -6101,6 +6115,12 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
                return kvm_set_cr4(vcpu, val);
 }
 
+static int handle_desc(struct kvm_vcpu *vcpu)
+{
+       WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));
+       return emulate_instruction(vcpu, 0) == EMULATE_DONE;
+}
+
 static int handle_cr(struct kvm_vcpu *vcpu)
 {
        unsigned long exit_qualification, val;
@@ -6754,10 +6774,6 @@ static __init int hardware_setup(void)
        memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
        memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
 
-       memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE);
-
-       memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
-
        memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
        memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
 
@@ -6773,11 +6789,6 @@ static __init int hardware_setup(void)
                !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
                enable_vpid = 0;
 
-       if (!cpu_has_vmx_shadow_vmcs())
-               enable_shadow_vmcs = 0;
-       if (enable_shadow_vmcs)
-               init_vmcs_shadow_fields();
-
        if (!cpu_has_vmx_ept() ||
            !cpu_has_vmx_ept_4levels() ||
            !cpu_has_vmx_ept_mt_wb() ||
@@ -6848,7 +6859,7 @@ static __init int hardware_setup(void)
        set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
 
        for (msr = 0x800; msr <= 0x8ff; msr++) {
-               if (msr == 0x839 /* TMCCT */)
+               if (msr == X2APIC_MSR(APIC_TMCCT))
                        continue;
                vmx_disable_intercept_msr_x2apic(msr, MSR_TYPE_R, true);
        }
@@ -6857,13 +6868,9 @@ static __init int hardware_setup(void)
         * TPR reads and writes can be virtualized even if virtual interrupt
         * delivery is not in use.
         */
-       vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_W, true);
-       vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_R | MSR_TYPE_W, false);
-
-       /* EOI */
-       vmx_disable_intercept_msr_x2apic(0x80b, MSR_TYPE_W, true);
-       /* SELF-IPI */
-       vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true);
+       vmx_disable_intercept_msr_x2apic(X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_R | MSR_TYPE_W, false);
+       vmx_disable_intercept_msr_x2apic(X2APIC_MSR(APIC_EOI), MSR_TYPE_W, true);
+       vmx_disable_intercept_msr_x2apic(X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W, true);
 
        if (enable_ept)
                vmx_enable_tdp();
@@ -6897,6 +6904,11 @@ static __init int hardware_setup(void)
                kvm_x86_ops->cancel_hv_timer = NULL;
        }
 
+       if (!cpu_has_vmx_shadow_vmcs())
+               enable_shadow_vmcs = 0;
+       if (enable_shadow_vmcs)
+               init_vmcs_shadow_fields();
+
        kvm_set_posted_intr_wakeup_handler(wakeup_handler);
 
        kvm_mce_cap_supported |= MCG_LMCE_P;
@@ -6967,94 +6979,6 @@ static int handle_monitor(struct kvm_vcpu *vcpu)
        return handle_nop(vcpu);
 }
 
-/*
- * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
- * We could reuse a single VMCS for all the L2 guests, but we also want the
- * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
- * allows keeping them loaded on the processor, and in the future will allow
- * optimizations where prepare_vmcs02 doesn't need to set all the fields on
- * every entry if they never change.
- * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
- * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
- *
- * The following functions allocate and free a vmcs02 in this pool.
- */
-
-/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
-static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
-{
-       struct vmcs02_list *item;
-       list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
-               if (item->vmptr == vmx->nested.current_vmptr) {
-                       list_move(&item->list, &vmx->nested.vmcs02_pool);
-                       return &item->vmcs02;
-               }
-
-       if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
-               /* Recycle the least recently used VMCS. */
-               item = list_last_entry(&vmx->nested.vmcs02_pool,
-                                      struct vmcs02_list, list);
-               item->vmptr = vmx->nested.current_vmptr;
-               list_move(&item->list, &vmx->nested.vmcs02_pool);
-               return &item->vmcs02;
-       }
-
-       /* Create a new VMCS */
-       item = kzalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
-       if (!item)
-               return NULL;
-       item->vmcs02.vmcs = alloc_vmcs();
-       item->vmcs02.shadow_vmcs = NULL;
-       if (!item->vmcs02.vmcs) {
-               kfree(item);
-               return NULL;
-       }
-       loaded_vmcs_init(&item->vmcs02);
-       item->vmptr = vmx->nested.current_vmptr;
-       list_add(&(item->list), &(vmx->nested.vmcs02_pool));
-       vmx->nested.vmcs02_num++;
-       return &item->vmcs02;
-}
-
-/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
-static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
-{
-       struct vmcs02_list *item;
-       list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
-               if (item->vmptr == vmptr) {
-                       free_loaded_vmcs(&item->vmcs02);
-                       list_del(&item->list);
-                       kfree(item);
-                       vmx->nested.vmcs02_num--;
-                       return;
-               }
-}
-
-/*
- * Free all VMCSs saved for this vcpu, except the one pointed by
- * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs
- * must be &vmx->vmcs01.
- */
-static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
-{
-       struct vmcs02_list *item, *n;
-
-       WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01);
-       list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
-               /*
-                * Something will leak if the above WARN triggers.  Better than
-                * a use-after-free.
-                */
-               if (vmx->loaded_vmcs == &item->vmcs02)
-                       continue;
-
-               free_loaded_vmcs(&item->vmcs02);
-               list_del(&item->list);
-               kfree(item);
-               vmx->nested.vmcs02_num--;
-       }
-}
-
 /*
  * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
  * set the success or error code of an emulated VMX instruction, as specified
@@ -7236,11 +7160,18 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct vmcs *shadow_vmcs;
 
+       vmx->nested.vmcs02.vmcs = alloc_vmcs();
+       vmx->nested.vmcs02.shadow_vmcs = NULL;
+       if (!vmx->nested.vmcs02.vmcs)
+               goto out_vmcs02;
+       loaded_vmcs_init(&vmx->nested.vmcs02);
+
        if (cpu_has_vmx_msr_bitmap()) {
                vmx->nested.msr_bitmap =
                                (unsigned long *)__get_free_page(GFP_KERNEL);
                if (!vmx->nested.msr_bitmap)
                        goto out_msr_bitmap;
+               memset(vmx->nested.msr_bitmap, 0xff, PAGE_SIZE);
        }
 
        vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
@@ -7258,9 +7189,6 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
                vmx->vmcs01.shadow_vmcs = shadow_vmcs;
        }
 
-       INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
-       vmx->nested.vmcs02_num = 0;
-
        hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
                     HRTIMER_MODE_REL_PINNED);
        vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
@@ -7275,6 +7203,9 @@ out_cached_vmcs12:
        free_page((unsigned long)vmx->nested.msr_bitmap);
 
 out_msr_bitmap:
+       vmx_nested_free_vmcs02(vmx);
+
+out_vmcs02:
        return -ENOMEM;
 }
 
@@ -7428,7 +7359,7 @@ static void free_nested(struct vcpu_vmx *vmx)
                vmx->vmcs01.shadow_vmcs = NULL;
        }
        kfree(vmx->nested.cached_vmcs12);
-       /* Unpin physical memory we referred to in current vmcs02 */
+       /* Unpin physical memory we referred to in the vmcs02 */
        if (vmx->nested.apic_access_page) {
                kvm_release_page_dirty(vmx->nested.apic_access_page);
                vmx->nested.apic_access_page = NULL;
@@ -7444,7 +7375,7 @@ static void free_nested(struct vcpu_vmx *vmx)
                vmx->nested.pi_desc = NULL;
        }
 
-       nested_free_all_saved_vmcss(vmx);
+       vmx_nested_free_vmcs02(vmx);
 }
 
 /* Emulate the VMXOFF instruction */
@@ -7487,8 +7418,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
                        vmptr + offsetof(struct vmcs12, launch_state),
                        &zero, sizeof(zero));
 
-       nested_free_vmcs02(vmx, vmptr);
-
        nested_vmx_succeed(vcpu);
        return kvm_skip_emulated_instruction(vcpu);
 }
@@ -7526,17 +7455,17 @@ static inline int vmcs12_read_any(struct kvm_vcpu *vcpu,
 
        p = ((char *)(get_vmcs12(vcpu))) + offset;
 
-       switch (vmcs_field_type(field)) {
-       case VMCS_FIELD_TYPE_NATURAL_WIDTH:
+       switch (vmcs_field_width(field)) {
+       case VMCS_FIELD_WIDTH_NATURAL_WIDTH:
                *ret = *((natural_width *)p);
                return 0;
-       case VMCS_FIELD_TYPE_U16:
+       case VMCS_FIELD_WIDTH_U16:
                *ret = *((u16 *)p);
                return 0;
-       case VMCS_FIELD_TYPE_U32:
+       case VMCS_FIELD_WIDTH_U32:
                *ret = *((u32 *)p);
                return 0;
-       case VMCS_FIELD_TYPE_U64:
+       case VMCS_FIELD_WIDTH_U64:
                *ret = *((u64 *)p);
                return 0;
        default:
@@ -7553,17 +7482,17 @@ static inline int vmcs12_write_any(struct kvm_vcpu *vcpu,
        if (offset < 0)
                return offset;
 
-       switch (vmcs_field_type(field)) {
-       case VMCS_FIELD_TYPE_U16:
+       switch (vmcs_field_width(field)) {
+       case VMCS_FIELD_WIDTH_U16:
                *(u16 *)p = field_value;
                return 0;
-       case VMCS_FIELD_TYPE_U32:
+       case VMCS_FIELD_WIDTH_U32:
                *(u32 *)p = field_value;
                return 0;
-       case VMCS_FIELD_TYPE_U64:
+       case VMCS_FIELD_WIDTH_U64:
                *(u64 *)p = field_value;
                return 0;
-       case VMCS_FIELD_TYPE_NATURAL_WIDTH:
+       case VMCS_FIELD_WIDTH_NATURAL_WIDTH:
                *(natural_width *)p = field_value;
                return 0;
        default:
@@ -7579,7 +7508,7 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
        unsigned long field;
        u64 field_value;
        struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
-       const unsigned long *fields = shadow_read_write_fields;
+       const u16 *fields = shadow_read_write_fields;
        const int num_fields = max_shadow_read_write_fields;
 
        preempt_disable();
@@ -7588,23 +7517,7 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
 
        for (i = 0; i < num_fields; i++) {
                field = fields[i];
-               switch (vmcs_field_type(field)) {
-               case VMCS_FIELD_TYPE_U16:
-                       field_value = vmcs_read16(field);
-                       break;
-               case VMCS_FIELD_TYPE_U32:
-                       field_value = vmcs_read32(field);
-                       break;
-               case VMCS_FIELD_TYPE_U64:
-                       field_value = vmcs_read64(field);
-                       break;
-               case VMCS_FIELD_TYPE_NATURAL_WIDTH:
-                       field_value = vmcs_readl(field);
-                       break;
-               default:
-                       WARN_ON(1);
-                       continue;
-               }
+               field_value = __vmcs_readl(field);
                vmcs12_write_any(&vmx->vcpu, field, field_value);
        }
 
@@ -7616,7 +7529,7 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
 
 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
 {
-       const unsigned long *fields[] = {
+       const u16 *fields[] = {
                shadow_read_write_fields,
                shadow_read_only_fields
        };
@@ -7635,24 +7548,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
                for (i = 0; i < max_fields[q]; i++) {
                        field = fields[q][i];
                        vmcs12_read_any(&vmx->vcpu, field, &field_value);
-
-                       switch (vmcs_field_type(field)) {
-                       case VMCS_FIELD_TYPE_U16:
-                               vmcs_write16(field, (u16)field_value);
-                               break;
-                       case VMCS_FIELD_TYPE_U32:
-                               vmcs_write32(field, (u32)field_value);
-                               break;
-                       case VMCS_FIELD_TYPE_U64:
-                               vmcs_write64(field, (u64)field_value);
-                               break;
-                       case VMCS_FIELD_TYPE_NATURAL_WIDTH:
-                               vmcs_writel(field, (long)field_value);
-                               break;
-                       default:
-                               WARN_ON(1);
-                               break;
-                       }
+                       __vmcs_writel(field, field_value);
                }
        }
 
@@ -7721,8 +7617,10 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
 {
        unsigned long field;
        gva_t gva;
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
        unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
        u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+
        /* The value to write might be 32 or 64 bits, depending on L1's long
         * mode, and eventually we need to write that into a field of several
         * possible lengths. The code below first zero-extends the value to 64
@@ -7765,6 +7663,20 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
                return kvm_skip_emulated_instruction(vcpu);
        }
 
+       switch (field) {
+#define SHADOW_FIELD_RW(x) case x:
+#include "vmx_shadow_fields.h"
+               /*
+                * The fields that can be updated by L1 without a vmexit are
+                * always updated in the vmcs02, the others go down the slow
+                * path of prepare_vmcs02.
+                */
+               break;
+       default:
+               vmx->nested.dirty_vmcs12 = true;
+               break;
+       }
+
        nested_vmx_succeed(vcpu);
        return kvm_skip_emulated_instruction(vcpu);
 }
@@ -7779,6 +7691,7 @@ static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
                             __pa(vmx->vmcs01.shadow_vmcs));
                vmx->nested.sync_shadow_vmcs = true;
        }
+       vmx->nested.dirty_vmcs12 = true;
 }
 
 /* Emulate the VMPTRLD instruction */
@@ -7999,7 +7912,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
                return kvm_skip_emulated_instruction(vcpu);
        }
 
-       __vmx_flush_tlb(vcpu, vmx->nested.vpid02);
+       __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
        nested_vmx_succeed(vcpu);
 
        return kvm_skip_emulated_instruction(vcpu);
@@ -8193,6 +8106,8 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_XSETBV]                  = handle_xsetbv,
        [EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
        [EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
+       [EXIT_REASON_GDTR_IDTR]               = handle_desc,
+       [EXIT_REASON_LDTR_TR]                 = handle_desc,
        [EXIT_REASON_EPT_VIOLATION]           = handle_ept_violation,
        [EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
        [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
@@ -8400,10 +8315,11 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
 
        /*
         * The host physical addresses of some pages of guest memory
-        * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU
-        * may write to these pages via their host physical address while
-        * L2 is running, bypassing any address-translation-based dirty
-        * tracking (e.g. EPT write protection).
+        * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
+        * Page). The CPU may write to these pages via their host
+        * physical address while L2 is running, bypassing any
+        * address-translation-based dirty tracking (e.g. EPT write
+        * protection).
         *
         * Mark them dirty on every exit from L2 to prevent them from
         * getting out of sync with dirty tracking.
@@ -9001,36 +8917,23 @@ static void vmx_set_rvi(int vector)
 
 static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
 {
-       if (!is_guest_mode(vcpu)) {
-               vmx_set_rvi(max_irr);
-               return;
-       }
-
-       if (max_irr == -1)
-               return;
-
-       /*
-        * In guest mode.  If a vmexit is needed, vmx_check_nested_events
-        * handles it.
-        */
-       if (nested_exit_on_intr(vcpu))
-               return;
-
        /*
-        * Else, fall back to pre-APICv interrupt injection since L2
-        * is run without virtual interrupt delivery.
+        * When running L2, updating RVI is only relevant when
+        * vmcs12 virtual-interrupt-delivery enabled.
+        * However, it can be enabled only when L1 also
+        * intercepts external-interrupts and in that case
+        * we should not update vmcs02 RVI but instead intercept
+        * interrupt. Therefore, do nothing when running L2.
         */
-       if (!kvm_event_needs_reinjection(vcpu) &&
-           vmx_interrupt_allowed(vcpu)) {
-               kvm_queue_interrupt(vcpu, max_irr, false);
-               vmx_inject_irq(vcpu);
-       }
+       if (!is_guest_mode(vcpu))
+               vmx_set_rvi(max_irr);
 }
 
 static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        int max_irr;
+       bool max_irr_updated;
 
        WARN_ON(!vcpu->arch.apicv_active);
        if (pi_test_on(&vmx->pi_desc)) {
@@ -9040,7 +8943,23 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
                 * But on x86 this is just a compiler barrier anyway.
                 */
                smp_mb__after_atomic();
-               max_irr = kvm_apic_update_irr(vcpu, vmx->pi_desc.pir);
+               max_irr_updated =
+                       kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
+
+               /*
+                * If we are running L2 and L1 has a new pending interrupt
+                * which can be injected, we should re-evaluate
+                * what should be done with this new L1 interrupt.
+                * If L1 intercepts external-interrupts, we should
+                * exit from L2 to L1. Otherwise, interrupt should be
+                * delivered directly to L2.
+                */
+               if (is_guest_mode(vcpu) && max_irr_updated) {
+                       if (nested_exit_on_intr(vcpu))
+                               kvm_vcpu_exiting_guest_mode(vcpu);
+                       else
+                               kvm_make_request(KVM_REQ_EVENT, vcpu);
+               }
        } else {
                max_irr = kvm_lapic_find_highest_irr(vcpu);
        }
@@ -9155,6 +9074,12 @@ static bool vmx_xsaves_supported(void)
                SECONDARY_EXEC_XSAVES;
 }
 
+static bool vmx_umip_emulated(void)
+{
+       return vmcs_config.cpu_based_2nd_exec_ctrl &
+               SECONDARY_EXEC_DESC;
+}
+
 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
 {
        u32 exit_intr_info;
@@ -9310,7 +9235,7 @@ static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
 static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
-       unsigned long debugctlmsr, cr3, cr4;
+       unsigned long cr3, cr4;
 
        /* Record the guest's net vcpu time for enforced NMI injections. */
        if (unlikely(!enable_vnmi &&
@@ -9363,7 +9288,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
                __write_pkru(vcpu->arch.pkru);
 
        atomic_switch_perf_msrs(vmx);
-       debugctlmsr = get_debugctlmsr();
 
        vmx_arm_hv_timer(vcpu);
 
@@ -9474,8 +9398,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
              );
 
        /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
-       if (debugctlmsr)
-               update_debugctlmsr(debugctlmsr);
+       if (vmx->host_debugctlmsr)
+               update_debugctlmsr(vmx->host_debugctlmsr);
 
 #ifndef CONFIG_X86_64
        /*
@@ -9555,10 +9479,8 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
 static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
-       int r;
 
-       r = vcpu_load(vcpu);
-       BUG_ON(r);
+       vcpu_load(vcpu);
        vmx_switch_vmcs(vcpu, &vmx->vmcs01);
        free_nested(vmx);
        vcpu_put(vcpu);
@@ -9750,7 +9672,8 @@ static void vmcs_set_secondary_exec_control(u32 new_ctl)
        u32 mask =
                SECONDARY_EXEC_SHADOW_VMCS |
                SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
-               SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+               SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+               SECONDARY_EXEC_DESC;
 
        u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
 
@@ -9916,8 +9839,8 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
        }
 }
 
-static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
-                                              struct vmcs12 *vmcs12);
+static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
+                                                struct vmcs12 *vmcs12);
 
 static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
                                        struct vmcs12 *vmcs12)
@@ -10006,11 +9929,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
                        (unsigned long)(vmcs12->posted_intr_desc_addr &
                        (PAGE_SIZE - 1)));
        }
-       if (cpu_has_vmx_msr_bitmap() &&
-           nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS) &&
-           nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
-               ;
-       else
+       if (!nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
                vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
                                CPU_BASED_USE_MSR_BITMAPS);
 }
@@ -10078,14 +9997,19 @@ static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
  * Merge L0's and L1's MSR bitmap, return false to indicate that
  * we do not use the hardware.
  */
-static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
-                                              struct vmcs12 *vmcs12)
+static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
+                                                struct vmcs12 *vmcs12)
 {
        int msr;
        struct page *page;
        unsigned long *msr_bitmap_l1;
        unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap;
 
+       /* Nothing to do if the MSR bitmap is not in use.  */
+       if (!cpu_has_vmx_msr_bitmap() ||
+           !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
+               return false;
+
        /* This shortcut is ok because we support only x2APIC MSRs so far. */
        if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
                return false;
@@ -10093,32 +10017,41 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
        page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap);
        if (is_error_page(page))
                return false;
-       msr_bitmap_l1 = (unsigned long *)kmap(page);
 
-       memset(msr_bitmap_l0, 0xff, PAGE_SIZE);
+       msr_bitmap_l1 = (unsigned long *)kmap(page);
+       if (nested_cpu_has_apic_reg_virt(vmcs12)) {
+               /*
+                * L0 need not intercept reads for MSRs between 0x800 and 0x8ff, it
+                * just lets the processor take the value from the virtual-APIC page;
+                * take those 256 bits directly from the L1 bitmap.
+                */
+               for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
+                       unsigned word = msr / BITS_PER_LONG;
+                       msr_bitmap_l0[word] = msr_bitmap_l1[word];
+                       msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0;
+               }
+       } else {
+               for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
+                       unsigned word = msr / BITS_PER_LONG;
+                       msr_bitmap_l0[word] = ~0;
+                       msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0;
+               }
+       }
 
-       if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
-               if (nested_cpu_has_apic_reg_virt(vmcs12))
-                       for (msr = 0x800; msr <= 0x8ff; msr++)
-                               nested_vmx_disable_intercept_for_msr(
-                                       msr_bitmap_l1, msr_bitmap_l0,
-                                       msr, MSR_TYPE_R);
+       nested_vmx_disable_intercept_for_msr(
+               msr_bitmap_l1, msr_bitmap_l0,
+               X2APIC_MSR(APIC_TASKPRI),
+               MSR_TYPE_W);
 
+       if (nested_cpu_has_vid(vmcs12)) {
                nested_vmx_disable_intercept_for_msr(
-                               msr_bitmap_l1, msr_bitmap_l0,
-                               APIC_BASE_MSR + (APIC_TASKPRI >> 4),
-                               MSR_TYPE_R | MSR_TYPE_W);
-
-               if (nested_cpu_has_vid(vmcs12)) {
-                       nested_vmx_disable_intercept_for_msr(
-                               msr_bitmap_l1, msr_bitmap_l0,
-                               APIC_BASE_MSR + (APIC_EOI >> 4),
-                               MSR_TYPE_W);
-                       nested_vmx_disable_intercept_for_msr(
-                               msr_bitmap_l1, msr_bitmap_l0,
-                               APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
-                               MSR_TYPE_W);
-               }
+                       msr_bitmap_l1, msr_bitmap_l0,
+                       X2APIC_MSR(APIC_EOI),
+                       MSR_TYPE_W);
+               nested_vmx_disable_intercept_for_msr(
+                       msr_bitmap_l1, msr_bitmap_l0,
+                       X2APIC_MSR(APIC_SELF_IPI),
+                       MSR_TYPE_W);
        }
        kunmap(page);
        kvm_release_page_clean(page);
@@ -10385,25 +10318,12 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
        return 0;
 }
 
-/*
- * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
- * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
- * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
- * guest in a way that will both be appropriate to L1's requests, and our
- * needs. In addition to modifying the active vmcs (which is vmcs02), this
- * function also has additional necessary side-effects, like setting various
- * vcpu->arch fields.
- * Returns 0 on success, 1 on failure. Invalid state exit qualification code
- * is assigned to entry_failure_code on failure.
- */
-static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
-                         bool from_vmentry, u32 *entry_failure_code)
+static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+                              bool from_vmentry)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
-       u32 exec_control, vmcs12_exec_ctrl;
 
        vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
-       vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
        vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
        vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
        vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
@@ -10411,7 +10331,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
        vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
        vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
-       vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
        vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
        vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
        vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
@@ -10421,15 +10340,12 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
        vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
        vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
-       vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
        vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
        vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
        vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
        vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
        vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
        vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
-       vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
-       vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
        vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
        vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
        vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
@@ -10439,6 +10355,122 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
        vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
 
+       vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
+       vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
+               vmcs12->guest_pending_dbg_exceptions);
+       vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
+       vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
+
+       if (nested_cpu_has_xsaves(vmcs12))
+               vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
+       vmcs_write64(VMCS_LINK_POINTER, -1ull);
+
+       if (cpu_has_vmx_posted_intr())
+               vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
+
+       /*
+        * Whether page-faults are trapped is determined by a combination of
+        * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
+        * If enable_ept, L0 doesn't care about page faults and we should
+        * set all of these to L1's desires. However, if !enable_ept, L0 does
+        * care about (at least some) page faults, and because it is not easy
+        * (if at all possible?) to merge L0 and L1's desires, we simply ask
+        * to exit on each and every L2 page fault. This is done by setting
+        * MASK=MATCH=0 and (see below) EB.PF=1.
+        * Note that below we don't need special code to set EB.PF beyond the
+        * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
+        * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
+        * !enable_ept, EB.PF is 1, so the "or" will always be 1.
+        */
+       vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
+               enable_ept ? vmcs12->page_fault_error_code_mask : 0);
+       vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
+               enable_ept ? vmcs12->page_fault_error_code_match : 0);
+
+       /* All VMFUNCs are currently emulated through L0 vmexits.  */
+       if (cpu_has_vmx_vmfunc())
+               vmcs_write64(VM_FUNCTION_CONTROL, 0);
+
+       if (cpu_has_vmx_apicv()) {
+               vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
+               vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
+               vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
+               vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
+       }
+
+       /*
+        * Set host-state according to L0's settings (vmcs12 is irrelevant here)
+        * Some constant fields are set here by vmx_set_constant_host_state().
+        * Other fields are different per CPU, and will be set later when
+        * vmx_vcpu_load() is called, and when vmx_save_host_state() is called.
+        */
+       vmx_set_constant_host_state(vmx);
+
+       /*
+        * Set the MSR load/store lists to match L0's settings.
+        */
+       vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
+       vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
+       vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
+       vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
+       vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
+
+       set_cr4_guest_host_mask(vmx);
+
+       if (vmx_mpx_supported())
+               vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
+
+       if (enable_vpid) {
+               if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
+                       vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
+               else
+                       vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
+       }
+
+       /*
+        * L1 may access the L2's PDPTR, so save them to construct vmcs12
+        */
+       if (enable_ept) {
+               vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
+               vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
+               vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
+               vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
+       }
+}
+
+/*
+ * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
+ * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
+ * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
+ * guest in a way that will both be appropriate to L1's requests, and our
+ * needs. In addition to modifying the active vmcs (which is vmcs02), this
+ * function also has additional necessary side-effects, like setting various
+ * vcpu->arch fields.
+ * Returns 0 on success, 1 on failure. Invalid state exit qualification code
+ * is assigned to entry_failure_code on failure.
+ */
+static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+                         bool from_vmentry, u32 *entry_failure_code)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u32 exec_control, vmcs12_exec_ctrl;
+
+       /*
+        * First, the fields that are shadowed.  This must be kept in sync
+        * with vmx_shadow_fields.h.
+        */
+
+       vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
+       vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
+       vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
+       vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
+       vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
+
+       /*
+        * Not in vmcs02: GUEST_PML_INDEX, HOST_FS_SELECTOR, HOST_GS_SELECTOR,
+        * HOST_FS_BASE, HOST_GS_BASE.
+        */
+
        if (from_vmentry &&
            (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
                kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
@@ -10461,16 +10493,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        } else {
                vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
        }
-       vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
        vmx_set_rflags(vcpu, vmcs12->guest_rflags);
-       vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
-               vmcs12->guest_pending_dbg_exceptions);
-       vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
-       vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
-
-       if (nested_cpu_has_xsaves(vmcs12))
-               vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
-       vmcs_write64(VMCS_LINK_POINTER, -1ull);
 
        exec_control = vmcs12->pin_based_vm_exec_control;
 
@@ -10484,7 +10507,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        if (nested_cpu_has_posted_intr(vmcs12)) {
                vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
                vmx->nested.pi_pending = false;
-               vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
        } else {
                exec_control &= ~PIN_BASED_POSTED_INTR;
        }
@@ -10495,25 +10517,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        if (nested_cpu_has_preemption_timer(vmcs12))
                vmx_start_preemption_timer(vcpu);
 
-       /*
-        * Whether page-faults are trapped is determined by a combination of
-        * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
-        * If enable_ept, L0 doesn't care about page faults and we should
-        * set all of these to L1's desires. However, if !enable_ept, L0 does
-        * care about (at least some) page faults, and because it is not easy
-        * (if at all possible?) to merge L0 and L1's desires, we simply ask
-        * to exit on each and every L2 page fault. This is done by setting
-        * MASK=MATCH=0 and (see below) EB.PF=1.
-        * Note that below we don't need special code to set EB.PF beyond the
-        * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
-        * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
-        * !enable_ept, EB.PF is 1, so the "or" will always be 1.
-        */
-       vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
-               enable_ept ? vmcs12->page_fault_error_code_mask : 0);
-       vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
-               enable_ept ? vmcs12->page_fault_error_code_match : 0);
-
        if (cpu_has_secondary_exec_ctrls()) {
                exec_control = vmx->secondary_exec_control;
 
@@ -10532,22 +10535,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                        exec_control |= vmcs12_exec_ctrl;
                }
 
-               /* All VMFUNCs are currently emulated through L0 vmexits.  */
-               if (exec_control & SECONDARY_EXEC_ENABLE_VMFUNC)
-                       vmcs_write64(VM_FUNCTION_CONTROL, 0);
-
-               if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
-                       vmcs_write64(EOI_EXIT_BITMAP0,
-                               vmcs12->eoi_exit_bitmap0);
-                       vmcs_write64(EOI_EXIT_BITMAP1,
-                               vmcs12->eoi_exit_bitmap1);
-                       vmcs_write64(EOI_EXIT_BITMAP2,
-                               vmcs12->eoi_exit_bitmap2);
-                       vmcs_write64(EOI_EXIT_BITMAP3,
-                               vmcs12->eoi_exit_bitmap3);
+               if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
                        vmcs_write16(GUEST_INTR_STATUS,
                                vmcs12->guest_intr_status);
-               }
 
                /*
                 * Write an illegal value to APIC_ACCESS_ADDR. Later,
@@ -10560,24 +10550,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
        }
 
-
-       /*
-        * Set host-state according to L0's settings (vmcs12 is irrelevant here)
-        * Some constant fields are set here by vmx_set_constant_host_state().
-        * Other fields are different per CPU, and will be set later when
-        * vmx_vcpu_load() is called, and when vmx_save_host_state() is called.
-        */
-       vmx_set_constant_host_state(vmx);
-
-       /*
-        * Set the MSR load/store lists to match L0's settings.
-        */
-       vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
-       vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
-       vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
-       vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
-       vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
-
        /*
         * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
         * entry, but only if the current (host) sp changed from the value
@@ -10609,8 +10581,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        }
 
        /*
-        * Merging of IO bitmap not currently supported.
-        * Rather, exit every time.
+        * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
+        * for I/O port accesses.
         */
        exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
        exec_control |= CPU_BASED_UNCOND_IO_EXITING;
@@ -10647,12 +10619,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
        }
 
-       set_cr4_guest_host_mask(vmx);
-
-       if (from_vmentry &&
-           vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)
-               vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
-
        if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
                vmcs_write64(TSC_OFFSET,
                        vcpu->arch.tsc_offset + vmcs12->tsc_offset);
@@ -10671,16 +10637,13 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                 * even if spawn a lot of nested vCPUs.
                 */
                if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) {
-                       vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
                        if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
                                vmx->nested.last_vpid = vmcs12->virtual_processor_id;
-                               __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02);
+                               __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02, true);
                        }
                } else {
-                       vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
-                       vmx_flush_tlb(vcpu);
+                       vmx_flush_tlb(vcpu, true);
                }
-
        }
 
        if (enable_pml) {
@@ -10729,6 +10692,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
        vmx_set_efer(vcpu, vcpu->arch.efer);
 
+       if (vmx->nested.dirty_vmcs12) {
+               prepare_vmcs02_full(vcpu, vmcs12, from_vmentry);
+               vmx->nested.dirty_vmcs12 = false;
+       }
+
        /* Shadow page tables on either EPT or shadow page tables. */
        if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
                                entry_failure_code))
@@ -10737,16 +10705,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        if (!enable_ept)
                vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
 
-       /*
-        * L1 may access the L2's PDPTR, so save them to construct vmcs12
-        */
-       if (enable_ept) {
-               vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
-               vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
-               vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
-               vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
-       }
-
        kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
        kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
        return 0;
@@ -10882,20 +10840,15 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-       struct loaded_vmcs *vmcs02;
        u32 msr_entry_idx;
        u32 exit_qual;
 
-       vmcs02 = nested_get_current_vmcs02(vmx);
-       if (!vmcs02)
-               return -ENOMEM;
-
        enter_guest_mode(vcpu);
 
        if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
                vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
 
-       vmx_switch_vmcs(vcpu, vmcs02);
+       vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
        vmx_segment_cache_clear(vmx);
 
        if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) {
@@ -11107,7 +11060,6 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
                if (block_nested_events)
                        return -EBUSY;
                nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
-               vcpu->arch.exception.pending = false;
                return 0;
        }
 
@@ -11388,11 +11340,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
                 * L1's vpid. TODO: move to a more elaborate solution, giving
                 * each L2 its own vpid and exposing the vpid feature to L1.
                 */
-               vmx_flush_tlb(vcpu);
+               vmx_flush_tlb(vcpu, true);
        }
-       /* Restore posted intr vector. */
-       if (nested_cpu_has_posted_intr(vmcs12))
-               vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
 
        vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
        vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
@@ -11513,10 +11462,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
        vm_exit_controls_reset_shadow(vmx);
        vmx_segment_cache_clear(vmx);
 
-       /* if no vmcs02 cache requested, remove the one we used */
-       if (VMCS02_POOL_SIZE == 0)
-               nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
-
        /* Update any VMCS fields that might have changed while L2 ran */
        vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
        vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
@@ -11657,6 +11602,21 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
                               struct x86_instruction_info *info,
                               enum x86_intercept_stage stage)
 {
+       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+       struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+
+       /*
+        * RDPID causes #UD if disabled through secondary execution controls.
+        * Because it is marked as EmulateOnUD, we need to intercept it here.
+        */
+       if (info->intercept == x86_intercept_rdtscp &&
+           !nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
+               ctxt->exception.vector = UD_VECTOR;
+               ctxt->exception.error_code_valid = false;
+               return X86EMUL_PROPAGATE_FAULT;</