Merge tag 'kvm-4.20-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 26 Oct 2018 00:57:35 +0000 (17:57 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 26 Oct 2018 00:57:35 +0000 (17:57 -0700)
Pull KVM updates from Radim Krčmář:
 "ARM:
   - Improved guest IPA space support (32 to 52 bits)

   - RAS event delivery for 32bit

   - PMU fixes

   - Guest entry hardening

   - Various cleanups

   - Port of dirty_log_test selftest

  PPC:
   - Nested HV KVM support for radix guests on POWER9. The performance
     is much better than with PR KVM. Migration and arbitrary level of
     nesting is supported.

   - Disable nested HV-KVM on early POWER9 chips that need a particular
     hardware bug workaround

   - One VM per core mode to prevent potential data leaks

   - PCI pass-through optimization

   - merge ppc-kvm topic branch and kvm-ppc-fixes to get a better base

  s390:
   - Initial version of AP crypto virtualization via vfio-mdev

   - Improvement for vfio-ap

   - Set the host program identifier

   - Optimize page table locking

  x86:
   - Enable nested virtualization by default

   - Implement Hyper-V IPI hypercalls

   - Improve #PF and #DB handling

   - Allow guests to use Enlightened VMCS

   - Add migration selftests for VMCS and Enlightened VMCS

   - Allow coalesced PIO accesses

   - Add an option to perform nested VMCS host state consistency check
     through hardware

   - Automatic tuning of lapic_timer_advance_ns

   - Many fixes, minor improvements, and cleanups"

* tag 'kvm-4.20-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (204 commits)
  KVM/nVMX: Do not validate that posted_intr_desc_addr is page aligned
  Revert "kvm: x86: optimize dr6 restore"
  KVM: PPC: Optimize clearing TCEs for sparse tables
  x86/kvm/nVMX: tweak shadow fields
  selftests/kvm: add missing executables to .gitignore
  KVM: arm64: Safety check PSTATE when entering guest and handle IL
  KVM: PPC: Book3S HV: Don't use streamlined entry path on early POWER9 chips
  arm/arm64: KVM: Enable 32 bits kvm vcpu events support
  arm/arm64: KVM: Rename function kvm_arch_dev_ioctl_check_extension()
  KVM: arm64: Fix caching of host MDCR_EL2 value
  KVM: VMX: enable nested virtualization by default
  KVM/x86: Use 32bit xor to clear registers in svm.c
  kvm: x86: Introduce KVM_CAP_EXCEPTION_PAYLOAD
  kvm: vmx: Defer setting of DR6 until #DB delivery
  kvm: x86: Defer setting of CR2 until #PF delivery
  kvm: x86: Add payload operands to kvm_multiple_exception
  kvm: x86: Add exception payload fields to kvm_vcpu_events
  kvm: x86: Add has_payload and payload to kvm_queued_exception
  KVM: Documentation: Fix omission in struct kvm_vcpu_events
  KVM: selftests: add Enlightened VMCS test
  ...

22 files changed:
1  2 
MAINTAINERS
arch/arm/include/asm/kvm_arm.h
arch/arm/include/asm/kvm_mmu.h
arch/arm64/include/asm/cpufeature.h
arch/arm64/include/asm/kvm_arm.h
arch/arm64/include/asm/kvm_host.h
arch/arm64/include/asm/kvm_mmu.h
arch/arm64/include/asm/ptrace.h
arch/arm64/kvm/guest.c
arch/arm64/kvm/hyp/sysreg-sr.c
arch/powerpc/kernel/asm-offsets.c
arch/powerpc/kvm/book3s_64_mmu_radix.c
arch/s390/Kconfig
arch/x86/include/asm/virtext.h
arch/x86/kvm/mmu.c
arch/x86/kvm/svm.c
arch/x86/kvm/vmx.c
drivers/s390/crypto/Makefile
include/linux/irqchip/arm-gic-v3.h
tools/arch/x86/include/uapi/asm/kvm.h
virt/kvm/arm/arm.c
virt/kvm/arm/mmu.c

diff --combined MAINTAINERS
index 8f22f6af3782104927b4fa37fbe3c2df27f98a86,86e019c7b0fa19a3fe8deadfc6cc99553bbb60b6..bd702ad56c7f6bff4e3f9f422f5ec452d58f2b95
@@@ -324,6 -324,7 +324,6 @@@ F: Documentation/ABI/testing/sysfs-bus-
  F:    Documentation/ABI/testing/configfs-acpi
  F:    drivers/pci/*acpi*
  F:    drivers/pci/*/*acpi*
 -F:    drivers/pci/*/*/*acpi*
  F:    tools/power/acpi/
  
  ACPI APEI
@@@ -839,7 -840,7 +839,7 @@@ ANALOG DEVICES INC ADGS1408 DRIVE
  M:    Mircea Caprioru <mircea.caprioru@analog.com>
  S:    Supported
  F:    drivers/mux/adgs1408.c
 -F:    Documentation/devicetree/bindings/mux/adgs1408.txt
 +F:    Documentation/devicetree/bindings/mux/adi,adgs1408.txt
  
  ANALOG DEVICES INC ADP5061 DRIVER
  M:    Stefan Popa <stefan.popa@analog.com>
@@@ -1180,7 -1181,7 +1180,7 @@@ N:      ow
  F:    arch/arm/mach-actions/
  F:    arch/arm/boot/dts/owl-*
  F:    arch/arm64/boot/dts/actions/
 -F:    drivers/clocksource/owl-*
 +F:    drivers/clocksource/timer-owl*
  F:    drivers/pinctrl/actions/*
  F:    drivers/soc/actions/
  F:    include/dt-bindings/power/owl-*
@@@ -1250,7 -1251,7 +1250,7 @@@ N:      meso
  
  ARM/Annapurna Labs ALPINE ARCHITECTURE
  M:    Tsahee Zidenberg <tsahee@annapurnalabs.com>
 -M:    Antoine Tenart <antoine.tenart@free-electrons.com>
 +M:    Antoine Tenart <antoine.tenart@bootlin.com>
  L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
  S:    Maintained
  F:    arch/arm/mach-alpine/
@@@ -1603,7 -1604,7 +1603,7 @@@ L:      linux-arm-kernel@lists.infradead.or
  S:    Maintained
  F:    arch/arm/boot/dts/lpc43*
  F:    drivers/clk/nxp/clk-lpc18xx*
 -F:    drivers/clocksource/time-lpc32xx.c
 +F:    drivers/clocksource/timer-lpc32xx.c
  F:    drivers/i2c/busses/i2c-lpc2k.c
  F:    drivers/memory/pl172.c
  F:    drivers/mtd/spi-nor/nxp-spifi.c
@@@ -2195,7 -2196,6 +2195,7 @@@ F:      drivers/clk/uniphier
  F:    drivers/gpio/gpio-uniphier.c
  F:    drivers/i2c/busses/i2c-uniphier*
  F:    drivers/irqchip/irq-uniphier-aidet.c
 +F:    drivers/mmc/host/uniphier-sd.c
  F:    drivers/pinctrl/uniphier/
  F:    drivers/reset/reset-uniphier.c
  F:    drivers/tty/serial/8250/8250_uniphier.c
@@@ -2220,7 -2220,7 +2220,7 @@@ F:      arch/arm/mach-vexpress
  F:    */*/vexpress*
  F:    */*/*/vexpress*
  F:    drivers/clk/versatile/clk-vexpress-osc.c
 -F:    drivers/clocksource/versatile.c
 +F:    drivers/clocksource/timer-versatile.c
  N:    mps2
  
  ARM/VFP SUPPORT
@@@ -2242,7 -2242,7 +2242,7 @@@ M:      Tony Prisk <linux@prisktech.co.nz
  L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
  S:    Maintained
  F:    arch/arm/mach-vt8500/
 -F:    drivers/clocksource/vt8500_timer.c
 +F:    drivers/clocksource/timer-vt8500.c
  F:    drivers/i2c/busses/i2c-wmt.c
  F:    drivers/mmc/host/wmt-sdmmc.c
  F:    drivers/pwm/pwm-vt8500.c
@@@ -2307,7 -2307,7 +2307,7 @@@ F:      drivers/cpuidle/cpuidle-zynq.
  F:    drivers/block/xsysace.c
  N:    zynq
  N:    xilinx
 -F:    drivers/clocksource/cadence_ttc_timer.c
 +F:    drivers/clocksource/timer-cadence-ttc.c
  F:    drivers/i2c/busses/i2c-cadence.c
  F:    drivers/mmc/host/sdhci-of-arasan.c
  F:    drivers/edac/synopsys_edac.c
@@@ -2956,6 -2956,7 +2956,6 @@@ F:      include/linux/bcm963xx_tag.
  
  BROADCOM BNX2 GIGABIT ETHERNET DRIVER
  M:    Rasesh Mody <rasesh.mody@cavium.com>
 -M:    Harish Patil <harish.patil@cavium.com>
  M:    Dept-GELinuxNICDev@cavium.com
  L:    netdev@vger.kernel.org
  S:    Supported
@@@ -2976,7 -2977,6 +2976,7 @@@ F:      drivers/scsi/bnx2i
  
  BROADCOM BNX2X 10 GIGABIT ETHERNET DRIVER
  M:    Ariel Elior <ariel.elior@cavium.com>
 +M:    Sudarsana Kalluru <sudarsana.kalluru@cavium.com>
  M:    everest-linux-l2@cavium.com
  L:    netdev@vger.kernel.org
  S:    Supported
@@@ -3007,14 -3007,6 +3007,14 @@@ S:    Supporte
  F:    drivers/gpio/gpio-brcmstb.c
  F:    Documentation/devicetree/bindings/gpio/brcm,brcmstb-gpio.txt
  
 +BROADCOM BRCMSTB I2C DRIVER
 +M:    Kamal Dasu <kdasu.kdev@gmail.com>
 +L:    linux-i2c@vger.kernel.org
 +L:    bcm-kernel-feedback-list@broadcom.com
 +S:    Supported
 +F:    drivers/i2c/busses/i2c-brcmstb.c
 +F:    Documentation/devicetree/bindings/i2c/i2c-brcmstb.txt
 +
  BROADCOM BRCMSTB USB2 and USB3 PHY DRIVER
  M:    Al Cooper <alcooperx@gmail.com>
  L:    linux-kernel@vger.kernel.org
@@@ -3122,15 -3114,6 +3122,15 @@@ S:    Maintaine
  F:    Documentation/devicetree/bindings/memory-controllers/brcm,dpfe-cpu.txt
  F:    drivers/memory/brcmstb_dpfe.c
  
 +BROADCOM SPI DRIVER
 +M:    Kamal Dasu <kdasu.kdev@gmail.com>
 +M:    bcm-kernel-feedback-list@broadcom.com
 +S:    Maintained
 +F:    Documentation/devicetree/bindings/spi/brcm,spi-bcm-qspi.txt
 +F:    drivers/spi/spi-bcm-qspi.*
 +F:    drivers/spi/spi-brcmstb-qspi.c
 +F:    drivers/spi/spi-iproc-qspi.c
 +
  BROADCOM SYSTEMPORT ETHERNET DRIVER
  M:    Florian Fainelli <f.fainelli@gmail.com>
  L:    netdev@vger.kernel.org
@@@ -3691,12 -3674,6 +3691,12 @@@ S:    Maintaine
  F:    Documentation/devicetree/bindings/media/coda.txt
  F:    drivers/media/platform/coda/
  
 +CODE OF CONDUCT
 +M:    Greg Kroah-Hartman <gregkh@linuxfoundation.org>
 +S:    Supported
 +F:    Documentation/process/code-of-conduct.rst
 +F:    Documentation/process/code-of-conduct-interpretation.rst
 +
  COMMON CLK FRAMEWORK
  M:    Michael Turquette <mturquette@baylibre.com>
  M:    Stephen Boyd <sboyd@kernel.org>
@@@ -4055,7 -4032,7 +4055,7 @@@ M:      Uma Krishnan <ukrishn@linux.vnet.ibm
  L:    linux-scsi@vger.kernel.org
  S:    Supported
  F:    drivers/scsi/cxlflash/
 -F:    include/uapi/scsi/cxlflash_ioctls.h
 +F:    include/uapi/scsi/cxlflash_ioctl.h
  F:    Documentation/powerpc/cxlflash.txt
  
  CYBERPRO FB DRIVER
@@@ -4194,11 -4171,6 +4194,11 @@@ S:    Maintaine
  F:    drivers/platform/x86/dell-smbios-wmi.c
  F:    tools/wmi/dell-smbios-example.c
  
 +DEFZA FDDI NETWORK DRIVER
 +M:    "Maciej W. Rozycki" <macro@linux-mips.org>
 +S:    Maintained
 +F:    drivers/net/fddi/defza.*
 +
  DELL LAPTOP DRIVER
  M:    Matthew Garrett <mjg59@srcf.ucam.org>
  M:    Pali Rohár <pali.rohar@gmail.com>
@@@ -4514,12 -4486,11 +4514,12 @@@ S:   Maintaine
  F:    Documentation/
  F:    scripts/kernel-doc
  X:    Documentation/ABI/
 +X:    Documentation/acpi/
  X:    Documentation/devicetree/
 -X:    Documentation/acpi
 -X:    Documentation/power
 -X:    Documentation/spi
 -X:    Documentation/media
 +X:    Documentation/i2c/
 +X:    Documentation/media/
 +X:    Documentation/power/
 +X:    Documentation/spi/
  T:    git git://git.lwn.net/linux.git docs-next
  
  DOCUMENTATION/ITALIAN
@@@ -4557,13 -4528,9 +4557,13 @@@ F:    drivers/soc/fsl/dpi
  
  DPAA2 ETHERNET DRIVER
  M:    Ioana Radulescu <ruxandra.radulescu@nxp.com>
 -L:    linux-kernel@vger.kernel.org
 +L:    netdev@vger.kernel.org
  S:    Maintained
 -F:    drivers/staging/fsl-dpaa2/ethernet
 +F:    drivers/net/ethernet/freescale/dpaa2/dpaa2-eth*
 +F:    drivers/net/ethernet/freescale/dpaa2/dpni*
 +F:    drivers/net/ethernet/freescale/dpaa2/dpkg.h
 +F:    drivers/net/ethernet/freescale/dpaa2/Makefile
 +F:    drivers/net/ethernet/freescale/dpaa2/Kconfig
  
  DPAA2 ETHERNET SWITCH DRIVER
  M:    Ioana Radulescu <ruxandra.radulescu@nxp.com>
@@@ -4574,10 -4541,9 +4574,10 @@@ F:    drivers/staging/fsl-dpaa2/eths
  
  DPAA2 PTP CLOCK DRIVER
  M:    Yangbo Lu <yangbo.lu@nxp.com>
 -L:    linux-kernel@vger.kernel.org
 +L:    netdev@vger.kernel.org
  S:    Maintained
 -F:    drivers/staging/fsl-dpaa2/rtc
 +F:    drivers/net/ethernet/freescale/dpaa2/dpaa2-ptp*
 +F:    drivers/net/ethernet/freescale/dpaa2/dprtc*
  
  DPT_I2O SCSI RAID DRIVER
  M:    Adaptec OEM Raid Solutions <aacraid@microsemi.com>
@@@ -5364,8 -5330,7 +5364,8 @@@ S:      Maintaine
  F:    drivers/edac/r82600_edac.c
  
  EDAC-SBRIDGE
 -M:    Mauro Carvalho Chehab <mchehab@kernel.org>
 +M:    Tony Luck <tony.luck@intel.com>
 +R:    Qiuxu Zhuo <qiuxu.zhuo@intel.com>
  L:    linux-edac@vger.kernel.org
  S:    Maintained
  F:    drivers/edac/sb_edac.c
@@@ -5505,8 -5470,7 +5505,8 @@@ S:      Odd Fixe
  F:    drivers/net/ethernet/agere/
  
  ETHERNET BRIDGE
 -M:    Stephen Hemminger <stephen@networkplumber.org>
 +M:    Roopa Prabhu <roopa@cumulusnetworks.com>
 +M:    Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
  L:    bridge@lists.linux-foundation.org (moderated for non-subscribers)
  L:    netdev@vger.kernel.org
  W:    http://www.linuxfoundation.org/en/Net:Bridge
@@@ -5550,7 -5514,7 +5550,7 @@@ W:      http://ext4.wiki.kernel.or
  Q:    http://patchwork.ozlabs.org/project/linux-ext4/list/
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4.git
  S:    Maintained
 -F:    Documentation/filesystems/ext4.txt
 +F:    Documentation/filesystems/ext4/ext4.rst
  F:    fs/ext4/
  
  Extended Verification Module (EVM)
@@@ -6490,7 -6454,6 +6490,7 @@@ F:      Documentation/devicetree/bindings/hw
  F:    Documentation/hwmon/
  F:    drivers/hwmon/
  F:    include/linux/hwmon*.h
 +F:    include/trace/events/hwmon*.h
  
  HARDWARE RANDOM NUMBER GENERATOR CORE
  M:    Matt Mackall <mpm@selenic.com>
@@@ -6799,12 -6762,6 +6799,12 @@@ S:    Maintaine
  F:    mm/memory-failure.c
  F:    mm/hwpoison-inject.c
  
 +HYGON PROCESSOR SUPPORT
 +M:    Pu Wen <puwen@hygon.cn>
 +L:    linux-kernel@vger.kernel.org
 +S:    Maintained
 +F:    arch/x86/kernel/cpu/hygon.c
 +
  Hyper-V CORE AND DRIVERS
  M:    "K. Y. Srinivasan" <kys@microsoft.com>
  M:    Haiyang Zhang <haiyangz@microsoft.com>
@@@ -7384,16 -7341,15 +7384,16 @@@ T:   git git://git.kernel.org/pub/scm/lin
  S:    Supported
  F:    Documentation/networking/e100.rst
  F:    Documentation/networking/e1000.rst
 -F:    Documentation/networking/e1000e.txt
 -F:    Documentation/networking/igb.txt
 -F:    Documentation/networking/igbvf.txt
 -F:    Documentation/networking/ixgb.txt
 -F:    Documentation/networking/ixgbe.txt
 -F:    Documentation/networking/ixgbevf.txt
 -F:    Documentation/networking/i40e.txt
 -F:    Documentation/networking/i40evf.txt
 -F:    Documentation/networking/ice.txt
 +F:    Documentation/networking/e1000e.rst
 +F:    Documentation/networking/fm10k.rst
 +F:    Documentation/networking/igb.rst
 +F:    Documentation/networking/igbvf.rst
 +F:    Documentation/networking/ixgb.rst
 +F:    Documentation/networking/ixgbe.rst
 +F:    Documentation/networking/ixgbevf.rst
 +F:    Documentation/networking/i40e.rst
 +F:    Documentation/networking/iavf.rst
 +F:    Documentation/networking/ice.rst
  F:    drivers/net/ethernet/intel/
  F:    drivers/net/ethernet/intel/*/
  F:    include/linux/avf/virtchnl.h
@@@ -7415,12 -7371,6 +7415,12 @@@ T:    git https://github.com/intel/gvt-lin
  S:    Supported
  F:    drivers/gpu/drm/i915/gvt/
  
 +INTEL PMIC GPIO DRIVER
 +R:    Andy Shevchenko <andriy.shevchenko@linux.intel.com>
 +S:    Maintained
 +F:    drivers/gpio/gpio-*cove.c
 +F:    drivers/gpio/gpio-msic.c
 +
  INTEL HID EVENT DRIVER
  M:    Alex Hung <alex.hung@canonical.com>
  L:    platform-driver-x86@vger.kernel.org
@@@ -7547,14 -7497,6 +7547,14 @@@ F:    drivers/platform/x86/intel_punit_ipc
  F:    arch/x86/include/asm/intel_pmc_ipc.h
  F:    arch/x86/include/asm/intel_punit_ipc.h
  
 +INTEL MULTIFUNCTION PMIC DEVICE DRIVERS
 +R:    Andy Shevchenko <andriy.shevchenko@linux.intel.com>
 +S:    Maintained
 +F:    drivers/mfd/intel_msic.c
 +F:    drivers/mfd/intel_soc_pmic*
 +F:    include/linux/mfd/intel_msic.h
 +F:    include/linux/mfd/intel_soc_pmic*
 +
  INTEL PRO/WIRELESS 2100, 2200BG, 2915ABG NETWORK CONNECTION SUPPORT
  M:    Stanislav Yakovlev <stas.yakovlev@gmail.com>
  L:    linux-wireless@vger.kernel.org
@@@ -7578,6 -7520,14 +7578,6 @@@ S:     Supporte
  F:    drivers/infiniband/hw/i40iw/
  F:    include/uapi/rdma/i40iw-abi.h
  
 -INTEL SHA MULTIBUFFER DRIVER
 -M:    Megha Dey <megha.dey@linux.intel.com>
 -R:    Tim Chen <tim.c.chen@linux.intel.com>
 -L:    linux-crypto@vger.kernel.org
 -S:    Supported
 -F:    arch/x86/crypto/sha*-mb/
 -F:    crypto/mcryptd.c
 -
  INTEL TELEMETRY DRIVER
  M:    Souvik Kumar Chakravarty <souvik.k.chakravarty@intel.com>
  L:    platform-driver-x86@vger.kernel.org
@@@ -7685,7 -7635,6 +7685,7 @@@ M:      Corey Minyard <minyard@acm.org
  L:    openipmi-developer@lists.sourceforge.net (moderated for non-subscribers)
  W:    http://openipmi.sourceforge.net/
  S:    Supported
 +F:    Documentation/devicetree/bindings/ipmi/
  F:    Documentation/IPMI.txt
  F:    drivers/char/ipmi/
  F:    include/linux/ipmi*
@@@ -8157,7 -8106,6 +8157,7 @@@ F:      security/keys/encrypted-keys
  
  KEYS-TRUSTED
  M:    James Bottomley <jejb@linux.vnet.ibm.com>
 +M:      Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
  M:    Mimi Zohar <zohar@linux.vnet.ibm.com>
  L:    linux-integrity@vger.kernel.org
  L:    keyrings@vger.kernel.org
@@@ -8235,25 -8183,6 +8235,25 @@@ S:    Maintaine
  F:    net/l3mdev
  F:    include/net/l3mdev.h
  
 +L7 BPF FRAMEWORK
 +M:    John Fastabend <john.fastabend@gmail.com>
 +M:    Daniel Borkmann <daniel@iogearbox.net>
 +L:    netdev@vger.kernel.org
 +S:    Maintained
 +F:    include/linux/skmsg.h
 +F:    net/core/skmsg.c
 +F:    net/core/sock_map.c
 +F:    net/ipv4/tcp_bpf.c
 +
 +LANTIQ / INTEL Ethernet drivers
 +M:    Hauke Mehrtens <hauke@hauke-m.de>
 +L:    netdev@vger.kernel.org
 +S:    Maintained
 +F:    net/dsa/tag_gswip.c
 +F:    drivers/net/ethernet/lantiq_xrx200.c
 +F:    drivers/net/dsa/lantiq_pce.h
 +F:    drivers/net/dsa/lantiq_gswip.c
 +
  LANTIQ MIPS ARCHITECTURE
  M:    John Crispin <john@phrozen.org>
  L:    linux-mips@linux-mips.org
@@@ -8669,6 -8598,7 +8669,6 @@@ F:      include/linux/spinlock*.
  F:    arch/*/include/asm/spinlock*.h
  F:    include/linux/rwlock*.h
  F:    include/linux/mutex*.h
 -F:    arch/*/include/asm/mutex*.h
  F:    include/linux/rwsem*.h
  F:    arch/*/include/asm/rwsem.h
  F:    include/linux/seqlock.h
@@@ -8814,7 -8744,7 +8814,7 @@@ M:      Vivien Didelot <vivien.didelot@savoi
  L:    netdev@vger.kernel.org
  S:    Maintained
  F:    drivers/net/dsa/mv88e6xxx/
 -F:    linux/platform_data/mv88e6xxx.h
 +F:    include/linux/platform_data/mv88e6xxx.h
  F:    Documentation/devicetree/bindings/net/dsa/marvell.txt
  
  MARVELL ARMADA DRM SUPPORT
@@@ -8904,15 -8834,6 +8904,15 @@@ S:    Supporte
  F:    drivers/mmc/host/sdhci-xenon*
  F:    Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.txt
  
 +MARVELL OCTEONTX2 RVU ADMIN FUNCTION DRIVER
 +M:    Sunil Goutham <sgoutham@marvell.com>
 +M:    Linu Cherian <lcherian@marvell.com>
 +M:    Geetha sowjanya <gakula@marvell.com>
 +M:    Jerin Jacob <jerinj@marvell.com>
 +L:    netdev@vger.kernel.org
 +S:    Supported
 +F:    drivers/net/ethernet/marvell/octeontx2/af/
 +
  MATROX FRAMEBUFFER DRIVER
  L:    linux-fbdev@vger.kernel.org
  S:    Orphan
@@@ -8926,6 -8847,13 +8926,6 @@@ S:     Maintaine
  F:    Documentation/hwmon/max16065
  F:    drivers/hwmon/max16065.c
  
 -MAX20751 HARDWARE MONITOR DRIVER
 -M:    Guenter Roeck <linux@roeck-us.net>
 -L:    linux-hwmon@vger.kernel.org
 -S:    Maintained
 -F:    Documentation/hwmon/max20751
 -F:    drivers/hwmon/max20751.c
 -
  MAX2175 SDR TUNER DRIVER
  M:    Ramesh Shanmugasundaram <ramesh.shanmugasundaram@bp.renesas.com>
  L:    linux-media@vger.kernel.org
@@@ -9591,7 -9519,6 +9591,7 @@@ M:      Richard Genoud <richard.genoud@gmail
  S:    Maintained
  F:    drivers/tty/serial/atmel_serial.c
  F:    drivers/tty/serial/atmel_serial.h
 +F:    Documentation/devicetree/bindings/mfd/atmel-usart.txt
  
  MICROCHIP / ATMEL DMA DRIVER
  M:    Ludovic Desroches <ludovic.desroches@microchip.com>
@@@ -9623,21 -9550,6 +9623,21 @@@ S:    Supporte
  F:    drivers/mtd/nand/raw/atmel/*
  F:    Documentation/devicetree/bindings/mtd/atmel-nand.txt
  
 +MICROCHIP AT91 USART MFD DRIVER
 +M:    Radu Pirea <radu_nicolae.pirea@upb.ro>
 +L:    linux-kernel@vger.kernel.org
 +S:    Supported
 +F:    drivers/mfd/at91-usart.c
 +F:    include/dt-bindings/mfd/at91-usart.h
 +F:    Documentation/devicetree/bindings/mfd/atmel-usart.txt
 +
 +MICROCHIP AT91 USART SPI DRIVER
 +M:    Radu Pirea <radu_nicolae.pirea@upb.ro>
 +L:    linux-spi@vger.kernel.org
 +S:    Supported
 +F:    drivers/spi/spi-at91-usart.c
 +F:    Documentation/devicetree/bindings/mfd/atmel-usart.txt
 +
  MICROCHIP KSZ SERIES ETHERNET SWITCH DRIVER
  M:    Woojung Huh <Woojung.Huh@microchip.com>
  M:    Microchip Linux Driver Support <UNGLinuxDriver@microchip.com>
@@@ -9746,8 -9658,7 +9746,8 @@@ MIPS/LOONGSON2 ARCHITECTUR
  M:    Jiaxun Yang <jiaxun.yang@flygoat.com>
  L:    linux-mips@linux-mips.org
  S:    Maintained
 -F:    arch/mips/loongson64/*{2e/2f}*
 +F:    arch/mips/loongson64/fuloong-2e/
 +F:    arch/mips/loongson64/lemote-2f/
  F:    arch/mips/include/asm/mach-loongson64/
  F:    drivers/*/*loongson2*
  F:    drivers/*/*/*loongson2*
@@@ -9787,19 -9698,6 +9787,19 @@@ S:    Maintaine
  F:    arch/arm/boot/dts/mmp*
  F:    arch/arm/mach-mmp/
  
 +MMU GATHER AND TLB INVALIDATION
 +M:    Will Deacon <will.deacon@arm.com>
 +M:    "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
 +M:    Andrew Morton <akpm@linux-foundation.org>
 +M:    Nick Piggin <npiggin@gmail.com>
 +M:    Peter Zijlstra <peterz@infradead.org>
 +L:    linux-arch@vger.kernel.org
 +L:    linux-mm@kvack.org
 +S:    Maintained
 +F:    arch/*/include/asm/tlb.h
 +F:    include/asm-generic/tlb.h
 +F:    mm/mmu_gather.c
 +
  MN88472 MEDIA DRIVER
  M:    Antti Palosaari <crope@iki.fi>
  L:    linux-media@vger.kernel.org
@@@ -9818,6 -9716,13 +9818,6 @@@ Q:     http://patchwork.linuxtv.org/project
  S:    Maintained
  F:    drivers/media/dvb-frontends/mn88473*
  
 -PCI DRIVER FOR MOBIVEIL PCIE IP
 -M:    Subrahmanya Lingappa <l.subrahmanya@mobiveil.co.in>
 -L:    linux-pci@vger.kernel.org
 -S:    Supported
 -F:    Documentation/devicetree/bindings/pci/mobiveil-pcie.txt
 -F:    drivers/pci/controller/pcie-mobiveil.c
 -
  MODULE SUPPORT
  M:    Jessica Yu <jeyu@kernel.org>
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/jeyu/linux.git modules-next
@@@ -9967,7 -9872,7 +9967,7 @@@ M:      Peter Rosin <peda@axentia.se
  S:    Maintained
  F:    Documentation/ABI/testing/sysfs-class-mux*
  F:    Documentation/devicetree/bindings/mux/
 -F:    include/linux/dt-bindings/mux/
 +F:    include/dt-bindings/mux/
  F:    include/linux/mux/
  F:    drivers/mux/
  
@@@ -10004,13 -9909,6 +10004,13 @@@ S:  Supporte
  F:    drivers/gpu/drm/mxsfb/
  F:    Documentation/devicetree/bindings/display/mxsfb.txt
  
 +MYLEX DAC960 PCI RAID Controller
 +M:    Hannes Reinecke <hare@kernel.org>
 +L:    linux-scsi@vger.kernel.org
 +S:    Supported
 +F:    drivers/scsi/myrb.*
 +F:    drivers/scsi/myrs.*
 +
  MYRICOM MYRI-10G 10GbE DRIVER (MYRI10GE)
  M:    Chris Lee <christopher.lee@cspi.com>
  L:    netdev@vger.kernel.org
@@@ -10231,6 -10129,7 +10231,6 @@@ L:   netdev@vger.kernel.or
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec.git
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec-next.git
  S:    Maintained
 -F:    net/core/flow.c
  F:    net/xfrm/
  F:    net/key/
  F:    net/ipv4/xfrm*
@@@ -10293,8 -10192,6 +10293,8 @@@ NETWORKING [TLS
  M:    Boris Pismenny <borisp@mellanox.com>
  M:    Aviad Yehezkel <aviadye@mellanox.com>
  M:    Dave Watson <davejwatson@fb.com>
 +M:    John Fastabend <john.fastabend@gmail.com>
 +M:    Daniel Borkmann <daniel@iogearbox.net>
  L:    netdev@vger.kernel.org
  S:    Maintained
  F:    net/tls/*
@@@ -11052,7 -10949,7 +11052,7 @@@ M:   Willy Tarreau <willy@haproxy.com
  M:    Ksenija Stanojevic <ksenija.stanojevic@gmail.com>
  S:    Odd Fixes
  F:    Documentation/auxdisplay/lcd-panel-cgram.txt
 -F:    drivers/misc/panel.c
 +F:    drivers/auxdisplay/panel.c
  
  PARALLEL PORT SUBSYSTEM
  M:    Sudip Mukherjee <sudipm.mukherjee@gmail.com>
@@@ -11240,13 -11137,6 +11240,13 @@@ F: include/uapi/linux/switchtec_ioctl.
  F:    include/linux/switchtec.h
  F:    drivers/ntb/hw/mscc/
  
 +PCI DRIVER FOR MOBIVEIL PCIE IP
 +M:    Subrahmanya Lingappa <l.subrahmanya@mobiveil.co.in>
 +L:    linux-pci@vger.kernel.org
 +S:    Supported
 +F:    Documentation/devicetree/bindings/pci/mobiveil-pcie.txt
 +F:    drivers/pci/controller/pcie-mobiveil.c
 +
  PCI DRIVER FOR MVEBU (Marvell Armada 370 and Armada XP SOC support)
  M:    Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
  M:    Jason Cooper <jason@lakedaemon.net>
@@@ -11299,7 -11189,7 +11299,7 @@@ M:   Murali Karicheri <m-karicheri2@ti.co
  L:    linux-pci@vger.kernel.org
  L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
  S:    Maintained
 -F:    drivers/pci/controller/dwc/*keystone*
 +F:    drivers/pci/controller/dwc/pci-keystone.c
  
  PCI ENDPOINT SUBSYSTEM
  M:    Kishon Vijay Abraham I <kishon@ti.com>
@@@ -11313,14 -11203,8 +11313,14 @@@ F: tools/pci
  
  PCI ENHANCED ERROR HANDLING (EEH) FOR POWERPC
  M:    Russell Currey <ruscur@russell.cc>
 +M:    Sam Bobroff <sbobroff@linux.ibm.com>
 +M:    Oliver O'Halloran <oohall@gmail.com>
  L:    linuxppc-dev@lists.ozlabs.org
  S:    Supported
 +F:    Documentation/PCI/pci-error-recovery.txt
 +F:    drivers/pci/pcie/aer.c
 +F:    drivers/pci/pcie/dpc.c
 +F:    drivers/pci/pcie/err.c
  F:    Documentation/powerpc/eeh-pci-error-recovery.txt
  F:    arch/powerpc/kernel/eeh*.c
  F:    arch/powerpc/platforms/*/eeh*.c
@@@ -11599,12 -11483,15 +11599,12 @@@ S:        Maintaine
  F:    drivers/pinctrl/intel/
  
  PIN CONTROLLER - MEDIATEK
 -M:    Sean Wang <sean.wang@mediatek.com>
 +M:    Sean Wang <sean.wang@kernel.org>
  L:    linux-mediatek@lists.infradead.org (moderated for non-subscribers)
  S:    Maintained
  F:    Documentation/devicetree/bindings/pinctrl/pinctrl-mt65xx.txt
  F:    Documentation/devicetree/bindings/pinctrl/pinctrl-mt7622.txt
 -F:    drivers/pinctrl/mediatek/mtk-eint.*
 -F:    drivers/pinctrl/mediatek/pinctrl-mtk-common.*
 -F:    drivers/pinctrl/mediatek/pinctrl-mt2701.c
 -F:    drivers/pinctrl/mediatek/pinctrl-mt7622.c
 +F:    drivers/pinctrl/mediatek/
  
  PIN CONTROLLER - QUALCOMM
  M:    Bjorn Andersson <bjorn.andersson@linaro.org>
@@@ -11682,26 -11569,7 +11682,26 @@@ W: http://hwmon.wiki.kernel.org
  W:    http://www.roeck-us.net/linux/drivers/
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/groeck/linux-staging.git
  S:    Maintained
 +F:    Documentation/devicetree/bindings/hwmon/ibm,cffps1.txt
 +F:    Documentation/devicetree/bindings/hwmon/max31785.txt
 +F:    Documentation/devicetree/bindings/hwmon/ltc2978.txt
 +F:    Documentation/hwmon/adm1275
 +F:    Documentation/hwmon/ibm-cffps
 +F:    Documentation/hwmon/ir35221
 +F:    Documentation/hwmon/lm25066
 +F:    Documentation/hwmon/ltc2978
 +F:    Documentation/hwmon/ltc3815
 +F:    Documentation/hwmon/max16064
 +F:    Documentation/hwmon/max20751
 +F:    Documentation/hwmon/max31785
 +F:    Documentation/hwmon/max34440
 +F:    Documentation/hwmon/max8688
  F:    Documentation/hwmon/pmbus
 +F:    Documentation/hwmon/pmbus-core
 +F:    Documentation/hwmon/tps40422
 +F:    Documentation/hwmon/ucd9000
 +F:    Documentation/hwmon/ucd9200
 +F:    Documentation/hwmon/zl6100
  F:    drivers/hwmon/pmbus/
  F:    include/linux/pmbus.h
  
@@@ -12105,7 -11973,7 +12105,7 @@@ F:   Documentation/scsi/LICENSE.qla4xx
  F:    drivers/scsi/qla4xxx/
  
  QLOGIC QLCNIC (1/10)Gb ETHERNET DRIVER
 -M:    Harish Patil <harish.patil@cavium.com>
 +M:    Shahed Shaikh <Shahed.Shaikh@cavium.com>
  M:    Manish Chopra <manish.chopra@cavium.com>
  M:    Dept-GELinuxNICDev@cavium.com
  L:    netdev@vger.kernel.org
@@@ -12113,6 -11981,7 +12113,6 @@@ S:   Supporte
  F:    drivers/net/ethernet/qlogic/qlcnic/
  
  QLOGIC QLGE 10Gb ETHERNET DRIVER
 -M:    Harish Patil <harish.patil@cavium.com>
  M:    Manish Chopra <manish.chopra@cavium.com>
  M:    Dept-GELinuxNICDev@cavium.com
  L:    netdev@vger.kernel.org
@@@ -12800,6 -12669,18 +12800,18 @@@ W: http://www.ibm.com/developerworks/li
  S:    Supported
  F:    drivers/s390/crypto/
  
+ S390 VFIO AP DRIVER
+ M:    Tony Krowiak <akrowiak@linux.ibm.com>
+ M:    Pierre Morel <pmorel@linux.ibm.com>
+ M:    Halil Pasic <pasic@linux.ibm.com>
+ L:    linux-s390@vger.kernel.org
+ W:    http://www.ibm.com/developerworks/linux/linux390/
+ S:    Supported
+ F:    drivers/s390/crypto/vfio_ap_drv.c
+ F:    drivers/s390/crypto/vfio_ap_private.h
+ F:    drivers/s390/crypto/vfio_ap_ops.c
+ F:    Documentation/s390/vfio-ap.txt
  S390 ZFCP DRIVER
  M:    Steffen Maier <maier@linux.ibm.com>
  M:    Benjamin Block <bblock@linux.ibm.com>
@@@ -13188,7 -13069,7 +13200,7 @@@ SELINUX SECURITY MODUL
  M:    Paul Moore <paul@paul-moore.com>
  M:    Stephen Smalley <sds@tycho.nsa.gov>
  M:    Eric Paris <eparis@parisplace.org>
 -L:    selinux@tycho.nsa.gov (moderated for non-subscribers)
 +L:    selinux@vger.kernel.org
  W:    https://selinuxproject.org
  W:    https://github.com/SELinuxProject
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/selinux.git
@@@ -13432,7 -13313,6 +13444,7 @@@ M:   Uwe Kleine-König <u.kleine-koenig@p
  R:    Pengutronix Kernel Team <kernel@pengutronix.de>
  S:    Supported
  F:    drivers/siox/*
 +F:    drivers/gpio/gpio-siox.c
  F:    include/trace/events/siox.h
  
  SIS 190 ETHERNET DRIVER
@@@ -13616,8 -13496,8 +13628,8 @@@ L:   linux-arm-kernel@lists.infradead.or
  S:    Maintained
  F:    Documentation/devicetree/bindings/arm/firmware/sdei.txt
  F:    drivers/firmware/arm_sdei.c
 -F:    include/linux/sdei.h
 -F:    include/uapi/linux/sdei.h
 +F:    include/linux/arm_sdei.h
 +F:    include/uapi/linux/arm_sdei.h
  
  SOFTWARE RAID (Multiple Disks) SUPPORT
  M:    Shaohua Li <shli@kernel.org>
@@@ -14160,12 -14040,6 +14172,12 @@@ S: Supporte
  F:    drivers/reset/reset-axs10x.c
  F:    Documentation/devicetree/bindings/reset/snps,axs10x-reset.txt
  
 +SYNOPSYS CREG GPIO DRIVER
 +M:    Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
 +S:    Maintained
 +F:    drivers/gpio/gpio-creg-snps.c
 +F:    Documentation/devicetree/bindings/gpio/snps,creg-gpio.txt
 +
  SYNOPSYS DESIGNWARE 8250 UART DRIVER
  R:    Andy Shevchenko <andriy.shevchenko@linux.intel.com>
  S:    Maintained
@@@ -14752,13 -14626,6 +14764,13 @@@ L: netdev@vger.kernel.or
  S:    Maintained
  F:    drivers/net/ethernet/ti/netcp*
  
 +TI PCM3060 ASoC CODEC DRIVER
 +M:    Kirill Marinushkin <kmarinushkin@birdec.tech>
 +L:    alsa-devel@alsa-project.org (moderated for non-subscribers)
 +S:    Maintained
 +F:    Documentation/devicetree/bindings/sound/pcm3060.txt
 +F:    sound/soc/codecs/pcm3060*
 +
  TI TAS571X FAMILY ASoC CODEC DRIVER
  M:    Kevin Cernekee <cernekee@chromium.org>
  L:    alsa-devel@alsa-project.org (moderated for non-subscribers)
@@@ -15534,7 -15401,7 +15546,7 @@@ S:   Maintaine
  UVESAFB DRIVER
  M:    Michal Januszewski <spock@gentoo.org>
  L:    linux-fbdev@vger.kernel.org
 -W:    http://dev.gentoo.org/~spock/projects/uvesafb/
 +W:    https://github.com/mjanusz/v86d
  S:    Maintained
  F:    Documentation/fb/uvesafb.txt
  F:    drivers/video/fbdev/uvesafb.*
@@@ -15847,7 -15714,7 +15859,7 @@@ F:   include/linux/regulator
  
  VRF
  M:    David Ahern <dsa@cumulusnetworks.com>
 -M:    Shrijeet Mukherjee <shm@cumulusnetworks.com>
 +M:    Shrijeet Mukherjee <shrijeet@gmail.com>
  L:    netdev@vger.kernel.org
  S:    Maintained
  F:    drivers/net/vrf.c
index 2d43dca29c722c716c8375e7bf5bff58ba523376,c3f1f9b304b7d4f0a7c6b809c6096311922696b0..b95f8d0d9f1783ccd931fccd52662baf6b379c93
   * space.
   */
  #define KVM_PHYS_SHIFT        (40)
- #define KVM_PHYS_SIZE (_AC(1, ULL) << KVM_PHYS_SHIFT)
- #define KVM_PHYS_MASK (KVM_PHYS_SIZE - _AC(1, ULL))
  #define PTRS_PER_S2_PGD       (_AC(1, ULL) << (KVM_PHYS_SHIFT - 30))
  
  /* Virtualization Translation Control Register (VTCR) bits */
  #else
  #define VTTBR_X               (5 - KVM_T0SZ)
  #endif
 +#define VTTBR_CNP_BIT     _AC(1, UL)
  #define VTTBR_BADDR_MASK  (((_AC(1, ULL) << (40 - VTTBR_X)) - 1) << VTTBR_X)
  #define VTTBR_VMID_SHIFT  _AC(48, ULL)
  #define VTTBR_VMID_MASK(size) (_AT(u64, (1 << size) - 1) << VTTBR_VMID_SHIFT)
index 847f01fa429dd4a90cf77249bff97266a8c5dd52,5ad1a54f98dcacbf5534617993af3ff875763e6c..1098ffc3d54b358d19c2e69b8df9ae9eb5fba031
                addr;                                                   \
        })
  
- /*
-  * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation levels.
-  */
- #define KVM_MMU_CACHE_MIN_PAGES       2
  #ifndef __ASSEMBLY__
  
  #include <linux/highmem.h>
  #include <asm/cacheflush.h>
  #include <asm/cputype.h>
+ #include <asm/kvm_arm.h>
  #include <asm/kvm_hyp.h>
  #include <asm/pgalloc.h>
  #include <asm/stage2_pgtable.h>
  /* Ensure compatibility with arm64 */
  #define VA_BITS                       32
  
+ #define kvm_phys_shift(kvm)           KVM_PHYS_SHIFT
+ #define kvm_phys_size(kvm)            (1ULL << kvm_phys_shift(kvm))
+ #define kvm_phys_mask(kvm)            (kvm_phys_size(kvm) - 1ULL)
+ #define kvm_vttbr_baddr_mask(kvm)     VTTBR_BADDR_MASK
+ #define stage2_pgd_size(kvm)          (PTRS_PER_S2_PGD * sizeof(pgd_t))
  int create_hyp_mappings(void *from, void *to, pgprot_t prot);
  int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
                           void __iomem **kaddr,
@@@ -355,11 -358,8 +358,13 @@@ static inline int hyp_map_aux_data(void
  
  #define kvm_phys_to_vttbr(addr)               (addr)
  
+ static inline void kvm_set_ipa_limit(void) {}
 +static inline bool kvm_cpu_has_cnp(void)
 +{
 +      return false;
 +}
 +
  #endif        /* !__ASSEMBLY__ */
  
  #endif /* __ARM_KVM_MMU_H__ */
index 6db48d90ad63ddb3cf347ee0f757b889bf4dfe92,072cc1c970c22b6b4f79adb83a5c6928ca7c4c00..7e2ec64aa414aad5c774a42f9d9c748122d56b20
@@@ -262,7 -262,7 +262,7 @@@ extern struct arm64_ftr_reg arm64_ftr_r
  /*
   * CPU feature detected at boot time based on system-wide value of a
   * feature. It is safe for a late CPU to have this feature even though
 - * the system hasn't enabled it, although the featuer will not be used
 + * the system hasn't enabled it, although the feature will not be used
   * by Linux in this case. If the system has enabled this feature already,
   * then every late CPU must have it.
   */
@@@ -508,12 -508,6 +508,12 @@@ static inline bool system_supports_sve(
                cpus_have_const_cap(ARM64_SVE);
  }
  
 +static inline bool system_supports_cnp(void)
 +{
 +      return IS_ENABLED(CONFIG_ARM64_CNP) &&
 +              cpus_have_const_cap(ARM64_HAS_CNP);
 +}
 +
  #define ARM64_SSBD_UNKNOWN            -1
  #define ARM64_SSBD_FORCE_DISABLE      0
  #define ARM64_SSBD_KERNEL             1
@@@ -536,7 -530,26 +536,28 @@@ void arm64_set_ssbd_mitigation(bool sta
  static inline void arm64_set_ssbd_mitigation(bool state) {}
  #endif
  
 +extern int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);
++
+ static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange)
+ {
+       switch (parange) {
+       case 0: return 32;
+       case 1: return 36;
+       case 2: return 40;
+       case 3: return 42;
+       case 4: return 44;
+       case 5: return 48;
+       case 6: return 52;
+       /*
+        * A future PE could use a value unknown to the kernel.
+        * However, by the "D10.1.4 Principles of the ID scheme
+        * for fields in ID registers", ARM DDI 0487C.a, any new
+        * value is guaranteed to be higher than what we know already.
+        * As a safe limit, we return the limit supported by the kernel.
+        */
+       default: return CONFIG_ARM64_PA_BITS;
+       }
+ }
  #endif /* __ASSEMBLY__ */
  
  #endif
index b476bc46f0abe2f2bf61002fcd4aa07e5ce697b4,6e324d1f12316600e0d433191162b745e42ba448..6f602af5263c20b6d1ded0cd4232c4b8b840b95f
  #define VTCR_EL2_RES1         (1 << 31)
  #define VTCR_EL2_HD           (1 << 22)
  #define VTCR_EL2_HA           (1 << 21)
+ #define VTCR_EL2_PS_SHIFT     TCR_EL2_PS_SHIFT
  #define VTCR_EL2_PS_MASK      TCR_EL2_PS_MASK
  #define VTCR_EL2_TG0_MASK     TCR_TG0_MASK
  #define VTCR_EL2_TG0_4K               TCR_TG0_4K
  #define VTCR_EL2_IRGN0_WBWA   TCR_IRGN0_WBWA
  #define VTCR_EL2_SL0_SHIFT    6
  #define VTCR_EL2_SL0_MASK     (3 << VTCR_EL2_SL0_SHIFT)
- #define VTCR_EL2_SL0_LVL1     (1 << VTCR_EL2_SL0_SHIFT)
  #define VTCR_EL2_T0SZ_MASK    0x3f
- #define VTCR_EL2_T0SZ_40B     24
  #define VTCR_EL2_VS_SHIFT     19
  #define VTCR_EL2_VS_8BIT      (0 << VTCR_EL2_VS_SHIFT)
  #define VTCR_EL2_VS_16BIT     (1 << VTCR_EL2_VS_SHIFT)
  
+ #define VTCR_EL2_T0SZ(x)      TCR_T0SZ(x)
  /*
   * We configure the Stage-2 page tables to always restrict the IPA space to be
   * 40 bits wide (T0SZ = 24).  Systems with a PARange smaller than 40 bits are
   * not known to exist and will break with this configuration.
   *
-  * VTCR_EL2.PS is extracted from ID_AA64MMFR0_EL1.PARange at boot time
-  * (see hyp-init.S).
+  * The VTCR_EL2 is configured per VM and is initialised in kvm_arm_setup_stage2().
   *
   * Note that when using 4K pages, we concatenate two first level page tables
   * together. With 16K pages, we concatenate 16 first level page tables.
   *
-  * The magic numbers used for VTTBR_X in this patch can be found in Tables
-  * D4-23 and D4-25 in ARM DDI 0487A.b.
   */
  
- #define VTCR_EL2_T0SZ_IPA     VTCR_EL2_T0SZ_40B
  #define VTCR_EL2_COMMON_BITS  (VTCR_EL2_SH0_INNER | VTCR_EL2_ORGN0_WBWA | \
                                 VTCR_EL2_IRGN0_WBWA | VTCR_EL2_RES1)
  
- #ifdef CONFIG_ARM64_64K_PAGES
  /*
-  * Stage2 translation configuration:
-  * 64kB pages (TG0 = 1)
-  * 2 level page tables (SL = 1)
+  * VTCR_EL2:SL0 indicates the entry level for Stage2 translation.
+  * Interestingly, it depends on the page size.
+  * See D.10.2.121, VTCR_EL2, in ARM DDI 0487C.a
+  *
+  *    -----------------------------------------
+  *    | Entry level           |  4K  | 16K/64K |
+  *    ------------------------------------------
+  *    | Level: 0              |  2   |   -     |
+  *    ------------------------------------------
+  *    | Level: 1              |  1   |   2     |
+  *    ------------------------------------------
+  *    | Level: 2              |  0   |   1     |
+  *    ------------------------------------------
+  *    | Level: 3              |  -   |   0     |
+  *    ------------------------------------------
+  *
+  * The table roughly translates to :
+  *
+  *    SL0(PAGE_SIZE, Entry_level) = TGRAN_SL0_BASE - Entry_Level
+  *
+  * Where TGRAN_SL0_BASE is a magic number depending on the page size:
+  *    TGRAN_SL0_BASE(4K) = 2
+  *    TGRAN_SL0_BASE(16K) = 3
+  *    TGRAN_SL0_BASE(64K) = 3
+  * provided we take care of ruling out the unsupported cases and
+  * Entry_Level = 4 - Number_of_levels.
+  *
   */
- #define VTCR_EL2_TGRAN_FLAGS          (VTCR_EL2_TG0_64K | VTCR_EL2_SL0_LVL1)
- #define VTTBR_X_TGRAN_MAGIC           38
+ #ifdef CONFIG_ARM64_64K_PAGES
+ #define VTCR_EL2_TGRAN                        VTCR_EL2_TG0_64K
+ #define VTCR_EL2_TGRAN_SL0_BASE               3UL
  #elif defined(CONFIG_ARM64_16K_PAGES)
- /*
-  * Stage2 translation configuration:
-  * 16kB pages (TG0 = 2)
-  * 2 level page tables (SL = 1)
-  */
- #define VTCR_EL2_TGRAN_FLAGS          (VTCR_EL2_TG0_16K | VTCR_EL2_SL0_LVL1)
- #define VTTBR_X_TGRAN_MAGIC           42
+ #define VTCR_EL2_TGRAN                        VTCR_EL2_TG0_16K
+ #define VTCR_EL2_TGRAN_SL0_BASE               3UL
  #else /* 4K */
- /*
-  * Stage2 translation configuration:
-  * 4kB pages (TG0 = 0)
-  * 3 level page tables (SL = 1)
-  */
- #define VTCR_EL2_TGRAN_FLAGS          (VTCR_EL2_TG0_4K | VTCR_EL2_SL0_LVL1)
- #define VTTBR_X_TGRAN_MAGIC           37
+ #define VTCR_EL2_TGRAN                        VTCR_EL2_TG0_4K
+ #define VTCR_EL2_TGRAN_SL0_BASE               2UL
  #endif
  
- #define VTCR_EL2_FLAGS                        (VTCR_EL2_COMMON_BITS | VTCR_EL2_TGRAN_FLAGS)
- #define VTTBR_X                               (VTTBR_X_TGRAN_MAGIC - VTCR_EL2_T0SZ_IPA)
+ #define VTCR_EL2_LVLS_TO_SL0(levels)  \
+       ((VTCR_EL2_TGRAN_SL0_BASE - (4 - (levels))) << VTCR_EL2_SL0_SHIFT)
+ #define VTCR_EL2_SL0_TO_LVLS(sl0)     \
+       ((sl0) + 4 - VTCR_EL2_TGRAN_SL0_BASE)
+ #define VTCR_EL2_LVLS(vtcr)           \
+       VTCR_EL2_SL0_TO_LVLS(((vtcr) & VTCR_EL2_SL0_MASK) >> VTCR_EL2_SL0_SHIFT)
+ #define VTCR_EL2_FLAGS                        (VTCR_EL2_COMMON_BITS | VTCR_EL2_TGRAN)
+ #define VTCR_EL2_IPA(vtcr)            (64 - ((vtcr) & VTCR_EL2_T0SZ_MASK))
+ /*
+  * ARM VMSAv8-64 defines an algorithm for finding the translation table
+  * descriptors in section D4.2.8 in ARM DDI 0487C.a.
+  *
+  * The algorithm defines the expectations on the translation table
+  * addresses for each level, based on PAGE_SIZE, entry level
+  * and the translation table size (T0SZ). The variable "x" in the
+  * algorithm determines the alignment of a table base address at a given
+  * level and thus determines the alignment of VTTBR:BADDR for stage2
+  * page table entry level.
+  * Since the number of bits resolved at the entry level could vary
+  * depending on the T0SZ, the value of "x" is defined based on a
+  * Magic constant for a given PAGE_SIZE and Entry Level. The
+  * intermediate levels must be always aligned to the PAGE_SIZE (i.e,
+  * x = PAGE_SHIFT).
+  *
+  * The value of "x" for entry level is calculated as :
+  *    x = Magic_N - T0SZ
+  *
+  * where Magic_N is an integer depending on the page size and the entry
+  * level of the page table as below:
+  *
+  *    --------------------------------------------
+  *    | Entry level           |  4K    16K   64K |
+  *    --------------------------------------------
+  *    | Level: 0 (4 levels)   | 28   |  -  |  -  |
+  *    --------------------------------------------
+  *    | Level: 1 (3 levels)   | 37   | 31  | 25  |
+  *    --------------------------------------------
+  *    | Level: 2 (2 levels)   | 46   | 42  | 38  |
+  *    --------------------------------------------
+  *    | Level: 3 (1 level)    | -    | 53  | 51  |
+  *    --------------------------------------------
+  *
+  * We have a magic formula for the Magic_N below:
+  *
+  *  Magic_N(PAGE_SIZE, Level) = 64 - ((PAGE_SHIFT - 3) * Number_of_levels)
+  *
+  * where Number_of_levels = (4 - Level). We are only interested in the
+  * value for Entry_Level for the stage2 page table.
+  *
+  * So, given that T0SZ = (64 - IPA_SHIFT), we can compute 'x' as follows:
+  *
+  *    x = (64 - ((PAGE_SHIFT - 3) * Number_of_levels)) - (64 - IPA_SHIFT)
+  *      = IPA_SHIFT - ((PAGE_SHIFT - 3) * Number of levels)
+  *
+  * Here is one way to explain the Magic Formula:
+  *
+  *  x = log2(Size_of_Entry_Level_Table)
+  *
+  * Since, we can resolve (PAGE_SHIFT - 3) bits at each level, and another
+  * PAGE_SHIFT bits in the PTE, we have :
+  *
+  *  Bits_Entry_level = IPA_SHIFT - ((PAGE_SHIFT - 3) * (n - 1) + PAGE_SHIFT)
+  *                 = IPA_SHIFT - (PAGE_SHIFT - 3) * n - 3
+  *  where n = number of levels, and since each pointer is 8bytes, we have:
+  *
+  *  x = Bits_Entry_Level + 3
+  *    = IPA_SHIFT - (PAGE_SHIFT - 3) * n
+  *
+  * The only constraint here is that, we have to find the number of page table
+  * levels for a given IPA size (which we do, see stage2_pt_levels())
+  */
+ #define ARM64_VTTBR_X(ipa, levels)    ((ipa) - ((levels) * (PAGE_SHIFT - 3)))
  
- #define VTTBR_BADDR_MASK  (((UL(1) << (PHYS_MASK_SHIFT - VTTBR_X)) - 1) << VTTBR_X)
 +#define VTTBR_CNP_BIT     (UL(1))
  #define VTTBR_VMID_SHIFT  (UL(48))
  #define VTTBR_VMID_MASK(size) (_AT(u64, (1 << size) - 1) << VTTBR_VMID_SHIFT)
  
  
  /* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */
  #define HPFAR_MASK    (~UL(0xf))
+ /*
+  * We have
+  *    PAR     [PA_Shift - 1   : 12] = PA      [PA_Shift - 1 : 12]
+  *    HPFAR   [PA_Shift - 9   : 4]  = FIPA    [PA_Shift - 1 : 12]
+  */
+ #define PAR_TO_HPFAR(par)             \
+       (((par) & GENMASK_ULL(PHYS_MASK_SHIFT - 1, 12)) >> 8)
  
  #define kvm_arm_exception_type        \
        {0, "IRQ" },            \
index 2842bf149029becc92b20383d5c0894f465b2dc4,f84052f306af89fed02dc779e4512658769ffe95..52fbc823ff8c7f52dcd924fe94bb0be603150e1f
@@@ -53,7 -53,7 +53,7 @@@ DECLARE_STATIC_KEY_FALSE(userspace_irqc
  
  int __attribute_const__ kvm_target_cpu(void);
  int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
- int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext);
+ int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext);
  void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start);
  
  struct kvm_arch {
        u64    vmid_gen;
        u32    vmid;
  
-       /* 1-level 2nd stage table, protected by kvm->mmu_lock */
+       /* stage2 entry level table */
        pgd_t *pgd;
  
        /* VTTBR value associated with above pgd and vmid */
        u64    vttbr;
+       /* VTCR_EL2 value for this VM */
+       u64    vtcr;
  
        /* The last vcpu id that ran on each physical CPU */
        int __percpu *last_vcpu_ran;
@@@ -387,8 -389,6 +389,8 @@@ struct kvm_vcpu *kvm_mpidr_to_vcpu(stru
  
  DECLARE_PER_CPU(kvm_cpu_context_t, kvm_host_cpu_state);
  
 +void __kvm_enable_ssbs(void);
 +
  static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr,
                                       unsigned long hyp_stack_ptr,
                                       unsigned long vector_ptr)
         */
        BUG_ON(!static_branch_likely(&arm64_const_caps_ready));
        __kvm_call_hyp((void *)pgd_ptr, hyp_stack_ptr, vector_ptr, tpidr_el2);
 +
 +      /*
 +       * Disabling SSBD on a non-VHE system requires us to enable SSBS
 +       * at EL2.
 +       */
 +      if (!has_vhe() && this_cpu_has_cap(ARM64_SSBS) &&
 +          arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE) {
 +              kvm_call_hyp(__kvm_enable_ssbs);
 +      }
  }
  
  static inline bool kvm_arch_check_sve_has_vhe(void)
@@@ -451,13 -442,7 +453,7 @@@ int kvm_arm_vcpu_arch_get_attr(struct k
  int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
                               struct kvm_device_attr *attr);
  
- static inline void __cpu_init_stage2(void)
- {
-       u32 parange = kvm_call_hyp(__init_stage2_translation);
-       WARN_ONCE(parange < 40,
-                 "PARange is %d bits, unsupported configuration!", parange);
- }
+ static inline void __cpu_init_stage2(void) {}
  
  /* Guest/host FPSIMD coordination helpers */
  int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu);
@@@ -520,8 -505,12 +516,12 @@@ static inline int kvm_arm_have_ssbd(voi
  void kvm_vcpu_load_sysregs(struct kvm_vcpu *vcpu);
  void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu);
  
+ void kvm_set_ipa_limit(void);
  #define __KVM_HAVE_ARCH_VM_ALLOC
  struct kvm *kvm_arch_alloc_vm(void);
  void kvm_arch_free_vm(struct kvm *kvm);
  
+ int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type);
  #endif /* __ARM64_KVM_HOST_H__ */
index 64337afbf124b44b39c429f34af1aabeeeb37720,77b1af9e64db1436a5d35ee026bed5f554c27fd0..658657367f2ff8d7b9d2349db6a114065da264c3
@@@ -141,8 -141,16 +141,16 @@@ static inline unsigned long __kern_hyp_
   * We currently only support a 40bit IPA.
   */
  #define KVM_PHYS_SHIFT        (40)
- #define KVM_PHYS_SIZE (1UL << KVM_PHYS_SHIFT)
- #define KVM_PHYS_MASK (KVM_PHYS_SIZE - 1UL)
+ #define kvm_phys_shift(kvm)           VTCR_EL2_IPA(kvm->arch.vtcr)
+ #define kvm_phys_size(kvm)            (_AC(1, ULL) << kvm_phys_shift(kvm))
+ #define kvm_phys_mask(kvm)            (kvm_phys_size(kvm) - _AC(1, ULL))
+ static inline bool kvm_page_empty(void *ptr)
+ {
+       struct page *ptr_page = virt_to_page(ptr);
+       return page_count(ptr_page) == 1;
+ }
  
  #include <asm/stage2_pgtable.h>
  
@@@ -238,12 -246,6 +246,6 @@@ static inline bool kvm_s2pmd_exec(pmd_
        return !(READ_ONCE(pmd_val(*pmdp)) & PMD_S2_XN);
  }
  
- static inline bool kvm_page_empty(void *ptr)
- {
-       struct page *ptr_page = virt_to_page(ptr);
-       return page_count(ptr_page) == 1;
- }
  #define hyp_pte_table_empty(ptep) kvm_page_empty(ptep)
  
  #ifdef __PAGETABLE_PMD_FOLDED
@@@ -517,10 -519,29 +519,34 @@@ static inline int hyp_map_aux_data(void
  
  #define kvm_phys_to_vttbr(addr)               phys_to_ttbr(addr)
  
+ /*
+  * Get the magic number 'x' for VTTBR:BADDR of this KVM instance.
+  * With v8.2 LVA extensions, 'x' should be a minimum of 6 with
+  * 52bit IPS.
+  */
+ static inline int arm64_vttbr_x(u32 ipa_shift, u32 levels)
+ {
+       int x = ARM64_VTTBR_X(ipa_shift, levels);
+       return (IS_ENABLED(CONFIG_ARM64_PA_BITS_52) && x < 6) ? 6 : x;
+ }
+ static inline u64 vttbr_baddr_mask(u32 ipa_shift, u32 levels)
+ {
+       unsigned int x = arm64_vttbr_x(ipa_shift, levels);
+       return GENMASK_ULL(PHYS_MASK_SHIFT - 1, x);
+ }
+ static inline u64 kvm_vttbr_baddr_mask(struct kvm *kvm)
+ {
+       return vttbr_baddr_mask(kvm_phys_shift(kvm), kvm_stage2_levels(kvm));
+ }
 +static inline bool kvm_cpu_has_cnp(void)
 +{
 +      return system_supports_cnp();
 +}
 +
  #endif /* __ASSEMBLY__ */
  #endif /* __ARM64_KVM_MMU_H__ */
index 6bc43889d11e46cbfb79882da231eb9728e6b7e7,ff35ac1258eb7b495c900e52f7da9a49993a2a1d..fce22c4b2f7300ce2d22721abbfea3c14f3b9dfd
@@@ -25,6 -25,9 +25,9 @@@
  #define CurrentEL_EL1         (1 << 2)
  #define CurrentEL_EL2         (2 << 2)
  
+ /* Additional SPSR bits not exposed in the UABI */
+ #define PSR_IL_BIT            (1 << 20)
  /* AArch32-specific ptrace requests */
  #define COMPAT_PTRACE_GETREGS         12
  #define COMPAT_PTRACE_SETREGS         13
@@@ -50,7 -53,6 +53,7 @@@
  #define PSR_AA32_I_BIT                0x00000080
  #define PSR_AA32_A_BIT                0x00000100
  #define PSR_AA32_E_BIT                0x00000200
 +#define PSR_AA32_SSBS_BIT     0x00800000
  #define PSR_AA32_DIT_BIT      0x01000000
  #define PSR_AA32_Q_BIT                0x08000000
  #define PSR_AA32_V_BIT                0x10000000
diff --combined arch/arm64/kvm/guest.c
index a6c9fbaeaefcdd71d0ea70c8eeb89c55692f8b66,a74f84d09412884380855fec5973ed811661ee21..dd436a50fce7b13fa1c1af79ce15495323ca142d
@@@ -57,45 -57,6 +57,45 @@@ static u64 core_reg_offset_from_id(u64 
        return id & ~(KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_CORE);
  }
  
 +static int validate_core_offset(const struct kvm_one_reg *reg)
 +{
 +      u64 off = core_reg_offset_from_id(reg->id);
 +      int size;
 +
 +      switch (off) {
 +      case KVM_REG_ARM_CORE_REG(regs.regs[0]) ...
 +           KVM_REG_ARM_CORE_REG(regs.regs[30]):
 +      case KVM_REG_ARM_CORE_REG(regs.sp):
 +      case KVM_REG_ARM_CORE_REG(regs.pc):
 +      case KVM_REG_ARM_CORE_REG(regs.pstate):
 +      case KVM_REG_ARM_CORE_REG(sp_el1):
 +      case KVM_REG_ARM_CORE_REG(elr_el1):
 +      case KVM_REG_ARM_CORE_REG(spsr[0]) ...
 +           KVM_REG_ARM_CORE_REG(spsr[KVM_NR_SPSR - 1]):
 +              size = sizeof(__u64);
 +              break;
 +
 +      case KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]) ...
 +           KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]):
 +              size = sizeof(__uint128_t);
 +              break;
 +
 +      case KVM_REG_ARM_CORE_REG(fp_regs.fpsr):
 +      case KVM_REG_ARM_CORE_REG(fp_regs.fpcr):
 +              size = sizeof(__u32);
 +              break;
 +
 +      default:
 +              return -EINVAL;
 +      }
 +
 +      if (KVM_REG_SIZE(reg->id) == size &&
 +          IS_ALIGNED(off, size / sizeof(__u32)))
 +              return 0;
 +
 +      return -EINVAL;
 +}
 +
  static int get_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
  {
        /*
            (off + (KVM_REG_SIZE(reg->id) / sizeof(__u32))) >= nr_regs)
                return -ENOENT;
  
 +      if (validate_core_offset(reg))
 +              return -EINVAL;
 +
        if (copy_to_user(uaddr, ((u32 *)regs) + off, KVM_REG_SIZE(reg->id)))
                return -EFAULT;
  
@@@ -140,9 -98,6 +140,9 @@@ static int set_core_reg(struct kvm_vcp
            (off + (KVM_REG_SIZE(reg->id) / sizeof(__u32))) >= nr_regs)
                return -ENOENT;
  
 +      if (validate_core_offset(reg))
 +              return -EINVAL;
 +
        if (KVM_REG_SIZE(reg->id) > sizeof(tmp))
                return -EINVAL;
  
        }
  
        if (off == KVM_REG_ARM_CORE_REG(regs.pstate)) {
 -              u32 mode = (*(u32 *)valp) & PSR_AA32_MODE_MASK;
 +              u64 mode = (*(u64 *)valp) & PSR_AA32_MODE_MASK;
                switch (mode) {
                case PSR_AA32_MODE_USR:
 +                      if (!system_supports_32bit_el0())
 +                              return -EINVAL;
 +                      break;
                case PSR_AA32_MODE_FIQ:
                case PSR_AA32_MODE_IRQ:
                case PSR_AA32_MODE_SVC:
                case PSR_AA32_MODE_ABT:
                case PSR_AA32_MODE_UND:
 +                      if (!vcpu_el1_is_32bit(vcpu))
 +                              return -EINVAL;
 +                      break;
                case PSR_MODE_EL0t:
                case PSR_MODE_EL1t:
                case PSR_MODE_EL1h:
 +                      if (vcpu_el1_is_32bit(vcpu))
 +                              return -EINVAL;
                        break;
                default:
                        err = -EINVAL;
@@@ -391,15 -338,15 +391,15 @@@ int __attribute_const__ kvm_target_cpu(
                        return KVM_ARM_TARGET_CORTEX_A53;
                case ARM_CPU_PART_CORTEX_A57:
                        return KVM_ARM_TARGET_CORTEX_A57;
-               };
+               }
                break;
        case ARM_CPU_IMP_APM:
                switch (part_number) {
                case APM_CPU_PART_POTENZA:
                        return KVM_ARM_TARGET_XGENE_POTENZA;
-               };
+               }
                break;
-       };
+       }
  
        /* Return a default generic target */
        return KVM_ARM_TARGET_GENERIC_V8;
index 76d016b446b203a721a7f02f23a73bbbb53c1cf5,8dc28531820414061040c657e6066815d248e589..68d6f7c3b237dc1713fa36bc55d4a27fef2e5949
@@@ -152,8 -152,25 +152,25 @@@ static void __hyp_text __sysreg_restore
  static void __hyp_text
  __sysreg_restore_el2_return_state(struct kvm_cpu_context *ctxt)
  {
+       u64 pstate = ctxt->gp_regs.regs.pstate;
+       u64 mode = pstate & PSR_AA32_MODE_MASK;
+       /*
+        * Safety check to ensure we're setting the CPU up to enter the guest
+        * in a less privileged mode.
+        *
+        * If we are attempting a return to EL2 or higher in AArch64 state,
+        * program SPSR_EL2 with M=EL2h and the IL bit set which ensures that
+        * we'll take an illegal exception state exception immediately after
+        * the ERET to the guest.  Attempts to return to AArch32 Hyp will
+        * result in an illegal exception return because EL2's execution state
+        * is determined by SCR_EL3.RW.
+        */
+       if (!(mode & PSR_MODE32_BIT) && mode >= PSR_MODE_EL2t)
+               pstate = PSR_MODE_EL2h | PSR_IL_BIT;
        write_sysreg_el2(ctxt->gp_regs.regs.pc,         elr);
-       write_sysreg_el2(ctxt->gp_regs.regs.pstate,     spsr);
+       write_sysreg_el2(pstate,                        spsr);
  
        if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN))
                write_sysreg_s(ctxt->sys_regs[DISR_EL1], SYS_VDISR_EL2);
@@@ -288,14 -305,3 +305,14 @@@ void kvm_vcpu_put_sysregs(struct kvm_vc
  
        vcpu->arch.sysregs_loaded_on_cpu = false;
  }
 +
 +void __hyp_text __kvm_enable_ssbs(void)
 +{
 +      u64 tmp;
 +
 +      asm volatile(
 +      "mrs    %0, sctlr_el2\n"
 +      "orr    %0, %0, %1\n"
 +      "msr    sctlr_el2, %0"
 +      : "=&r" (tmp) : "L" (SCTLR_ELx_DSSBS));
 +}
index 041a115789a14f6f1892cec5d76c6f95ea9abff8,d0abcbbdc70024bb4dc9c43a5ba7e350fba2b376..d68b9ef383286c35d6c8b75b35072615d5486158
@@@ -387,12 -387,12 +387,12 @@@ int main(void
        OFFSET(CFG_SYSCALL_MAP64, vdso_data, syscall_map_64);
        OFFSET(TVAL64_TV_SEC, timeval, tv_sec);
        OFFSET(TVAL64_TV_USEC, timeval, tv_usec);
 -      OFFSET(TVAL32_TV_SEC, compat_timeval, tv_sec);
 -      OFFSET(TVAL32_TV_USEC, compat_timeval, tv_usec);
 +      OFFSET(TVAL32_TV_SEC, old_timeval32, tv_sec);
 +      OFFSET(TVAL32_TV_USEC, old_timeval32, tv_usec);
        OFFSET(TSPC64_TV_SEC, timespec, tv_sec);
        OFFSET(TSPC64_TV_NSEC, timespec, tv_nsec);
 -      OFFSET(TSPC32_TV_SEC, compat_timespec, tv_sec);
 -      OFFSET(TSPC32_TV_NSEC, compat_timespec, tv_nsec);
 +      OFFSET(TSPC32_TV_SEC, old_timespec32, tv_sec);
 +      OFFSET(TSPC32_TV_NSEC, old_timespec32, tv_nsec);
  #else
        OFFSET(TVAL32_TV_SEC, timeval, tv_sec);
        OFFSET(TVAL32_TV_USEC, timeval, tv_usec);
  #ifdef CONFIG_PPC_BOOK3S
        OFFSET(VCPU_TAR, kvm_vcpu, arch.tar);
  #endif
-       OFFSET(VCPU_CR, kvm_vcpu, arch.cr);
+       OFFSET(VCPU_CR, kvm_vcpu, arch.regs.ccr);
        OFFSET(VCPU_PC, kvm_vcpu, arch.regs.nip);
  #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
        OFFSET(VCPU_MSR, kvm_vcpu, arch.shregs.msr);
        OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr);
        OFFSET(VCPU_VPA_DIRTY, kvm_vcpu, arch.vpa.dirty);
        OFFSET(VCPU_HEIR, kvm_vcpu, arch.emul_inst);
+       OFFSET(VCPU_NESTED, kvm_vcpu, arch.nested);
        OFFSET(VCPU_CPU, kvm_vcpu, cpu);
        OFFSET(VCPU_THREAD_CPU, kvm_vcpu, arch.thread_cpu);
  #endif
  #endif /* CONFIG_PPC_BOOK3S_64 */
  
  #else /* CONFIG_PPC_BOOK3S */
-       OFFSET(VCPU_CR, kvm_vcpu, arch.cr);
+       OFFSET(VCPU_CR, kvm_vcpu, arch.regs.ccr);
        OFFSET(VCPU_XER, kvm_vcpu, arch.regs.xer);
        OFFSET(VCPU_LR, kvm_vcpu, arch.regs.link);
        OFFSET(VCPU_CTR, kvm_vcpu, arch.regs.ctr);
index 998f8d089ac7ea840341f0a741df3989d30542bc,43b21e88c71667b4133ebb88587d1a199d4bb15b..d68162ee159bb8ed328732916ab9ce6cd0ee94b2
@@@ -10,6 -10,9 +10,9 @@@
  #include <linux/string.h>
  #include <linux/kvm.h>
  #include <linux/kvm_host.h>
+ #include <linux/anon_inodes.h>
+ #include <linux/file.h>
+ #include <linux/debugfs.h>
  
  #include <asm/kvm_ppc.h>
  #include <asm/kvm_book3s.h>
   */
  static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 };
  
- int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
-                          struct kvmppc_pte *gpte, bool data, bool iswrite)
+ int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
+                              struct kvmppc_pte *gpte, u64 root,
+                              u64 *pte_ret_p)
  {
        struct kvm *kvm = vcpu->kvm;
-       u32 pid;
        int ret, level, ps;
-       __be64 prte, rpte;
-       unsigned long ptbl;
-       unsigned long root, pte, index;
-       unsigned long rts, bits, offset;
-       unsigned long gpa;
-       unsigned long proc_tbl_size;
-       /* Work out effective PID */
-       switch (eaddr >> 62) {
-       case 0:
-               pid = vcpu->arch.pid;
-               break;
-       case 3:
-               pid = 0;
-               break;
-       default:
-               return -EINVAL;
-       }
-       proc_tbl_size = 1 << ((kvm->arch.process_table & PRTS_MASK) + 12);
-       if (pid * 16 >= proc_tbl_size)
-               return -EINVAL;
-       /* Read partition table to find root of tree for effective PID */
-       ptbl = (kvm->arch.process_table & PRTB_MASK) + (pid * 16);
-       ret = kvm_read_guest(kvm, ptbl, &prte, sizeof(prte));
-       if (ret)
-               return ret;
+       unsigned long rts, bits, offset, index;
+       u64 pte, base, gpa;
+       __be64 rpte;
  
-       root = be64_to_cpu(prte);
        rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) |
                ((root & RTS2_MASK) >> RTS2_SHIFT);
        bits = root & RPDS_MASK;
-       root = root & RPDB_MASK;
+       base = root & RPDB_MASK;
  
        offset = rts + 31;
  
-       /* current implementations only support 52-bit space */
+       /* Current implementations only support 52-bit space */
        if (offset != 52)
                return -EINVAL;
  
+       /* Walk each level of the radix tree */
        for (level = 3; level >= 0; --level) {
+               u64 addr;
+               /* Check a valid size */
                if (level && bits != p9_supported_radix_bits[level])
                        return -EINVAL;
                if (level == 0 && !(bits == 5 || bits == 9))
                        return -EINVAL;
                offset -= bits;
                index = (eaddr >> offset) & ((1UL << bits) - 1);
-               /* check that low bits of page table base are zero */
-               if (root & ((1UL << (bits + 3)) - 1))
+               /* Check that low bits of page table base are zero */
+               if (base & ((1UL << (bits + 3)) - 1))
                        return -EINVAL;
-               ret = kvm_read_guest(kvm, root + index * 8,
-                                    &rpte, sizeof(rpte));
-               if (ret)
+               /* Read the entry from guest memory */
+               addr = base + (index * sizeof(rpte));
+               ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte));
+               if (ret) {
+                       if (pte_ret_p)
+                               *pte_ret_p = addr;
                        return ret;
+               }
                pte = __be64_to_cpu(rpte);
                if (!(pte & _PAGE_PRESENT))
                        return -ENOENT;
+               /* Check if a leaf entry */
                if (pte & _PAGE_PTE)
                        break;
-               bits = pte & 0x1f;
-               root = pte & 0x0fffffffffffff00ul;
+               /* Get ready to walk the next level */
+               base = pte & RPDB_MASK;
+               bits = pte & RPDS_MASK;
        }
-       /* need a leaf at lowest level; 512GB pages not supported */
+       /* Need a leaf at lowest level; 512GB pages not supported */
        if (level < 0 || level == 3)
                return -EINVAL;
  
-       /* offset is now log base 2 of the page size */
+       /* We found a valid leaf PTE */
+       /* Offset is now log base 2 of the page size */
        gpa = pte & 0x01fffffffffff000ul;
        if (gpa & ((1ul << offset) - 1))
                return -EINVAL;
-       gpa += eaddr & ((1ul << offset) - 1);
+       gpa |= eaddr & ((1ul << offset) - 1);
        for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps)
                if (offset == mmu_psize_defs[ps].shift)
                        break;
        gpte->page_size = ps;
+       gpte->page_shift = offset;
  
        gpte->eaddr = eaddr;
        gpte->raddr = gpa;
        gpte->may_read = !!(pte & _PAGE_READ);
        gpte->may_write = !!(pte & _PAGE_WRITE);
        gpte->may_execute = !!(pte & _PAGE_EXEC);
+       gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY);
+       if (pte_ret_p)
+               *pte_ret_p = pte;
+       return 0;
+ }
+ /*
+  * Used to walk a partition or process table radix tree in guest memory
+  * Note: We exploit the fact that a partition table and a process
+  * table have the same layout, a partition-scoped page table and a
+  * process-scoped page table have the same layout, and the 2nd
+  * doubleword of a partition table entry has the same layout as
+  * the PTCR register.
+  */
+ int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
+                                    struct kvmppc_pte *gpte, u64 table,
+                                    int table_index, u64 *pte_ret_p)
+ {
+       struct kvm *kvm = vcpu->kvm;
+       int ret;
+       unsigned long size, ptbl, root;
+       struct prtb_entry entry;
+       if ((table & PRTS_MASK) > 24)
+               return -EINVAL;
+       size = 1ul << ((table & PRTS_MASK) + 12);
+       /* Is the table big enough to contain this entry? */
+       if ((table_index * sizeof(entry)) >= size)
+               return -EINVAL;
+       /* Read the table to find the root of the radix tree */
+       ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry));
+       ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry));
+       if (ret)
+               return ret;
+       /* Root is stored in the first double word */
+       root = be64_to_cpu(entry.prtb0);
+       return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p);
+ }
+ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
+                          struct kvmppc_pte *gpte, bool data, bool iswrite)
+ {
+       u32 pid;
+       u64 pte;
+       int ret;
+       /* Work out effective PID */
+       switch (eaddr >> 62) {
+       case 0:
+               pid = vcpu->arch.pid;
+               break;
+       case 3:
+               pid = 0;
+               break;
+       default:
+               return -EINVAL;
+       }
+       ret = kvmppc_mmu_radix_translate_table(vcpu, eaddr, gpte,
+                               vcpu->kvm->arch.process_table, pid, &pte);
+       if (ret)
+               return ret;
+       /* Check privilege (applies only to process scoped translations) */
        if (kvmppc_get_msr(vcpu) & MSR_PR) {
                if (pte & _PAGE_PRIVILEGED) {
                        gpte->may_read = 0;
  }
  
  static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
-                                   unsigned int pshift)
+                                   unsigned int pshift, unsigned int lpid)
  {
        unsigned long psize = PAGE_SIZE;
+       int psi;
+       long rc;
+       unsigned long rb;
  
        if (pshift)
                psize = 1UL << pshift;
+       else
+               pshift = PAGE_SHIFT;
  
        addr &= ~(psize - 1);
-       radix__flush_tlb_lpid_page(kvm->arch.lpid, addr, psize);
+       if (!kvmhv_on_pseries()) {
+               radix__flush_tlb_lpid_page(lpid, addr, psize);
+               return;
+       }
+       psi = shift_to_mmu_psize(pshift);
+       rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58));
+       rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1),
+                               lpid, rb);
+       if (rc)
+               pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc);
  }
  
- static void kvmppc_radix_flush_pwc(struct kvm *kvm)
+ static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned int lpid)
  {
-       radix__flush_pwc_lpid(kvm->arch.lpid);
+       long rc;
+       if (!kvmhv_on_pseries()) {
+               radix__flush_pwc_lpid(lpid);
+               return;
+       }
+       rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1),
+                               lpid, TLBIEL_INVAL_SET_LPID);
+       if (rc)
+               pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc);
  }
  
  static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
@@@ -195,23 -282,38 +282,38 @@@ static void kvmppc_pmd_free(pmd_t *pmdp
        kmem_cache_free(kvm_pmd_cache, pmdp);
  }
  
- static void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte,
-                            unsigned long gpa, unsigned int shift)
+ /* Called with kvm->mmu_lock held */
+ void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
+                     unsigned int shift, struct kvm_memory_slot *memslot,
+                     unsigned int lpid)
  
  {
-       unsigned long page_size = 1ul << shift;
        unsigned long old;
+       unsigned long gfn = gpa >> PAGE_SHIFT;
+       unsigned long page_size = PAGE_SIZE;
+       unsigned long hpa;
  
        old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift);
-       kvmppc_radix_tlbie_page(kvm, gpa, shift);
-       if (old & _PAGE_DIRTY) {
-               unsigned long gfn = gpa >> PAGE_SHIFT;
-               struct kvm_memory_slot *memslot;
+       kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid);
+       /* The following only applies to L1 entries */
+       if (lpid != kvm->arch.lpid)
+               return;
  
+       if (!memslot) {
                memslot = gfn_to_memslot(kvm, gfn);
-               if (memslot && memslot->dirty_bitmap)
-                       kvmppc_update_dirty_map(memslot, gfn, page_size);
+               if (!memslot)
+                       return;
        }
+       if (shift)
+               page_size = 1ul << shift;
+       gpa &= ~(page_size - 1);
+       hpa = old & PTE_RPN_MASK;
+       kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size);
+       if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap)
+               kvmppc_update_dirty_map(memslot, gfn, page_size);
  }
  
  /*
   * and emit a warning if encountered, but there may already be data
   * corruption due to the unexpected mappings.
   */
- static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full)
+ static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full,
+                                 unsigned int lpid)
  {
        if (full) {
                memset(pte, 0, sizeof(long) << PTE_INDEX_SIZE);
                        WARN_ON_ONCE(1);
                        kvmppc_unmap_pte(kvm, p,
                                         pte_pfn(*p) << PAGE_SHIFT,
-                                        PAGE_SHIFT);
+                                        PAGE_SHIFT, NULL, lpid);
                }
        }
  
        kvmppc_pte_free(pte);
  }
  
- static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full)
+ static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full,
+                                 unsigned int lpid)
  {
        unsigned long im;
        pmd_t *p = pmd;
                                WARN_ON_ONCE(1);
                                kvmppc_unmap_pte(kvm, (pte_t *)p,
                                         pte_pfn(*(pte_t *)p) << PAGE_SHIFT,
-                                        PMD_SHIFT);
+                                        PMD_SHIFT, NULL, lpid);
                        }
                } else {
                        pte_t *pte;
  
                        pte = pte_offset_map(p, 0);
-                       kvmppc_unmap_free_pte(kvm, pte, full);
+                       kvmppc_unmap_free_pte(kvm, pte, full, lpid);
                        pmd_clear(p);
                }
        }
        kvmppc_pmd_free(pmd);
  }
  
- static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud)
+ static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud,
+                                 unsigned int lpid)
  {
        unsigned long iu;
        pud_t *p = pud;
                        pmd_t *pmd;
  
                        pmd = pmd_offset(p, 0);
-                       kvmppc_unmap_free_pmd(kvm, pmd, true);
+                       kvmppc_unmap_free_pmd(kvm, pmd, true, lpid);
                        pud_clear(p);
                }
        }
        pud_free(kvm->mm, pud);
  }
  
- void kvmppc_free_radix(struct kvm *kvm)
+ void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid)
  {
        unsigned long ig;
-       pgd_t *pgd;
  
-       if (!kvm->arch.pgtable)
-               return;
-       pgd = kvm->arch.pgtable;
        for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
                pud_t *pud;
  
                if (!pgd_present(*pgd))
                        continue;
                pud = pud_offset(pgd, 0);
-               kvmppc_unmap_free_pud(kvm, pud);
+               kvmppc_unmap_free_pud(kvm, pud, lpid);
                pgd_clear(pgd);
        }
-       pgd_free(kvm->mm, kvm->arch.pgtable);
-       kvm->arch.pgtable = NULL;
+ }
+ void kvmppc_free_radix(struct kvm *kvm)
+ {
+       if (kvm->arch.pgtable) {
+               kvmppc_free_pgtable_radix(kvm, kvm->arch.pgtable,
+                                         kvm->arch.lpid);
+               pgd_free(kvm->mm, kvm->arch.pgtable);
+               kvm->arch.pgtable = NULL;
+       }
  }
  
  static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
-                                             unsigned long gpa)
+                                       unsigned long gpa, unsigned int lpid)
  {
        pte_t *pte = pte_offset_kernel(pmd, 0);
  
         * flushing the PWC again.
         */
        pmd_clear(pmd);
-       kvmppc_radix_flush_pwc(kvm);
+       kvmppc_radix_flush_pwc(kvm, lpid);
  
-       kvmppc_unmap_free_pte(kvm, pte, false);
+       kvmppc_unmap_free_pte(kvm, pte, false, lpid);
  }
  
  static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
-                                       unsigned long gpa)
+                                       unsigned long gpa, unsigned int lpid)
  {
        pmd_t *pmd = pmd_offset(pud, 0);
  
         * so can be freed without flushing the PWC again.
         */
        pud_clear(pud);
-       kvmppc_radix_flush_pwc(kvm);
+       kvmppc_radix_flush_pwc(kvm, lpid);
  
-       kvmppc_unmap_free_pmd(kvm, pmd, false);
+       kvmppc_unmap_free_pmd(kvm, pmd, false, lpid);
  }
  
  /*
   */
  #define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED))
  
- static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
-                            unsigned int level, unsigned long mmu_seq)
+ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
+                     unsigned long gpa, unsigned int level,
+                     unsigned long mmu_seq, unsigned int lpid,
+                     unsigned long *rmapp, struct rmap_nested **n_rmap)
  {
        pgd_t *pgd;
        pud_t *pud, *new_pud = NULL;
        int ret;
  
        /* Traverse the guest's 2nd-level tree, allocate new levels needed */
-       pgd = kvm->arch.pgtable + pgd_index(gpa);
+       pgd = pgtable + pgd_index(gpa);
        pud = NULL;
        if (pgd_present(*pgd))
                pud = pud_offset(pgd, gpa);
                        goto out_unlock;
                }
                /* Valid 1GB page here already, remove it */
-               kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT);
+               kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL,
+                                lpid);
        }
        if (level == 2) {
                if (!pud_none(*pud)) {
                         * install a large page, so remove and free the page
                         * table page.
                         */
-                       kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa);
+                       kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid);
                }
                kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte);
+               if (rmapp && n_rmap)
+                       kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
                ret = 0;
                goto out_unlock;
        }
                        WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) &
                                                        PTE_BITS_MUST_MATCH);
                        kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd),
-                                             0, pte_val(pte), lgpa, PMD_SHIFT);
+                                       0, pte_val(pte), lgpa, PMD_SHIFT);
                        ret = 0;
                        goto out_unlock;
                }
                        goto out_unlock;
                }
                /* Valid 2MB page here already, remove it */
-               kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT);
+               kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL,
+                                lpid);
        }
        if (level == 1) {
                if (!pmd_none(*pmd)) {
                         * install a large page, so remove and free the page
                         * table page.
                         */
-                       kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa);
+                       kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid);
                }
                kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
+               if (rmapp && n_rmap)
+                       kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
                ret = 0;
                goto out_unlock;
        }
                goto out_unlock;
        }
        kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
+       if (rmapp && n_rmap)
+               kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
        ret = 0;
  
   out_unlock:
        return ret;
  }
  
- int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
-                                  unsigned long ea, unsigned long dsisr)
+ bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, bool writing,
+                            unsigned long gpa, unsigned int lpid)
+ {
+       unsigned long pgflags;
+       unsigned int shift;
+       pte_t *ptep;
+       /*
+        * Need to set an R or C bit in the 2nd-level tables;
+        * since we are just helping out the hardware here,
+        * it is sufficient to do what the hardware does.
+        */
+       pgflags = _PAGE_ACCESSED;
+       if (writing)
+               pgflags |= _PAGE_DIRTY;
+       /*
+        * We are walking the secondary (partition-scoped) page table here.
+        * We can do this without disabling irq because the Linux MM
+        * subsystem doesn't do THP splits and collapses on this tree.
+        */
+       ptep = __find_linux_pte(pgtable, gpa, NULL, &shift);
+       if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) {
+               kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift);
+               return true;
+       }
+       return false;
+ }
+ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
+                                  unsigned long gpa,
+                                  struct kvm_memory_slot *memslot,
+                                  bool writing, bool kvm_ro,
+                                  pte_t *inserted_pte, unsigned int *levelp)
  {
        struct kvm *kvm = vcpu->kvm;
-       unsigned long mmu_seq;
-       unsigned long gpa, gfn, hva;
-       struct kvm_memory_slot *memslot;
        struct page *page = NULL;
-       long ret;
-       bool writing;
+       unsigned long mmu_seq;
+       unsigned long hva, gfn = gpa >> PAGE_SHIFT;
        bool upgrade_write = false;
        bool *upgrade_p = &upgrade_write;
        pte_t pte, *ptep;
-       unsigned long pgflags;
        unsigned int shift, level;
-       /* Check for unusual errors */
-       if (dsisr & DSISR_UNSUPP_MMU) {
-               pr_err("KVM: Got unsupported MMU fault\n");
-               return -EFAULT;
-       }
-       if (dsisr & DSISR_BADACCESS) {
-               /* Reflect to the guest as DSI */
-               pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
-               kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
-               return RESUME_GUEST;
-       }
-       /* Translate the logical address and get the page */
-       gpa = vcpu->arch.fault_gpa & ~0xfffUL;
-       gpa &= ~0xF000000000000000ul;
-       gfn = gpa >> PAGE_SHIFT;
-       if (!(dsisr & DSISR_PRTABLE_FAULT))
-               gpa |= ea & 0xfff;
-       memslot = gfn_to_memslot(kvm, gfn);
-       /* No memslot means it's an emulated MMIO region */
-       if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
-               if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS |
-                            DSISR_SET_RC)) {
-                       /*
-                        * Bad address in guest page table tree, or other
-                        * unusual error - reflect it to the guest as DSI.
-                        */
-                       kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
-                       return RESUME_GUEST;
-               }
-               return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
-                                             dsisr & DSISR_ISSTORE);
-       }
-       writing = (dsisr & DSISR_ISSTORE) != 0;
-       if (memslot->flags & KVM_MEM_READONLY) {
-               if (writing) {
-                       /* give the guest a DSI */
-                       dsisr = DSISR_ISSTORE | DSISR_PROTFAULT;
-                       kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
-                       return RESUME_GUEST;
-               }
-               upgrade_p = NULL;
-       }
-       if (dsisr & DSISR_SET_RC) {
-               /*
-                * Need to set an R or C bit in the 2nd-level tables;
-                * since we are just helping out the hardware here,
-                * it is sufficient to do what the hardware does.
-                */
-               pgflags = _PAGE_ACCESSED;
-               if (writing)
-                       pgflags |= _PAGE_DIRTY;
-               /*
-                * We are walking the secondary page table here. We can do this
-                * without disabling irq.
-                */
-               spin_lock(&kvm->mmu_lock);
-               ptep = __find_linux_pte(kvm->arch.pgtable,
-                                       gpa, NULL, &shift);
-               if (ptep && pte_present(*ptep) &&
-                   (!writing || pte_write(*ptep))) {
-                       kvmppc_radix_update_pte(kvm, ptep, 0, pgflags,
-                                               gpa, shift);
-                       dsisr &= ~DSISR_SET_RC;
-               }
-               spin_unlock(&kvm->mmu_lock);
-               if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
-                              DSISR_PROTFAULT | DSISR_SET_RC)))
-                       return RESUME_GUEST;
-       }
+       int ret;
  
        /* used to check for invalidations in progress */
        mmu_seq = kvm->mmu_notifier_seq;
         * is that the page is writable.
         */
        hva = gfn_to_hva_memslot(memslot, gfn);
-       if (upgrade_p && __get_user_pages_fast(hva, 1, 1, &page) == 1) {
+       if (!kvm_ro && __get_user_pages_fast(hva, 1, 1, &page) == 1) {
                upgrade_write = true;
        } else {
                unsigned long pfn;
         */
        local_irq_disable();
        ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift);
 +      /*
 +       * If the PTE disappeared temporarily due to a THP
 +       * collapse, just return and let the guest try again.
 +       */
 +      if (!ptep) {
 +              local_irq_enable();
 +              if (page)
 +                      put_page(page);
 +              return RESUME_GUEST;
 +      }
        pte = *ptep;
        local_irq_enable();
  
        }
  
        /* Allocate space in the tree and write the PTE */
-       ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq);
+       ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level,
+                               mmu_seq, kvm->arch.lpid, NULL, NULL);
+       if (inserted_pte)
+               *inserted_pte = pte;
+       if (levelp)
+               *levelp = level;
  
        if (page) {
                if (!ret && (pte_val(pte) & _PAGE_WRITE))
                put_page(page);
        }
  
+       return ret;
+ }
+ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                                  unsigned long ea, unsigned long dsisr)
+ {
+       struct kvm *kvm = vcpu->kvm;
+       unsigned long gpa, gfn;
+       struct kvm_memory_slot *memslot;
+       long ret;
+       bool writing = !!(dsisr & DSISR_ISSTORE);
+       bool kvm_ro = false;
+       /* Check for unusual errors */
+       if (dsisr & DSISR_UNSUPP_MMU) {
+               pr_err("KVM: Got unsupported MMU fault\n");
+               return -EFAULT;
+       }
+       if (dsisr & DSISR_BADACCESS) {
+               /* Reflect to the guest as DSI */
+               pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
+               kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
+               return RESUME_GUEST;
+       }
+       /* Translate the logical address */
+       gpa = vcpu->arch.fault_gpa & ~0xfffUL;
+       gpa &= ~0xF000000000000000ul;
+       gfn = gpa >> PAGE_SHIFT;
+       if (!(dsisr & DSISR_PRTABLE_FAULT))
+               gpa |= ea & 0xfff;
+       /* Get the corresponding memslot */
+       memslot = gfn_to_memslot(kvm, gfn);
+       /* No memslot means it's an emulated MMIO region */
+       if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
+               if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS |
+                            DSISR_SET_RC)) {
+                       /*
+                        * Bad address in guest page table tree, or other
+                        * unusual error - reflect it to the guest as DSI.
+                        */
+                       kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
+                       return RESUME_GUEST;
+               }
+               return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, writing);
+       }
+       if (memslot->flags & KVM_MEM_READONLY) {
+               if (writing) {
+                       /* give the guest a DSI */
+                       kvmppc_core_queue_data_storage(vcpu, ea, DSISR_ISSTORE |
+                                                      DSISR_PROTFAULT);
+                       return RESUME_GUEST;
+               }
+               kvm_ro = true;
+       }
+       /* Failed to set the reference/change bits */
+       if (dsisr & DSISR_SET_RC) {
+               spin_lock(&kvm->mmu_lock);
+               if (kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable,
+                                           writing, gpa, kvm->arch.lpid))
+                       dsisr &= ~DSISR_SET_RC;
+               spin_unlock(&kvm->mmu_lock);
+               if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
+                              DSISR_PROTFAULT | DSISR_SET_RC)))
+                       return RESUME_GUEST;
+       }
+       /* Try to insert a pte */
+       ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, writing,
+                                            kvm_ro, NULL, NULL);
        if (ret == 0 || ret == -EAGAIN)
                ret = RESUME_GUEST;
        return ret;
@@@ -710,20 -854,11 +864,11 @@@ int kvm_unmap_radix(struct kvm *kvm, st
        pte_t *ptep;
        unsigned long gpa = gfn << PAGE_SHIFT;
        unsigned int shift;
-       unsigned long old;
  
        ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
-       if (ptep && pte_present(*ptep)) {
-               old = kvmppc_radix_update_pte(kvm, ptep, ~0UL, 0,
-                                             gpa, shift);
-               kvmppc_radix_tlbie_page(kvm, gpa, shift);
-               if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) {
-                       unsigned long psize = PAGE_SIZE;
-                       if (shift)
-                               psize = 1ul << shift;
-                       kvmppc_update_dirty_map(memslot, gfn, psize);
-               }
-       }
+       if (ptep && pte_present(*ptep))
+               kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
+                                kvm->arch.lpid);
        return 0;                               
  }
  
@@@ -778,7 -913,7 +923,7 @@@ static int kvm_radix_test_clear_dirty(s
                        ret = 1 << (shift - PAGE_SHIFT);
                kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0,
                                        gpa, shift);
-               kvmppc_radix_tlbie_page(kvm, gpa, shift);
+               kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid);
        }
        return ret;
  }
@@@ -863,6 -998,215 +1008,215 @@@ static void pmd_ctor(void *addr
        memset(addr, 0, RADIX_PMD_TABLE_SIZE);
  }
  
+ struct debugfs_radix_state {
+       struct kvm      *kvm;
+       struct mutex    mutex;
+       unsigned long   gpa;
+       int             lpid;
+       int             chars_left;
+       int             buf_index;
+       char            buf[128];
+       u8              hdr;
+ };
+ static int debugfs_radix_open(struct inode *inode, struct file *file)
+ {
+       struct kvm *kvm = inode->i_private;
+       struct debugfs_radix_state *p;
+       p = kzalloc(sizeof(*p), GFP_KERNEL);
+       if (!p)
+               return -ENOMEM;
+       kvm_get_kvm(kvm);
+       p->kvm = kvm;
+       mutex_init(&p->mutex);
+       file->private_data = p;
+       return nonseekable_open(inode, file);
+ }
+ static int debugfs_radix_release(struct inode *inode, struct file *file)
+ {
+       struct debugfs_radix_state *p = file->private_data;
+       kvm_put_kvm(p->kvm);
+       kfree(p);
+       return 0;
+ }
+ static ssize_t debugfs_radix_read(struct file *file, char __user *buf,
+                                size_t len, loff_t *ppos)
+ {
+       struct debugfs_radix_state *p = file->private_data;
+       ssize_t ret, r;
+       unsigned long n;
+       struct kvm *kvm;
+       unsigned long gpa;
+       pgd_t *pgt;
+       struct kvm_nested_guest *nested;
+       pgd_t pgd, *pgdp;
+       pud_t pud, *pudp;
+       pmd_t pmd, *pmdp;
+       pte_t *ptep;
+       int shift;
+       unsigned long pte;
+       kvm = p->kvm;
+       if (!kvm_is_radix(kvm))
+               return 0;
+       ret = mutex_lock_interruptible(&p->mutex);
+       if (ret)
+               return ret;
+       if (p->chars_left) {
+               n = p->chars_left;
+               if (n > len)
+                       n = len;
+               r = copy_to_user(buf, p->buf + p->buf_index, n);
+               n -= r;
+               p->chars_left -= n;
+               p->buf_index += n;
+               buf += n;
+               len -= n;
+               ret = n;
+               if (r) {
+                       if (!n)
+                               ret = -EFAULT;
+                       goto out;
+               }
+       }
+       gpa = p->gpa;
+       nested = NULL;
+       pgt = NULL;
+       while (len != 0 && p->lpid >= 0) {
+               if (gpa >= RADIX_PGTABLE_RANGE) {
+                       gpa = 0;
+                       pgt = NULL;
+                       if (nested) {
+                               kvmhv_put_nested(nested);
+                               nested = NULL;
+                       }
+                       p->lpid = kvmhv_nested_next_lpid(kvm, p->lpid);
+                       p->hdr = 0;
+                       if (p->lpid < 0)
+                               break;
+               }
+               if (!pgt) {
+                       if (p->lpid == 0) {
+                               pgt = kvm->arch.pgtable;
+                       } else {
+                               nested = kvmhv_get_nested(kvm, p->lpid, false);
+                               if (!nested) {
+                                       gpa = RADIX_PGTABLE_RANGE;
+                                       continue;
+                               }
+                               pgt = nested->shadow_pgtable;
+                       }
+               }
+               n = 0;
+               if (!p->hdr) {
+                       if (p->lpid > 0)
+                               n = scnprintf(p->buf, sizeof(p->buf),
+                                             "\nNested LPID %d: ", p->lpid);
+                       n += scnprintf(p->buf + n, sizeof(p->buf) - n,
+                                     "pgdir: %lx\n", (unsigned long)pgt);
+                       p->hdr = 1;
+                       goto copy;
+               }
+               pgdp = pgt + pgd_index(gpa);
+               pgd = READ_ONCE(*pgdp);
+               if (!(pgd_val(pgd) & _PAGE_PRESENT)) {
+                       gpa = (gpa & PGDIR_MASK) + PGDIR_SIZE;
+                       continue;
+               }
+               pudp = pud_offset(&pgd, gpa);
+               pud = READ_ONCE(*pudp);
+               if (!(pud_val(pud) & _PAGE_PRESENT)) {
+                       gpa = (gpa & PUD_MASK) + PUD_SIZE;
+                       continue;
+               }
+               if (pud_val(pud) & _PAGE_PTE) {
+                       pte = pud_val(pud);
+                       shift = PUD_SHIFT;
+                       goto leaf;
+               }
+               pmdp = pmd_offset(&pud, gpa);
+               pmd = READ_ONCE(*pmdp);
+               if (!(pmd_val(pmd) & _PAGE_PRESENT)) {
+                       gpa = (gpa & PMD_MASK) + PMD_SIZE;
+                       continue;
+               }
+               if (pmd_val(pmd) & _PAGE_PTE) {
+                       pte = pmd_val(pmd);
+                       shift = PMD_SHIFT;
+                       goto leaf;
+               }
+               ptep = pte_offset_kernel(&pmd, gpa);
+               pte = pte_val(READ_ONCE(*ptep));
+               if (!(pte & _PAGE_PRESENT)) {
+                       gpa += PAGE_SIZE;
+                       continue;
+               }
+               shift = PAGE_SHIFT;
+       leaf:
+               n = scnprintf(p->buf, sizeof(p->buf),
+                             " %lx: %lx %d\n", gpa, pte, shift);
+               gpa += 1ul << shift;
+       copy:
+               p->chars_left = n;
+               if (n > len)
+                       n = len;
+               r = copy_to_user(buf, p->buf, n);
+               n -= r;
+               p->chars_left -= n;
+               p->buf_index = n;
+               buf += n;
+               len -= n;
+               ret += n;
+               if (r) {
+                       if (!ret)
+                               ret = -EFAULT;
+                       break;
+               }
+       }
+       p->gpa = gpa;
+       if (nested)
+               kvmhv_put_nested(nested);
+  out:
+       mutex_unlock(&p->mutex);
+       return ret;
+ }
+ static ssize_t debugfs_radix_write(struct file *file, const char __user *buf,
+                          size_t len, loff_t *ppos)
+ {
+       return -EACCES;
+ }
+ static const struct file_operations debugfs_radix_fops = {
+       .owner   = THIS_MODULE,
+       .open    = debugfs_radix_open,
+       .release = debugfs_radix_release,
+       .read    = debugfs_radix_read,
+       .write   = debugfs_radix_write,
+       .llseek  = generic_file_llseek,
+ };
+ void kvmhv_radix_debugfs_init(struct kvm *kvm)
+ {
+       kvm->arch.radix_dentry = debugfs_create_file("radix", 0400,
+                                                    kvm->arch.debugfs_dir, kvm,
+                                                    &debugfs_radix_fops);
+ }
  int kvmppc_radix_init(void)
  {
        unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE;
diff --combined arch/s390/Kconfig
index 039a3417dfc454736befb81304ee3c412294d039,8cc8f25d95761bca3edea7566833ed1c6e952821..8b25e1f45b2749c1472e6b1247082b3c24d023b6
@@@ -56,12 -56,6 +56,12 @@@ config PCI_QUIRK
  config ARCH_SUPPORTS_UPROBES
        def_bool y
  
 +config KASAN_SHADOW_OFFSET
 +      hex
 +      depends on KASAN
 +      default 0x18000000000000 if KASAN_S390_4_LEVEL_PAGING
 +      default 0x30000000000
 +
  config S390
        def_bool y
        select ARCH_BINFMT_ELF_STATE
        select HAVE_ALIGNED_STRUCT_PAGE if SLUB
        select HAVE_ARCH_AUDITSYSCALL
        select HAVE_ARCH_JUMP_LABEL
 +      select HAVE_ARCH_JUMP_LABEL_RELATIVE
 +      select HAVE_ARCH_KASAN
        select CPU_NO_EFFICIENT_FFS if !HAVE_MARCH_Z9_109_FEATURES
        select HAVE_ARCH_SECCOMP_FILTER
        select HAVE_ARCH_SOFT_DIRTY
        select HAVE_ARCH_TRACEHOOK
        select HAVE_ARCH_TRANSPARENT_HUGEPAGE
 +      select HAVE_ARCH_VMAP_STACK
        select HAVE_EBPF_JIT if PACK_STACK && HAVE_MARCH_Z196_FEATURES
        select HAVE_CMPXCHG_DOUBLE
        select HAVE_CMPXCHG_LOCAL
@@@ -658,7 -649,6 +658,7 @@@ config PACK_STAC
  
  config CHECK_STACK
        def_bool y
 +      depends on !VMAP_STACK
        prompt "Detect kernel stack overflow"
        help
          This option enables the compiler option -mstack-guard and
@@@ -783,6 -773,17 +783,17 @@@ config VFIO_CC
          To compile this driver as a module, choose M here: the
          module will be called vfio_ccw.
  
+ config VFIO_AP
+       def_tristate n
+       prompt "VFIO support for AP devices"
+       depends on S390_AP_IOMMU && VFIO_MDEV_DEVICE && KVM
+       help
+               This driver grants access to Adjunct Processor (AP) devices
+               via the VFIO mediated device interface.
+               To compile this driver as a module, choose M here: the module
+               will be called vfio_ap.
  endmenu
  
  menu "Dump support"
index e05e0d3092445736ffd6a25ecbfddaa06168e7c6,449c92da2c91ef24a5cad9ad187f021fd36261e1..1fc7a0d1e8775227edaf307894fce11bdee3b28c
@@@ -40,7 -40,7 +40,7 @@@ static inline int cpu_has_vmx(void
   */
  static inline void cpu_vmxoff(void)
  {
-       asm volatile (ASM_VMX_VMXOFF : : : "cc");
+       asm volatile ("vmxoff");
        cr4_clear_bits(X86_CR4_VMXE);
  }
  
@@@ -83,10 -83,9 +83,10 @@@ static inline void cpu_emergency_vmxoff
   */
  static inline int cpu_has_svm(const char **msg)
  {
 -      if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
 +      if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
 +          boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) {
                if (msg)
 -                      *msg = "not amd";
 +                      *msg = "not amd or hygon";
                return 0;
        }
  
diff --combined arch/x86/kvm/mmu.c
index e843ec46609d304a92a4e554170c011305f5edbf,4cf43ce4295964dc8b509dfacca4d6cfa2bf655b..cf5f572f230520b4f00694b0afe8ec6738e89868
@@@ -932,7 -932,7 +932,7 @@@ static int mmu_topup_memory_cache(struc
        while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
                obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
                if (!obj)
-                       return -ENOMEM;
+                       return cache->nobjs >= min ? 0 : -ENOMEM;
                cache->objects[cache->nobjs++] = obj;
        }
        return 0;
@@@ -960,7 -960,7 +960,7 @@@ static int mmu_topup_memory_cache_page(
        while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
                page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
                if (!page)
-                       return -ENOMEM;
+                       return cache->nobjs >= min ? 0 : -ENOMEM;
                cache->objects[cache->nobjs++] = page;
        }
        return 0;
@@@ -1265,24 -1265,24 +1265,24 @@@ pte_list_desc_remove_entry(struct kvm_r
        mmu_free_pte_list_desc(desc);
  }
  
- static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
+ static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
  {
        struct pte_list_desc *desc;
        struct pte_list_desc *prev_desc;
        int i;
  
        if (!rmap_head->val) {
-               printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte);
+               pr_err("%s: %p 0->BUG\n", __func__, spte);
                BUG();
        } else if (!(rmap_head->val & 1)) {
-               rmap_printk("pte_list_remove:  %p 1->0\n", spte);
+               rmap_printk("%s:  %p 1->0\n", __func__, spte);
                if ((u64 *)rmap_head->val != spte) {
-                       printk(KERN_ERR "pte_list_remove:  %p 1->BUG\n", spte);
+                       pr_err("%s:  %p 1->BUG\n", __func__, spte);
                        BUG();
                }
                rmap_head->val = 0;
        } else {
-               rmap_printk("pte_list_remove:  %p many->many\n", spte);
+               rmap_printk("%s:  %p many->many\n", __func__, spte);
                desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
                prev_desc = NULL;
                while (desc) {
                        prev_desc = desc;
                        desc = desc->more;
                }
-               pr_err("pte_list_remove: %p many->many\n", spte);
+               pr_err("%s: %p many->many\n", __func__, spte);
                BUG();
        }
  }
  
+ static void pte_list_remove(struct kvm_rmap_head *rmap_head, u64 *sptep)
+ {
+       mmu_spte_clear_track_bits(sptep);
+       __pte_list_remove(sptep, rmap_head);
+ }
  static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
                                           struct kvm_memory_slot *slot)
  {
@@@ -1349,7 -1355,7 +1355,7 @@@ static void rmap_remove(struct kvm *kvm
        sp = page_header(__pa(spte));
        gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
        rmap_head = gfn_to_rmap(kvm, gfn, sp);
-       pte_list_remove(spte, rmap_head);
+       __pte_list_remove(spte, rmap_head);
  }
  
  /*
@@@ -1685,7 -1691,7 +1691,7 @@@ static bool kvm_zap_rmapp(struct kvm *k
        while ((sptep = rmap_get_first(rmap_head, &iter))) {
                rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
  
-               drop_spte(kvm, sptep);
+               pte_list_remove(rmap_head, sptep);
                flush = true;
        }
  
@@@ -1721,7 -1727,7 +1727,7 @@@ restart
                need_flush = 1;
  
                if (pte_write(*ptep)) {
-                       drop_spte(kvm, sptep);
+                       pte_list_remove(rmap_head, sptep);
                        goto restart;
                } else {
                        new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
@@@ -1988,7 -1994,7 +1994,7 @@@ static void mmu_page_add_parent_pte(str
  static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
                                       u64 *parent_pte)
  {
-       pte_list_remove(parent_pte, &sp->parent_ptes);
+       __pte_list_remove(parent_pte, &sp->parent_ptes);
  }
  
  static void drop_parent_pte(struct kvm_mmu_page *sp,
@@@ -2181,7 -2187,7 +2187,7 @@@ static bool __kvm_sync_page(struct kvm_
                            struct list_head *invalid_list)
  {
        if (sp->role.cr4_pae != !!is_pae(vcpu)
-           || vcpu->arch.mmu.sync_page(vcpu, sp) == 0) {
+           || vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
                kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
                return false;
        }
@@@ -2375,14 -2381,14 +2381,14 @@@ static struct kvm_mmu_page *kvm_mmu_get
        int collisions = 0;
        LIST_HEAD(invalid_list);
  
-       role = vcpu->arch.mmu.base_role;
+       role = vcpu->arch.mmu->mmu_role.base;
        role.level = level;
        role.direct = direct;
        if (role.direct)
                role.cr4_pae = 0;
        role.access = access;
-       if (!vcpu->arch.mmu.direct_map
-           && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
+       if (!vcpu->arch.mmu->direct_map
+           && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
                quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
                quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
                role.quadrant = quadrant;
@@@ -2457,11 -2463,11 +2463,11 @@@ static void shadow_walk_init_using_root
  {
        iterator->addr = addr;
        iterator->shadow_addr = root;
-       iterator->level = vcpu->arch.mmu.shadow_root_level;
+       iterator->level = vcpu->arch.mmu->shadow_root_level;
  
        if (iterator->level == PT64_ROOT_4LEVEL &&
-           vcpu->arch.mmu.root_level < PT64_ROOT_4LEVEL &&
-           !vcpu->arch.mmu.direct_map)
+           vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL &&
+           !vcpu->arch.mmu->direct_map)
                --iterator->level;
  
        if (iterator->level == PT32E_ROOT_LEVEL) {
                 * prev_root is currently only used for 64-bit hosts. So only
                 * the active root_hpa is valid here.
                 */
-               BUG_ON(root != vcpu->arch.mmu.root_hpa);
+               BUG_ON(root != vcpu->arch.mmu->root_hpa);
  
                iterator->shadow_addr
-                       = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
+                       = vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
                iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
                --iterator->level;
                if (!iterator->shadow_addr)
  static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
                             struct kvm_vcpu *vcpu, u64 addr)
  {
-       shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu.root_hpa,
+       shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root_hpa,
                                    addr);
  }
  
@@@ -3095,7 -3101,7 +3101,7 @@@ static int __direct_map(struct kvm_vcp
        int emulate = 0;
        gfn_t pseudo_gfn;
  
-       if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+       if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
                return 0;
  
        for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
  
  static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
  {
 -      siginfo_t info;
 -
 -      clear_siginfo(&info);
 -      info.si_signo   = SIGBUS;
 -      info.si_errno   = 0;
 -      info.si_code    = BUS_MCEERR_AR;
 -      info.si_addr    = (void __user *)address;
 -      info.si_addr_lsb = PAGE_SHIFT;
 -
 -      send_sig_info(SIGBUS, &info, tsk);
 +      send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
  }
  
  static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
@@@ -3301,7 -3316,7 +3307,7 @@@ static bool fast_page_fault(struct kvm_
        u64 spte = 0ull;
        uint retry_count = 0;
  
-       if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+       if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
                return false;
  
        if (!page_fault_can_be_fast(error_code))
@@@ -3471,11 -3486,11 +3477,11 @@@ static void mmu_free_root_page(struct k
  }
  
  /* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
- void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, ulong roots_to_free)
+ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+                       ulong roots_to_free)
  {
        int i;
        LIST_HEAD(invalid_list);
-       struct kvm_mmu *mmu = &vcpu->arch.mmu;
        bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
  
        BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
@@@ -3535,20 -3550,20 +3541,20 @@@ static int mmu_alloc_direct_roots(struc
        struct kvm_mmu_page *sp;
        unsigned i;
  
-       if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL) {
+       if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
                spin_lock(&vcpu->kvm->mmu_lock);
                if(make_mmu_pages_available(vcpu) < 0) {
                        spin_unlock(&vcpu->kvm->mmu_lock);
                        return -ENOSPC;
                }
                sp = kvm_mmu_get_page(vcpu, 0, 0,
-                               vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL);
+                               vcpu->arch.mmu->shadow_root_level, 1, ACC_ALL);
                ++sp->root_count;
                spin_unlock(&vcpu->kvm->mmu_lock);
-               vcpu->arch.mmu.root_hpa = __pa(sp->spt);
-       } else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) {
+               vcpu->arch.mmu->root_hpa = __pa(sp->spt);
+       } else if (vcpu->arch.mmu->shadow_root_level == PT32E_ROOT_LEVEL) {
                for (i = 0; i < 4; ++i) {
-                       hpa_t root = vcpu->arch.mmu.pae_root[i];
+                       hpa_t root = vcpu->arch.mmu->pae_root[i];
  
                        MMU_WARN_ON(VALID_PAGE(root));
                        spin_lock(&vcpu->kvm->mmu_lock);
                        root = __pa(sp->spt);
                        ++sp->root_count;
                        spin_unlock(&vcpu->kvm->mmu_lock);
-                       vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
+                       vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK;
                }
-               vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+               vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
        } else
                BUG();
  
@@@ -3577,7 -3592,7 +3583,7 @@@ static int mmu_alloc_shadow_roots(struc
        gfn_t root_gfn;
        int i;
  
-       root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
+       root_gfn = vcpu->arch.mmu->get_cr3(vcpu) >> PAGE_SHIFT;
  
        if (mmu_check_root(vcpu, root_gfn))
                return 1;
         * Do we shadow a long mode page table? If so we need to
         * write-protect the guests page table root.
         */
-       if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
-               hpa_t root = vcpu->arch.mmu.root_hpa;
+       if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
+               hpa_t root = vcpu->arch.mmu->root_hpa;
  
                MMU_WARN_ON(VALID_PAGE(root));
  
                        return -ENOSPC;
                }
                sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
-                               vcpu->arch.mmu.shadow_root_level, 0, ACC_ALL);
+                               vcpu->arch.mmu->shadow_root_level, 0, ACC_ALL);
                root = __pa(sp->spt);
                ++sp->root_count;
                spin_unlock(&vcpu->kvm->mmu_lock);
-               vcpu->arch.mmu.root_hpa = root;
+               vcpu->arch.mmu->root_hpa = root;
                return 0;
        }
  
         * the shadow page table may be a PAE or a long mode page table.
         */
        pm_mask = PT_PRESENT_MASK;
-       if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL)
+       if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL)
                pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
  
        for (i = 0; i < 4; ++i) {
-               hpa_t root = vcpu->arch.mmu.pae_root[i];
+               hpa_t root = vcpu->arch.mmu->pae_root[i];
  
                MMU_WARN_ON(VALID_PAGE(root));
-               if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
-                       pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i);
+               if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) {
+                       pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i);
                        if (!(pdptr & PT_PRESENT_MASK)) {
-                               vcpu->arch.mmu.pae_root[i] = 0;
+                               vcpu->arch.mmu->pae_root[i] = 0;
                                continue;
                        }
                        root_gfn = pdptr >> PAGE_SHIFT;
                ++sp->root_count;
                spin_unlock(&vcpu->kvm->mmu_lock);
  
-               vcpu->arch.mmu.pae_root[i] = root | pm_mask;
+               vcpu->arch.mmu->pae_root[i] = root | pm_mask;
        }
-       vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+       vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
  
        /*
         * If we shadow a 32 bit page table with a long mode page
         * table we enter this path.
         */
-       if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) {
-               if (vcpu->arch.mmu.lm_root == NULL) {
+       if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
+               if (vcpu->arch.mmu->lm_root == NULL) {
                        /*
                         * The additional page necessary for this is only
                         * allocated on demand.
                        if (lm_root == NULL)
                                return 1;
  
-                       lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask;
+                       lm_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask;
  
-                       vcpu->arch.mmu.lm_root = lm_root;
+                       vcpu->arch.mmu->lm_root = lm_root;
                }
  
-               vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root);
+               vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root);
        }
  
        return 0;
  
  static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
  {
-       if (vcpu->arch.mmu.direct_map)
+       if (vcpu->arch.mmu->direct_map)
                return mmu_alloc_direct_roots(vcpu);
        else
                return mmu_alloc_shadow_roots(vcpu);
@@@ -3684,17 -3699,16 +3690,16 @@@ void kvm_mmu_sync_roots(struct kvm_vcp
        int i;
        struct kvm_mmu_page *sp;
  
-       if (vcpu->arch.mmu.direct_map)
+       if (vcpu->arch.mmu->direct_map)
                return;
  
-       if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+       if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
                return;
  
        vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
  
-       if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
-               hpa_t root = vcpu->arch.mmu.root_hpa;
+       if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
+               hpa_t root = vcpu->arch.mmu->root_hpa;
                sp = page_header(root);
  
                /*
        kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
  
        for (i = 0; i < 4; ++i) {
-               hpa_t root = vcpu->arch.mmu.pae_root[i];
+               hpa_t root = vcpu->arch.mmu->pae_root[i];
  
                if (root && VALID_PAGE(root)) {
                        root &= PT64_BASE_ADDR_MASK;
@@@ -3799,7 -3813,7 +3804,7 @@@ walk_shadow_page_get_mmio_spte(struct k
        int root, leaf;
        bool reserved = false;
  
-       if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+       if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
                goto exit;
  
        walk_shadow_page_lockless_begin(vcpu);
                if (!is_shadow_present_pte(spte))
                        break;
  
-               reserved |= is_shadow_zero_bits_set(&vcpu->arch.mmu, spte,
+               reserved |= is_shadow_zero_bits_set(vcpu->arch.mmu, spte,
                                                    iterator.level);
        }
  
@@@ -3895,7 -3909,7 +3900,7 @@@ static void shadow_page_table_clear_flo
        struct kvm_shadow_walk_iterator iterator;
        u64 spte;
  
-       if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+       if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
                return;
  
        walk_shadow_page_lockless_begin(vcpu);
@@@ -3922,7 -3936,7 +3927,7 @@@ static int nonpaging_page_fault(struct 
        if (r)
                return r;
  
-       MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+       MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
  
  
        return nonpaging_map(vcpu, gva & PAGE_MASK,
@@@ -3935,8 -3949,8 +3940,8 @@@ static int kvm_arch_setup_async_pf(stru
  
        arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
        arch.gfn = gfn;
-       arch.direct_map = vcpu->arch.mmu.direct_map;
-       arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
+       arch.direct_map = vcpu->arch.mmu->direct_map;
+       arch.cr3 = vcpu->arch.mmu->get_cr3(vcpu);
  
        return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
  }
@@@ -4042,7 -4056,7 +4047,7 @@@ static int tdp_page_fault(struct kvm_vc
        int write = error_code & PFERR_WRITE_MASK;
        bool map_writable;
  
-       MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+       MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
  
        if (page_fault_handle_page_track(vcpu, error_code, gfn))
                return RET_PF_EMULATE;
@@@ -4118,7 -4132,7 +4123,7 @@@ static bool cached_root_available(struc
  {
        uint i;
        struct kvm_mmu_root_info root;
-       struct kvm_mmu *mmu = &vcpu->arch.mmu;
+       struct kvm_mmu *mmu = vcpu->arch.mmu;
  
        root.cr3 = mmu->get_cr3(vcpu);
        root.hpa = mmu->root_hpa;
@@@ -4141,7 -4155,7 +4146,7 @@@ static bool fast_cr3_switch(struct kvm_
                            union kvm_mmu_page_role new_role,
                            bool skip_tlb_flush)
  {
-       struct kvm_mmu *mmu = &vcpu->arch.mmu;
+       struct kvm_mmu *mmu = vcpu->arch.mmu;
  
        /*
         * For now, limit the fast switch to 64-bit hosts+VMs in order to avoid
@@@ -4192,7 -4206,8 +4197,8 @@@ static void __kvm_mmu_new_cr3(struct kv
                              bool skip_tlb_flush)
  {
        if (!fast_cr3_switch(vcpu, new_cr3, new_role, skip_tlb_flush))
-               kvm_mmu_free_roots(vcpu, KVM_MMU_ROOT_CURRENT);
+               kvm_mmu_free_roots(vcpu, vcpu->arch.mmu,
+                                  KVM_MMU_ROOT_CURRENT);
  }
  
  void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush)
@@@ -4210,7 -4225,7 +4216,7 @@@ static unsigned long get_cr3(struct kvm
  static void inject_page_fault(struct kvm_vcpu *vcpu,
                              struct x86_exception *fault)
  {
-       vcpu->arch.mmu.inject_page_fault(vcpu, fault);
+       vcpu->arch.mmu->inject_page_fault(vcpu, fault);
  }
  
  static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
@@@ -4414,7 -4429,8 +4420,8 @@@ static void reset_rsvds_bits_mask_ept(s
  void
  reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
  {
-       bool uses_nx = context->nx || context->base_role.smep_andnot_wp;
+       bool uses_nx = context->nx ||
+               context->mmu_role.base.smep_andnot_wp;
        struct rsvd_bits_validate *shadow_zero_check;
        int i;
  
@@@ -4553,7 -4569,7 +4560,7 @@@ static void update_permission_bitmask(s
                         * SMAP:kernel-mode data accesses from user-mode
                         * mappings should fault. A fault is considered
                         * as a SMAP violation if all of the following
-                        * conditions are ture:
+                        * conditions are true:
                         *   - X86_CR4_SMAP is set in CR4
                         *   - A user page is accessed
                         *   - The access is not a fetch
@@@ -4714,27 -4730,65 +4721,65 @@@ static void paging32E_init_context(stru
        paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
  }
  
- static union kvm_mmu_page_role
- kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu)
+ static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu)
+ {
+       union kvm_mmu_extended_role ext = {0};
+       ext.cr0_pg = !!is_paging(vcpu);
+       ext.cr4_smep = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
+       ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
+       ext.cr4_pse = !!is_pse(vcpu);
+       ext.cr4_pke = !!kvm_read_cr4_bits(vcpu, X86_CR4_PKE);
+       ext.cr4_la57 = !!kvm_read_cr4_bits(vcpu, X86_CR4_LA57);
+       ext.valid = 1;
+       return ext;
+ }
+ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
+                                                  bool base_only)
+ {
+       union kvm_mmu_role role = {0};
+       role.base.access = ACC_ALL;
+       role.base.nxe = !!is_nx(vcpu);
+       role.base.cr4_pae = !!is_pae(vcpu);
+       role.base.cr0_wp = is_write_protection(vcpu);
+       role.base.smm = is_smm(vcpu);
+       role.base.guest_mode = is_guest_mode(vcpu);
+       if (base_only)
+               return role;
+       role.ext = kvm_calc_mmu_role_ext(vcpu);
+       return role;
+ }
+ static union kvm_mmu_role
+ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
  {
-       union kvm_mmu_page_role role = {0};
+       union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
  
-       role.guest_mode = is_guest_mode(vcpu);
-       role.smm = is_smm(vcpu);
-       role.ad_disabled = (shadow_accessed_mask == 0);
-       role.level = kvm_x86_ops->get_tdp_level(vcpu);
-       role.direct = true;
-       role.access = ACC_ALL;
+       role.base.ad_disabled = (shadow_accessed_mask == 0);
+       role.base.level = kvm_x86_ops->get_tdp_level(vcpu);
+       role.base.direct = true;
  
        return role;
  }
  
  static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
  {
-       struct kvm_mmu *context = &vcpu->arch.mmu;
+       struct kvm_mmu *context = vcpu->arch.mmu;
+       union kvm_mmu_role new_role =
+               kvm_calc_tdp_mmu_root_page_role(vcpu, false);
  
-       context->base_role.word = mmu_base_role_mask.word &
-                                 kvm_calc_tdp_mmu_root_page_role(vcpu).word;
+       new_role.base.word &= mmu_base_role_mask.word;
+       if (new_role.as_u64 == context->mmu_role.as_u64)
+               return;
+       context->mmu_role.as_u64 = new_role.as_u64;
        context->page_fault = tdp_page_fault;
        context->sync_page = nonpaging_sync_page;
        context->invlpg = nonpaging_invlpg;
        reset_tdp_shadow_zero_bits_mask(vcpu, context);
  }
  
- static union kvm_mmu_page_role
- kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu)
- {
-       union kvm_mmu_page_role role = {0};
-       bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
-       bool smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
-       role.nxe = is_nx(vcpu);
-       role.cr4_pae = !!is_pae(vcpu);
-       role.cr0_wp  = is_write_protection(vcpu);
-       role.smep_andnot_wp = smep && !is_write_protection(vcpu);
-       role.smap_andnot_wp = smap && !is_write_protection(vcpu);
-       role.guest_mode = is_guest_mode(vcpu);
-       role.smm = is_smm(vcpu);
-       role.direct = !is_paging(vcpu);
-       role.access = ACC_ALL;
+ static union kvm_mmu_role
+ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
+ {
+       union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
+       role.base.smep_andnot_wp = role.ext.cr4_smep &&
+               !is_write_protection(vcpu);
+       role.base.smap_andnot_wp = role.ext.cr4_smap &&
+               !is_write_protection(vcpu);
+       role.base.direct = !is_paging(vcpu);
  
        if (!is_long_mode(vcpu))
-               role.level = PT32E_ROOT_LEVEL;
+               role.base.level = PT32E_ROOT_LEVEL;
        else if (is_la57_mode(vcpu))
-               role.level = PT64_ROOT_5LEVEL;
+               role.base.level = PT64_ROOT_5LEVEL;
        else
-               role.level = PT64_ROOT_4LEVEL;
+               role.base.level = PT64_ROOT_4LEVEL;
  
        return role;
  }
  
  void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
  {
-       struct kvm_mmu *context = &vcpu->arch.mmu;
+       struct kvm_mmu *context = vcpu->arch.mmu;
+       union kvm_mmu_role new_role =
+               kvm_calc_shadow_mmu_root_page_role(vcpu, false);
+       new_role.base.word &= mmu_base_role_mask.word;
+       if (new_role.as_u64 == context->mmu_role.as_u64)
+               return;
  
        if (!is_paging(vcpu))
                nonpaging_init_context(vcpu, context);
        else
                paging32_init_context(vcpu, context);
  
-       context->base_role.word = mmu_base_role_mask.word &
-                                 kvm_calc_shadow_mmu_root_page_role(vcpu).word;
+       context->mmu_role.as_u64 = new_role.as_u64;
        reset_shadow_zero_bits_mask(vcpu, context);
  }
  EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
  
- static union kvm_mmu_page_role
- kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty)
+ static union kvm_mmu_role
+ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
+                                  bool execonly)
  {
-       union kvm_mmu_page_role role = vcpu->arch.mmu.base_role;
+       union kvm_mmu_role role;
+       /* Base role is inherited from root_mmu */
+       role.base.word = vcpu->arch.root_mmu.mmu_role.base.word;
+       role.ext = kvm_calc_mmu_role_ext(vcpu);
+       role.base.level = PT64_ROOT_4LEVEL;
+       role.base.direct = false;
+       role.base.ad_disabled = !accessed_dirty;
+       role.base.guest_mode = true;
+       role.base.access = ACC_ALL;
  
-       role.level = PT64_ROOT_4LEVEL;
-       role.direct = false;
-       role.ad_disabled = !accessed_dirty;
-       role.guest_mode = true;
-       role.access = ACC_ALL;
+       role.ext.execonly = execonly;
  
        return role;
  }
  void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
                             bool accessed_dirty, gpa_t new_eptp)
  {
-       struct kvm_mmu *context = &vcpu->arch.mmu;
-       union kvm_mmu_page_role root_page_role =
-               kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty);
+       struct kvm_mmu *context = vcpu->arch.mmu;
+       union kvm_mmu_role new_role =
+               kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
+                                                  execonly);
+       __kvm_mmu_new_cr3(vcpu, new_eptp, new_role.base, false);
+       new_role.base.word &= mmu_base_role_mask.word;
+       if (new_role.as_u64 == context->mmu_role.as_u64)
+               return;
  
-       __kvm_mmu_new_cr3(vcpu, new_eptp, root_page_role, false);
        context->shadow_root_level = PT64_ROOT_4LEVEL;
  
        context->nx = true;
        context->update_pte = ept_update_pte;
        context->root_level = PT64_ROOT_4LEVEL;
        context->direct_map = false;
-       context->base_role.word = root_page_role.word & mmu_base_role_mask.word;
+       context->mmu_role.as_u64 = new_role.as_u64;
        update_permission_bitmask(vcpu, context, true);
        update_pkru_bitmask(vcpu, context, true);
        update_last_nonleaf_level(vcpu, context);
@@@ -4864,7 -4931,7 +4922,7 @@@ EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_m
  
  static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
  {
-       struct kvm_mmu *context = &vcpu->arch.mmu;
+       struct kvm_mmu *context = vcpu->arch.mmu;
  
        kvm_init_shadow_mmu(vcpu);
        context->set_cr3           = kvm_x86_ops->set_cr3;
  
  static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
  {
+       union kvm_mmu_role new_role = kvm_calc_mmu_role_common(vcpu, false);
        struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
  
+       new_role.base.word &= mmu_base_role_mask.word;
+       if (new_role.as_u64 == g_context->mmu_role.as_u64)
+               return;
+       g_context->mmu_role.as_u64 = new_role.as_u64;
        g_context->get_cr3           = get_cr3;
        g_context->get_pdptr         = kvm_pdptr_read;
        g_context->inject_page_fault = kvm_inject_page_fault;
  
        /*
-        * Note that arch.mmu.gva_to_gpa translates l2_gpa to l1_gpa using
+        * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
         * L1's nested page tables (e.g. EPT12). The nested translation
         * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
         * L2's page tables as the first level of translation and L1's
@@@ -4921,10 -4994,10 +4985,10 @@@ void kvm_init_mmu(struct kvm_vcpu *vcpu
        if (reset_roots) {
                uint i;
  
-               vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+               vcpu->arch.mmu->root_hpa = INVALID_PAGE;
  
                for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
-                       vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
+                       vcpu->arch.mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
        }
  
        if (mmu_is_nested(vcpu))
@@@ -4939,10 -5012,14 +5003,14 @@@ EXPORT_SYMBOL_GPL(kvm_init_mmu)
  static union kvm_mmu_page_role
  kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
  {
+       union kvm_mmu_role role;
        if (tdp_enabled)
-               return kvm_calc_tdp_mmu_root_page_role(vcpu);
+               role = kvm_calc_tdp_mmu_root_page_role(vcpu, true);
        else
-               return kvm_calc_shadow_mmu_root_page_role(vcpu);
+               role = kvm_calc_shadow_mmu_root_page_role(vcpu, true);
+       return role.base;
  }
  
  void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
@@@ -4972,8 -5049,10 +5040,10 @@@ EXPORT_SYMBOL_GPL(kvm_mmu_load)
  
  void kvm_mmu_unload(struct kvm_vcpu *vcpu)
  {
-       kvm_mmu_free_roots(vcpu, KVM_MMU_ROOTS_ALL);
-       WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+       kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
+       WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root_hpa));
+       kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
+       WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa));
  }
  EXPORT_SYMBOL_GPL(kvm_mmu_unload);
  
@@@ -4987,7 -5066,7 +5057,7 @@@ static void mmu_pte_write_new_pte(struc
          }
  
        ++vcpu->kvm->stat.mmu_pte_updated;
-       vcpu->arch.mmu.update_pte(vcpu, sp, spte, new);
+       vcpu->arch.mmu->update_pte(vcpu, sp, spte, new);
  }
  
  static bool need_remote_flush(u64 old, u64 new)
@@@ -5164,10 -5243,12 +5234,12 @@@ static void kvm_mmu_pte_write(struct kv
  
                local_flush = true;
                while (npte--) {
+                       u32 base_role = vcpu->arch.mmu->mmu_role.base.word;
                        entry = *spte;
                        mmu_page_zap_pte(vcpu->kvm, sp, spte);
                        if (gentry &&
-                             !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
+                             !((sp->role.word ^ base_role)
                              & mmu_base_role_mask.word) && rmap_can_add(vcpu))
                                mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
                        if (need_remote_flush(entry, *spte))
@@@ -5185,7 -5266,7 +5257,7 @@@ int kvm_mmu_unprotect_page_virt(struct 
        gpa_t gpa;
        int r;
  
-       if (vcpu->arch.mmu.direct_map)
+       if (vcpu->arch.mmu->direct_map)
                return 0;
  
        gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
@@@ -5221,10 -5302,10 +5293,10 @@@ int kvm_mmu_page_fault(struct kvm_vcpu 
  {
        int r, emulation_type = 0;
        enum emulation_result er;
-       bool direct = vcpu->arch.mmu.direct_map;
+       bool direct = vcpu->arch.mmu->direct_map;
  
        /* With shadow page tables, fault_address contains a GVA or nGPA.  */
-       if (vcpu->arch.mmu.direct_map) {
+       if (vcpu->arch.mmu->direct_map) {
                vcpu->arch.gpa_available = true;
                vcpu->arch.gpa_val = cr2;
        }
        }
  
        if (r == RET_PF_INVALID) {
-               r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code),
-                                             false);
+               r = vcpu->arch.mmu->page_fault(vcpu, cr2,
+                                              lower_32_bits(error_code),
+                                              false);
                WARN_ON(r == RET_PF_INVALID);
        }
  
         * paging in both guests. If true, we simply unprotect the page
         * and resume the guest.
         */
-       if (vcpu->arch.mmu.direct_map &&
+       if (vcpu->arch.mmu->direct_map &&
            (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
                kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2));
                return 1;
@@@ -5302,7 -5384,7 +5375,7 @@@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault)
  
  void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
  {
-       struct kvm_mmu *mmu = &vcpu->arch.mmu;
+       struct kvm_mmu *mmu = vcpu->arch.mmu;
        int i;
  
        /* INVLPG on a * non-canonical address is a NOP according to the SDM.  */
@@@ -5333,7 -5415,7 +5406,7 @@@ EXPORT_SYMBOL_GPL(kvm_mmu_invlpg)
  
  void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
  {
-       struct kvm_mmu *mmu = &vcpu->arch.mmu;
+       struct kvm_mmu *mmu = vcpu->arch.mmu;
        bool tlb_flush = false;
        uint i;
  
@@@ -5377,8 -5459,8 +5450,8 @@@ EXPORT_SYMBOL_GPL(kvm_disable_tdp)
  
  static void free_mmu_pages(struct kvm_vcpu *vcpu)
  {
-       free_page((unsigned long)vcpu->arch.mmu.pae_root);
-       free_page((unsigned long)vcpu->arch.mmu.lm_root);
+       free_page((unsigned long)vcpu->arch.mmu->pae_root);
+       free_page((unsigned long)vcpu->arch.mmu->lm_root);
  }
  
  static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
        if (!page)
                return -ENOMEM;
  
-       vcpu->arch.mmu.pae_root = page_address(page);
+       vcpu->arch.mmu->pae_root = page_address(page);
        for (i = 0; i < 4; ++i)
-               vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
+               vcpu->arch.mmu->pae_root[i] = INVALID_PAGE;
  
        return 0;
  }
@@@ -5409,27 -5491,21 +5482,21 @@@ int kvm_mmu_create(struct kvm_vcpu *vcp
  {
        uint i;
  
-       vcpu->arch.walk_mmu = &vcpu->arch.mmu;
-       vcpu->arch.mmu.root_hpa = INVALID_PAGE;
-       vcpu->arch.mmu.translate_gpa = translate_gpa;
-       vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
+       vcpu->arch.mmu = &vcpu->arch.root_mmu;
+       vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
  
+       vcpu->arch.root_mmu.root_hpa = INVALID_PAGE;
+       vcpu->arch.root_mmu.translate_gpa = translate_gpa;
        for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
-               vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
-       return alloc_mmu_pages(vcpu);
- }
+               vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
  
- void kvm_mmu_setup(struct kvm_vcpu *vcpu)
- {
-       MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+       vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE;
+       vcpu->arch.guest_mmu.translate_gpa = translate_gpa;
+       for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+               vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
  
-       /*
-        * kvm_mmu_setup() is called only on vCPU initialization.  
-        * Therefore, no need to reset mmu roots as they are not yet
-        * initialized.
-        */
-       kvm_init_mmu(vcpu, false);
+       vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
+       return alloc_mmu_pages(vcpu);
  }
  
  static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
@@@ -5612,7 -5688,7 +5679,7 @@@ restart
                if (sp->role.direct &&
                        !kvm_is_reserved_pfn(pfn) &&
                        PageTransCompoundMap(pfn_to_page(pfn))) {
-                       drop_spte(kvm, sptep);
+                       pte_list_remove(rmap_head, sptep);
                        need_tlb_flush = 1;
                        goto restart;
                }
@@@ -5869,6 -5945,16 +5936,16 @@@ int kvm_mmu_module_init(void
  {
        int ret = -ENOMEM;
  
+       /*
+        * MMU roles use union aliasing which is, generally speaking, an
+        * undefined behavior. However, we supposedly know how compilers behave
+        * and the current status quo is unlikely to change. Guardians below are
+        * supposed to let us know if the assumption becomes false.
+        */
+       BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
+       BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
+       BUILD_BUG_ON(sizeof(union kvm_mmu_role) != sizeof(u64));
        kvm_mmu_reset_all_pte_masks();
  
        pte_list_desc_cache = kmem_cache_create("pte_list_desc",
@@@ -5898,7 -5984,7 +5975,7 @@@ out
  }
  
  /*
-  * Caculate mmu pages needed for kvm.
+  * Calculate mmu pages needed for kvm.
   */
  unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
  {
diff --combined arch/x86/kvm/svm.c
index 61ccfb13899ed702d8ab7dc88bdb5489b34bcfeb,f416f5c7f2aee11816f82f79f64da25919047b95..0e21ccc46792f6bcc6665ff63979f23979aa1829
@@@ -436,18 -436,14 +436,18 @@@ static inline struct kvm_svm *to_kvm_sv
  
  static inline bool svm_sev_enabled(void)
  {
 -      return max_sev_asid;
 +      return IS_ENABLED(CONFIG_KVM_AMD_SEV) ? max_sev_asid : 0;
  }
  
  static inline bool sev_guest(struct kvm *kvm)
  {
 +#ifdef CONFIG_KVM_AMD_SEV
        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
  
        return sev->active;
 +#else
 +      return false;
 +#endif
  }
  
  static inline int sev_get_asid(struct kvm *kvm)
@@@ -809,6 -805,8 +809,8 @@@ static void svm_queue_exception(struct 
            nested_svm_check_exception(svm, nr, has_error_code, error_code))
                return;
  
+       kvm_deliver_exception_payload(&svm->vcpu);
        if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
                unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
  
@@@ -2922,18 -2920,18 +2924,18 @@@ static void nested_svm_init_mmu_context
  {
        WARN_ON(mmu_is_nested(vcpu));
        kvm_init_shadow_mmu(vcpu);
-       vcpu->arch.mmu.set_cr3           = nested_svm_set_tdp_cr3;
-       vcpu->arch.mmu.get_cr3           = nested_svm_get_tdp_cr3;
-       vcpu->arch.mmu.get_pdptr         = nested_svm_get_tdp_pdptr;
-       vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
-       vcpu->arch.mmu.shadow_root_level = get_npt_level(vcpu);
-       reset_shadow_zero_bits_mask(vcpu, &vcpu->arch.mmu);
+       vcpu->arch.mmu->set_cr3           = nested_svm_set_tdp_cr3;
+       vcpu->arch.mmu->get_cr3           = nested_svm_get_tdp_cr3;
+       vcpu->arch.mmu->get_pdptr         = nested_svm_get_tdp_pdptr;
+       vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit;
+       vcpu->arch.mmu->shadow_root_level = get_npt_level(vcpu);
+       reset_shadow_zero_bits_mask(vcpu, vcpu->arch.mmu);
        vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
  }
  
  static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
  {
-       vcpu->arch.walk_mmu = &vcpu->arch.mmu;
+       vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
  }
  
  static int nested_svm_check_permissions(struct vcpu_svm *svm)
@@@ -2969,16 -2967,13 +2971,13 @@@ static int nested_svm_check_exception(s
        svm->vmcb->control.exit_info_1 = error_code;
  
        /*
-        * FIXME: we should not write CR2 when L1 intercepts an L2 #PF exception.
-        * The fix is to add the ancillary datum (CR2 or DR6) to structs
-        * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6 can be
-        * written only when inject_pending_event runs (DR6 would written here
-        * too).  This should be conditional on a new capability---if the
-        * capability is disabled, kvm_multiple_exception would write the
-        * ancillary information to CR2 or DR6, for backwards ABI-compatibility.
+        * EXITINFO2 is undefined for all exception intercepts other
+        * than #PF.
         */
        if (svm->vcpu.arch.exception.nested_apf)
                svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
+       else if (svm->vcpu.arch.exception.has_payload)
+               svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload;
        else
                svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
  
@@@ -5642,26 -5637,24 +5641,24 @@@ static void svm_vcpu_run(struct kvm_vcp
                "mov %%r13, %c[r13](%[svm]) \n\t"
                "mov %%r14, %c[r14](%[svm]) \n\t"
                "mov %%r15, %c[r15](%[svm]) \n\t"
- #endif
                /*
                * Clear host registers marked as clobbered to prevent
                * speculative use.
                */
-               "xor %%" _ASM_BX ", %%" _ASM_BX " \n\t"
-               "xor %%" _ASM_CX ", %%" _ASM_CX " \n\t"
-               "xor %%" _ASM_DX ", %%" _ASM_DX " \n\t"
-               "xor %%" _ASM_SI ", %%" _ASM_SI " \n\t"
-               "xor %%" _ASM_DI ", %%" _ASM_DI " \n\t"
- #ifdef CONFIG_X86_64
-               "xor %%r8, %%r8 \n\t"
-               "xor %%r9, %%r9 \n\t"
-               "xor %%r10, %%r10 \n\t"
-               "xor %%r11, %%r11 \n\t"
-               "xor %%r12, %%r12 \n\t"
-               "xor %%r13, %%r13 \n\t"
-               "xor %%r14, %%r14 \n\t"
-               "xor %%r15, %%r15 \n\t"
+               "xor %%r8d, %%r8d \n\t"
+               "xor %%r9d, %%r9d \n\t"
+               "xor %%r10d, %%r10d \n\t"
+               "xor %%r11d, %%r11d \n\t"
+               "xor %%r12d, %%r12d \n\t"
+               "xor %%r13d, %%r13d \n\t"
+               "xor %%r14d, %%r14d \n\t"
+               "xor %%r15d, %%r15d \n\t"
  #endif
+               "xor %%ebx, %%ebx \n\t"
+               "xor %%ecx, %%ecx \n\t"
+               "xor %%edx, %%edx \n\t"
+               "xor %%esi, %%esi \n\t"
+               "xor %%edi, %%edi \n\t"
                "pop %%" _ASM_BP
                :
                : [svm]"a"(svm),
@@@ -7040,6 -7033,13 +7037,13 @@@ failed
        return ret;
  }
  
+ static int nested_enable_evmcs(struct kvm_vcpu *vcpu,
+                                  uint16_t *vmcs_version)
+ {
+       /* Intel-only feature */
+       return -ENODEV;
+ }
  static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
        .cpu_has_kvm_support = has_svm,
        .disabled_by_bios = is_disabled,
        .mem_enc_op = svm_mem_enc_op,
        .mem_enc_reg_region = svm_register_enc_region,
        .mem_enc_unreg_region = svm_unregister_enc_region,
+       .nested_enable_evmcs = nested_enable_evmcs,
  };
  
  static int __init svm_init(void)
diff --combined arch/x86/kvm/vmx.c
index e665aa7167cf9729aac82a075c358236d9f03aec,ccc6a01eb4f401563b916efe33fec8f219e57a46..4555077d69ce204148facb207f46ddf882483231
@@@ -20,6 -20,7 +20,7 @@@
  #include "mmu.h"
  #include "cpuid.h"
  #include "lapic.h"
+ #include "hyperv.h"
  
  #include <linux/kvm_host.h>
  #include <linux/module.h>
@@@ -61,7 -62,7 +62,7 @@@
  
  #define __ex(x) __kvm_handle_fault_on_reboot(x)
  #define __ex_clear(x, reg) \
-       ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg)
+       ____kvm_handle_fault_on_reboot(x, "xor " reg ", " reg)
  
  MODULE_AUTHOR("Qumranet");
  MODULE_LICENSE("GPL");
@@@ -107,9 -108,12 +108,12 @@@ module_param_named(enable_shadow_vmcs, 
   * VMX and be a hypervisor for its own guests. If nested=0, guests may not
   * use VMX instructions.
   */
- static bool __read_mostly nested = 0;
+ static bool __read_mostly nested = 1;
  module_param(nested, bool, S_IRUGO);
  
+ static bool __read_mostly nested_early_check = 0;
+ module_param(nested_early_check, bool, S_IRUGO);
  static u64 __read_mostly host_xss;
  
  static bool __read_mostly enable_pml = 1;
@@@ -131,7 -135,7 +135,7 @@@ static bool __read_mostly enable_preemp
  module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
  #endif
  
- #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
+ #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
  #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
  #define KVM_VM_CR0_ALWAYS_ON                          \
        (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST |      \
@@@ -187,6 -191,7 +191,7 @@@ static unsigned int ple_window_ma
  module_param(ple_window_max, uint, 0444);
  
  extern const ulong vmx_return;
+ extern const ulong vmx_early_consistency_check_return;
  
  static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
  static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
@@@ -827,14 -832,28 +832,28 @@@ struct nested_vmx 
         */
        struct vmcs12 *cached_shadow_vmcs12;
        /*
-        * Indicates if the shadow vmcs must be updated with the
-        * data hold by vmcs12
+        * Indicates if the shadow vmcs or enlightened vmcs must be updated
+        * with the data held by struct vmcs12.
         */
-       bool sync_shadow_vmcs;
+       bool need_vmcs12_sync;
        bool dirty_vmcs12;
  
+       /*
+        * vmcs02 has been initialized, i.e. state that is constant for
+        * vmcs02 has been written to the backing VMCS.  Initialization
+        * is delayed until L1 actually attempts to run a nested VM.
+        */
+       bool vmcs02_initialized;
        bool change_vmcs01_virtual_apic_mode;
  
+       /*
+        * Enlightened VMCS has been enabled. It does not mean that L1 has to
+        * use it. However, VMX features available to L1 will be limited based
+        * on what the enlightened VMCS supports.
+        */
+       bool enlightened_vmcs_enabled;
        /* L2 must run next, and mustn't decide to exit to L1. */
        bool nested_run_pending;
  
                /* in guest mode on SMM entry? */
                bool guest_mode;
        } smm;
+       gpa_t hv_evmcs_vmptr;
+       struct page *hv_evmcs_page;
+       struct hv_enlightened_vmcs *hv_evmcs;
  };
  
  #define POSTED_INTR_ON  0
@@@ -1381,6 -1404,49 +1404,49 @@@ DEFINE_STATIC_KEY_FALSE(enable_evmcs)
  
  #define KVM_EVMCS_VERSION 1
  
+ /*
+  * Enlightened VMCSv1 doesn't support these:
+  *
+  *    POSTED_INTR_NV                  = 0x00000002,
+  *    GUEST_INTR_STATUS               = 0x00000810,
+  *    APIC_ACCESS_ADDR                = 0x00002014,
+  *    POSTED_INTR_DESC_ADDR           = 0x00002016,
+  *    EOI_EXIT_BITMAP0                = 0x0000201c,
+  *    EOI_EXIT_BITMAP1                = 0x0000201e,
+  *    EOI_EXIT_BITMAP2                = 0x00002020,
+  *    EOI_EXIT_BITMAP3                = 0x00002022,
+  *    GUEST_PML_INDEX                 = 0x00000812,
+  *    PML_ADDRESS                     = 0x0000200e,
+  *    VM_FUNCTION_CONTROL             = 0x00002018,
+  *    EPTP_LIST_ADDRESS               = 0x00002024,
+  *    VMREAD_BITMAP                   = 0x00002026,
+  *    VMWRITE_BITMAP                  = 0x00002028,
+  *
+  *    TSC_MULTIPLIER                  = 0x00002032,
+  *    PLE_GAP                         = 0x00004020,
+  *    PLE_WINDOW                      = 0x00004022,
+  *    VMX_PREEMPTION_TIMER_VALUE      = 0x0000482E,
+  *      GUEST_IA32_PERF_GLOBAL_CTRL     = 0x00002808,
+  *      HOST_IA32_PERF_GLOBAL_CTRL      = 0x00002c04,
+  *
+  * Currently unsupported in KVM:
+  *    GUEST_IA32_RTIT_CTL             = 0x00002814,
+  */
+ #define EVMCS1_UNSUPPORTED_PINCTRL (PIN_BASED_POSTED_INTR | \
+                                   PIN_BASED_VMX_PREEMPTION_TIMER)
+ #define EVMCS1_UNSUPPORTED_2NDEXEC                                    \
+       (SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |                         \
+        SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |                      \
+        SECONDARY_EXEC_APIC_REGISTER_VIRT |                            \
+        SECONDARY_EXEC_ENABLE_PML |                                    \
+        SECONDARY_EXEC_ENABLE_VMFUNC |                                 \
+        SECONDARY_EXEC_SHADOW_VMCS |                                   \
+        SECONDARY_EXEC_TSC_SCALING |                                   \
+        SECONDARY_EXEC_PAUSE_LOOP_EXITING)
+ #define EVMCS1_UNSUPPORTED_VMEXIT_CTRL (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
+ #define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
+ #define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING)
  #if IS_ENABLED(CONFIG_HYPERV)
  static bool __read_mostly enlightened_vmcs = true;
  module_param(enlightened_vmcs, bool, 0444);
@@@ -1473,69 -1539,12 +1539,12 @@@ static void evmcs_load(u64 phys_addr
  
  static void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
  {
-       /*
-        * Enlightened VMCSv1 doesn't support these:
-        *
-        *      POSTED_INTR_NV                  = 0x00000002,
-        *      GUEST_INTR_STATUS               = 0x00000810,
-        *      APIC_ACCESS_ADDR                = 0x00002014,
-        *      POSTED_INTR_DESC_ADDR           = 0x00002016,
-        *      EOI_EXIT_BITMAP0                = 0x0000201c,
-        *      EOI_EXIT_BITMAP1                = 0x0000201e,
-        *      EOI_EXIT_BITMAP2                = 0x00002020,
-        *      EOI_EXIT_BITMAP3                = 0x00002022,
-        */
-       vmcs_conf->pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
-       vmcs_conf->cpu_based_2nd_exec_ctrl &=
-               ~SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
-       vmcs_conf->cpu_based_2nd_exec_ctrl &=
-               ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
-       vmcs_conf->cpu_based_2nd_exec_ctrl &=
-               ~SECONDARY_EXEC_APIC_REGISTER_VIRT;
-       /*
-        *      GUEST_PML_INDEX                 = 0x00000812,
-        *      PML_ADDRESS                     = 0x0000200e,
-        */
-       vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_PML;
-       /*      VM_FUNCTION_CONTROL             = 0x00002018, */
-       vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_VMFUNC;
-       /*
-        *      EPTP_LIST_ADDRESS               = 0x00002024,
-        *      VMREAD_BITMAP                   = 0x00002026,
-        *      VMWRITE_BITMAP                  = 0x00002028,
-        */
-       vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_SHADOW_VMCS;
-       /*
-        *      TSC_MULTIPLIER                  = 0x00002032,
-        */
-       vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_TSC_SCALING;
-       /*
-        *      PLE_GAP                         = 0x00004020,
-        *      PLE_WINDOW                      = 0x00004022,
-        */
-       vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
-       /*
-        *      VMX_PREEMPTION_TIMER_VALUE      = 0x0000482E,
-        */
-       vmcs_conf->pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+       vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL;
+       vmcs_conf->cpu_based_2nd_exec_ctrl &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
  
-       /*
-        *      GUEST_IA32_PERF_GLOBAL_CTRL     = 0x00002808,
-        *      HOST_IA32_PERF_GLOBAL_CTRL      = 0x00002c04,
-        */
-       vmcs_conf->vmexit_ctrl &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
-       vmcs_conf->vmentry_ctrl &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+       vmcs_conf->vmexit_ctrl &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
+       vmcs_conf->vmentry_ctrl &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
  
-       /*
-        * Currently unsupported in KVM:
-        *      GUEST_IA32_RTIT_CTL             = 0x00002814,
-        */
  }
  
  /* check_ept_pointer() should be under protection of ept_pointer_lock. */
@@@ -1560,26 -1569,23 +1569,27 @@@ static void check_ept_pointer_match(str
  
  static int vmx_hv_remote_flush_tlb(struct kvm *kvm)
  {
-       int ret;
+       struct kvm_vcpu *vcpu;
+       int ret = -ENOTSUPP, i;
  
        spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
  
        if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK)
                check_ept_pointer_match(kvm);
  
-       if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
-               ret = -ENOTSUPP;
-               goto out;
-       }
 +      /*
 +       * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs the address of the
 +       * base of EPT PML4 table, strip off EPT configuration information.
 +       */
-       ret = hyperv_flush_guest_mapping(
-                       to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer & PAGE_MASK);
+       if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
+               kvm_for_each_vcpu(i, vcpu, kvm)
+                       ret |= hyperv_flush_guest_mapping(
 -                              to_vmx(kvm_get_vcpu(kvm, i))->ept_pointer);
++                              to_vmx(kvm_get_vcpu(kvm, i))->ept_pointer & PAGE_MASK);
+       } else {
+               ret = hyperv_flush_guest_mapping(
 -                              to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer);
++                              to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer & PAGE_MASK);
+       }
  
- out:
        spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
        return ret;
  }
@@@ -1595,6 -1601,35 +1605,35 @@@ static inline void evmcs_sanitize_exec_
  static inline void evmcs_touch_msr_bitmap(void) {}
  #endif /* IS_ENABLED(CONFIG_HYPERV) */
  
+ static int nested_enable_evmcs(struct kvm_vcpu *vcpu,
+                              uint16_t *vmcs_version)
+ {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       /* We don't support disabling the feature for simplicity. */
+       if (vmx->nested.enlightened_vmcs_enabled)
+               return 0;
+       vmx->nested.enlightened_vmcs_enabled = true;
+       /*
+        * vmcs_version represents the range of supported Enlightened VMCS
+        * versions: lower 8 bits is the minimal version, higher 8 bits is the
+        * maximum supported version. KVM supports versions from 1 to
+        * KVM_EVMCS_VERSION.
+        */
+       if (vmcs_version)
+               *vmcs_version = (KVM_EVMCS_VERSION << 8) | 1;
+       vmx->nested.msrs.pinbased_ctls_high &= ~EVMCS1_UNSUPPORTED_PINCTRL;
+       vmx->nested.msrs.entry_ctls_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
+       vmx->nested.msrs.exit_ctls_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
+       vmx->nested.msrs.secondary_ctls_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
+       vmx->nested.msrs.vmfunc_controls &= ~EVMCS1_UNSUPPORTED_VMFUNC;
+       return 0;
+ }
  static inline bool is_exception_n(u32 intr_info, u8 vector)
  {
        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
@@@ -1617,11 -1652,6 +1656,6 @@@ static inline bool is_page_fault(u32 in
        return is_exception_n(intr_info, PF_VECTOR);
  }
  
- static inline bool is_no_device(u32 intr_info)
- {
-       return is_exception_n(intr_info, NM_VECTOR);
- }
  static inline bool is_invalid_opcode(u32 intr_info)
  {
        return is_exception_n(intr_info, UD_VECTOR);
@@@ -1632,12 -1662,6 +1666,6 @@@ static inline bool is_gp_fault(u32 intr
        return is_exception_n(intr_info, GP_VECTOR);
  }
  
- static inline bool is_external_interrupt(u32 intr_info)
- {
-       return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
-               == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
- }
  static inline bool is_machine_check(u32 intr_info)
  {
        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
@@@ -2063,9 -2087,6 +2091,6 @@@ static inline bool is_nmi(u32 intr_info
  static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
                              u32 exit_intr_info,
                              unsigned long exit_qualification);
- static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
-                       struct vmcs12 *vmcs12,
-                       u32 reason, unsigned long qualification);
  
  static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
  {
        return -1;
  }
  
- static inline void __invvpid(int ext, u16 vpid, gva_t gva)
+ static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva)
  {
      struct {
        u64 vpid : 16;
      } operand = { vpid, 0, gva };
      bool error;
  
-     asm volatile (__ex(ASM_VMX_INVVPID) CC_SET(na)
-                 : CC_OUT(na) (error) : "a"(&operand), "c"(ext)
-                 : "memory");
+     asm volatile (__ex("invvpid %2, %1") CC_SET(na)
+                 : CC_OUT(na) (error) : "r"(ext), "m"(operand));
      BUG_ON(error);
  }
  
- static inline void __invept(int ext, u64 eptp, gpa_t gpa)
+ static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa)
  {
        struct {
                u64 eptp, gpa;
        } operand = {eptp, gpa};
        bool error;
  
-       asm volatile (__ex(ASM_VMX_INVEPT) CC_SET(na)
-                     : CC_OUT(na) (error) : "a" (&operand), "c" (ext)
-                     : "memory");
+       asm volatile (__ex("invept %2, %1") CC_SET(na)
+                     : CC_OUT(na) (error) : "r"(ext), "m"(operand));
        BUG_ON(error);
  }
  
@@@ -2120,9 -2139,8 +2143,8 @@@ static void vmcs_clear(struct vmcs *vmc
        u64 phys_addr = __pa(vmcs);
        bool error;
  
-       asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) CC_SET(na)
-                     : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr)
-                     : "memory");
+       asm volatile (__ex("vmclear %1") CC_SET(na)
+                     : CC_OUT(na) (error) : "m"(phys_addr));
        if (unlikely(error))
                printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
                       vmcs, phys_addr);
@@@ -2145,9 -2163,8 +2167,8 @@@ static void vmcs_load(struct vmcs *vmcs
        if (static_branch_unlikely(&enable_evmcs))
                return evmcs_load(phys_addr);
  
-       asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) CC_SET(na)
-                     : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr)
-                     : "memory");
+       asm volatile (__ex("vmptrld %1") CC_SET(na)
+                     : CC_OUT(na) (error) : "m"(phys_addr));
        if (unlikely(error))
                printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
                       vmcs, phys_addr);
@@@ -2323,8 -2340,8 +2344,8 @@@ static __always_inline unsigned long __
  {
        unsigned long value;
  
-       asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0")
-                     : "=a"(value) : "d"(field) : "cc");
+       asm volatile (__ex_clear("vmread %1, %0", "%k0")
+                     : "=r"(value) : "r"(field));
        return value;
  }
  
@@@ -2375,8 -2392,8 +2396,8 @@@ static __always_inline void __vmcs_writ
  {
        bool error;
  
-       asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) CC_SET(na)
-                     : CC_OUT(na) (error) : "a"(value), "d"(field));
+       asm volatile (__ex("vmwrite %2, %1") CC_SET(na)
+                     : CC_OUT(na) (error) : "r"(field), "rm"(value));
        if (unlikely(error))
                vmwrite_error(field, value);
  }
@@@ -2707,7 -2724,8 +2728,8 @@@ static void add_atomic_switch_msr_speci
                u64 guest_val, u64 host_val)
  {
        vmcs_write64(guest_val_vmcs, guest_val);
-       vmcs_write64(host_val_vmcs, host_val);
+       if (host_val_vmcs != HOST_IA32_EFER)
+               vmcs_write64(host_val_vmcs, host_val);
        vm_entry_controls_setbit(vmx, entry);
        vm_exit_controls_setbit(vmx, exit);
  }
@@@ -2805,8 -2823,6 +2827,6 @@@ static bool update_transition_efer(stru
                ignore_bits &= ~(u64)EFER_SCE;
  #endif
  
-       clear_atomic_switch_msr(vmx, MSR_EFER);
        /*
         * On EPT, we can't emulate NX, so we must switch EFER atomically.
         * On CPUs that support "load IA32_EFER", always switch EFER
                if (guest_efer != host_efer)
                        add_atomic_switch_msr(vmx, MSR_EFER,
                                              guest_efer, host_efer, false);
+               else
+                       clear_atomic_switch_msr(vmx, MSR_EFER);
                return false;
        } else {
+               clear_atomic_switch_msr(vmx, MSR_EFER);
                guest_efer &= ~ignore_bits;
                guest_efer |= host_efer & ignore_bits;
  
@@@ -3272,34 -3292,30 +3296,30 @@@ static int nested_vmx_check_exception(s
  {
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
        unsigned int nr = vcpu->arch.exception.nr;
+       bool has_payload = vcpu->arch.exception.has_payload;
+       unsigned long payload = vcpu->arch.exception.payload;
  
        if (nr == PF_VECTOR) {
                if (vcpu->arch.exception.nested_apf) {
                        *exit_qual = vcpu->arch.apf.nested_apf_token;
                        return 1;
                }
-               /*
-                * FIXME: we must not write CR2 when L1 intercepts an L2 #PF exception.
-                * The fix is to add the ancillary datum (CR2 or DR6) to structs
-                * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6
-                * can be written only when inject_pending_event runs.  This should be
-                * conditional on a new capability---if the capability is disabled,
-                * kvm_multiple_exception would write the ancillary information to
-                * CR2 or DR6, for backwards ABI-compatibility.
-                */
                if (nested_vmx_is_page_fault_vmexit(vmcs12,
                                                    vcpu->arch.exception.error_code)) {
-                       *exit_qual = vcpu->arch.cr2;
-                       return 1;
-               }
-       } else {
-               if (vmcs12->exception_bitmap & (1u << nr)) {
-                       if (nr == DB_VECTOR)
-                               *exit_qual = vcpu->arch.dr6;
-                       else
-                               *exit_qual = 0;
+                       *exit_qual = has_payload ? payload : vcpu->arch.cr2;
                        return 1;
                }
+       } else if (vmcs12->exception_bitmap & (1u << nr)) {
+               if (nr == DB_VECTOR) {
+                       if (!has_payload) {
+                               payload = vcpu->arch.dr6;
+                               payload &= ~(DR6_FIXED_1 | DR6_BT);
+                               payload ^= DR6_RTM;
+                       }
+                       *exit_qual = payload;
+               } else
+                       *exit_qual = 0;
+               return 1;
        }
  
        return 0;
@@@ -3326,6 -3342,8 +3346,8 @@@ static void vmx_queue_exception(struct 
        u32 error_code = vcpu->arch.exception.error_code;
        u32 intr_info = nr | INTR_INFO_VALID_MASK;
  
+       kvm_deliver_exception_payload(vcpu);
        if (has_error_code) {
                vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
                intr_info |= INTR_INFO_DELIVER_CODE_MASK;
@@@ -4397,9 -4415,7 +4419,7 @@@ static void kvm_cpu_vmxon(u64 addr
        cr4_set_bits(X86_CR4_VMXE);
        intel_pt_handle_vmx(1);
  
-       asm volatile (ASM_VMX_VMXON_RAX
-                       : : "a"(&addr), "m"(addr)
-                       : "memory", "cc");
+       asm volatile ("vmxon %0" : : "m"(addr));
  }
  
  static int hardware_enable(void)
@@@ -4468,7 -4484,7 +4488,7 @@@ static void vmclear_local_loaded_vmcss(
   */
  static void kvm_cpu_vmxoff(void)
  {
-       asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
+       asm volatile (__ex("vmxoff"));
  
        intel_pt_handle_vmx(0);
        cr4_clear_bits(X86_CR4_VMXE);
@@@ -5112,9 -5128,10 +5132,10 @@@ static inline void __vmx_flush_tlb(stru
                                bool invalidate_gpa)
  {
        if (enable_ept && (invalidate_gpa || !enable_vpid)) {
-               if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+               if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
                        return;
-               ept_sync_context(construct_eptp(vcpu, vcpu->arch.mmu.root_hpa));
+               ept_sync_context(construct_eptp(vcpu,
+                                               vcpu->arch.mmu->root_hpa));
        } else {
                vpid_sync_context(vpid);
        }
@@@ -5264,7 -5281,7 +5285,7 @@@ static void vmx_set_cr0(struct kvm_vcp
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        unsigned long hw_cr0;
  
-       hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK);
+       hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
        if (enable_unrestricted_guest)
                hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
        else {
@@@ -6339,6 -6356,9 +6360,9 @@@ static void vmx_set_constant_host_state
                rdmsr(MSR_IA32_CR_PAT, low32, high32);
                vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
        }
+       if (cpu_has_load_ia32_efer)
+               vmcs_write64(HOST_IA32_EFER, host_efer);
  }
  
  static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
@@@ -6666,7 -6686,6 +6690,6 @@@ static void vmx_vcpu_setup(struct vcpu_
                vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
  
        if (enable_pml) {
-               ASSERT(vmx->pml_pg);
                vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
                vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
        }
@@@ -8067,35 -8086,39 +8090,39 @@@ static int handle_monitor(struct kvm_vc
  
  /*
   * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
-  * set the success or error code of an emulated VMX instruction, as specified
-  * by Vol 2B, VMX Instruction Reference, "Conventions".
+  * set the success or error code of an emulated VMX instruction (as specified
+  * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated
+  * instruction.
   */
- static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
+ static int nested_vmx_succeed(struct kvm_vcpu *vcpu)
  {
        vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
                        & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
                            X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
+       return kvm_skip_emulated_instruction(vcpu);
  }
  
- static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
+ static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
  {
        vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
                        & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
                            X86_EFLAGS_SF | X86_EFLAGS_OF))
                        | X86_EFLAGS_CF);
+       return kvm_skip_emulated_instruction(vcpu);
  }
  
- static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
-                                       u32 vm_instruction_error)
+ static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
+                               u32 vm_instruction_error)
  {
-       if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
-               /*
-                * failValid writes the error number to the current VMCS, which
-                * can't be done there isn't a current VMCS.
-                */
-               nested_vmx_failInvalid(vcpu);
-               return;
-       }
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       /*
+        * failValid writes the error number to the current VMCS, which
+        * can't be done if there isn't a current VMCS.
+        */
+       if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs)
+               return nested_vmx_failInvalid(vcpu);
        vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
                        & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
                            X86_EFLAGS_SF | X86_EFLAGS_OF))
         * We don't need to force a shadow sync because
         * VM_INSTRUCTION_ERROR is not shadowed
         */
+       return kvm_skip_emulated_instruction(vcpu);
  }
  
  static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
@@@ -8292,6 -8316,7 +8320,7 @@@ static int enter_vmx_operation(struct k
  
        vmx->nested.vpid02 = allocate_vpid();
  
+       vmx->nested.vmcs02_initialized = false;
        vmx->nested.vmxon = true;
        return 0;
  
@@@ -8345,10 -8370,9 +8374,9 @@@ static int handle_vmon(struct kvm_vcpu 
                return 1;
        }
  
-       if (vmx->nested.vmxon) {
-               nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
+       if (vmx->nested.vmxon)
+               return nested_vmx_failValid(vcpu,
+                       VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
  
        if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
                        != VMXON_NEEDED_FEATURES) {
         * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
         * which replaces physical address width with 32
         */
-       if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
-               nested_vmx_failInvalid(vcpu);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
+       if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
+               return nested_vmx_failInvalid(vcpu);
  
        page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
-       if (is_error_page(page)) {
-               nested_vmx_failInvalid(vcpu);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
+       if (is_error_page(page))
+               return nested_vmx_failInvalid(vcpu);
        if (*(u32 *)kmap(page) != VMCS12_REVISION) {
                kunmap(page);
                kvm_release_page_clean(page);
-               nested_vmx_failInvalid(vcpu);
-               return kvm_skip_emulated_instruction(vcpu);
+               return nested_vmx_failInvalid(vcpu);
        }
        kunmap(page);
        kvm_release_page_clean(page);
        if (ret)
                return ret;
  
-       nested_vmx_succeed(vcpu);
-       return kvm_skip_emulated_instruction(vcpu);
+       return nested_vmx_succeed(vcpu);
  }
  
  /*
@@@ -8423,8 -8442,24 +8446,24 @@@ static void vmx_disable_shadow_vmcs(str
        vmcs_write64(VMCS_LINK_POINTER, -1ull);
  }
  
- static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
+ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       if (!vmx->nested.hv_evmcs)
+               return;
+       kunmap(vmx->nested.hv_evmcs_page);
+       kvm_release_page_dirty(vmx->nested.hv_evmcs_page);
+       vmx->nested.hv_evmcs_vmptr = -1ull;
+       vmx->nested.hv_evmcs_page = NULL;
+       vmx->nested.hv_evmcs = NULL;
+ }
+ static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
  {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
        if (vmx->nested.current_vmptr == -1ull)
                return;
  
                /* copy to memory all shadowed fields in case
                   they were modified */
                copy_shadow_to_vmcs12(vmx);
-               vmx->nested.sync_shadow_vmcs = false;
+               vmx->nested.need_vmcs12_sync = false;
                vmx_disable_shadow_vmcs(vmx);
        }
        vmx->nested.posted_intr_nv = -1;
  
        /* Flush VMCS12 to guest memory */
-       kvm_vcpu_write_guest_page(&vmx->vcpu,
+       kvm_vcpu_write_guest_page(vcpu,
                                  vmx->nested.current_vmptr >> PAGE_SHIFT,
                                  vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
  
+       kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
        vmx->nested.current_vmptr = -1ull;
  }
  
   * Free whatever needs to be freed from vmx->nested when L1 goes down, or
   * just stops using VMX.
   */
- static void free_nested(struct vcpu_vmx *vmx)
+ static void free_nested(struct kvm_vcpu *vcpu)
  {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
        if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
                return;
  
                vmx->nested.pi_desc = NULL;
        }
  
+       kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
+       nested_release_evmcs(vcpu);
        free_loaded_vmcs(&vmx->nested.vmcs02);
  }
  
@@@ -8491,9 -8534,8 +8538,8 @@@ static int handle_vmoff(struct kvm_vcp
  {
        if (!nested_vmx_check_permission(vcpu))
                return 1;
-       free_nested(to_vmx(vcpu));
-       nested_vmx_succeed(vcpu);
-       return kvm_skip_emulated_instruction(vcpu);
+       free_nested(vcpu);
+       return nested_vmx_succeed(vcpu);
  }
  
  /* Emulate the VMCLEAR instruction */
@@@ -8509,25 -8551,28 +8555,28 @@@ static int handle_vmclear(struct kvm_vc
        if (nested_vmx_get_vmptr(vcpu, &vmptr))
                return 1;
  
-       if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
-               nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
+       if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
+               return nested_vmx_failValid(vcpu,
+                       VMXERR_VMCLEAR_INVALID_ADDRESS);
  
-       if (vmptr == vmx->nested.vmxon_ptr) {
-               nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
+       if (vmptr == vmx->nested.vmxon_ptr)
+               return nested_vmx_failValid(vcpu,
+                       VMXERR_VMCLEAR_VMXON_POINTER);
  
-       if (vmptr == vmx->nested.current_vmptr)
-               nested_release_vmcs12(vmx);
+       if (vmx->nested.hv_evmcs_page) {
+               if (vmptr == vmx->nested.hv_evmcs_vmptr)
+                       nested_release_evmcs(vcpu);
+       } else {
+               if (vmptr == vmx->nested.current_vmptr)
+                       nested_release_vmcs12(vcpu);
  
-       kvm_vcpu_write_guest(vcpu,
-                       vmptr + offsetof(struct vmcs12, launch_state),
-                       &zero, sizeof(zero));
+               kvm_vcpu_write_guest(vcpu,
+                                    vmptr + offsetof(struct vmcs12,
+                                                     launch_state),
+                                    &zero, sizeof(zero));
+       }
  
-       nested_vmx_succeed(vcpu);
-       return kvm_skip_emulated_instruction(vcpu);
+       return nested_vmx_succeed(vcpu);
  }
  
  static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
@@@ -8610,6 -8655,395 +8659,395 @@@ static inline int vmcs12_write_any(stru
  
  }
  
+ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
+ {
+       struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
+       struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
+       vmcs12->hdr.revision_id = evmcs->revision_id;
+       /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
+       vmcs12->tpr_threshold = evmcs->tpr_threshold;
+       vmcs12->guest_rip = evmcs->guest_rip;
+       if (unlikely(!(evmcs->hv_clean_fields &
+                      HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
+               vmcs12->guest_rsp = evmcs->guest_rsp;
+               vmcs12->guest_rflags = evmcs->guest_rflags;
+               vmcs12->guest_interruptibility_info =
+                       evmcs->guest_interruptibility_info;
+       }
+       if (unlikely(!(evmcs->hv_clean_fields &
+                      HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
+               vmcs12->cpu_based_vm_exec_control =
+                       evmcs->cpu_based_vm_exec_control;
+       }
+       if (unlikely(!(evmcs->hv_clean_fields &
+                      HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
+               vmcs12->exception_bitmap = evmcs->exception_bitmap;
+       }
+       if (unlikely(!(evmcs->hv_clean_fields &
+                      HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) {
+               vmcs12->vm_entry_controls = evmcs->vm_entry_controls;
+       }
+       if (unlikely(!(evmcs->hv_clean_fields &
+                      HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) {
+               vmcs12->vm_entry_intr_info_field =
+                       evmcs->vm_entry_intr_info_field;
+               vmcs12->vm_entry_exception_error_code =
+                       evmcs->vm_entry_exception_error_code;
+               vmcs12->vm_entry_instruction_len =
+                       evmcs->vm_entry_instruction_len;
+       }
+       if (unlikely(!(evmcs->hv_clean_fields &
+                      HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
+               vmcs12->host_ia32_pat = evmcs->host_ia32_pat;
+               vmcs12->host_ia32_efer = evmcs->host_ia32_efer;
+               vmcs12->host_cr0 = evmcs->host_cr0;
+               vmcs12->host_cr3 = evmcs->host_cr3;
+               vmcs12->host_cr4 = evmcs->host_cr4;
+               vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp;
+               vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip;
+               vmcs12->host_rip = evmcs->host_rip;
+               vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs;
+               vmcs12->host_es_selector = evmcs->host_es_selector;
+               vmcs12->host_cs_selector = evmcs->host_cs_selector;
+               vmcs12->host_ss_selector = evmcs->host_ss_selector;
+               vmcs12->host_ds_selector = evmcs->host_ds_selector;
+               vmcs12->host_fs_selector = evmcs->host_fs_selector;
+               vmcs12->host_gs_selector = evmcs->host_gs_selector;
+               vmcs12->host_tr_selector = evmcs->host_tr_selector;
+       }
+       if (unlikely(!(evmcs->hv_clean_fields &
+                      HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
+               vmcs12->pin_based_vm_exec_control =
+                       evmcs->pin_based_vm_exec_control;
+               vmcs12->vm_exit_controls = evmcs->vm_exit_controls;
+               vmcs12->secondary_vm_exec_control =
+                       evmcs->secondary_vm_exec_control;
+       }
+       if (unlikely(!(evmcs->hv_clean_fields &
+                      HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) {
+               vmcs12->io_bitmap_a = evmcs->io_bitmap_a;
+               vmcs12->io_bitmap_b = evmcs->io_bitmap_b;
+       }
+       if (unlikely(!(evmcs->hv_clean_fields &
+                      HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) {
+               vmcs12->msr_bitmap = evmcs->msr_bitmap;
+       }
+       if (unlikely(!(evmcs->hv_clean_fields &
+                      HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) {
+               vmcs12->guest_es_base = evmcs->guest_es_base;
+               vmcs12->guest_cs_base = evmcs->guest_cs_base;
+               vmcs12->guest_ss_base = evmcs->guest_ss_base;
+               vmcs12->guest_ds_base = evmcs->guest_ds_base;
+               vmcs12->guest_fs_base = evmcs->guest_fs_base;
+               vmcs12->guest_gs_base = evmcs->guest_gs_base;
+               vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base;
+               vmcs12->guest_tr_base = evmcs->guest_tr_base;
+               vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base;
+               vmcs12->guest_idtr_base = evmcs->guest_idtr_base;
+               vmcs12->guest_es_limit = evmcs->guest_es_limit;
+               vmcs12->guest_cs_limit = evmcs->guest_cs_limit;
+               vmcs12->guest_ss_limit = evmcs->guest_ss_limit;
+               vmcs12->guest_ds_limit = evmcs->guest_ds_limit;
+               vmcs12->guest_fs_limit = evmcs->guest_fs_limit;
+               vmcs12->guest_gs_limit = evmcs->guest_gs_limit;
+               vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit;
+               vmcs12->guest_tr_limit = evmcs->guest_tr_limit;
+               vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit;
+               vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit;
+               vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes;
+               vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes;
+               vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes;
+               vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes;
+               vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes;
+               vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes;
+               vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes;
+               vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes;
+               vmcs12->guest_es_selector = evmcs->guest_es_selector;
+               vmcs12->guest_cs_selector = evmcs->guest_cs_selector;
+               vmcs12->guest_ss_selector = evmcs->guest_ss_selector;
+               vmcs12->guest_ds_selector = evmcs->guest_ds_selector;
+               vmcs12->guest_fs_selector = evmcs->guest_fs_selector;
+               vmcs12->guest_gs_selector = evmcs->guest_gs_selector;
+               vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector;
+               vmcs12->guest_tr_selector = evmcs->guest_tr_selector;
+       }
+       if (unlikely(!(evmcs->hv_clean_fields &
+                      HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) {
+               vmcs12->tsc_offset = evmcs->tsc_offset;
+               vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr;
+               vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap;
+       }
+       if (unlikely(!(evmcs->hv_clean_fields &
+                      HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) {
+               vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask;
+               vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask;
+               vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow;
+               vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow;
+               vmcs12->guest_cr0 = evmcs->guest_cr0;
+               vmcs12->guest_cr3 = evmcs->guest_cr3;
+               vmcs12->guest_cr4 = evmcs->guest_cr4;
+               vmcs12->guest_dr7 = evmcs->guest_dr7;
+       }
+       if (unlikely(!(evmcs->hv_clean_fields &
+                      HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) {
+               vmcs12->host_fs_base = evmcs->host_fs_base;
+               vmcs12->host_gs_base = evmcs->host_gs_base;
+               vmcs12->host_tr_base = evmcs->host_tr_base;
+               vmcs12->host_gdtr_base = evmcs->host_gdtr_base;
+               vmcs12->host_idtr_base = evmcs->host_idtr_base;
+               vmcs12->host_rsp = evmcs->host_rsp;
+       }
+       if (unlikely(!(evmcs->hv_clean_fields &
+                      HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) {
+               vmcs12->ept_pointer = evmcs->ept_pointer;
+               vmcs12->virtual_processor_id = evmcs->virtual_processor_id;
+       }
+       if (unlikely(!(evmcs->hv_clean_fields &
+                      HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) {
+               vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer;
+               vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl;
+               vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat;
+               vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer;
+               vmcs12->guest_pdptr0 = evmcs->guest_pdptr0;
+               vmcs12->guest_pdptr1 = evmcs->guest_pdptr1;
+               vmcs12->guest_pdptr2 = evmcs->guest_pdptr2;
+               vmcs12->guest_pdptr3 = evmcs->guest_pdptr3;
+               vmcs12->guest_pending_dbg_exceptions =
+                       evmcs->guest_pending_dbg_exceptions;
+               vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp;
+               vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip;
+               vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs;
+               vmcs12->guest_activity_state = evmcs->guest_activity_state;
+               vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs;
+       }
+       /*
+        * Not used?
+        * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr;
+        * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr;
+        * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr;
+        * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0;
+        * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1;
+        * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2;
+        * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3;
+        * vmcs12->page_fault_error_code_mask =
+        *              evmcs->page_fault_error_code_mask;
+        * vmcs12->page_fault_error_code_match =
+        *              evmcs->page_fault_error_code_match;
+        * vmcs12->cr3_target_count = evmcs->cr3_target_count;
+        * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count;
+        * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count;
+        * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count;
+        */
+       /*
+        * Read only fields:
+        * vmcs12->guest_physical_address = evmcs->guest_physical_address;
+        * vmcs12->vm_instruction_error = evmcs->vm_instruction_error;
+        * vmcs12->vm_exit_reason = evmcs->vm_exit_reason;
+        * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info;
+        * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code;
+        * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field;
+        * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code;
+        * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len;
+        * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info;
+        * vmcs12->exit_qualification = evmcs->exit_qualification;
+        * vmcs12->guest_linear_address = evmcs->guest_linear_address;
+        *
+        * Not present in struct vmcs12:
+        * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx;
+        * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi;
+        * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi;
+        * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip;
+        */
+       return 0;
+ }
+ static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
+ {
+       struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
+       struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
+       /*
+        * Should not be changed by KVM:
+        *
+        * evmcs->host_es_selector = vmcs12->host_es_selector;
+        * evmcs->host_cs_selector = vmcs12->host_cs_selector;
+        * evmcs->host_ss_selector = vmcs12->host_ss_selector;
+        * evmcs->host_ds_selector = vmcs12->host_ds_selector;
+        * evmcs->host_fs_selector = vmcs12->host_fs_selector;
+        * evmcs->host_gs_selector = vmcs12->host_gs_selector;
+        * evmcs->host_tr_selector = vmcs12->host_tr_selector;
+        * evmcs->host_ia32_pat = vmcs12->host_ia32_pat;
+        * evmcs->host_ia32_efer = vmcs12->host_ia32_efer;
+        * evmcs->host_cr0 = vmcs12->host_cr0;
+        * evmcs->host_cr3 = vmcs12->host_cr3;
+        * evmcs->host_cr4 = vmcs12->host_cr4;
+        * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp;
+        * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip;
+        * evmcs->host_rip = vmcs12->host_rip;
+        * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs;
+        * evmcs->host_fs_base = vmcs12->host_fs_base;
+        * evmcs->host_gs_base = vmcs12->host_gs_base;
+        * evmcs->host_tr_base = vmcs12->host_tr_base;
+        * evmcs->host_gdtr_base = vmcs12->host_gdtr_base;
+        * evmcs->host_idtr_base = vmcs12->host_idtr_base;
+        * evmcs->host_rsp = vmcs12->host_rsp;
+        * sync_vmcs12() doesn't read these:
+        * evmcs->io_bitmap_a = vmcs12->io_bitmap_a;
+        * evmcs->io_bitmap_b = vmcs12->io_bitmap_b;
+        * evmcs->msr_bitmap = vmcs12->msr_bitmap;
+        * evmcs->ept_pointer = vmcs12->ept_pointer;
+        * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap;
+        * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr;
+        * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr;
+        * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr;
+        * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0;
+        * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1;
+        * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2;
+        * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3;
+        * evmcs->tpr_threshold = vmcs12->tpr_threshold;
+        * evmcs->virtual_processor_id = vmcs12->virtual_processor_id;
+        * evmcs->exception_bitmap = vmcs12->exception_bitmap;
+        * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer;
+        * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control;
+        * evmcs->vm_exit_controls = vmcs12->vm_exit_controls;
+        * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control;
+        * evmcs->page_fault_error_code_mask =
+        *              vmcs12->page_fault_error_code_mask;
+        * evmcs->page_fault_error_code_match =
+        *              vmcs12->page_fault_error_code_match;
+        * evmcs->cr3_target_count = vmcs12->cr3_target_count;
+        * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr;
+        * evmcs->tsc_offset = vmcs12->tsc_offset;
+        * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl;
+        * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask;
+        * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask;
+        * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow;
+        * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow;
+        * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count;
+        * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count;
+        * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count;
+        *
+        * Not present in struct vmcs12:
+        * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx;
+        * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi;
+        * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi;
+        * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip;
+        */
+       evmcs->guest_es_selector = vmcs12->guest_es_selector;
+       evmcs->guest_cs_selector = vmcs12->guest_cs_selector;
+       evmcs->guest_ss_selector = vmcs12->guest_ss_selector;
+       evmcs->guest_ds_selector = vmcs12->guest_ds_selector;
+       evmcs->guest_fs_selector = vmcs12->guest_fs_selector;
+       evmcs->guest_gs_selector = vmcs12->guest_gs_selector;
+       evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector;
+       evmcs->guest_tr_selector = vmcs12->guest_tr_selector;
+       evmcs->guest_es_limit = vmcs12->guest_es_limit;
+       evmcs->guest_cs_limit = vmcs12->guest_cs_limit;
+       evmcs->guest_ss_limit = vmcs12->guest_ss_limit;
+       evmcs->guest_ds_limit = vmcs12->guest_ds_limit;
+       evmcs->guest_fs_limit = vmcs12->guest_fs_limit;
+       evmcs->guest_gs_limit = vmcs12->guest_gs_limit;
+       evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit;
+       evmcs->guest_tr_limit = vmcs12->guest_tr_limit;
+       evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit;
+       evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit;
+       evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes;
+       evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes;
+       evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes;
+       evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes;
+       evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes;
+       evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes;
+       evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes;
+       evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes;
+       evmcs->guest_es_base = vmcs12->guest_es_base;
+       evmcs->guest_cs_base = vmcs12->guest_cs_base;
+       evmcs->guest_ss_base = vmcs12->guest_ss_base;
+       evmcs->guest_ds_base = vmcs12->guest_ds_base;
+       evmcs->guest_fs_base = vmcs12->guest_fs_base;
+       evmcs->guest_gs_base = vmcs12->guest_gs_base;
+       evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base;
+       evmcs->guest_tr_base = vmcs12->guest_tr_base;
+       evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base;
+       evmcs->guest_idtr_base = vmcs12->guest_idtr_base;
+       evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat;
+       evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer;
+       evmcs->guest_pdptr0 = vmcs12->guest_pdptr0;
+       evmcs->guest_pdptr1 = vmcs12->guest_pdptr1;
+       evmcs->guest_pdptr2 = vmcs12->guest_pdptr2;
+       evmcs->guest_pdptr3 = vmcs12->guest_pdptr3;
+       evmcs->guest_pending_dbg_exceptions =
+               vmcs12->guest_pending_dbg_exceptions;
+       evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp;
+       evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip;
+       evmcs->guest_activity_state = vmcs12->guest_activity_state;
+       evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs;
+       evmcs->guest_cr0 = vmcs12->guest_cr0;
+       evmcs->guest_cr3 = vmcs12->guest_cr3;
+       evmcs->guest_cr4 = vmcs12->guest_cr4;
+       evmcs->guest_dr7 = vmcs12->guest_dr7;
+       evmcs->guest_physical_address = vmcs12->guest_physical_address;
+       evmcs->vm_instruction_error = vmcs12->vm_instruction_error;
+       evmcs->vm_exit_reason = vmcs12->vm_exit_reason;
+       evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info;
+       evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code;
+       evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field;
+       evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code;
+       evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len;
+       evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info;
+       evmcs->exit_qualification = vmcs12->exit_qualification;
+       evmcs->guest_linear_address = vmcs12->guest_linear_address;
+       evmcs->guest_rsp = vmcs12->guest_rsp;
+       evmcs->guest_rflags = vmcs12->guest_rflags;
+       evmcs->guest_interruptibility_info =
+               vmcs12->guest_interruptibility_info;
+       evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control;
+       evmcs->vm_entry_controls = vmcs12->vm_entry_controls;
+       evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field;
+       evmcs->vm_entry_exception_error_code =
+               vmcs12->vm_entry_exception_error_code;
+       evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len;
+       evmcs->guest_rip = vmcs12->guest_rip;
+       evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
+       return 0;
+ }
  /*
   * Copy the writable VMCS shadow fields back to the VMCS12, in case
   * they have been modified by the L1 guest. Note that the "read-only"
@@@ -8683,20 -9117,6 +9121,6 @@@ static void copy_vmcs12_to_shadow(struc
        vmcs_load(vmx->loaded_vmcs->vmcs);
  }
  
- /*
-  * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was
-  * used before) all generate the same failure when it is missing.
-  */
- static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu)
- {
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-       if (vmx->nested.current_vmptr == -1ull) {
-               nested_vmx_failInvalid(vcpu);
-               return 0;
-       }
-       return 1;
- }
  static int handle_vmread(struct kvm_vcpu *vcpu)
  {
        unsigned long field;
        if (!nested_vmx_check_permission(vcpu))
                return 1;
  
-       if (!nested_vmx_check_vmcs12(vcpu))
-               return kvm_skip_emulated_instruction(vcpu);
+       if (to_vmx(vcpu)->nested.current_vmptr == -1ull)
+               return nested_vmx_failInvalid(vcpu);
  
        if (!is_guest_mode(vcpu))
                vmcs12 = get_vmcs12(vcpu);
                 * When vmcs->vmcs_link_pointer is -1ull, any VMREAD
                 * to shadowed-field sets the ALU flags for VMfailInvalid.
                 */
-               if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) {
-                       nested_vmx_failInvalid(vcpu);
-                       return kvm_skip_emulated_instruction(vcpu);
-               }
+               if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
+                       return nested_vmx_failInvalid(vcpu);
                vmcs12 = get_shadow_vmcs12(vcpu);
        }
  
        /* Decode instruction info and find the field to read */
        field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
        /* Read the field, zero-extended to a u64 field_value */
-       if (vmcs12_read_any(vmcs12, field, &field_value) < 0) {
-               nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
+       if (vmcs12_read_any(vmcs12, field, &field_value) < 0)
+               return nested_vmx_failValid(vcpu,
+                       VMXERR_UNSUPPORTED_VMCS_COMPONENT);
        /*
         * Now copy part of this value to register or memory, as requested.
         * Note that the number of bits actually copied is 32 or 64 depending
                                            (is_long_mode(vcpu) ? 8 : 4), NULL);
        }
  
-       nested_vmx_succeed(vcpu);
-       return kvm_skip_emulated_instruction(vcpu);
+       return nested_vmx_succeed(vcpu);
  }
  
  
@@@ -8776,8 -9193,8 +9197,8 @@@ static int handle_vmwrite(struct kvm_vc
        if (!nested_vmx_check_permission(vcpu))
                return 1;
  
-       if (!nested_vmx_check_vmcs12(vcpu))
-               return kvm_skip_emulated_instruction(vcpu);
+       if (vmx->nested.current_vmptr == -1ull)
+               return nested_vmx_failInvalid(vcpu);
  
        if (vmx_instruction_info & (1u << 10))
                field_value = kvm_register_readl(vcpu,
         * VMCS," then the "read-only" fields are actually read/write.
         */
        if (vmcs_field_readonly(field) &&
-           !nested_cpu_has_vmwrite_any_field(vcpu)) {
-               nested_vmx_failValid(vcpu,
+           !nested_cpu_has_vmwrite_any_field(vcpu))
+               return nested_vmx_failValid(vcpu,
                        VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
  
        if (!is_guest_mode(vcpu))
                vmcs12 = get_vmcs12(vcpu);
                 * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE
                 * to shadowed-field sets the ALU flags for VMfailInvalid.
                 */
-               if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) {
-                       nested_vmx_failInvalid(vcpu);
-                       return kvm_skip_emulated_instruction(vcpu);
-               }
+               if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
+                       return nested_vmx_failInvalid(vcpu);
                vmcs12 = get_shadow_vmcs12(vcpu);
        }
  
-       if (vmcs12_write_any(vmcs12, field, field_value) < 0) {
-               nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
+       if (vmcs12_write_any(vmcs12, field, field_value) < 0)
+               return nested_vmx_failValid(vcpu,
+                       VMXERR_UNSUPPORTED_VMCS_COMPONENT);
  
        /*
         * Do not track vmcs12 dirty-state if in guest-mode
                }
        }
  
-       nested_vmx_succeed(vcpu);
-       return kvm_skip_emulated_instruction(vcpu);
+       return nested_vmx_succeed(vcpu);
  }
  
  static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
                              SECONDARY_EXEC_SHADOW_VMCS);
                vmcs_write64(VMCS_LINK_POINTER,
                             __pa(vmx->vmcs01.shadow_vmcs));
-               vmx->nested.sync_shadow_vmcs = true;
+               vmx->nested.need_vmcs12_sync = true;
        }
        vmx->nested.dirty_vmcs12 = true;
  }
@@@ -8875,36 -9285,37 +9289,37 @@@ static int handle_vmptrld(struct kvm_vc
        if (nested_vmx_get_vmptr(vcpu, &vmptr))
                return 1;
  
-       if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
-               nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
+       if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
+               return nested_vmx_failValid(vcpu,
+                       VMXERR_VMPTRLD_INVALID_ADDRESS);
  
-       if (vmptr == vmx->nested.vmxon_ptr) {
-               nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
+       if (vmptr == vmx->nested.vmxon_ptr)
+               return nested_vmx_failValid(vcpu,
+                       VMXERR_VMPTRLD_VMXON_POINTER);
+       /* Forbid normal VMPTRLD if Enlightened version was used */
+       if (vmx->nested.hv_evmcs)
+               return 1;
  
        if (vmx->nested.current_vmptr != vmptr) {
                struct vmcs12 *new_vmcs12;
                struct page *page;
                page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
-               if (is_error_page(page)) {
-                       nested_vmx_failInvalid(vcpu);
-                       return kvm_skip_emulated_instruction(vcpu);
-               }
+               if (is_error_page(page))
+                       return nested_vmx_failInvalid(vcpu);
                new_vmcs12 = kmap(page);
                if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
                    (new_vmcs12->hdr.shadow_vmcs &&
                     !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
                        kunmap(page);
                        kvm_release_page_clean(page);
-                       nested_vmx_failValid(vcpu,
+                       return nested_vmx_failValid(vcpu,
                                VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
-                       return kvm_skip_emulated_instruction(vcpu);
                }
  
-               nested_release_vmcs12(vmx);
+               nested_release_vmcs12(vcpu);
                /*
                 * Load VMCS12 from guest memory since it is not already
                 * cached.
                set_current_vmptr(vmx, vmptr);
        }
  
-       nested_vmx_succeed(vcpu);
-       return kvm_skip_emulated_instruction(vcpu);
+       return nested_vmx_succeed(vcpu);
+ }
+ /*
+  * This is an equivalent of the nested hypervisor executing the vmptrld
+  * instruction.
+  */
+ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
+                                                bool from_launch)
+ {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       struct hv_vp_assist_page assist_page;
+       if (likely(!vmx->nested.enlightened_vmcs_enabled))
+               return 1;
+       if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page)))
+               return 1;
+       if (unlikely(!assist_page.enlighten_vmentry))
+               return 1;
+       if (unlikely(assist_page.current_nested_vmcs !=
+                    vmx->nested.hv_evmcs_vmptr)) {
+               if (!vmx->nested.hv_evmcs)
+                       vmx->nested.current_vmptr = -1ull;
+               nested_release_evmcs(vcpu);
+               vmx->nested.hv_evmcs_page = kvm_vcpu_gpa_to_page(
+                       vcpu, assist_page.current_nested_vmcs);
+               if (unlikely(is_error_page(vmx->nested.hv_evmcs_page)))
+                       return 0;
+               vmx->nested.hv_evmcs = kmap(vmx->nested.hv_evmcs_page);
+               if (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION) {
+                       nested_release_evmcs(vcpu);
+                       return 0;
+               }
+               vmx->nested.dirty_vmcs12 = true;
+               /*
+                * As we keep L2 state for one guest only 'hv_clean_fields' mask
+                * can't be used when we switch between them. Reset it here for
+                * simplicity.
+                */
+               vmx->nested.hv_evmcs->hv_clean_fields &=
+                       ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+               vmx->nested.hv_evmcs_vmptr = assist_page.current_nested_vmcs;
+               /*
+                * Unlike normal vmcs12, enlightened vmcs12 is not fully
+                * reloaded from guest's memory (read only fields, fields not
+                * present in struct hv_enlightened_vmcs, ...). Make sure there
+                * are no leftovers.
+                */
+               if (from_launch)
+                       memset(vmx->nested.cached_vmcs12, 0,
+                              sizeof(*vmx->nested.cached_vmcs12));
+       }
+       return 1;
  }
  
  /* Emulate the VMPTRST instruction */
@@@ -8932,6 -9406,9 +9410,9 @@@ static int handle_vmptrst(struct kvm_vc
        if (!nested_vmx_check_permission(vcpu))
                return 1;
  
+       if (unlikely(to_vmx(vcpu)->nested.hv_evmcs))
+               return 1;
        if (get_vmx_mem_address(vcpu, exit_qual, instr_info, true, &gva))
                return 1;
        /* *_system ok, nested_vmx_check_permission has verified cpl=0 */
                kvm_inject_page_fault(vcpu, &e);
                return 1;
        }
-       nested_vmx_succeed(vcpu);
-       return kvm_skip_emulated_instruction(vcpu);
+       return nested_vmx_succeed(vcpu);
  }
  
  /* Emulate the INVEPT instruction */
@@@ -8971,11 -9447,9 +9451,9 @@@ static int handle_invept(struct kvm_vcp
  
        types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
  
-       if (type >= 32 || !(types & (1 << type))) {
-               nested_vmx_failValid(vcpu,
+       if (type >= 32 || !(types & (1 << type)))
+               return nested_vmx_failValid(vcpu,
                                VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
  
        /* According to the Intel VMX instruction reference, the memory
         * operand is read even if it isn't needed (e.g., for type==global)
        case VMX_EPT_EXTENT_CONTEXT:
                kvm_mmu_sync_roots(vcpu);
                kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
-               nested_vmx_succeed(vcpu);
                break;
        default:
                BUG_ON(1);
                break;
        }
  
-       return kvm_skip_emulated_instruction(vcpu);
+       return nested_vmx_succeed(vcpu);
+ }
+ static u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid;
  }
  
  static int handle_invvpid(struct kvm_vcpu *vcpu)
                u64 vpid;
                u64 gla;
        } operand;
+       u16 vpid02;
  
        if (!(vmx->nested.msrs.secondary_ctls_high &
              SECONDARY_EXEC_ENABLE_VPID) ||
        types = (vmx->nested.msrs.vpid_caps &
                        VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
  
-       if (type >= 32 || !(types & (1 << type))) {
-               nested_vmx_failValid(vcpu,
+       if (type >= 32 || !(types & (1 << type)))
+               return nested_vmx_failValid(vcpu,
                        VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
  
        /* according to the intel vmx instruction reference, the memory
         * operand is read even if it isn't needed (e.g., for type==global)
                kvm_inject_page_fault(vcpu, &e);
                return 1;
        }
-       if (operand.vpid >> 16) {
-               nested_vmx_failValid(vcpu,
+       if (operand.vpid >> 16)
+               return nested_vmx_failValid(vcpu,
                        VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
  
+       vpid02 = nested_get_vpid02(vcpu);
        switch (type) {
        case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
                if (!operand.vpid ||
-                   is_noncanonical_address(operand.gla, vcpu)) {
-                       nested_vmx_failValid(vcpu,
+                   is_noncanonical_address(operand.gla, vcpu))
+                       return nested_vmx_failValid(vcpu,
                                VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
-                       return kvm_skip_emulated_instruction(vcpu);
-               }
-               if (cpu_has_vmx_invvpid_individual_addr() &&
-                   vmx->nested.vpid02) {
+               if (cpu_has_vmx_invvpid_individual_addr()) {
                        __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR,
-                               vmx->nested.vpid02, operand.gla);
+                               vpid02, operand.gla);
                } else
-                       __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
+                       __vmx_flush_tlb(vcpu, vpid02, false);
                break;
        case VMX_VPID_EXTENT_SINGLE_CONTEXT:
        case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
-               if (!operand.vpid) {
-                       nested_vmx_failValid(vcpu,
+               if (!operand.vpid)
+                       return nested_vmx_failValid(vcpu,
                                VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
-                       return kvm_skip_emulated_instruction(vcpu);
-               }
-               __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
+               __vmx_flush_tlb(vcpu, vpid02, false);
                break;
        case VMX_VPID_EXTENT_ALL_CONTEXT:
-               __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
+               __vmx_flush_tlb(vcpu, vpid02, false);
                break;
        default:
                WARN_ON_ONCE(1);
                return kvm_skip_emulated_instruction(vcpu);
        }
  
-       nested_vmx_succeed(vcpu);
-       return kvm_skip_emulated_instruction(vcpu);
+       return nested_vmx_succeed(vcpu);
  }
  
  static int handle_invpcid(struct kvm_vcpu *vcpu)
                }
  
                for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
-                       if (kvm_get_pcid(vcpu, vcpu->arch.mmu.prev_roots[i].cr3)
+                       if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].cr3)
                            == operand.pcid)
                                roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
  
-               kvm_mmu_free_roots(vcpu, roots_to_free);
+               kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
                /*
                 * If neither the current cr3 nor any of the prev_roots use the
                 * given PCID, then nothing needs to be done here because a
@@@ -9293,7 -9764,7 +9768,7 @@@ static int nested_vmx_eptp_switching(st
  
                kvm_mmu_unload(vcpu);
                mmu->ept_ad = accessed_dirty;
-               mmu->base_role.ad_disabled = !accessed_dirty;
+               mmu->mmu_role.base.ad_disabled = !accessed_dirty;
                vmcs12->ept_pointer = address;
                /*
                 * TODO: Check what's the correct approach in case
@@@ -9652,9 -10123,6 +10127,6 @@@ static bool nested_vmx_exit_reflected(s
                        return false;
                else if (is_page_fault(intr_info))
                        return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept;
-               else if (is_no_device(intr_info) &&
-                        !(vmcs12->guest_cr0 & X86_CR0_TS))
-                       return false;
                else if (is_debug(intr_info) &&
                         vcpu->guest_debug &
                         (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
@@@ -10676,9 -11144,25 +11148,25 @@@ static void __noclone vmx_vcpu_run(stru
                vmcs_write32(PLE_WINDOW, vmx->ple_window);
        }
  
-       if (vmx->nested.sync_shadow_vmcs) {
-               copy_vmcs12_to_shadow(vmx);
-               vmx->nested.sync_shadow_vmcs = false;
+       if (vmx->nested.need_vmcs12_sync) {
+               /*
+                * hv_evmcs may end up being not mapped after migration (when
+                * L2 was running), map it here to make sure vmcs12 changes are
+                * properly reflected.
+                */
+               if (vmx->nested.enlightened_vmcs_enabled &&
+                   !vmx->nested.hv_evmcs)
+                       nested_vmx_handle_enlightened_vmptrld(vcpu, false);
+               if (vmx->nested.hv_evmcs) {
+                       copy_vmcs12_to_enlightened(vmx);
+                       /* All fields are clean */
+                       vmx->nested.hv_evmcs->hv_clean_fields |=
+                               HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+               } else {
+                       copy_vmcs12_to_shadow(vmx);
+               }
+               vmx->nested.need_vmcs12_sync = false;
        }
  
        if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
                "mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t"
                "jmp 1f \n\t"
                "2: \n\t"
-               __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
+               __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t"
                "1: \n\t"
                /* Reload cr2 if changed */
                "mov %c[cr2](%0), %%" _ASM_AX " \n\t"
  
                /* Enter guest mode */
                "jne 1f \n\t"
-               __ex(ASM_VMX_VMLAUNCH) "\n\t"
+               __ex("vmlaunch") "\n\t"
                "jmp 2f \n\t"
-               "1: " __ex(ASM_VMX_VMRESUME) "\n\t"
+               "1: " __ex("vmresume") "\n\t"
                "2: "
                /* Save guest registers, load host registers, keep flags */
                "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t"
                "mov %%r13, %c[r13](%0) \n\t"
                "mov %%r14, %c[r14](%0) \n\t"
                "mov %%r15, %c[r15](%0) \n\t"
+               /*
+               * Clear host registers marked as clobbered to prevent
+               * speculative use.
+               */
                "xor %%r8d,  %%r8d \n\t"
                "xor %%r9d,  %%r9d \n\t"
                "xor %%r10d, %%r10d \n\t"
@@@ -10958,6 -11446,10 +11450,10 @@@ static void vmx_switch_vmcs(struct kvm_
        vmx->loaded_vmcs = vmcs;
        vmx_vcpu_load(vcpu, cpu);
        put_cpu();
+       vm_entry_controls_reset_shadow(vmx);
+       vm_exit_controls_reset_shadow(vmx);
+       vmx_segment_cache_clear(vmx);
  }
  
  /*
   */
  static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu)
  {
-        struct vcpu_vmx *vmx = to_vmx(vcpu);
-        vcpu_load(vcpu);
-        vmx_switch_vmcs(vcpu, &vmx->vmcs01);
-        free_nested(vmx);
-        vcpu_put(vcpu);
+       vcpu_load(vcpu);
+       vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01);
+       free_nested(vcpu);
+       vcpu_put(vcpu);
  }
  
  static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
@@@ -11334,28 -11824,28 +11828,28 @@@ static unsigned long nested_ept_get_cr3
        return get_vmcs12(vcpu)->ept_pointer;
  }
  
- static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
+ static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
  {
        WARN_ON(mmu_is_nested(vcpu));
-       if (!valid_ept_address(vcpu, nested_ept_get_cr3(vcpu)))
-               return 1;
  
+       vcpu->arch.mmu = &vcpu->arch.guest_mmu;
        kvm_init_shadow_ept_mmu(vcpu,
                        to_vmx(vcpu)->nested.msrs.ept_caps &
                        VMX_EPT_EXECUTE_ONLY_BIT,
                        nested_ept_ad_enabled(vcpu),
                        nested_ept_get_cr3(vcpu));
-       vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
-       vcpu->arch.mmu.get_cr3           = nested_ept_get_cr3;
-       vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
+       vcpu->arch.mmu->set_cr3           = vmx_set_cr3;
+       vcpu->arch.mmu->get_cr3           = nested_ept_get_cr3;
+       vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
+       vcpu->arch.mmu->get_pdptr         = kvm_pdptr_read;
  
        vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
-       return 0;
  }
  
  static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
  {
-       vcpu->arch.walk_mmu = &vcpu->arch.mmu;
+       vcpu->arch.mmu = &vcpu->arch.root_mmu;
+       vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
  }
  
  static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
@@@ -11716,7 -12206,7 +12210,7 @@@ static int nested_vmx_check_apicv_contr
            !nested_exit_intr_ack_set(vcpu) ||
            (vmcs12->posted_intr_nv & 0xff00) ||
            (vmcs12->posted_intr_desc_addr & 0x3f) ||
-           (!page_address_valid(vcpu, vmcs12->posted_intr_desc_addr))))
+           (vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu))))
                return -EINVAL;
  
        /* tpr shadow is needed by all apicv features. */
@@@ -11772,15 -12262,12 +12266,12 @@@ static int nested_vmx_check_msr_switch_
  static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
                                         struct vmcs12 *vmcs12)
  {
-       u64 address = vmcs12->pml_address;
-       int maxphyaddr = cpuid_maxphyaddr(vcpu);
+       if (!nested_cpu_has_pml(vmcs12))
+               return 0;
  
-       if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML)) {
-               if (!nested_cpu_has_ept(vmcs12) ||
-                   !IS_ALIGNED(address, 4096)  ||
-                   address >> maxphyaddr)
-                       return -EINVAL;
-       }
+       if (!nested_cpu_has_ept(vmcs12) ||
+           !page_address_valid(vcpu, vmcs12->pml_address))
+               return -EINVAL;
  
        return 0;
  }
@@@ -11960,112 -12447,87 +12451,87 @@@ static int nested_vmx_load_cr3(struct k
        return 0;
  }
  
- static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
- {
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
+ /*
+  * Returns if KVM is able to config CPU to tag TLB entries
+  * populated by L2 differently than TLB entries populated
+  * by L1.
+  *
+  * If L1 uses EPT, then TLB entries are tagged with different EPTP.
+  *
+  * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
+  * with different VPID (L1 entries are tagged with vmx->vpid
+  * while L2 entries are tagged with vmx->nested.vpid02).
+  */
+ static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
+ {
+       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
  
-       vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
-       vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
-       vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
-       vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
-       vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
-       vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
-       vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
-       vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
-       vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
-       vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
-       vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
-       vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
-       vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
-       vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
-       vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
-       vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
-       vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
-       vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
-       vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
-       vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
-       vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
-       vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
-       vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
-       vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
-       vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
-       vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
-       vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
-       vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
-       vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
-       vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
-       vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
-       vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
-       vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
-               vmcs12->guest_pending_dbg_exceptions);
-       vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
-       vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
+       return nested_cpu_has_ept(vmcs12) ||
+              (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
+ }
  
-       if (nested_cpu_has_xsaves(vmcs12))
-               vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
-       vmcs_write64(VMCS_LINK_POINTER, -1ull);
+ static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
+ {
+       if (vmx->nested.nested_run_pending &&
+           (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
+               return vmcs12->guest_ia32_efer;
+       else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
+               return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME);
+       else
+               return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME);
+ }
  
-       if (cpu_has_vmx_posted_intr())
-               vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
+ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
+ {
+       /*
+        * If vmcs02 hasn't been initialized, set the constant vmcs02 state
+        * according to L0's settings (vmcs12 is irrelevant here).  Host
+        * fields that come from L0 and are not constant, e.g. HOST_CR3,
+        * will be set as needed prior to VMLAUNCH/VMRESUME.
+        */
+       if (vmx->nested.vmcs02_initialized)
+               return;
+       vmx->nested.vmcs02_initialized = true;
  
        /*
-        * Whether page-faults are trapped is determined by a combination of
-        * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
-        * If enable_ept, L0 doesn't care about page faults and we should
-        * set all of these to L1's desires. However, if !enable_ept, L0 does
-        * care about (at least some) page faults, and because it is not easy
-        * (if at all possible?) to merge L0 and L1's desires, we simply ask
-        * to exit on each and every L2 page fault. This is done by setting
-        * MASK=MATCH=0 and (see below) EB.PF=1.
-        * Note that below we don't need special code to set EB.PF beyond the
-        * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
-        * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
-        * !enable_ept, EB.PF is 1, so the "or" will always be 1.
+        * We don't care what the EPTP value is we just need to guarantee
+        * it's valid so we don't get a false positive when doing early
+        * consistency checks.
         */
-       vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
-               enable_ept ? vmcs12->page_fault_error_code_mask : 0);
-       vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
-               enable_ept ? vmcs12->page_fault_error_code_match : 0);
+       if (enable_ept && nested_early_check)
+               vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0));
  
        /* All VMFUNCs are currently emulated through L0 vmexits.  */
        if (cpu_has_vmx_vmfunc())
                vmcs_write64(VM_FUNCTION_CONTROL, 0);
  
-       if (cpu_has_vmx_apicv()) {
-               vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
-               vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
-               vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
-               vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
-       }
+       if (cpu_has_vmx_posted_intr())
+               vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
  
-       /*
-        * Set host-state according to L0's settings (vmcs12 is irrelevant here)
-        * Some constant fields are set here by vmx_set_constant_host_state().
-        * Other fields are different per CPU, and will be set later when
-        * vmx_vcpu_load() is called, and when vmx_prepare_switch_to_guest()
-        * is called.
-        */
-       vmx_set_constant_host_state(vmx);
+       if (cpu_has_vmx_msr_bitmap())
+               vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
+       if (enable_pml)
+               vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
  
        /*
-        * Set the MSR load/store lists to match L0's settings.
+        * Set the MSR load/store lists to match L0's settings.  Only the
+        * addresses are constant (for vmcs02), the counts can change based
+        * on L2's behavior, e.g. switching to/from long mode.
         */
        vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
-       vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
        vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
-       vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
        vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
  
-       set_cr4_guest_host_mask(vmx);
+       vmx_set_constant_host_state(vmx);
+ }
  
-       if (kvm_mpx_supported()) {
-               if (vmx->nested.nested_run_pending &&
-                       (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
-                       vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
-               else
-                       vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
-       }
+ static void prepare_vmcs02_early_full(struct vcpu_vmx *vmx,
+                                     struct vmcs12 *vmcs12)
+ {
+       prepare_vmcs02_constant_state(vmx);
+       vmcs_write64(VMCS_LINK_POINTER, -1ull);
  
        if (enable_vpid) {
                if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
                else
                        vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
        }
-       /*
-        * L1 may access the L2's PDPTR, so save them to construct vmcs12
-        */
-       if (enable_ept) {
-               vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
-               vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
-               vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
-               vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
-       }
-       if (cpu_has_vmx_msr_bitmap())
-               vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
  }
  
- /*
-  * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
-  * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
-  * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
-  * guest in a way that will both be appropriate to L1's requests, and our
-  * needs. In addition to modifying the active vmcs (which is vmcs02), this
-  * function also has additional necessary side-effects, like setting various
-  * vcpu->arch fields.
-  * Returns 0 on success, 1 on failure. Invalid state exit qualification code
-  * is assigned to entry_failure_code on failure.
-  */
- static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
-                         u32 *entry_failure_code)
+ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
  {
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 exec_control, vmcs12_exec_ctrl;
+       u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
  
-       if (vmx->nested.dirty_vmcs12) {
-               prepare_vmcs02_full(vcpu, vmcs12);
-               vmx->nested.dirty_vmcs12 = false;
-       }
+       if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs)
+               prepare_vmcs02_early_full(vmx, vmcs12);
  
        /*
-        * First, the fields that are shadowed.  This must be kept in sync
-        * with vmx_shadow_fields.h.
+        * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
+        * entry, but only if the current (host) sp changed from the value
+        * we wrote last (vmx->host_rsp).  This cache is no longer relevant
+        * if we switch vmcs, and rather than hold a separate cache per vmcs,
+        * here we just force the write to happen on entry.  host_rsp will
+        * also be written unconditionally by nested_vmx_check_vmentry_hw()
+        * if we are doing early consistency checks via hardware.
         */
+       vmx->host_rsp = 0;
  
-       vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
-       vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
-       vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
-       vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
-       vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
-       if (vmx->nested.nested_run_pending &&
-           (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
-               kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
-               vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
-       } else {
-               kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
-               vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
-       }
-       if (vmx->nested.nested_run_pending) {
-               vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-                            vmcs12->vm_entry_intr_info_field);
-               vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
-                            vmcs12->vm_entry_exception_error_code);
-               vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
-                            vmcs12->vm_entry_instruction_len);
-               vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
-                            vmcs12->guest_interruptibility_info);
-               vmx->loaded_vmcs->nmi_known_unmasked =
-                       !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
-       } else {
-               vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
-       }
-       vmx_set_rflags(vcpu, vmcs12->guest_rflags);
+       /*
+        * PIN CONTROLS
+        */
        exec_control = vmcs12->pin_based_vm_exec_control;
  
        /* Preemption timer setting is computed directly in vmx_vcpu_run.  */
        } else {
                exec_control &= ~PIN_BASED_POSTED_INTR;
        }
        vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
  
-       vmx->nested.preemption_timer_expired = false;
-       if (nested_cpu_has_preemption_timer(vmcs12))
-               vmx_start_preemption_timer(vcpu);
+       /*
+        * EXEC CONTROLS
+        */
+       exec_control = vmx_exec_control(vmx); /* L0's desires */
+       exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
+       exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
+       exec_control &= ~CPU_BASED_TPR_SHADOW;
+       exec_control |= vmcs12->cpu_based_vm_exec_control;
+       /*
+        * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if
+        * nested_get_vmcs12_pages can't fix it up, the illegal value
+        * will result in a VM entry failure.
+        */
+       if (exec_control & CPU_BASED_TPR_SHADOW) {
+               vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
+               vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
+       } else {
+ #ifdef CONFIG_X86_64
+               exec_control |= CPU_BASED_CR8_LOAD_EXITING |
+                               CPU_BASED_CR8_STORE_EXITING;
+ #endif
+       }
+       /*
+        * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
+        * for I/O port accesses.
+        */
+       exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
+       exec_control |= CPU_BASED_UNCOND_IO_EXITING;
+       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
  
+       /*
+        * SECONDARY EXEC CONTROLS
+        */
        if (cpu_has_secondary_exec_ctrls()) {
                exec_control = vmx->secondary_exec_control;
  
        }
  
        /*
-        * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
-        * entry, but only if the current (host) sp changed from the value
-        * we wrote last (vmx->host_rsp). This cache is no longer relevant
-        * if we switch vmcs, and rather than hold a separate cache per vmcs,
-        * here we just force the write to happen on entry.
+        * ENTRY CONTROLS
+        *
+        * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
+        * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
+        * on the related bits (if supported by the CPU) in the hope that
+        * we can avoid VMWrites during vmx_set_efer().
+        */
+       exec_control = (vmcs12->vm_entry_controls | vmcs_config.vmentry_ctrl) &
+                       ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER;
+       if (cpu_has_load_ia32_efer) {
+               if (guest_efer & EFER_LMA)
+                       exec_control |= VM_ENTRY_IA32E_MODE;
+               if (guest_efer != host_efer)
+                       exec_control |= VM_ENTRY_LOAD_IA32_EFER;
+       }
+       vm_entry_controls_init(vmx, exec_control);
+       /*
+        * EXIT CONTROLS
+        *
+        * L2->L1 exit controls are emulated - the hardware exit is to L0 so
+        * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
+        * bits may be modified by vmx_set_efer() in prepare_vmcs02().
         */
-       vmx->host_rsp = 0;
+       exec_control = vmcs_config.vmexit_ctrl;
+       if (cpu_has_load_ia32_efer && guest_efer != host_efer)
+               exec_control |= VM_EXIT_LOAD_IA32_EFER;
+       vm_exit_controls_init(vmx, exec_control);
  
-       exec_control = vmx_exec_control(vmx); /* L0's desires */
-       exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
-       exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
-       exec_control &= ~CPU_BASED_TPR_SHADOW;
-       exec_control |= vmcs12->cpu_based_vm_exec_control;
+       /*
+        * Conceptually we want to copy the PML address and index from
+        * vmcs01 here, and then back to vmcs01 on nested vmexit. But,
+        * since we always flush the log on each vmexit and never change
+        * the PML address (once set), this happens to be equivalent to
+        * simply resetting the index in vmcs02.
+        */
+       if (enable_pml)
+               vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
  
        /*
-        * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if
-        * nested_get_vmcs12_pages can't fix it up, the illegal value
-        * will result in a VM entry failure.
+        * Interrupt/Exception Fields
         */
-       if (exec_control & CPU_BASED_TPR_SHADOW) {
-               vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
-               vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
+       if (vmx->nested.nested_run_pending) {
+               vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+                            vmcs12->vm_entry_intr_info_field);
+               vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+                            vmcs12->vm_entry_exception_error_code);
+               vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+                            vmcs12->vm_entry_instruction_len);
+               vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
+                            vmcs12->guest_interruptibility_info);
+               vmx->loaded_vmcs->nmi_known_unmasked =
+                       !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
        } else {
- #ifdef CONFIG_X86_64
-               exec_control |= CPU_BASED_CR8_LOAD_EXITING |
-                               CPU_BASED_CR8_STORE_EXITING;
- #endif
+               vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
+       }
+ }
+ static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
+ {
+       struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
+       if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
+                          HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
+               vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
+               vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
+               vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
+               vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
+               vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
+               vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
+               vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
+               vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
+               vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
+               vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
+               vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
+               vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
+               vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
+               vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
+               vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
+               vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
+               vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
+               vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
+               vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
+               vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
+               vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
+               vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
+               vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
+               vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
+               vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
+               vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
+               vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
+               vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
+               vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
+               vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
+               vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
+               vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
+               vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
+               vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
+       }
+       if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
+                          HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) {
+               vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
+               vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
+                           vmcs12->guest_pending_dbg_exceptions);
+               vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
+               vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
+               /*
+                * L1 may access the L2's PDPTR, so save them to construct
+                * vmcs12
+                */
+               if (enable_ept) {
+                       vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
+                       vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
+                       vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
+                       vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
+               }
        }
  
+       if (nested_cpu_has_xsaves(vmcs12))
+               vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
        /*
-        * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
-        * for I/O port accesses.
+        * Whether page-faults are trapped is determined by a combination of
+        * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
+        * If enable_ept, L0 doesn't care about page faults and we should
+        * set all of these to L1's desires. However, if !enable_ept, L0 does
+        * care about (at least some) page faults, and because it is not easy
+        * (if at all possible?) to merge L0 and L1's desires, we simply ask
+        * to exit on each and every L2 page fault. This is done by setting
+        * MASK=MATCH=0 and (see below) EB.PF=1.
+        * Note that below we don't need special code to set EB.PF beyond the
+        * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
+        * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
+        * !enable_ept, EB.PF is 1, so the "or" will always be 1.
         */
-       exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
-       exec_control |= CPU_BASED_UNCOND_IO_EXITING;
+       vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
+               enable_ept ? vmcs12->page_fault_error_code_mask : 0);
+       vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
+               enable_ept ? vmcs12->page_fault_error_code_match : 0);
  
-       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
+       if (cpu_has_vmx_apicv()) {
+               vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
+               vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
+               vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
+               vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
+       }
+       vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
+       vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
+       set_cr4_guest_host_mask(vmx);
+       if (kvm_mpx_supported()) {
+               if (vmx->nested.nested_run_pending &&
+                       (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
+                       vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
+               else
+                       vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
+       }
+ }
+ /*
+  * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
+  * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
+  * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
+  * guest in a way that will both be appropriate to L1's requests, and our
+  * needs. In addition to modifying the active vmcs (which is vmcs02), this
+  * function also has additional necessary side-effects, like setting various
+  * vcpu->arch fields.
+  * Returns 0 on success, 1 on failure. Invalid state exit qualification code
+  * is assigned to entry_failure_code on failure.
+  */
+ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+                         u32 *entry_failure_code)
+ {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
+       if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) {
+               prepare_vmcs02_full(vmx, vmcs12);
+               vmx->nested.dirty_vmcs12 = false;
+       }
+       /*
+        * First, the fields that are shadowed.  This must be kept in sync
+        * with vmx_shadow_fields.h.
+        */
+       if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
+                          HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
+               vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
+               vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
+       }
+       if (vmx->nested.nested_run_pending &&
+           (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
+               kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
+               vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
+       } else {
+               kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
+               vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
+       }
+       vmx_set_rflags(vcpu, vmcs12->guest_rflags);
+       vmx->nested.preemption_timer_expired = false;
+       if (nested_cpu_has_preemption_timer(vmcs12))
+               vmx_start_preemption_timer(vcpu);
  
        /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
         * bitwise-or of what L1 wants to trap for L2, and what we want to
        vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
        vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
  
-       /* L2->L1 exit controls are emulated - the hardware exit is to L0 so
-        * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
-        * bits are further modified by vmx_set_efer() below.
-        */
-       vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
-       /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
-        * emulated by vmx_set_efer(), below.
-        */
-       vm_entry_controls_init(vmx, 
-               (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
-                       ~VM_ENTRY_IA32E_MODE) |
-               (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
        if (vmx->nested.nested_run_pending &&
            (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
                vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
                 * influence global bitmap(for vpid01 and vpid02 allocation)
                 * even if spawn a lot of nested vCPUs.
                 */
-               if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) {
+               if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) {
                        if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
                                vmx->nested.last_vpid = vmcs12->virtual_processor_id;
-                               __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
+                               __vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false);
                        }
                } else {
-                       vmx_flush_tlb(vcpu, true);
+                       /*
+                        * If L1 use EPT, then L0 needs to execute INVEPT on
+                        * EPTP02 instead of EPTP01. Therefore, delay TLB
+                        * flush until vmcs02->eptp is fully updated by
+                        * KVM_REQ_LOAD_CR3. Note that this assumes
+                        * KVM_REQ_TLB_FLUSH is evaluated after
+                        * KVM_REQ_LOAD_CR3 in vcpu_enter_guest().
+                        */
+                       kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
                }
        }
  
-       if (enable_pml) {
-               /*
-                * Conceptually we want to copy the PML address and index from
-                * vmcs01 here, and then back to vmcs01 on nested vmexit. But,
-                * since we always flush the log on each vmexit, this happens
-                * to be equivalent to simply resetting the fields in vmcs02.
-                */
-               ASSERT(vmx->pml_pg);
-               vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
-               vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
-       }
-       if (nested_cpu_has_ept(vmcs12)) {
-               if (nested_ept_init_mmu_context(vcpu)) {
-                       *entry_failure_code = ENTRY_FAIL_DEFAULT;
-                       return 1;
-               }
-       } else if (nested_cpu_has2(vmcs12,
-                                  SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
+       if (nested_cpu_has_ept(vmcs12))
+               nested_ept_init_mmu_context(vcpu);
+       else if (nested_cpu_has2(vmcs12,
+                                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
                vmx_flush_tlb(vcpu, true);
-       }
  
        /*
         * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
        vmx_set_cr4(vcpu, vmcs12->guest_cr4);
        vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
  
-       if (vmx->nested.nested_run_pending &&
-           (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
-               vcpu->arch.efer = vmcs12->guest_ia32_efer;
-       else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
-               vcpu->arch.efer |= (EFER_LMA | EFER_LME);
-       else
-               vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
-       /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
+       vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12);
+       /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
        vmx_set_efer(vcpu, vcpu->arch.efer);
  
        /*
@@@ -12383,6 -12970,7 +12974,7 @@@ static int nested_vmx_check_nmi_control
  static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
  {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
+       bool ia32e;
  
        if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
            vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
            !nested_cr3_valid(vcpu, vmcs12->host_cr3))
                return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD;
  
+       /*
+        * If the load IA32_EFER VM-exit control is 1, bits reserved in the
+        * IA32_EFER MSR must be 0 in the field for that register. In addition,
+        * the values of the LMA and LME bits in the field must each be that of
+        * the host address-space size VM-exit control.
+        */
+       if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
+               ia32e = (vmcs12->vm_exit_controls &
+                        VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
+               if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
+                   ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
+                   ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))
+                       return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD;
+       }
        /*
         * From the Intel SDM, volume 3:
         * Fields relevant to VM-entry event injection must be set properly.
                }
        }
  
+       if (nested_cpu_has_ept(vmcs12) &&
+           !valid_ept_address(vcpu, vmcs12->ept_pointer))
+               return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
        return 0;
  }
  
@@@ -12532,94 -13139,192 +13143,192 @@@ static int nested_vmx_check_vmcs_link_p
        if (is_error_page(page))
                return -EINVAL;
  
-       r = 0;
-       shadow = kmap(page);
-       if (shadow->hdr.revision_id != VMCS12_REVISION ||
-           shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))
-               r = -EINVAL;
-       kunmap(page);
-       kvm_release_page_clean(page);
-       return r;
- }
+       r = 0;
+       shadow = kmap(page);
+       if (shadow->hdr.revision_id != VMCS12_REVISION ||
+           shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))
+               r = -EINVAL;
+       kunmap(page);
+       kvm_release_page_clean(page);
+       return r;
+ }
+ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+                                 u32 *exit_qual)
+ {
+       bool ia32e;
+       *exit_qual = ENTRY_FAIL_DEFAULT;
+       if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) ||
+           !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))
+               return 1;
+       if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
+               *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
+               return 1;
+       }
+       /*
+        * If the load IA32_EFER VM-entry control is 1, the following checks
+        * are performed on the field for the IA32_EFER MSR:
+        * - Bits reserved in the IA32_EFER MSR must be 0.
+        * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
+        *   the IA-32e mode guest VM-exit control. It must also be identical
+        *   to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
+        *   CR0.PG) is 1.
+        */
+       if (to_vmx(vcpu)->nested.nested_run_pending &&
+           (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
+               ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
+               if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) ||
+                   ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
+                   ((vmcs12->guest_cr0 & X86_CR0_PG) &&
+                    ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))
+                       return 1;
+       }
+       if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
+               (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) ||
+               (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))
+                       return 1;
+       return 0;
+ }
+ static int __noclone nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       unsigned long cr3, cr4;
+       if (!nested_early_check)
+               return 0;
+       if (vmx->msr_autoload.host.nr)
+               vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
+       if (vmx->msr_autoload.guest.nr)
+               vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
+       preempt_disable();
+       vmx_prepare_switch_to_guest(vcpu);
+       /*
+        * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
+        * which is reserved to '1' by hardware.  GUEST_RFLAGS is guaranteed to
+        * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e.
+        * there is no need to preserve other bits or save/restore the field.
+        */
+       vmcs_writel(GUEST_RFLAGS, 0);
+       vmcs_writel(HOST_RIP, vmx_early_consistency_check_return);
+       cr3 = __get_current_cr3_fast();
+       if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
+               vmcs_writel(HOST_CR3, cr3);
+               vmx->loaded_vmcs->host_state.cr3 = cr3;
+       }
+       cr4 = cr4_read_shadow();
+       if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
+               vmcs_writel(HOST_CR4, cr4);
+               vmx->loaded_vmcs->host_state.cr4 = cr4;
+       }
+       vmx->__launched = vmx->loaded_vmcs->launched;
+       asm(
+               /* Set HOST_RSP */
+               __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t"
+               "mov %%" _ASM_SP ", %c[host_rsp](%0)\n\t"
+               /* Check if vmlaunch of vmresume is needed */
+               "cmpl $0, %c[launched](%0)\n\t"
+               "je 1f\n\t"
+               __ex("vmresume") "\n\t"
+               "jmp 2f\n\t"
+               "1: " __ex("vmlaunch") "\n\t"
+               "jmp 2f\n\t"
+               "2: "
+               /* Set vmx->fail accordingly */
+               "setbe %c[fail](%0)\n\t"
  
- static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
-                                 u32 *exit_qual)
- {
-       bool ia32e;
+               ".pushsection .rodata\n\t"
+               ".global vmx_early_consistency_check_return\n\t"
+               "vmx_early_consistency_check_return: " _ASM_PTR " 2b\n\t"
+               ".popsection"
+             :
+             : "c"(vmx), "d"((unsigned long)HOST_RSP),
+               [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
+               [fail]"i"(offsetof(struct vcpu_vmx, fail)),
+               [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp))
+             : "rax", "cc", "memory"
+       );
  
-       *exit_qual = ENTRY_FAIL_DEFAULT;
+       vmcs_writel(HOST_RIP, vmx_return);
  
-       if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) ||
-           !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))
-               return 1;
+       preempt_enable();
  
-       if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
-               *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
+       if (vmx->msr_autoload.host.nr)
+               vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
+       if (vmx->msr_autoload.guest.nr)
+               vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
+       if (vmx->fail) {
+               WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
+                            VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+               vmx->fail = 0;
                return 1;
        }
  
        /*
-        * If the load IA32_EFER VM-entry control is 1, the following checks
-        * are performed on the field for the IA32_EFER MSR:
-        * - Bits reserved in the IA32_EFER MSR must be 0.
-        * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
-        *   the IA-32e mode guest VM-exit control. It must also be identical
-        *   to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
-        *   CR0.PG) is 1.
+        * VMExit clears RFLAGS.IF and DR7, even on a consistency check.
         */
-       if (to_vmx(vcpu)->nested.nested_run_pending &&
-           (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
-               ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
-               if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) ||
-                   ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
-                   ((vmcs12->guest_cr0 & X86_CR0_PG) &&
-                    ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))
-                       return 1;
-       }
+       local_irq_enable();
+       if (hw_breakpoint_active())
+               set_debugreg(__this_cpu_read(cpu_dr7), 7);
  
        /*
-        * If the load IA32_EFER VM-exit control is 1, bits reserved in the
-        * IA32_EFER MSR must be 0 in the field for that register. In addition,
-        * the values of the LMA and LME bits in the field must each be that of
-        * the host address-space size VM-exit control.
+        * A non-failing VMEntry means we somehow entered guest mode with
+        * an illegal RIP, and that's just the tip of the iceberg.  There
+        * is no telling what memory has been modified or what state has
+        * been exposed to unknown code.  Hitting this all but guarantees
+        * a (very critical) hardware issue.
         */
-       if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
-               ia32e = (vmcs12->vm_exit_controls &
-                        VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
-               if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
-                   ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
-                   ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))
-                       return 1;
-       }
-       if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
-               (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) ||
-               (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))
-                       return 1;
+       WARN_ON(!(vmcs_read32(VM_EXIT_REASON) &
+               VMX_EXIT_REASONS_FAILED_VMENTRY));
  
        return 0;
  }
+ STACK_FRAME_NON_STANDARD(nested_vmx_check_vmentry_hw);
+ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
+                                  struct vmcs12 *vmcs12);
  
  /*
-  * If exit_qual is NULL, this is being called from state restore (either RSM
+  * If from_vmentry is false, this is being called from state restore (either RSM
   * or KVM_SET_NESTED_STATE).  Otherwise it's called from vmlaunch/vmresume.
+ + *
+ + * Returns:
+ + *   0 - success, i.e. proceed with actual VMEnter
+ + *   1 - consistency check VMExit
+ + *  -1 - consistency check VMFail
   */
- static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual)
+ static int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
+                                         bool from_vmentry)
  {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);