Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 13 Apr 2015 16:47:01 +0000 (09:47 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 13 Apr 2015 16:47:01 +0000 (09:47 -0700)
Pull KVM updates from Paolo Bonzini:
 "First batch of KVM changes for 4.1

  The most interesting bit here is irqfd/ioeventfd support for ARM and
  ARM64.

  Summary:

  ARM/ARM64:
     fixes for live migration, irqfd and ioeventfd support (enabling
     vhost, too), page aging

  s390:
     interrupt handling rework, allowing to inject all local interrupts
     via new ioctl and to get/set the full local irq state for migration
     and introspection.  New ioctls to access memory by virtual address,
     and to get/set the guest storage keys.  SIMD support.

  MIPS:
     FPU and MIPS SIMD Architecture (MSA) support.  Includes some
     patches from Ralf Baechle's MIPS tree.

  x86:
     bugfixes (notably for pvclock, the others are small) and cleanups.
     Another small latency improvement for the TSC deadline timer"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (146 commits)
  KVM: use slowpath for cross page cached accesses
  kvm: mmu: lazy collapse small sptes into large sptes
  KVM: x86: Clear CR2 on VCPU reset
  KVM: x86: DR0-DR3 are not clear on reset
  KVM: x86: BSP in MSR_IA32_APICBASE is writable
  KVM: x86: simplify kvm_apic_map
  KVM: x86: avoid logical_map when it is invalid
  KVM: x86: fix mixed APIC mode broadcast
  KVM: x86: use MDA for interrupt matching
  kvm/ppc/mpic: drop unused IRQ_testbit
  KVM: nVMX: remove unnecessary double caching of MAXPHYADDR
  KVM: nVMX: checks for address bits beyond MAXPHYADDR on VM-entry
  KVM: x86: cache maxphyaddr CPUID leaf in struct kvm_vcpu
  KVM: vmx: pass error code with internal error #2
  x86: vdso: fix pvclock races with task migration
  KVM: remove kvm_read_hva and kvm_read_hva_atomic
  KVM: x86: optimize delivery of TSC deadline timer interrupt
  KVM: x86: extract blocking logic from __vcpu_run
  kvm: x86: fix x86 eflags fixed bit
  KVM: s390: migrate vcpu interrupt state
  ...

1  2 
MAINTAINERS
arch/s390/kvm/kvm-s390.c
arch/x86/kvm/ioapic.c
arch/x86/kvm/lapic.c
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
include/linux/sched.h
kernel/sched/core.c
virt/kvm/kvm_main.c

diff --combined MAINTAINERS
index e8bdf1b17cdb09be255874c81eacc40d0f554c62,15e4015c5a2f8b9315d6e1fececbc4e08d2f45e9..b84686826b23cca2bb9699d6fae5d9b8cae35afa
@@@ -637,7 -637,8 +637,7 @@@ F:      drivers/gpu/drm/radeon/radeon_k
  F:      include/uapi/linux/kfd_ioctl.h
  
  AMD MICROCODE UPDATE SUPPORT
 -M:    Andreas Herrmann <herrmann.der.user@googlemail.com>
 -L:    amd64-microcode@amd64.org
 +M:    Borislav Petkov <bp@alien8.de>
  S:    Maintained
  F:    arch/x86/kernel/cpu/microcode/amd*
  
@@@ -1029,16 -1030,6 +1029,16 @@@ F:    arch/arm/mach-mxs
  F:    arch/arm/boot/dts/imx*
  F:    arch/arm/configs/imx*_defconfig
  
 +ARM/FREESCALE VYBRID ARM ARCHITECTURE
 +M:    Shawn Guo <shawn.guo@linaro.org>
 +M:    Sascha Hauer <kernel@pengutronix.de>
 +R:    Stefan Agner <stefan@agner.ch>
 +L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 +S:    Maintained
 +T:    git git://git.kernel.org/pub/scm/linux/kernel/git/shawnguo/linux.git
 +F:    arch/arm/mach-imx/*vf610*
 +F:    arch/arm/boot/dts/vf*
 +
  ARM/GLOMATION GESBC9312SX MACHINE SUPPORT
  M:    Lennert Buytenhek <kernel@wantstofly.org>
  L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
@@@ -1185,7 -1176,7 +1185,7 @@@ M:      Sebastian Hesselbarth <sebastian.hes
  L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
  S:    Maintained
  F:    arch/arm/mach-mvebu/
 -F:    drivers/rtc/armada38x-rtc
 +F:    drivers/rtc/rtc-armada38x.c
  
  ARM/Marvell Berlin SoC support
  M:    Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
@@@ -1197,7 -1188,6 +1197,7 @@@ ARM/Marvell Dove/MV78xx0/Orion SOC supp
  M:    Jason Cooper <jason@lakedaemon.net>
  M:    Andrew Lunn <andrew@lunn.ch>
  M:    Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
 +M:    Gregory Clement <gregory.clement@free-electrons.com>
  L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
  S:    Maintained
  F:    arch/arm/mach-dove/
@@@ -1361,7 -1351,6 +1361,7 @@@ F:      drivers/i2c/busses/i2c-rk3x.
  F:    drivers/*/*rockchip*
  F:    drivers/*/*/*rockchip*
  F:    sound/soc/rockchip/
 +N:    rockchip
  
  ARM/SAMSUNG EXYNOS ARM ARCHITECTURES
  M:    Kukjin Kim <kgene@kernel.org>
@@@ -1675,8 -1664,8 +1675,8 @@@ F:      drivers/misc/eeprom/at24.
  F:    include/linux/platform_data/at24.h
  
  ATA OVER ETHERNET (AOE) DRIVER
 -M:    "Ed L. Cashin" <ecashin@coraid.com>
 -W:    http://support.coraid.com/support/linux
 +M:    "Ed L. Cashin" <ed.cashin@acm.org>
 +W:    http://www.openaoe.org/
  S:    Supported
  F:    Documentation/aoe/
  F:    drivers/block/aoe/
@@@ -1741,7 -1730,7 +1741,7 @@@ S:      Maintaine
  F:    drivers/net/ethernet/atheros/
  
  ATM
 -M:    Chas Williams <chas@cmf.nrl.navy.mil>
 +M:    Chas Williams <3chas3@gmail.com>
  L:    linux-atm-general@lists.sourceforge.net (moderated for non-subscribers)
  L:    netdev@vger.kernel.org
  W:    http://linux-atm.sourceforge.net
@@@ -2118,6 -2107,7 +2118,6 @@@ F:      drivers/net/ethernet/broadcom/bnx2x
  
  BROADCOM BCM281XX/BCM11XXX/BCM216XX ARM ARCHITECTURE
  M:    Christian Daudt <bcm@fixthebug.org>
 -M:    Matt Porter <mporter@linaro.org>
  M:    Florian Fainelli <f.fainelli@gmail.com>
  L:    bcm-kernel-feedback-list@broadcom.com
  T:    git git://github.com/broadcom/mach-bcm
@@@ -3252,13 -3242,6 +3252,13 @@@ S:    Maintaine
  F:    Documentation/hwmon/dme1737
  F:    drivers/hwmon/dme1737.c
  
 +DMI/SMBIOS SUPPORT
 +M:    Jean Delvare <jdelvare@suse.de>
 +S:    Maintained
 +F:    drivers/firmware/dmi-id.c
 +F:    drivers/firmware/dmi_scan.c
 +F:    include/linux/dmi.h
 +
  DOCKING STATION DRIVER
  M:    Shaohua Li <shaohua.li@intel.com>
  L:    linux-acpi@vger.kernel.org
@@@ -5094,7 -5077,7 +5094,7 @@@ S:      Supporte
  F:    drivers/platform/x86/intel_menlow.c
  
  INTEL IA32 MICROCODE UPDATE SUPPORT
 -M:    Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
 +M:    Borislav Petkov <bp@alien8.de>
  S:    Maintained
  F:    arch/x86/kernel/cpu/microcode/core*
  F:    arch/x86/kernel/cpu/microcode/intel*
@@@ -5135,21 -5118,22 +5135,21 @@@ M:   Deepak Saxena <dsaxena@plexity.net
  S:    Maintained
  F:    drivers/char/hw_random/ixp4xx-rng.c
  
 -INTEL ETHERNET DRIVERS (e100/e1000/e1000e/fm10k/igb/igbvf/ixgb/ixgbe/ixgbevf/i40e/i40evf)
 +INTEL ETHERNET DRIVERS
  M:    Jeff Kirsher <jeffrey.t.kirsher@intel.com>
 -M:    Jesse Brandeburg <jesse.brandeburg@intel.com>
 -M:    Bruce Allan <bruce.w.allan@intel.com>
 -M:    Carolyn Wyborny <carolyn.wyborny@intel.com>
 -M:    Don Skidmore <donald.c.skidmore@intel.com>
 -M:    Greg Rose <gregory.v.rose@intel.com>
 -M:    Matthew Vick <matthew.vick@intel.com>
 -M:    John Ronciak <john.ronciak@intel.com>
 -M:    Mitch Williams <mitch.a.williams@intel.com>
 -M:    Linux NICS <linux.nics@intel.com>
 -L:    e1000-devel@lists.sourceforge.net
 +R:    Jesse Brandeburg <jesse.brandeburg@intel.com>
 +R:    Shannon Nelson <shannon.nelson@intel.com>
 +R:    Carolyn Wyborny <carolyn.wyborny@intel.com>
 +R:    Don Skidmore <donald.c.skidmore@intel.com>
 +R:    Matthew Vick <matthew.vick@intel.com>
 +R:    John Ronciak <john.ronciak@intel.com>
 +R:    Mitch Williams <mitch.a.williams@intel.com>
 +L:    intel-wired-lan@lists.osuosl.org
  W:    http://www.intel.com/support/feedback.htm
  W:    http://e1000.sourceforge.net/
 -T:    git git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/net.git
 -T:    git git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/net-next.git
 +Q:    http://patchwork.ozlabs.org/project/intel-wired-lan/list/
 +T:    git git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/net-queue.git
 +T:    git git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue.git
  S:    Supported
  F:    Documentation/networking/e100.txt
  F:    Documentation/networking/e1000.txt
@@@ -5591,6 -5575,8 +5591,8 @@@ S:      Supporte
  F:    Documentation/*/kvm*.txt
  F:    Documentation/virtual/kvm/
  F:    arch/*/kvm/
+ F:    arch/x86/kernel/kvm.c
+ F:    arch/x86/kernel/kvmclock.c
  F:    arch/*/include/asm/kvm*
  F:    include/linux/kvm*
  F:    include/uapi/linux/kvm*
@@@ -8559,7 -8545,6 +8561,7 @@@ F:      include/uapi/linux/timex.
  F:    kernel/time/clocksource.c
  F:    kernel/time/time*.c
  F:    kernel/time/ntp.c
 +F:    tools/testing/selftests/timers/
  
  SC1200 WDT DRIVER
  M:    Zwane Mwaikambo <zwanem@gmail.com>
@@@ -10214,13 -10199,6 +10216,13 @@@ S: Maintaine
  F:    Documentation/usb/ohci.txt
  F:    drivers/usb/host/ohci*
  
 +USB OTG FSM (Finite State Machine)
 +M:    Peter Chen <Peter.Chen@freescale.com>
 +T:    git git://github.com/hzpeterchen/linux-usb.git
 +L:    linux-usb@vger.kernel.org
 +S:    Maintained
 +F:    drivers/usb/common/usb-otg-fsm.c
 +
  USB OVER IP DRIVER
  M:    Valentina Manea <valentina.manea.m@gmail.com>
  M:    Shuah Khan <shuah.kh@samsung.com>
diff --combined arch/s390/kvm/kvm-s390.c
index 19e17bd7aec09b2662874a3925e3d55f4e4207f4,3040b14751b8154e1bd9bc64f7fac964e5cc4d2c..afa2bd750ffc814d36300bc491a0a6eff516bda4
  #include <linux/random.h>
  #include <linux/slab.h>
  #include <linux/timer.h>
+ #include <linux/vmalloc.h>
  #include <asm/asm-offsets.h>
  #include <asm/lowcore.h>
  #include <asm/pgtable.h>
  #include <asm/nmi.h>
  #include <asm/switch_to.h>
+ #include <asm/isc.h>
  #include <asm/sclp.h>
  #include "kvm-s390.h"
  #include "gaccess.h"
  #include "trace.h"
  #include "trace-s390.h"
  
+ #define MEM_OP_MAX_SIZE 65536 /* Maximum transfer size for KVM_S390_MEM_OP */
+ #define LOCAL_IRQS 32
+ #define VCPU_IRQS_MAX_BUF (sizeof(struct kvm_s390_irq) * \
+                          (KVM_MAX_VCPUS + LOCAL_IRQS))
  #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
  
  struct kvm_stats_debugfs_item debugfs_entries[] = {
@@@ -87,6 -94,7 +94,7 @@@
        { "instruction_sigp_stop", VCPU_STAT(instruction_sigp_stop) },
        { "instruction_sigp_stop_store_status", VCPU_STAT(instruction_sigp_stop_store_status) },
        { "instruction_sigp_store_status", VCPU_STAT(instruction_sigp_store_status) },
+       { "instruction_sigp_store_adtl_status", VCPU_STAT(instruction_sigp_store_adtl_status) },
        { "instruction_sigp_set_arch", VCPU_STAT(instruction_sigp_arch) },
        { "instruction_sigp_set_prefix", VCPU_STAT(instruction_sigp_prefix) },
        { "instruction_sigp_restart", VCPU_STAT(instruction_sigp_restart) },
  
  /* upper facilities limit for kvm */
  unsigned long kvm_s390_fac_list_mask[] = {
-       0xff82fffbf4fc2000UL,
-       0x005c000000000000UL,
+       0xffe6fffbfcfdfc40UL,
+       0x205c800000000000UL,
  };
  
  unsigned long kvm_s390_fac_list_mask_size(void)
@@@ -165,15 -173,23 +173,22 @@@ int kvm_vm_ioctl_check_extension(struc
        case KVM_CAP_ONE_REG:
        case KVM_CAP_ENABLE_CAP:
        case KVM_CAP_S390_CSS_SUPPORT:
 -      case KVM_CAP_IRQFD:
        case KVM_CAP_IOEVENTFD:
        case KVM_CAP_DEVICE_CTRL:
        case KVM_CAP_ENABLE_CAP_VM:
        case KVM_CAP_S390_IRQCHIP:
        case KVM_CAP_VM_ATTRIBUTES:
        case KVM_CAP_MP_STATE:
+       case KVM_CAP_S390_INJECT_IRQ:
        case KVM_CAP_S390_USER_SIGP:
+       case KVM_CAP_S390_USER_STSI:
+       case KVM_CAP_S390_SKEYS:
+       case KVM_CAP_S390_IRQ_STATE:
                r = 1;
                break;
+       case KVM_CAP_S390_MEM_OP:
+               r = MEM_OP_MAX_SIZE;
+               break;
        case KVM_CAP_NR_VCPUS:
        case KVM_CAP_MAX_VCPUS:
                r = KVM_MAX_VCPUS;
        case KVM_CAP_S390_COW:
                r = MACHINE_HAS_ESOP;
                break;
+       case KVM_CAP_S390_VECTOR_REGISTERS:
+               r = MACHINE_HAS_VX;
+               break;
        default:
                r = 0;
        }
@@@ -264,6 -283,18 +282,18 @@@ static int kvm_vm_ioctl_enable_cap(stru
                kvm->arch.user_sigp = 1;
                r = 0;
                break;
+       case KVM_CAP_S390_VECTOR_REGISTERS:
+               if (MACHINE_HAS_VX) {
+                       set_kvm_facility(kvm->arch.model.fac->mask, 129);
+                       set_kvm_facility(kvm->arch.model.fac->list, 129);
+                       r = 0;
+               } else
+                       r = -EINVAL;
+               break;
+       case KVM_CAP_S390_USER_STSI:
+               kvm->arch.user_stsi = 1;
+               r = 0;
+               break;
        default:
                r = -EINVAL;
                break;
@@@ -708,6 -739,108 +738,108 @@@ static int kvm_s390_vm_has_attr(struct 
        return ret;
  }
  
+ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
+ {
+       uint8_t *keys;
+       uint64_t hva;
+       unsigned long curkey;
+       int i, r = 0;
+       if (args->flags != 0)
+               return -EINVAL;
+       /* Is this guest using storage keys? */
+       if (!mm_use_skey(current->mm))
+               return KVM_S390_GET_SKEYS_NONE;
+       /* Enforce sane limit on memory allocation */
+       if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX)
+               return -EINVAL;
+       keys = kmalloc_array(args->count, sizeof(uint8_t),
+                            GFP_KERNEL | __GFP_NOWARN);
+       if (!keys)
+               keys = vmalloc(sizeof(uint8_t) * args->count);
+       if (!keys)
+               return -ENOMEM;
+       for (i = 0; i < args->count; i++) {
+               hva = gfn_to_hva(kvm, args->start_gfn + i);
+               if (kvm_is_error_hva(hva)) {
+                       r = -EFAULT;
+                       goto out;
+               }
+               curkey = get_guest_storage_key(current->mm, hva);
+               if (IS_ERR_VALUE(curkey)) {
+                       r = curkey;
+                       goto out;
+               }
+               keys[i] = curkey;
+       }
+       r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys,
+                        sizeof(uint8_t) * args->count);
+       if (r)
+               r = -EFAULT;
+ out:
+       kvfree(keys);
+       return r;
+ }
+ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
+ {
+       uint8_t *keys;
+       uint64_t hva;
+       int i, r = 0;
+       if (args->flags != 0)
+               return -EINVAL;
+       /* Enforce sane limit on memory allocation */
+       if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX)
+               return -EINVAL;
+       keys = kmalloc_array(args->count, sizeof(uint8_t),
+                            GFP_KERNEL | __GFP_NOWARN);
+       if (!keys)
+               keys = vmalloc(sizeof(uint8_t) * args->count);
+       if (!keys)
+               return -ENOMEM;
+       r = copy_from_user(keys, (uint8_t __user *)args->skeydata_addr,
+                          sizeof(uint8_t) * args->count);
+       if (r) {
+               r = -EFAULT;
+               goto out;
+       }
+       /* Enable storage key handling for the guest */
+       s390_enable_skey();
+       for (i = 0; i < args->count; i++) {
+               hva = gfn_to_hva(kvm, args->start_gfn + i);
+               if (kvm_is_error_hva(hva)) {
+                       r = -EFAULT;
+                       goto out;
+               }
+               /* Lowest order bit is reserved */
+               if (keys[i] & 0x01) {
+                       r = -EINVAL;
+                       goto out;
+               }
+               r = set_guest_storage_key(current->mm, hva,
+                                         (unsigned long)keys[i], 0);
+               if (r)
+                       goto out;
+       }
+ out:
+       kvfree(keys);
+       return r;
+ }
  long kvm_arch_vm_ioctl(struct file *filp,
                       unsigned int ioctl, unsigned long arg)
  {
                r = kvm_s390_vm_has_attr(kvm, &attr);
                break;
        }
+       case KVM_S390_GET_SKEYS: {
+               struct kvm_s390_skeys args;
+               r = -EFAULT;
+               if (copy_from_user(&args, argp,
+                                  sizeof(struct kvm_s390_skeys)))
+                       break;
+               r = kvm_s390_get_skeys(kvm, &args);
+               break;
+       }
+       case KVM_S390_SET_SKEYS: {
+               struct kvm_s390_skeys args;
+               r = -EFAULT;
+               if (copy_from_user(&args, argp,
+                                  sizeof(struct kvm_s390_skeys)))
+                       break;
+               r = kvm_s390_set_skeys(kvm, &args);
+               break;
+       }
        default:
                r = -ENOTTY;
        }
@@@ -887,7 -1040,7 +1039,7 @@@ int kvm_arch_init_vm(struct kvm *kvm, u
  
        kvm->arch.dbf = debug_register(debug_name, 8, 2, 8 * sizeof(long));
        if (!kvm->arch.dbf)
-               goto out_nodbf;
+               goto out_err;
  
        /*
         * The architectural maximum amount of facilities is 16 kbit. To store
        kvm->arch.model.fac =
                (struct kvm_s390_fac *) get_zeroed_page(GFP_KERNEL | GFP_DMA);
        if (!kvm->arch.model.fac)
-               goto out_nofac;
+               goto out_err;
  
        /* Populate the facility mask initially. */
        memcpy(kvm->arch.model.fac->mask, S390_lowcore.stfle_fac_list,
        kvm->arch.model.ibc = sclp_get_ibc() & 0x0fff;
  
        if (kvm_s390_crypto_init(kvm) < 0)
-               goto out_crypto;
+               goto out_err;
  
        spin_lock_init(&kvm->arch.float_int.lock);
-       INIT_LIST_HEAD(&kvm->arch.float_int.list);
+       for (i = 0; i < FIRQ_LIST_COUNT; i++)
+               INIT_LIST_HEAD(&kvm->arch.float_int.lists[i]);
        init_waitqueue_head(&kvm->arch.ipte_wq);
        mutex_init(&kvm->arch.ipte_mutex);
  
        } else {
                kvm->arch.gmap = gmap_alloc(current->mm, (1UL << 44) - 1);
                if (!kvm->arch.gmap)
-                       goto out_nogmap;
+                       goto out_err;
                kvm->arch.gmap->private = kvm;
                kvm->arch.gmap->pfault_enabled = 0;
        }
        spin_lock_init(&kvm->arch.start_stop_lock);
  
        return 0;
- out_nogmap:
+ out_err:
        kfree(kvm->arch.crypto.crycb);
- out_crypto:
        free_page((unsigned long)kvm->arch.model.fac);
- out_nofac:
        debug_unregister(kvm->arch.dbf);
- out_nodbf:
        free_page((unsigned long)(kvm->arch.sca));
- out_err:
        return rc;
  }
  
@@@ -1034,6 -1184,8 +1183,8 @@@ int kvm_arch_vcpu_init(struct kvm_vcpu 
                                    KVM_SYNC_CRS |
                                    KVM_SYNC_ARCH0 |
                                    KVM_SYNC_PFAULT;
+       if (test_kvm_facility(vcpu->kvm, 129))
+               vcpu->run->kvm_valid_regs |= KVM_SYNC_VRS;
  
        if (kvm_is_ucontrol(vcpu->kvm))
                return __kvm_ucontrol_vcpu_init(vcpu);
  void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
  {
        save_fp_ctl(&vcpu->arch.host_fpregs.fpc);
-       save_fp_regs(vcpu->arch.host_fpregs.fprs);
+       if (test_kvm_facility(vcpu->kvm, 129))
+               save_vx_regs((__vector128 *)&vcpu->arch.host_vregs->vrs);
+       else
+               save_fp_regs(vcpu->arch.host_fpregs.fprs);
        save_access_regs(vcpu->arch.host_acrs);
-       restore_fp_ctl(&vcpu->arch.guest_fpregs.fpc);
-       restore_fp_regs(vcpu->arch.guest_fpregs.fprs);
+       if (test_kvm_facility(vcpu->kvm, 129)) {
+               restore_fp_ctl(&vcpu->run->s.regs.fpc);
+               restore_vx_regs((__vector128 *)&vcpu->run->s.regs.vrs);
+       } else {
+               restore_fp_ctl(&vcpu->arch.guest_fpregs.fpc);
+               restore_fp_regs(vcpu->arch.guest_fpregs.fprs);
+       }
        restore_access_regs(vcpu->run->s.regs.acrs);
        gmap_enable(vcpu->arch.gmap);
        atomic_set_mask(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
@@@ -1057,11 -1217,19 +1216,19 @@@ void kvm_arch_vcpu_put(struct kvm_vcpu 
  {
        atomic_clear_mask(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
        gmap_disable(vcpu->arch.gmap);
-       save_fp_ctl(&vcpu->arch.guest_fpregs.fpc);
-       save_fp_regs(vcpu->arch.guest_fpregs.fprs);
+       if (test_kvm_facility(vcpu->kvm, 129)) {
+               save_fp_ctl(&vcpu->run->s.regs.fpc);
+               save_vx_regs((__vector128 *)&vcpu->run->s.regs.vrs);
+       } else {
+               save_fp_ctl(&vcpu->arch.guest_fpregs.fpc);
+               save_fp_regs(vcpu->arch.guest_fpregs.fprs);
+       }
        save_access_regs(vcpu->run->s.regs.acrs);
        restore_fp_ctl(&vcpu->arch.host_fpregs.fpc);
-       restore_fp_regs(vcpu->arch.host_fpregs.fprs);
+       if (test_kvm_facility(vcpu->kvm, 129))
+               restore_vx_regs((__vector128 *)&vcpu->arch.host_vregs->vrs);
+       else
+               restore_fp_regs(vcpu->arch.host_fpregs.fprs);
        restore_access_regs(vcpu->arch.host_acrs);
  }
  
@@@ -1129,6 -1297,15 +1296,15 @@@ int kvm_s390_vcpu_setup_cmma(struct kvm
        return 0;
  }
  
+ static void kvm_s390_vcpu_setup_model(struct kvm_vcpu *vcpu)
+ {
+       struct kvm_s390_cpu_model *model = &vcpu->kvm->arch.model;
+       vcpu->arch.cpu_id = model->cpu_id;
+       vcpu->arch.sie_block->ibc = model->ibc;
+       vcpu->arch.sie_block->fac = (int) (long) model->fac->list;
+ }
  int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
  {
        int rc = 0;
                                                    CPUSTAT_SM |
                                                    CPUSTAT_STOPPED |
                                                    CPUSTAT_GED);
+       kvm_s390_vcpu_setup_model(vcpu);
        vcpu->arch.sie_block->ecb   = 6;
        if (test_kvm_facility(vcpu->kvm, 50) && test_kvm_facility(vcpu->kvm, 73))
                vcpu->arch.sie_block->ecb |= 0x10;
                vcpu->arch.sie_block->eca |= 1;
        if (sclp_has_sigpif())
                vcpu->arch.sie_block->eca |= 0x10000000U;
-       vcpu->arch.sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE |
-                                     ICTL_TPROT;
+       if (test_kvm_facility(vcpu->kvm, 129)) {
+               vcpu->arch.sie_block->eca |= 0x00020000;
+               vcpu->arch.sie_block->ecd |= 0x20000000;
+       }
+       vcpu->arch.sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
  
        if (kvm_s390_cmma_enabled(vcpu->kvm)) {
                rc = kvm_s390_vcpu_setup_cmma(vcpu);
        hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        vcpu->arch.ckc_timer.function = kvm_s390_idle_wakeup;
  
-       mutex_lock(&vcpu->kvm->lock);
-       vcpu->arch.cpu_id = vcpu->kvm->arch.model.cpu_id;
-       vcpu->arch.sie_block->ibc = vcpu->kvm->arch.model.ibc;
-       mutex_unlock(&vcpu->kvm->lock);
        kvm_s390_vcpu_crypto_setup(vcpu);
  
        return rc;
@@@ -1190,6 -1367,7 +1366,7 @@@ struct kvm_vcpu *kvm_arch_vcpu_create(s
  
        vcpu->arch.sie_block = &sie_page->sie_block;
        vcpu->arch.sie_block->itdba = (unsigned long) &sie_page->itdb;
+       vcpu->arch.host_vregs = &sie_page->vregs;
  
        vcpu->arch.sie_block->icpua = id;
        if (!kvm_is_ucontrol(kvm)) {
                vcpu->arch.sie_block->scaol = (__u32)(__u64)kvm->arch.sca;
                set_bit(63 - id, (unsigned long *) &kvm->arch.sca->mcn);
        }
-       vcpu->arch.sie_block->fac = (int) (long) kvm->arch.model.fac->list;
  
        spin_lock_init(&vcpu->arch.local_int.lock);
        vcpu->arch.local_int.float_int = &kvm->arch.float_int;
@@@ -1725,6 -1902,31 +1901,31 @@@ static int vcpu_pre_run(struct kvm_vcp
        return 0;
  }
  
+ static int vcpu_post_run_fault_in_sie(struct kvm_vcpu *vcpu)
+ {
+       psw_t *psw = &vcpu->arch.sie_block->gpsw;
+       u8 opcode;
+       int rc;
+       VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction");
+       trace_kvm_s390_sie_fault(vcpu);
+       /*
+        * We want to inject an addressing exception, which is defined as a
+        * suppressing or terminating exception. However, since we came here
+        * by a DAT access exception, the PSW still points to the faulting
+        * instruction since DAT exceptions are nullifying. So we've got
+        * to look up the current opcode to get the length of the instruction
+        * to be able to forward the PSW.
+        */
+       rc = read_guest(vcpu, psw->addr, 0, &opcode, 1);
+       if (rc)
+               return kvm_s390_inject_prog_cond(vcpu, rc);
+       psw->addr = __rewind_psw(*psw, -insn_length(opcode));
+       return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+ }
  static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
  {
        int rc = -1;
                }
        }
  
-       if (rc == -1) {
-               VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction");
-               trace_kvm_s390_sie_fault(vcpu);
-               rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-       }
+       if (rc == -1)
+               rc = vcpu_post_run_fault_in_sie(vcpu);
  
        memcpy(&vcpu->run->s.regs.gprs[14], &vcpu->arch.sie_block->gg14, 16);
  
@@@ -1976,6 -2175,35 +2174,35 @@@ int kvm_s390_vcpu_store_status(struct k
        return kvm_s390_store_status_unloaded(vcpu, addr);
  }
  
+ /*
+  * store additional status at address
+  */
+ int kvm_s390_store_adtl_status_unloaded(struct kvm_vcpu *vcpu,
+                                       unsigned long gpa)
+ {
+       /* Only bits 0-53 are used for address formation */
+       if (!(gpa & ~0x3ff))
+               return 0;
+       return write_guest_abs(vcpu, gpa & ~0x3ff,
+                              (void *)&vcpu->run->s.regs.vrs, 512);
+ }
+ int kvm_s390_vcpu_store_adtl_status(struct kvm_vcpu *vcpu, unsigned long addr)
+ {
+       if (!test_kvm_facility(vcpu->kvm, 129))
+               return 0;
+       /*
+        * The guest VXRS are in the host VXRs due to the lazy
+        * copying in vcpu load/put. Let's update our copies before we save
+        * it into the save area.
+        */
+       save_vx_regs((__vector128 *)&vcpu->run->s.regs.vrs);
+       return kvm_s390_store_adtl_status_unloaded(vcpu, addr);
+ }
  static void __disable_ibs_on_vcpu(struct kvm_vcpu *vcpu)
  {
        kvm_check_request(KVM_REQ_ENABLE_IBS, vcpu);
@@@ -2100,6 -2328,65 +2327,65 @@@ static int kvm_vcpu_ioctl_enable_cap(st
        return r;
  }
  
+ static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu,
+                                 struct kvm_s390_mem_op *mop)
+ {
+       void __user *uaddr = (void __user *)mop->buf;
+       void *tmpbuf = NULL;
+       int r, srcu_idx;
+       const u64 supported_flags = KVM_S390_MEMOP_F_INJECT_EXCEPTION
+                                   | KVM_S390_MEMOP_F_CHECK_ONLY;
+       if (mop->flags & ~supported_flags)
+               return -EINVAL;
+       if (mop->size > MEM_OP_MAX_SIZE)
+               return -E2BIG;
+       if (!(mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)) {
+               tmpbuf = vmalloc(mop->size);
+               if (!tmpbuf)
+                       return -ENOMEM;
+       }
+       srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+       switch (mop->op) {
+       case KVM_S390_MEMOP_LOGICAL_READ:
+               if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
+                       r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size, false);
+                       break;
+               }
+               r = read_guest(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size);
+               if (r == 0) {
+                       if (copy_to_user(uaddr, tmpbuf, mop->size))
+                               r = -EFAULT;
+               }
+               break;
+       case KVM_S390_MEMOP_LOGICAL_WRITE:
+               if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
+                       r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size, true);
+                       break;
+               }
+               if (copy_from_user(tmpbuf, uaddr, mop->size)) {
+                       r = -EFAULT;
+                       break;
+               }
+               r = write_guest(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size);
+               break;
+       default:
+               r = -EINVAL;
+       }
+       srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
+       if (r > 0 && (mop->flags & KVM_S390_MEMOP_F_INJECT_EXCEPTION) != 0)
+               kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
+       vfree(tmpbuf);
+       return r;
+ }
  long kvm_arch_vcpu_ioctl(struct file *filp,
                         unsigned int ioctl, unsigned long arg)
  {
        long r;
  
        switch (ioctl) {
+       case KVM_S390_IRQ: {
+               struct kvm_s390_irq s390irq;
+               r = -EFAULT;
+               if (copy_from_user(&s390irq, argp, sizeof(s390irq)))
+                       break;
+               r = kvm_s390_inject_vcpu(vcpu, &s390irq);
+               break;
+       }
        case KVM_S390_INTERRUPT: {
                struct kvm_s390_interrupt s390int;
                struct kvm_s390_irq s390irq;
                r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
                break;
        }
+       case KVM_S390_MEM_OP: {
+               struct kvm_s390_mem_op mem_op;
+               if (copy_from_user(&mem_op, argp, sizeof(mem_op)) == 0)
+                       r = kvm_s390_guest_mem_op(vcpu, &mem_op);
+               else
+                       r = -EFAULT;
+               break;
+       }
+       case KVM_S390_SET_IRQ_STATE: {
+               struct kvm_s390_irq_state irq_state;
+               r = -EFAULT;
+               if (copy_from_user(&irq_state, argp, sizeof(irq_state)))
+                       break;
+               if (irq_state.len > VCPU_IRQS_MAX_BUF ||
+                   irq_state.len == 0 ||
+                   irq_state.len % sizeof(struct kvm_s390_irq) > 0) {
+                       r = -EINVAL;
+                       break;
+               }
+               r = kvm_s390_set_irq_state(vcpu,
+                                          (void __user *) irq_state.buf,
+                                          irq_state.len);
+               break;
+       }
+       case KVM_S390_GET_IRQ_STATE: {
+               struct kvm_s390_irq_state irq_state;
+               r = -EFAULT;
+               if (copy_from_user(&irq_state, argp, sizeof(irq_state)))
+                       break;
+               if (irq_state.len == 0) {
+                       r = -EINVAL;
+                       break;
+               }
+               r = kvm_s390_get_irq_state(vcpu,
+                                          (__u8 __user *)  irq_state.buf,
+                                          irq_state.len);
+               break;
+       }
        default:
                r = -ENOTTY;
        }
diff --combined arch/x86/kvm/ioapic.c
index 46d4449772bc714daa658ea6424fb45659095c70,51889ec847b0a4864f021dc8c9c7b1d11b8bd3f3..28146f03c51421ce12f728d69613ded0a65699fd
@@@ -206,6 -206,8 +206,8 @@@ static int ioapic_set_irq(struct kvm_io
  
        old_irr = ioapic->irr;
        ioapic->irr |= mask;
+       if (edge)
+               ioapic->irr_delivered &= ~mask;
        if ((edge && old_irr == ioapic->irr) ||
            (!edge && entry.fields.remote_irr)) {
                ret = 0;
@@@ -349,7 -351,7 +351,7 @@@ static int ioapic_service(struct kvm_io
        irqe.shorthand = 0;
  
        if (irqe.trig_mode == IOAPIC_EDGE_TRIG)
-               ioapic->irr &= ~(1 << irq);
+               ioapic->irr_delivered |= 1 << irq;
  
        if (irq == RTC_GSI && line_status) {
                /*
@@@ -422,7 -424,6 +424,7 @@@ static void __kvm_ioapic_update_eoi(str
                        struct kvm_ioapic *ioapic, int vector, int trigger_mode)
  {
        int i;
 +      struct kvm_lapic *apic = vcpu->arch.apic;
  
        for (i = 0; i < IOAPIC_NUM_PINS; i++) {
                union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
                kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, i);
                spin_lock(&ioapic->lock);
  
 -              if (trigger_mode != IOAPIC_LEVEL_TRIG)
 +              if (trigger_mode != IOAPIC_LEVEL_TRIG ||
 +                  kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)
                        continue;
  
                ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
        }
  }
  
- bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector)
- {
-       struct kvm_ioapic *ioapic = kvm->arch.vioapic;
-       smp_rmb();
-       return test_bit(vector, ioapic->handled_vectors);
- }
  void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, int trigger_mode)
  {
        struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
@@@ -500,8 -493,8 +495,8 @@@ static inline int ioapic_in_range(struc
                 (addr < ioapic->base_address + IOAPIC_MEM_LENGTH)));
  }
  
- static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
-                           void *val)
+ static int ioapic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
+                               gpa_t addr, int len, void *val)
  {
        struct kvm_ioapic *ioapic = to_ioapic(this);
        u32 result;
        return 0;
  }
  
- static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
-                            const void *val)
+ static int ioapic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
+                                gpa_t addr, int len, const void *val)
  {
        struct kvm_ioapic *ioapic = to_ioapic(this);
        u32 data;
@@@ -599,6 -592,7 +594,7 @@@ static void kvm_ioapic_reset(struct kvm
        ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
        ioapic->ioregsel = 0;
        ioapic->irr = 0;
+       ioapic->irr_delivered = 0;
        ioapic->id = 0;
        memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS);
        rtc_irq_eoi_tracking_reset(ioapic);
@@@ -656,6 -650,7 +652,7 @@@ int kvm_get_ioapic(struct kvm *kvm, str
  
        spin_lock(&ioapic->lock);
        memcpy(state, ioapic, sizeof(struct kvm_ioapic_state));
+       state->irr &= ~ioapic->irr_delivered;
        spin_unlock(&ioapic->lock);
        return 0;
  }
@@@ -669,6 -664,7 +666,7 @@@ int kvm_set_ioapic(struct kvm *kvm, str
        spin_lock(&ioapic->lock);
        memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
        ioapic->irr = 0;
+       ioapic->irr_delivered = 0;
        update_handled_vectors(ioapic);
        kvm_vcpu_request_scan_ioapic(kvm);
        kvm_ioapic_inject_all(ioapic, state->irr);
diff --combined arch/x86/kvm/lapic.c
index 4ee827d7bf36f730c25d358f709aa99cda93260a,4a6e58a967f7bef87788debf89eee0a7a7e37da0..d67206a7b99a689a4d7361de8bd8fc1b9ab02c1a
@@@ -133,6 -133,28 +133,28 @@@ static inline int kvm_apic_id(struct kv
        return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
  }
  
+ /* The logical map is definitely wrong if we have multiple
+  * modes at the same time.  (Physical map is always right.)
+  */
+ static inline bool kvm_apic_logical_map_valid(struct kvm_apic_map *map)
+ {
+       return !(map->mode & (map->mode - 1));
+ }
+ static inline void
+ apic_logical_id(struct kvm_apic_map *map, u32 dest_id, u16 *cid, u16 *lid)
+ {
+       unsigned lid_bits;
+       BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_CLUSTER !=  4);
+       BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_FLAT    !=  8);
+       BUILD_BUG_ON(KVM_APIC_MODE_X2APIC        != 16);
+       lid_bits = map->mode;
+       *cid = dest_id >> lid_bits;
+       *lid = dest_id & ((1 << lid_bits) - 1);
+ }
  static void recalculate_apic_map(struct kvm *kvm)
  {
        struct kvm_apic_map *new, *old = NULL;
        if (!new)
                goto out;
  
-       new->ldr_bits = 8;
-       /* flat mode is default */
-       new->cid_shift = 8;
-       new->cid_mask = 0;
-       new->lid_mask = 0xff;
-       new->broadcast = APIC_BROADCAST;
-       kvm_for_each_vcpu(i, vcpu, kvm) {
-               struct kvm_lapic *apic = vcpu->arch.apic;
-               if (!kvm_apic_present(vcpu))
-                       continue;
-               if (apic_x2apic_mode(apic)) {
-                       new->ldr_bits = 32;
-                       new->cid_shift = 16;
-                       new->cid_mask = new->lid_mask = 0xffff;
-                       new->broadcast = X2APIC_BROADCAST;
-               } else if (kvm_apic_get_reg(apic, APIC_LDR)) {
-                       if (kvm_apic_get_reg(apic, APIC_DFR) ==
-                                                       APIC_DFR_CLUSTER) {
-                               new->cid_shift = 4;
-                               new->cid_mask = 0xf;
-                               new->lid_mask = 0xf;
-                       } else {
-                               new->cid_shift = 8;
-                               new->cid_mask = 0;
-                               new->lid_mask = 0xff;
-                       }
-               }
-               /*
-                * All APICs have to be configured in the same mode by an OS.
-                * We take advatage of this while building logical id loockup
-                * table. After reset APICs are in software disabled mode, so if
-                * we find apic with different setting we assume this is the mode
-                * OS wants all apics to be in; build lookup table accordingly.
-                */
-               if (kvm_apic_sw_enabled(apic))
-                       break;
-       }
        kvm_for_each_vcpu(i, vcpu, kvm) {
                struct kvm_lapic *apic = vcpu->arch.apic;
                u16 cid, lid;
  
                aid = kvm_apic_id(apic);
                ldr = kvm_apic_get_reg(apic, APIC_LDR);
-               cid = apic_cluster_id(new, ldr);
-               lid = apic_logical_id(new, ldr);
  
                if (aid < ARRAY_SIZE(new->phys_map))
                        new->phys_map[aid] = apic;
+               if (apic_x2apic_mode(apic)) {
+                       new->mode |= KVM_APIC_MODE_X2APIC;
+               } else if (ldr) {
+                       ldr = GET_APIC_LOGICAL_ID(ldr);
+                       if (kvm_apic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT)
+                               new->mode |= KVM_APIC_MODE_XAPIC_FLAT;
+                       else
+                               new->mode |= KVM_APIC_MODE_XAPIC_CLUSTER;
+               }
+               if (!kvm_apic_logical_map_valid(new))
+                       continue;
+               apic_logical_id(new, ldr, &cid, &lid);
                if (lid && cid < ARRAY_SIZE(new->logical_map))
                        new->logical_map[cid][ffs(lid) - 1] = apic;
        }
@@@ -588,15 -582,23 +582,23 @@@ static void apic_set_tpr(struct kvm_lap
        apic_update_ppr(apic);
  }
  
- static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 dest)
+ static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 mda)
  {
-       return dest == (apic_x2apic_mode(apic) ?
-                       X2APIC_BROADCAST : APIC_BROADCAST);
+       if (apic_x2apic_mode(apic))
+               return mda == X2APIC_BROADCAST;
+       return GET_APIC_DEST_FIELD(mda) == APIC_BROADCAST;
  }
  
- static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest)
+ static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda)
  {
-       return kvm_apic_id(apic) == dest || kvm_apic_broadcast(apic, dest);
+       if (kvm_apic_broadcast(apic, mda))
+               return true;
+       if (apic_x2apic_mode(apic))
+               return mda == kvm_apic_id(apic);
+       return mda == SET_APIC_DEST_FIELD(kvm_apic_id(apic));
  }
  
  static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
                       && (logical_id & mda & 0xffff) != 0;
  
        logical_id = GET_APIC_LOGICAL_ID(logical_id);
+       mda = GET_APIC_DEST_FIELD(mda);
  
        switch (kvm_apic_get_reg(apic, APIC_DFR)) {
        case APIC_DFR_FLAT:
        }
  }
  
+ /* KVM APIC implementation has two quirks
+  *  - dest always begins at 0 while xAPIC MDA has offset 24,
+  *  - IOxAPIC messages have to be delivered (directly) to x2APIC.
+  */
+ static u32 kvm_apic_mda(unsigned int dest_id, struct kvm_lapic *source,
+                                               struct kvm_lapic *target)
+ {
+       bool ipi = source != NULL;
+       bool x2apic_mda = apic_x2apic_mode(ipi ? source : target);
+       if (!ipi && dest_id == APIC_BROADCAST && x2apic_mda)
+               return X2APIC_BROADCAST;
+       return x2apic_mda ? dest_id : SET_APIC_DEST_FIELD(dest_id);
+ }
  bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
                           int short_hand, unsigned int dest, int dest_mode)
  {
        struct kvm_lapic *target = vcpu->arch.apic;
+       u32 mda = kvm_apic_mda(dest, source, target);
  
        apic_debug("target %p, source %p, dest 0x%x, "
                   "dest_mode 0x%x, short_hand 0x%x\n",
        switch (short_hand) {
        case APIC_DEST_NOSHORT:
                if (dest_mode == APIC_DEST_PHYSICAL)
-                       return kvm_apic_match_physical_addr(target, dest);
+                       return kvm_apic_match_physical_addr(target, mda);
                else
-                       return kvm_apic_match_logical_addr(target, dest);
+                       return kvm_apic_match_logical_addr(target, mda);
        case APIC_DEST_SELF:
                return target == source;
        case APIC_DEST_ALLINC:
@@@ -664,6 -684,7 +684,7 @@@ bool kvm_irq_delivery_to_apic_fast(stru
        struct kvm_lapic **dst;
        int i;
        bool ret = false;
+       bool x2apic_ipi = src && apic_x2apic_mode(src);
  
        *r = -1;
  
        if (irq->shorthand)
                return false;
  
+       if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST))
+               return false;
        rcu_read_lock();
        map = rcu_dereference(kvm->arch.apic_map);
  
        if (!map)
                goto out;
  
-       if (irq->dest_id == map->broadcast)
-               goto out;
        ret = true;
  
        if (irq->dest_mode == APIC_DEST_PHYSICAL) {
  
                dst = &map->phys_map[irq->dest_id];
        } else {
-               u32 mda = irq->dest_id << (32 - map->ldr_bits);
-               u16 cid = apic_cluster_id(map, mda);
+               u16 cid;
+               if (!kvm_apic_logical_map_valid(map)) {
+                       ret = false;
+                       goto out;
+               }
+               apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap);
  
                if (cid >= ARRAY_SIZE(map->logical_map))
                        goto out;
  
                dst = map->logical_map[cid];
  
-               bitmap = apic_logical_id(map, mda);
                if (irq->delivery_mode == APIC_DM_LOWEST) {
                        int l = -1;
                        for_each_set_bit(i, &bitmap, 16) {
@@@ -833,7 -858,8 +858,7 @@@ int kvm_apic_compare_prio(struct kvm_vc
  
  static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
  {
 -      if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
 -          kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
 +      if (kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
                int trigger_mode;
                if (apic_test_vector(vector, apic->regs + APIC_TMR))
                        trigger_mode = IOAPIC_LEVEL_TRIG;
@@@ -1037,7 -1063,7 +1062,7 @@@ static int apic_mmio_in_range(struct kv
            addr < apic->base_address + LAPIC_MMIO_LENGTH;
  }
  
- static int apic_mmio_read(struct kvm_io_device *this,
+ static int apic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
                           gpa_t address, int len, void *data)
  {
        struct kvm_lapic *apic = to_lapic(this);
@@@ -1357,7 -1383,7 +1382,7 @@@ static int apic_reg_write(struct kvm_la
        return ret;
  }
  
- static int apic_mmio_write(struct kvm_io_device *this,
+ static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
                            gpa_t address, int len, const void *data)
  {
        struct kvm_lapic *apic = to_lapic(this);
@@@ -1497,8 -1523,6 +1522,6 @@@ void kvm_lapic_set_base(struct kvm_vcp
                return;
        }
  
-       if (!kvm_vcpu_is_bsp(apic->vcpu))
-               value &= ~MSR_IA32_APICBASE_BSP;
        vcpu->arch.apic_base = value;
  
        /* update jump label if enable bit changes */
diff --combined arch/x86/kvm/vmx.c
index ae4f6d35d19c268315745741150dd6d1a7df5222,8c14d6a455b00ed446c7b97b6bdd5288214a728c..f5e8dce8046c56b5273e9aa043754f98a7dee7d7
@@@ -2168,10 -2168,7 +2168,10 @@@ static void vmx_set_msr_bitmap(struct k
  {
        unsigned long *msr_bitmap;
  
 -      if (irqchip_in_kernel(vcpu->kvm) && apic_x2apic_mode(vcpu->arch.apic)) {
 +      if (is_guest_mode(vcpu))
 +              msr_bitmap = vmx_msr_bitmap_nested;
 +      else if (irqchip_in_kernel(vcpu->kvm) &&
 +              apic_x2apic_mode(vcpu->arch.apic)) {
                if (is_long_mode(vcpu))
                        msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
                else
@@@ -2470,6 -2467,7 +2470,7 @@@ static void nested_vmx_setup_ctls_msrs(
        vmx->nested.nested_vmx_secondary_ctls_low = 0;
        vmx->nested.nested_vmx_secondary_ctls_high &=
                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+               SECONDARY_EXEC_RDTSCP |
                SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
                SECONDARY_EXEC_APIC_REGISTER_VIRT |
                SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
        if (enable_ept) {
                /* nested EPT: emulate EPT also to L1 */
                vmx->nested.nested_vmx_secondary_ctls_high |=
 -                      SECONDARY_EXEC_ENABLE_EPT |
 -                      SECONDARY_EXEC_UNRESTRICTED_GUEST;
 +                      SECONDARY_EXEC_ENABLE_EPT;
                vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
                         VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
                         VMX_EPT_INVEPT_BIT;
        } else
                vmx->nested.nested_vmx_ept_caps = 0;
  
 +      if (enable_unrestricted_guest)
 +              vmx->nested.nested_vmx_secondary_ctls_high |=
 +                      SECONDARY_EXEC_UNRESTRICTED_GUEST;
 +
        /* miscellaneous data */
        rdmsr(MSR_IA32_VMX_MISC,
                vmx->nested.nested_vmx_misc_low,
@@@ -3268,8 -3263,8 +3269,8 @@@ static void fix_pmode_seg(struct kvm_vc
                 * default value.
                 */
                if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
-                       save->selector &= ~SELECTOR_RPL_MASK;
-               save->dpl = save->selector & SELECTOR_RPL_MASK;
+                       save->selector &= ~SEGMENT_RPL_MASK;
+               save->dpl = save->selector & SEGMENT_RPL_MASK;
                save->s = 1;
        }
        vmx_set_segment(vcpu, save, seg);
@@@ -3842,7 -3837,7 +3843,7 @@@ static bool code_segment_valid(struct k
        unsigned int cs_rpl;
  
        vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
-       cs_rpl = cs.selector & SELECTOR_RPL_MASK;
+       cs_rpl = cs.selector & SEGMENT_RPL_MASK;
  
        if (cs.unusable)
                return false;
@@@ -3870,7 -3865,7 +3871,7 @@@ static bool stack_segment_valid(struct 
        unsigned int ss_rpl;
  
        vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
-       ss_rpl = ss.selector & SELECTOR_RPL_MASK;
+       ss_rpl = ss.selector & SEGMENT_RPL_MASK;
  
        if (ss.unusable)
                return true;
@@@ -3892,7 -3887,7 +3893,7 @@@ static bool data_segment_valid(struct k
        unsigned int rpl;
  
        vmx_get_segment(vcpu, &var, seg);
-       rpl = var.selector & SELECTOR_RPL_MASK;
+       rpl = var.selector & SEGMENT_RPL_MASK;
  
        if (var.unusable)
                return true;
@@@ -3919,7 -3914,7 +3920,7 @@@ static bool tr_valid(struct kvm_vcpu *v
  
        if (tr.unusable)
                return false;
-       if (tr.selector & SELECTOR_TI_MASK)     /* TI = 1 */
+       if (tr.selector & SEGMENT_TI_MASK)      /* TI = 1 */
                return false;
        if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
                return false;
@@@ -3937,7 -3932,7 +3938,7 @@@ static bool ldtr_valid(struct kvm_vcpu 
  
        if (ldtr.unusable)
                return true;
-       if (ldtr.selector & SELECTOR_TI_MASK)   /* TI = 1 */
+       if (ldtr.selector & SEGMENT_TI_MASK)    /* TI = 1 */
                return false;
        if (ldtr.type != 2)
                return false;
@@@ -3954,8 -3949,8 +3955,8 @@@ static bool cs_ss_rpl_check(struct kvm_
        vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
        vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
  
-       return ((cs.selector & SELECTOR_RPL_MASK) ==
-                (ss.selector & SELECTOR_RPL_MASK));
+       return ((cs.selector & SEGMENT_RPL_MASK) ==
+                (ss.selector & SEGMENT_RPL_MASK));
  }
  
  /*
@@@ -4711,7 -4706,7 +4712,7 @@@ static void vmx_vcpu_reset(struct kvm_v
        vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
        kvm_set_cr8(&vmx->vcpu, 0);
        apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE;
-       if (kvm_vcpu_is_bsp(&vmx->vcpu))
+       if (kvm_vcpu_is_reset_bsp(&vmx->vcpu))
                apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
        apic_base_msr.host_initiated = true;
        kvm_set_apic_base(&vmx->vcpu, &apic_base_msr);
@@@ -5006,7 -5001,7 +5007,7 @@@ static int handle_rmode_exception(struc
                if (emulate_instruction(vcpu, 0) == EMULATE_DONE) {
                        if (vcpu->arch.halt_request) {
                                vcpu->arch.halt_request = 0;
-                               return kvm_emulate_halt(vcpu);
+                               return kvm_vcpu_halt(vcpu);
                        }
                        return 1;
                }
@@@ -5071,6 -5066,10 +5072,10 @@@ static int handle_exception(struct kvm_
        }
  
        if (is_invalid_opcode(intr_info)) {
+               if (is_guest_mode(vcpu)) {
+                       kvm_queue_exception(vcpu, UD_VECTOR);
+                       return 1;
+               }
                er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
                if (er != EMULATE_DONE)
                        kvm_queue_exception(vcpu, UD_VECTOR);
            !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
                vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
                vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
-               vcpu->run->internal.ndata = 2;
+               vcpu->run->internal.ndata = 3;
                vcpu->run->internal.data[0] = vect_info;
                vcpu->run->internal.data[1] = intr_info;
+               vcpu->run->internal.data[2] = error_code;
                return 0;
        }
  
@@@ -5533,13 -5533,11 +5539,11 @@@ static int handle_interrupt_window(stru
  
  static int handle_halt(struct kvm_vcpu *vcpu)
  {
-       skip_emulated_instruction(vcpu);
        return kvm_emulate_halt(vcpu);
  }
  
  static int handle_vmcall(struct kvm_vcpu *vcpu)
  {
-       skip_emulated_instruction(vcpu);
        kvm_emulate_hypercall(vcpu);
        return 1;
  }
@@@ -5570,7 -5568,6 +5574,6 @@@ static int handle_rdpmc(struct kvm_vcp
  
  static int handle_wbinvd(struct kvm_vcpu *vcpu)
  {
-       skip_emulated_instruction(vcpu);
        kvm_emulate_wbinvd(vcpu);
        return 1;
  }
@@@ -5828,7 -5825,7 +5831,7 @@@ static int handle_ept_misconfig(struct 
        gpa_t gpa;
  
        gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
-       if (!kvm_io_bus_write(vcpu->kvm, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
+       if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
                skip_emulated_instruction(vcpu);
                return 1;
        }
@@@ -5909,7 -5906,7 +5912,7 @@@ static int handle_invalid_guest_state(s
  
                if (vcpu->arch.halt_request) {
                        vcpu->arch.halt_request = 0;
-                       ret = kvm_emulate_halt(vcpu);
+                       ret = kvm_vcpu_halt(vcpu);
                        goto out;
                }
  
@@@ -7318,21 -7315,21 +7321,21 @@@ static bool nested_vmx_exit_handled_io(
                else if (port < 0x10000)
                        bitmap = vmcs12->io_bitmap_b;
                else
-                       return 1;
+                       return true;
                bitmap += (port & 0x7fff) / 8;
  
                if (last_bitmap != bitmap)
                        if (kvm_read_guest(vcpu->kvm, bitmap, &b, 1))
-                               return 1;
+                               return true;
                if (b & (1 << (port & 7)))
-                       return 1;
+                       return true;
  
                port++;
                size--;
                last_bitmap = bitmap;
        }
  
-       return 0;
+       return false;
  }
  
  /*
@@@ -7348,7 -7345,7 +7351,7 @@@ static bool nested_vmx_exit_handled_msr
        gpa_t bitmap;
  
        if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
-               return 1;
+               return true;
  
        /*
         * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
        if (msr_index < 1024*8) {
                unsigned char b;
                if (kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1))
-                       return 1;
+                       return true;
                return 1 & (b >> (msr_index & 7));
        } else
-               return 1; /* let L1 handle the wrong parameter */
+               return true; /* let L1 handle the wrong parameter */
  }
  
  /*
@@@ -7392,7 -7389,7 +7395,7 @@@ static bool nested_vmx_exit_handled_cr(
                case 0:
                        if (vmcs12->cr0_guest_host_mask &
                            (val ^ vmcs12->cr0_read_shadow))
-                               return 1;
+                               return true;
                        break;
                case 3:
                        if ((vmcs12->cr3_target_count >= 1 &&
                                        vmcs12->cr3_target_value2 == val) ||
                                (vmcs12->cr3_target_count >= 4 &&
                                        vmcs12->cr3_target_value3 == val))
-                               return 0;
+                               return false;
                        if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
-                               return 1;
+                               return true;
                        break;
                case 4:
                        if (vmcs12->cr4_guest_host_mask &
                            (vmcs12->cr4_read_shadow ^ val))
-                               return 1;
+                               return true;
                        break;
                case 8:
                        if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
-                               return 1;
+                               return true;
                        break;
                }
                break;
        case 2: /* clts */
                if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
                    (vmcs12->cr0_read_shadow & X86_CR0_TS))
-                       return 1;
+                       return true;
                break;
        case 1: /* mov from cr */
                switch (cr) {
                case 3:
                        if (vmcs12->cpu_based_vm_exec_control &
                            CPU_BASED_CR3_STORE_EXITING)
-                               return 1;
+                               return true;
                        break;
                case 8:
                        if (vmcs12->cpu_based_vm_exec_control &
                            CPU_BASED_CR8_STORE_EXITING)
-                               return 1;
+                               return true;
                        break;
                }
                break;
                 */
                if (vmcs12->cr0_guest_host_mask & 0xe &
                    (val ^ vmcs12->cr0_read_shadow))
-                       return 1;
+                       return true;
                if ((vmcs12->cr0_guest_host_mask & 0x1) &&
                    !(vmcs12->cr0_read_shadow & 0x1) &&
                    (val & 0x1))
-                       return 1;
+                       return true;
                break;
        }
-       return 0;
+       return false;
  }
  
  /*
@@@ -7474,48 -7471,48 +7477,48 @@@ static bool nested_vmx_exit_handled(str
                                KVM_ISA_VMX);
  
        if (vmx->nested.nested_run_pending)
-               return 0;
+               return false;
  
        if (unlikely(vmx->fail)) {
                pr_info_ratelimited("%s failed vm entry %x\n", __func__,
                                    vmcs_read32(VM_INSTRUCTION_ERROR));
-               return 1;
+               return true;
        }
  
        switch (exit_reason) {
        case EXIT_REASON_EXCEPTION_NMI:
                if (!is_exception(intr_info))
-                       return 0;
+                       return false;
                else if (is_page_fault(intr_info))
                        return enable_ept;
                else if (is_no_device(intr_info) &&
                         !(vmcs12->guest_cr0 & X86_CR0_TS))
-                       return 0;
+                       return false;
                return vmcs12->exception_bitmap &
                                (1u << (intr_info & INTR_INFO_VECTOR_MASK));
        case EXIT_REASON_EXTERNAL_INTERRUPT:
-               return 0;
+               return false;
        case EXIT_REASON_TRIPLE_FAULT:
-               return 1;
+               return true;
        case EXIT_REASON_PENDING_INTERRUPT:
                return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING);
        case EXIT_REASON_NMI_WINDOW:
                return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING);
        case EXIT_REASON_TASK_SWITCH:
-               return 1;
+               return true;
        case EXIT_REASON_CPUID:
                if (kvm_register_read(vcpu, VCPU_REGS_RAX) == 0xa)
-                       return 0;
-               return 1;
+                       return false;
+               return true;
        case EXIT_REASON_HLT:
                return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
        case EXIT_REASON_INVD:
-               return 1;
+               return true;
        case EXIT_REASON_INVLPG:
                return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
        case EXIT_REASON_RDPMC:
                return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
-       case EXIT_REASON_RDTSC:
+       case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
                return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
        case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
        case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
                 * VMX instructions trap unconditionally. This allows L1 to
                 * emulate them for its L2 guest, i.e., allows 3-level nesting!
                 */
-               return 1;
+               return true;
        case EXIT_REASON_CR_ACCESS:
                return nested_vmx_exit_handled_cr(vcpu, vmcs12);
        case EXIT_REASON_DR_ACCESS:
        case EXIT_REASON_MSR_WRITE:
                return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
        case EXIT_REASON_INVALID_STATE:
-               return 1;
+               return true;
        case EXIT_REASON_MWAIT_INSTRUCTION:
                return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
        case EXIT_REASON_MONITOR_INSTRUCTION:
                        nested_cpu_has2(vmcs12,
                                SECONDARY_EXEC_PAUSE_LOOP_EXITING);
        case EXIT_REASON_MCE_DURING_VMENTRY:
-               return 0;
+               return false;
        case EXIT_REASON_TPR_BELOW_THRESHOLD:
                return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
        case EXIT_REASON_APIC_ACCESS:
        case EXIT_REASON_APIC_WRITE:
        case EXIT_REASON_EOI_INDUCED:
                /* apic_write and eoi_induced should exit unconditionally. */
-               return 1;
+               return true;
        case EXIT_REASON_EPT_VIOLATION:
                /*
                 * L0 always deals with the EPT violation. If nested EPT is
                 * missing in the guest EPT table (EPT12), the EPT violation
                 * will be injected with nested_ept_inject_page_fault()
                 */
-               return 0;
+               return false;
        case EXIT_REASON_EPT_MISCONFIG:
                /*
                 * L2 never uses directly L1's EPT, but rather L0's own EPT
                 * (EPT on EPT). So any problems with the structure of the
                 * table is L0's fault.
                 */
-               return 0;
+               return false;
        case EXIT_REASON_WBINVD:
                return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
        case EXIT_REASON_XSETBV:
-               return 1;
+               return true;
        case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
                /*
                 * This should never happen, since it is not possible to
                 */
                return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
        default:
-               return 1;
+               return true;
        }
  }
  
@@@ -8522,6 -8519,9 +8525,9 @@@ static void vmx_cpuid_update(struct kvm
                                                exec_control);
                        }
                }
+               if (nested && !vmx->rdtscp_enabled)
+                       vmx->nested.nested_vmx_secondary_ctls_high &=
+                               ~SECONDARY_EXEC_RDTSCP;
        }
  
        /* Exposing INVPCID only when PCID is exposed */
@@@ -8622,10 -8622,11 +8628,11 @@@ static bool nested_get_vmcs12_pages(str
                                        struct vmcs12 *vmcs12)
  {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
+       int maxphyaddr = cpuid_maxphyaddr(vcpu);
  
        if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
-               /* TODO: Also verify bits beyond physical address width are 0 */
-               if (!PAGE_ALIGNED(vmcs12->apic_access_addr))
+               if (!PAGE_ALIGNED(vmcs12->apic_access_addr) ||
+                   vmcs12->apic_access_addr >> maxphyaddr)
                        return false;
  
                /*
        }
  
        if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
-               /* TODO: Also verify bits beyond physical address width are 0 */
-               if (!PAGE_ALIGNED(vmcs12->virtual_apic_page_addr))
+               if (!PAGE_ALIGNED(vmcs12->virtual_apic_page_addr) ||
+                   vmcs12->virtual_apic_page_addr >> maxphyaddr)
                        return false;
  
                if (vmx->nested.virtual_apic_page) /* shouldn't happen */
        }
  
        if (nested_cpu_has_posted_intr(vmcs12)) {
-               if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64))
+               if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64) ||
+                   vmcs12->posted_intr_desc_addr >> maxphyaddr)
                        return false;
  
                if (vmx->nested.pi_desc_page) { /* shouldn't happen */
@@@ -8864,9 -8866,9 +8872,9 @@@ static int nested_vmx_check_apicv_contr
  
  static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
                                       unsigned long count_field,
-                                      unsigned long addr_field,
-                                      int maxphyaddr)
+                                      unsigned long addr_field)
  {
+       int maxphyaddr;
        u64 count, addr;
  
        if (vmcs12_read_any(vcpu, count_field, &count) ||
        }
        if (count == 0)
                return 0;
+       maxphyaddr = cpuid_maxphyaddr(vcpu);
        if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
            (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) {
                pr_warn_ratelimited(
  static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu,
                                                struct vmcs12 *vmcs12)
  {
-       int maxphyaddr;
        if (vmcs12->vm_exit_msr_load_count == 0 &&
            vmcs12->vm_exit_msr_store_count == 0 &&
            vmcs12->vm_entry_msr_load_count == 0)
                return 0; /* Fast path */
-       maxphyaddr = cpuid_maxphyaddr(vcpu);
        if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT,
-                                       VM_EXIT_MSR_LOAD_ADDR, maxphyaddr) ||
+                                       VM_EXIT_MSR_LOAD_ADDR) ||
            nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT,
-                                       VM_EXIT_MSR_STORE_ADDR, maxphyaddr) ||
+                                       VM_EXIT_MSR_STORE_ADDR) ||
            nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT,
-                                       VM_ENTRY_MSR_LOAD_ADDR, maxphyaddr))
+                                       VM_ENTRY_MSR_LOAD_ADDR))
                return -EINVAL;
        return 0;
  }
@@@ -9151,8 -9151,9 +9157,9 @@@ static void prepare_vmcs02(struct kvm_v
                        exec_control &= ~SECONDARY_EXEC_RDTSCP;
                /* Take the following fields only from vmcs12 */
                exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+                                 SECONDARY_EXEC_RDTSCP |
                                  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
-                                   SECONDARY_EXEC_APIC_REGISTER_VIRT);
+                                 SECONDARY_EXEC_APIC_REGISTER_VIRT);
                if (nested_cpu_has(vmcs12,
                                CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
                        exec_control |= vmcs12->secondary_vm_exec_control;
        }
  
        if (cpu_has_vmx_msr_bitmap() &&
 -          exec_control & CPU_BASED_USE_MSR_BITMAPS &&
 -          nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) {
 -              vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_nested));
 +          exec_control & CPU_BASED_USE_MSR_BITMAPS) {
 +              nested_vmx_merge_msr_bitmap(vcpu, vmcs12);
 +              /* MSR_BITMAP will be set by following vmx_set_efer. */
        } else
                exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
  
@@@ -9385,7 -9386,6 +9392,6 @@@ static int nested_vmx_run(struct kvm_vc
        }
  
        if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
-               /*TODO: Also verify bits beyond physical address width are 0*/
                nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
                return 1;
        }
        vmcs12->launch_state = 1;
  
        if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
-               return kvm_emulate_halt(vcpu);
+               return kvm_vcpu_halt(vcpu);
  
        vmx->nested.nested_run_pending = 1;
  
diff --combined arch/x86/kvm/x86.c
index 32bf19ef3115f65c9dffc23a655be2763babcaff,b8cb1d09169740c415116557f7524e0ede5a9a7d..2b2dd030ea3be3b7d5599be94fdddded0077db9e
@@@ -801,6 -801,17 +801,17 @@@ unsigned long kvm_get_cr8(struct kvm_vc
  }
  EXPORT_SYMBOL_GPL(kvm_get_cr8);
  
+ static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
+ {
+       int i;
+       if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
+               for (i = 0; i < KVM_NR_DB_REGS; i++)
+                       vcpu->arch.eff_db[i] = vcpu->arch.db[i];
+               vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
+       }
+ }
  static void kvm_update_dr6(struct kvm_vcpu *vcpu)
  {
        if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
@@@ -2744,6 -2755,7 +2755,6 @@@ int kvm_vm_ioctl_check_extension(struc
        case KVM_CAP_USER_NMI:
        case KVM_CAP_REINJECT_CONTROL:
        case KVM_CAP_IRQ_INJECT_STATUS:
 -      case KVM_CAP_IRQFD:
        case KVM_CAP_IOEVENTFD:
        case KVM_CAP_IOEVENTFD_NO_LENGTH:
        case KVM_CAP_PIT2:
@@@ -3149,6 -3161,7 +3160,7 @@@ static int kvm_vcpu_ioctl_x86_set_debug
                return -EINVAL;
  
        memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
+       kvm_update_dr0123(vcpu);
        vcpu->arch.dr6 = dbgregs->dr6;
        kvm_update_dr6(vcpu);
        vcpu->arch.dr7 = dbgregs->dr7;
@@@ -4114,8 -4127,8 +4126,8 @@@ static int vcpu_mmio_write(struct kvm_v
        do {
                n = min(len, 8);
                if (!(vcpu->arch.apic &&
-                     !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, n, v))
-                   && kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))
+                     !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
+                   && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
                        break;
                handled += n;
                addr += n;
@@@ -4134,8 -4147,9 +4146,9 @@@ static int vcpu_mmio_read(struct kvm_vc
        do {
                n = min(len, 8);
                if (!(vcpu->arch.apic &&
-                     !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, n, v))
-                   && kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))
+                     !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
+                                        addr, n, v))
+                   && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
                        break;
                trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v);
                handled += n;
@@@ -4475,7 -4489,8 +4488,8 @@@ mmio
        return X86EMUL_CONTINUE;
  }
  
- int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
+ static int emulator_read_write(struct x86_emulate_ctxt *ctxt,
+                       unsigned long addr,
                        void *val, unsigned int bytes,
                        struct x86_exception *exception,
                        const struct read_write_emulator_ops *ops)
@@@ -4538,7 -4553,7 +4552,7 @@@ static int emulator_read_emulated(struc
                                   exception, &read_emultor);
  }
  
- int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
                            unsigned long addr,
                            const void *val,
                            unsigned int bytes,
@@@ -4629,10 -4644,10 +4643,10 @@@ static int kernel_pio(struct kvm_vcpu *
        int r;
  
        if (vcpu->arch.pio.in)
-               r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
+               r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port,
                                    vcpu->arch.pio.size, pd);
        else
-               r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
+               r = kvm_io_bus_write(vcpu, KVM_PIO_BUS,
                                     vcpu->arch.pio.port, vcpu->arch.pio.size,
                                     pd);
        return r;
@@@ -4705,7 -4720,7 +4719,7 @@@ static void emulator_invlpg(struct x86_
        kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
  }
  
- int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
+ int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
  {
        if (!need_emulate_wbinvd(vcpu))
                return X86EMUL_CONTINUE;
                wbinvd();
        return X86EMUL_CONTINUE;
  }
+ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
+ {
+       kvm_x86_ops->skip_emulated_instruction(vcpu);
+       return kvm_emulate_wbinvd_noskip(vcpu);
+ }
  EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
  
  static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
  {
-       kvm_emulate_wbinvd(emul_to_vcpu(ctxt));
+       kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt));
  }
  
- int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
+ static int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
+                          unsigned long *dest)
  {
        return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
  }
  
- int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
+ static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
+                          unsigned long value)
  {
  
        return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
@@@ -5816,7 -5841,7 +5840,7 @@@ void kvm_arch_exit(void
        free_percpu(shared_msrs);
  }
  
- int kvm_emulate_halt(struct kvm_vcpu *vcpu)
+ int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
  {
        ++vcpu->stat.halt_exits;
        if (irqchip_in_kernel(vcpu->kvm)) {
                return 0;
        }
  }
+ EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
+ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
+ {
+       kvm_x86_ops->skip_emulated_instruction(vcpu);
+       return kvm_vcpu_halt(vcpu);
+ }
  EXPORT_SYMBOL_GPL(kvm_emulate_halt);
  
  int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
@@@ -5903,7 -5935,7 +5934,7 @@@ static void kvm_pv_kick_cpu_op(struct k
        lapic_irq.dest_id = apicid;
  
        lapic_irq.delivery_mode = APIC_DM_REMRD;
-       kvm_irq_delivery_to_apic(kvm, 0, &lapic_irq, NULL);
+       kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
  }
  
  int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
        unsigned long nr, a0, a1, a2, a3, ret;
        int op_64_bit, r = 1;
  
+       kvm_x86_ops->skip_emulated_instruction(vcpu);
        if (kvm_hv_hypercall_enabled(vcpu->kvm))
                return kvm_hv_hypercall(vcpu);
  
@@@ -6164,7 -6198,7 +6197,7 @@@ void kvm_arch_mmu_notifier_invalidate_p
  }
  
  /*
-  * Returns 1 to let __vcpu_run() continue the guest execution loop without
+  * Returns 1 to let vcpu_run() continue the guest execution loop without
   * exiting to the userspace.  Otherwise, the value will be returned to the
   * userspace.
   */
@@@ -6301,6 -6335,7 +6334,7 @@@ static int vcpu_enter_guest(struct kvm_
                set_debugreg(vcpu->arch.eff_db[2], 2);
                set_debugreg(vcpu->arch.eff_db[3], 3);
                set_debugreg(vcpu->arch.dr6, 6);
+               vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
        }
  
        trace_kvm_entry(vcpu->vcpu_id);
        return r;
  }
  
+ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
+ {
+       if (!kvm_arch_vcpu_runnable(vcpu)) {
+               srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+               kvm_vcpu_block(vcpu);
+               vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+               if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
+                       return 1;
+       }
+       kvm_apic_accept_events(vcpu);
+       switch(vcpu->arch.mp_state) {
+       case KVM_MP_STATE_HALTED:
+               vcpu->arch.pv.pv_unhalted = false;
+               vcpu->arch.mp_state =
+                       KVM_MP_STATE_RUNNABLE;
+       case KVM_MP_STATE_RUNNABLE:
+               vcpu->arch.apf.halted = false;
+               break;
+       case KVM_MP_STATE_INIT_RECEIVED:
+               break;
+       default:
+               return -EINTR;
+               break;
+       }
+       return 1;
+ }
  
- static int __vcpu_run(struct kvm_vcpu *vcpu)
+ static int vcpu_run(struct kvm_vcpu *vcpu)
  {
        int r;
        struct kvm *kvm = vcpu->kvm;
  
        vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
  
-       r = 1;
-       while (r > 0) {
+       for (;;) {
                if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
                    !vcpu->arch.apf.halted)
                        r = vcpu_enter_guest(vcpu);
-               else {
-                       srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
-                       kvm_vcpu_block(vcpu);
-                       vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
-                       if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) {
-                               kvm_apic_accept_events(vcpu);
-                               switch(vcpu->arch.mp_state) {
-                               case KVM_MP_STATE_HALTED:
-                                       vcpu->arch.pv.pv_unhalted = false;
-                                       vcpu->arch.mp_state =
-                                               KVM_MP_STATE_RUNNABLE;
-                               case KVM_MP_STATE_RUNNABLE:
-                                       vcpu->arch.apf.halted = false;
-                                       break;
-                               case KVM_MP_STATE_INIT_RECEIVED:
-                                       break;
-                               default:
-                                       r = -EINTR;
-                                       break;
-                               }
-                       }
-               }
+               else
+                       r = vcpu_block(kvm, vcpu);
                if (r <= 0)
                        break;
  
                        r = -EINTR;
                        vcpu->run->exit_reason = KVM_EXIT_INTR;
                        ++vcpu->stat.request_irq_exits;
+                       break;
                }
  
                kvm_check_async_pf_completion(vcpu);
                        r = -EINTR;
                        vcpu->run->exit_reason = KVM_EXIT_INTR;
                        ++vcpu->stat.signal_exits;
+                       break;
                }
                if (need_resched()) {
                        srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
@@@ -6568,7 -6610,7 +6609,7 @@@ int kvm_arch_vcpu_ioctl_run(struct kvm_
        } else
                WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
  
-       r = __vcpu_run(vcpu);
+       r = vcpu_run(vcpu);
  
  out:
        post_kvm_run_save(vcpu);
@@@ -7075,11 -7117,14 +7116,14 @@@ void kvm_vcpu_reset(struct kvm_vcpu *vc
        kvm_clear_exception_queue(vcpu);
  
        memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
+       kvm_update_dr0123(vcpu);
        vcpu->arch.dr6 = DR6_INIT;
        kvm_update_dr6(vcpu);
        vcpu->arch.dr7 = DR7_FIXED_1;
        kvm_update_dr7(vcpu);
  
+       vcpu->arch.cr2 = 0;
        kvm_make_request(KVM_REQ_EVENT, vcpu);
        vcpu->arch.apf.msr_val = 0;
        vcpu->arch.st.msr_val = 0;
@@@ -7240,7 -7285,7 +7284,7 @@@ int kvm_arch_vcpu_init(struct kvm_vcpu 
  
        vcpu->arch.pv.pv_unhalted = false;
        vcpu->arch.emulate_ctxt.ops = &emulate_ops;
-       if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
+       if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu))
                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
        else
                vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
        vcpu->arch.guest_supported_xcr0 = 0;
        vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
  
+       vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
        kvm_async_pf_hash_reset(vcpu);
        kvm_pmu_init(vcpu);
  
@@@ -7428,7 -7475,7 +7474,7 @@@ void kvm_arch_free_memslot(struct kvm *
  
        for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
                if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) {
-                       kvm_kvfree(free->arch.rmap[i]);
+                       kvfree(free->arch.rmap[i]);
                        free->arch.rmap[i] = NULL;
                }
                if (i == 0)
  
                if (!dont || free->arch.lpage_info[i - 1] !=
                             dont->arch.lpage_info[i - 1]) {
-                       kvm_kvfree(free->arch.lpage_info[i - 1]);
+                       kvfree(free->arch.lpage_info[i - 1]);
                        free->arch.lpage_info[i - 1] = NULL;
                }
        }
@@@ -7490,12 -7537,12 +7536,12 @@@ int kvm_arch_create_memslot(struct kvm 
  
  out_free:
        for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
-               kvm_kvfree(slot->arch.rmap[i]);
+               kvfree(slot->arch.rmap[i]);
                slot->arch.rmap[i] = NULL;
                if (i == 0)
                        continue;
  
-               kvm_kvfree(slot->arch.lpage_info[i - 1]);
+               kvfree(slot->arch.lpage_info[i - 1]);
                slot->arch.lpage_info[i - 1] = NULL;
        }
        return -ENOMEM;
@@@ -7617,6 -7664,23 +7663,23 @@@ void kvm_arch_commit_memory_region(stru
        /* It's OK to get 'new' slot here as it has already been installed */
        new = id_to_memslot(kvm->memslots, mem->slot);
  
+       /*
+        * Dirty logging tracks sptes in 4k granularity, meaning that large
+        * sptes have to be split.  If live migration is successful, the guest
+        * in the source machine will be destroyed and large sptes will be
+        * created in the destination. However, if the guest continues to run
+        * in the source machine (for example if live migration fails), small
+        * sptes will remain around and cause bad performance.
+        *
+        * Scan sptes if dirty logging has been stopped, dropping those
+        * which can be collapsed into a single large-page spte.  Later
+        * page faults will create the large-page sptes.
+        */
+       if ((change != KVM_MR_DELETE) &&
+               (old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
+               !(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
+               kvm_mmu_zap_collapsible_sptes(kvm, new);
        /*
         * Set up write protection and/or dirty logging for the new slot.
         *
diff --combined include/linux/sched.h
index a419b65770d669c3a51c88a86a145abbcd3db339,be98910cc1e2c9548ffa3e04df3bc975f6ae8692..51348f77e4311d028c769497275e3367b72e4a89
@@@ -176,6 -176,14 +176,14 @@@ extern void get_iowait_load(unsigned lo
  extern void calc_global_load(unsigned long ticks);
  extern void update_cpu_load_nohz(void);
  
+ /* Notifier for when a task gets migrated to a new CPU */
+ struct task_migration_notifier {
+       struct task_struct *task;
+       int from_cpu;
+       int to_cpu;
+ };
+ extern void register_task_migration_notifier(struct notifier_block *n);
  extern unsigned long get_parent_ip(unsigned long addr);
  
  extern void dump_cpu_task(int cpu);
@@@ -1625,11 -1633,11 +1633,11 @@@ struct task_struct 
  
        /*
         * numa_faults_locality tracks if faults recorded during the last
 -       * scan window were remote/local. The task scan period is adapted
 -       * based on the locality of the faults with different weights
 -       * depending on whether they were shared or private faults
 +       * scan window were remote/local or failed to migrate. The task scan
 +       * period is adapted based on the locality of the faults with different
 +       * weights depending on whether they were shared or private faults
         */
 -      unsigned long numa_faults_locality[2];
 +      unsigned long numa_faults_locality[3];
  
        unsigned long numa_pages_migrated;
  #endif /* CONFIG_NUMA_BALANCING */
  #define TNF_NO_GROUP  0x02
  #define TNF_SHARED    0x04
  #define TNF_FAULT_LOCAL       0x08
 +#define TNF_MIGRATE_FAIL 0x10
  
  #ifdef CONFIG_NUMA_BALANCING
  extern void task_numa_fault(int last_node, int node, int pages, int flags);
diff --combined kernel/sched/core.c
index 62671f53202ac7d4de8037dce950c934c7a4ddbc,d0c4209bb836a8ccc13688bf8eb0a21e48892f03..3d5f6f6d14c2f152dedb56393bedd2c312724c34
@@@ -996,6 -996,13 +996,13 @@@ void check_preempt_curr(struct rq *rq, 
                rq_clock_skip_update(rq, true);
  }
  
+ static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
+ void register_task_migration_notifier(struct notifier_block *n)
+ {
+       atomic_notifier_chain_register(&task_migration_notifier, n);
+ }
  #ifdef CONFIG_SMP
  void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
  {
        trace_sched_migrate_task(p, new_cpu);
  
        if (task_cpu(p) != new_cpu) {
+               struct task_migration_notifier tmn;
                if (p->sched_class->migrate_task_rq)
                        p->sched_class->migrate_task_rq(p, new_cpu);
                p->se.nr_migrations++;
                perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
+               tmn.task = p;
+               tmn.from_cpu = task_cpu(p);
+               tmn.to_cpu = new_cpu;
+               atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
        }
  
        __set_task_cpu(p, new_cpu);
@@@ -3034,8 -3049,6 +3049,8 @@@ void rt_mutex_setprio(struct task_struc
        } else {
                if (dl_prio(oldprio))
                        p->dl.dl_boosted = 0;
 +              if (rt_prio(oldprio))
 +                      p->rt.timeout = 0;
                p->sched_class = &fair_sched_class;
        }
  
diff --combined virt/kvm/kvm_main.c
index cc6a25d95fbff532bf5b00b0c339bec91ddc5bcf,91a36e21c0fb348344299a8bf17f3909d388f07d..d3fc9399062a5034b99eaa3d12c855699fdbf608
@@@ -16,7 -16,7 +16,7 @@@
   *
   */
  
- #include "iodev.h"
+ #include <kvm/iodev.h>
  
  #include <linux/kvm_host.h>
  #include <linux/kvm.h>
  MODULE_AUTHOR("Qumranet");
  MODULE_LICENSE("GPL");
  
unsigned int halt_poll_ns = 0;
static unsigned int halt_poll_ns;
  module_param(halt_poll_ns, uint, S_IRUGO | S_IWUSR);
  
  /*
   * Ordering of locks:
   *
-  *            kvm->lock --> kvm->slots_lock --> kvm->irq_lock
+  *    kvm->lock --> kvm->slots_lock --> kvm->irq_lock
   */
  
  DEFINE_SPINLOCK(kvm_lock);
@@@ -80,7 -80,7 +80,7 @@@ static DEFINE_RAW_SPINLOCK(kvm_count_lo
  LIST_HEAD(vm_list);
  
  static cpumask_var_t cpus_hardware_enabled;
- static int kvm_usage_count = 0;
+ static int kvm_usage_count;
  static atomic_t hardware_enable_failed;
  
  struct kmem_cache *kvm_vcpu_cache;
@@@ -471,7 -471,7 +471,7 @@@ static struct kvm *kvm_create_vm(unsign
        BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
  
        r = -ENOMEM;
 -      kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
 +      kvm->memslots = kvm_kvzalloc(sizeof(struct kvm_memslots));
        if (!kvm->memslots)
                goto out_err_no_srcu;
  
@@@ -522,7 -522,7 +522,7 @@@ out_err_no_srcu
  out_err_no_disable:
        for (i = 0; i < KVM_NR_BUSES; i++)
                kfree(kvm->buses[i]);
 -      kfree(kvm->memslots);
 +      kvfree(kvm->memslots);
        kvm_arch_free_vm(kvm);
        return ERR_PTR(r);
  }
@@@ -539,20 -539,12 +539,12 @@@ void *kvm_kvzalloc(unsigned long size
                return kzalloc(size, GFP_KERNEL);
  }
  
- void kvm_kvfree(const void *addr)
- {
-       if (is_vmalloc_addr(addr))
-               vfree(addr);
-       else
-               kfree(addr);
- }
  static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
  {
        if (!memslot->dirty_bitmap)
                return;
  
-       kvm_kvfree(memslot->dirty_bitmap);
+       kvfree(memslot->dirty_bitmap);
        memslot->dirty_bitmap = NULL;
  }
  
@@@ -578,7 -570,7 +570,7 @@@ static void kvm_free_physmem(struct kv
        kvm_for_each_memslot(memslot, slots)
                kvm_free_physmem_slot(kvm, memslot, NULL);
  
 -      kfree(kvm->memslots);
 +      kvfree(kvm->memslots);
  }
  
  static void kvm_destroy_devices(struct kvm *kvm)
@@@ -871,10 -863,10 +863,10 @@@ int __kvm_set_memory_region(struct kvm 
                        goto out_free;
        }
  
 -      slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots),
 -                      GFP_KERNEL);
 +      slots = kvm_kvzalloc(sizeof(struct kvm_memslots));
        if (!slots)
                goto out_free;
 +      memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
  
        if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) {
                slot = id_to_memslot(slots, mem->slot);
                 * or moved, memslot will be created.
                 *
                 * validation of sp->gfn happens in:
-                *      - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
-                *      - kvm_is_visible_gfn (mmu_check_roots)
+                *      - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
+                *      - kvm_is_visible_gfn (mmu_check_roots)
                 */
                kvm_arch_flush_shadow_memslot(kvm, slot);
  
        kvm_arch_commit_memory_region(kvm, mem, &old, change);
  
        kvm_free_physmem_slot(kvm, &old, &new);
 -      kfree(old_memslots);
 +      kvfree(old_memslots);
  
        /*
         * IOMMU mapping:  New slots need to be mapped.  Old slots need to be
        return 0;
  
  out_slots:
 -      kfree(slots);
 +      kvfree(slots);
  out_free:
        kvm_free_physmem_slot(kvm, &new, &old);
  out:
@@@ -1061,9 -1053,11 +1053,11 @@@ int kvm_get_dirty_log_protect(struct kv
                mask = xchg(&dirty_bitmap[i], 0);
                dirty_bitmap_buffer[i] = mask;
  
-               offset = i * BITS_PER_LONG;
-               kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset,
-                                                               mask);
+               if (mask) {
+                       offset = i * BITS_PER_LONG;
+                       kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
+                                                               offset, mask);
+               }
        }
  
        spin_unlock(&kvm->mmu_lock);
@@@ -1193,16 -1187,6 +1187,6 @@@ unsigned long gfn_to_hva_prot(struct kv
        return gfn_to_hva_memslot_prot(slot, gfn, writable);
  }
  
- static int kvm_read_hva(void *data, void __user *hva, int len)
- {
-       return __copy_from_user(data, hva, len);
- }
- static int kvm_read_hva_atomic(void *data, void __user *hva, int len)
- {
-       return __copy_from_user_inatomic(data, hva, len);
- }
  static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
        unsigned long start, int write, struct page **page)
  {
@@@ -1481,7 -1465,6 +1465,6 @@@ struct page *gfn_to_page(struct kvm *kv
  
        return kvm_pfn_to_page(pfn);
  }
  EXPORT_SYMBOL_GPL(gfn_to_page);
  
  void kvm_release_page_clean(struct page *page)
@@@ -1517,6 -1500,7 +1500,7 @@@ void kvm_set_pfn_dirty(pfn_t pfn
  {
        if (!kvm_is_reserved_pfn(pfn)) {
                struct page *page = pfn_to_page(pfn);
                if (!PageReserved(page))
                        SetPageDirty(page);
        }
@@@ -1554,7 -1538,7 +1538,7 @@@ int kvm_read_guest_page(struct kvm *kvm
        addr = gfn_to_hva_prot(kvm, gfn, NULL);
        if (kvm_is_error_hva(addr))
                return -EFAULT;
-       r = kvm_read_hva(data, (void __user *)addr + offset, len);
+       r = __copy_from_user(data, (void __user *)addr + offset, len);
        if (r)
                return -EFAULT;
        return 0;
@@@ -1593,7 -1577,7 +1577,7 @@@ int kvm_read_guest_atomic(struct kvm *k
        if (kvm_is_error_hva(addr))
                return -EFAULT;
        pagefault_disable();
-       r = kvm_read_hva_atomic(data, (void __user *)addr + offset, len);
+       r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
        pagefault_enable();
        if (r)
                return -EFAULT;
@@@ -1653,8 -1637,8 +1637,8 @@@ int kvm_gfn_to_hva_cache_init(struct kv
        ghc->generation = slots->generation;
        ghc->len = len;
        ghc->memslot = gfn_to_memslot(kvm, start_gfn);
-       ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, &nr_pages_avail);
-       if (!kvm_is_error_hva(ghc->hva) && nr_pages_avail >= nr_pages_needed) {
+       ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, NULL);
+       if (!kvm_is_error_hva(ghc->hva) && nr_pages_needed <= 1) {
                ghc->hva += offset;
        } else {
                /*
@@@ -1742,7 -1726,7 +1726,7 @@@ int kvm_clear_guest(struct kvm *kvm, gp
        int offset = offset_in_page(gpa);
        int ret;
  
-         while ((seg = next_segment(len, offset)) != 0) {
+       while ((seg = next_segment(len, offset)) != 0) {
                ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
                if (ret < 0)
                        return ret;
@@@ -1800,6 -1784,7 +1784,7 @@@ void kvm_vcpu_block(struct kvm_vcpu *vc
        start = cur = ktime_get();
        if (halt_poll_ns) {
                ktime_t stop = ktime_add_ns(ktime_get(), halt_poll_ns);
                do {
                        /*
                         * This sets KVM_REQ_UNHALT if an interrupt
@@@ -2118,7 -2103,7 +2103,7 @@@ static long kvm_vcpu_ioctl(struct file 
         * Special cases: vcpu ioctls that are asynchronous to vcpu execution,
         * so vcpu_load() would break it.
         */
-       if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT)
+       if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_S390_IRQ || ioctl == KVM_INTERRUPT)
                return kvm_arch_vcpu_ioctl(filp, ioctl, arg);
  #endif
  
                        /* The thread running this VCPU changed. */
                        struct pid *oldpid = vcpu->pid;
                        struct pid *newpid = get_task_pid(current, PIDTYPE_PID);
                        rcu_assign_pointer(vcpu->pid, newpid);
                        if (oldpid)
                                synchronize_rcu();
@@@ -2205,7 -2191,7 +2191,7 @@@ out_free1
                if (r)
                        goto out;
                r = -EFAULT;
-               if (copy_to_user(argp, &mp_state, sizeof mp_state))
+               if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
                        goto out;
                r = 0;
                break;
                struct kvm_mp_state mp_state;
  
                r = -EFAULT;
-               if (copy_from_user(&mp_state, argp, sizeof mp_state))
+               if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
                        goto out;
                r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
                break;
                struct kvm_translation tr;
  
                r = -EFAULT;
-               if (copy_from_user(&tr, argp, sizeof tr))
+               if (copy_from_user(&tr, argp, sizeof(tr)))
                        goto out;
                r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
                if (r)
                        goto out;
                r = -EFAULT;
-               if (copy_to_user(argp, &tr, sizeof tr))
+               if (copy_to_user(argp, &tr, sizeof(tr)))
                        goto out;
                r = 0;
                break;
                struct kvm_guest_debug dbg;
  
                r = -EFAULT;
-               if (copy_from_user(&dbg, argp, sizeof dbg))
+               if (copy_from_user(&dbg, argp, sizeof(dbg)))
                        goto out;
                r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
                break;
                if (argp) {
                        r = -EFAULT;
                        if (copy_from_user(&kvm_sigmask, argp,
-                                          sizeof kvm_sigmask))
+                                          sizeof(kvm_sigmask)))
                                goto out;
                        r = -EINVAL;
-                       if (kvm_sigmask.len != sizeof sigset)
+                       if (kvm_sigmask.len != sizeof(sigset))
                                goto out;
                        r = -EFAULT;
                        if (copy_from_user(&sigset, sigmask_arg->sigset,
-                                          sizeof sigset))
+                                          sizeof(sigset)))
                                goto out;
                        p = &sigset;
                }
@@@ -2321,14 -2307,14 +2307,14 @@@ static long kvm_vcpu_compat_ioctl(struc
                if (argp) {
                        r = -EFAULT;
                        if (copy_from_user(&kvm_sigmask, argp,
-                                          sizeof kvm_sigmask))
+                                          sizeof(kvm_sigmask)))
                                goto out;
                        r = -EINVAL;
-                       if (kvm_sigmask.len != sizeof csigset)
+                       if (kvm_sigmask.len != sizeof(csigset))
                                goto out;
                        r = -EFAULT;
                        if (copy_from_user(&csigset, sigmask_arg->sigset,
-                                          sizeof csigset))
+                                          sizeof(csigset)))
                                goto out;
                        sigset_from_compat(&sigset, &csigset);
                        r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
@@@ -2492,7 -2478,6 +2478,7 @@@ static long kvm_vm_ioctl_check_extensio
        case KVM_CAP_SIGNAL_MSI:
  #endif
  #ifdef CONFIG_HAVE_KVM_IRQFD
 +      case KVM_CAP_IRQFD:
        case KVM_CAP_IRQFD_RESAMPLE:
  #endif
        case KVM_CAP_CHECK_EXTENSION_VM:
@@@ -2525,7 -2510,7 +2511,7 @@@ static long kvm_vm_ioctl(struct file *f
  
                r = -EFAULT;
                if (copy_from_user(&kvm_userspace_mem, argp,
-                                               sizeof kvm_userspace_mem))
+                                               sizeof(kvm_userspace_mem)))
                        goto out;
  
                r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
                struct kvm_dirty_log log;
  
                r = -EFAULT;
-               if (copy_from_user(&log, argp, sizeof log))
+               if (copy_from_user(&log, argp, sizeof(log)))
                        goto out;
                r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
                break;
  #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
        case KVM_REGISTER_COALESCED_MMIO: {
                struct kvm_coalesced_mmio_zone zone;
                r = -EFAULT;
-               if (copy_from_user(&zone, argp, sizeof zone))
+               if (copy_from_user(&zone, argp, sizeof(zone)))
                        goto out;
                r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
                break;
        }
        case KVM_UNREGISTER_COALESCED_MMIO: {
                struct kvm_coalesced_mmio_zone zone;
                r = -EFAULT;
-               if (copy_from_user(&zone, argp, sizeof zone))
+               if (copy_from_user(&zone, argp, sizeof(zone)))
                        goto out;
                r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
                break;
                struct kvm_irqfd data;
  
                r = -EFAULT;
-               if (copy_from_user(&data, argp, sizeof data))
+               if (copy_from_user(&data, argp, sizeof(data)))
                        goto out;
                r = kvm_irqfd(kvm, &data);
                break;
                struct kvm_ioeventfd data;
  
                r = -EFAULT;
-               if (copy_from_user(&data, argp, sizeof data))
+               if (copy_from_user(&data, argp, sizeof(data)))
                        goto out;
                r = kvm_ioeventfd(kvm, &data);
                break;
                struct kvm_msi msi;
  
                r = -EFAULT;
-               if (copy_from_user(&msi, argp, sizeof msi))
+               if (copy_from_user(&msi, argp, sizeof(msi)))
                        goto out;
                r = kvm_send_userspace_msi(kvm, &msi);
                break;
                struct kvm_irq_level irq_event;
  
                r = -EFAULT;
-               if (copy_from_user(&irq_event, argp, sizeof irq_event))
+               if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
                        goto out;
  
                r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
  
                r = -EFAULT;
                if (ioctl == KVM_IRQ_LINE_STATUS) {
-                       if (copy_to_user(argp, &irq_event, sizeof irq_event))
+                       if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
                                goto out;
                }
  
                        goto out_free_irq_routing;
                r = kvm_set_irq_routing(kvm, entries, routing.nr,
                                        routing.flags);
      out_free_irq_routing:
+ out_free_irq_routing:
                vfree(entries);
                break;
        }
@@@ -2822,8 -2809,7 +2810,7 @@@ static void hardware_enable_nolock(voi
        if (r) {
                cpumask_clear_cpu(cpu, cpus_hardware_enabled);
                atomic_inc(&hardware_enable_failed);
-               printk(KERN_INFO "kvm: enabling virtualization on "
-                                "CPU%d failed\n", cpu);
+               pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu);
        }
  }
  
@@@ -2899,12 -2885,12 +2886,12 @@@ static int kvm_cpu_hotplug(struct notif
        val &= ~CPU_TASKS_FROZEN;
        switch (val) {
        case CPU_DYING:
-               printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
+               pr_info("kvm: disabling virtualization on CPU%d\n",
                       cpu);
                hardware_disable();
                break;
        case CPU_STARTING:
-               printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
+               pr_info("kvm: enabling virtualization on CPU%d\n",
                       cpu);
                hardware_enable();
                break;
@@@ -2921,7 -2907,7 +2908,7 @@@ static int kvm_reboot(struct notifier_b
         *
         * And Intel TXT required VMX off for all cpu when system shutdown.
         */
-       printk(KERN_INFO "kvm: exiting hardware virtualization\n");
+       pr_info("kvm: exiting hardware virtualization\n");
        kvm_rebooting = true;
        on_each_cpu(hardware_disable_nolock, NULL, 1);
        return NOTIFY_OK;
@@@ -2945,7 -2931,7 +2932,7 @@@ static void kvm_io_bus_destroy(struct k
  }
  
  static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
-                                  const struct kvm_io_range *r2)
+                                const struct kvm_io_range *r2)
  {
        if (r1->addr < r2->addr)
                return -1;
@@@ -2998,7 -2984,7 +2985,7 @@@ static int kvm_io_bus_get_first_dev(str
        return off;
  }
  
- static int __kvm_io_bus_write(struct kvm_io_bus *bus,
+ static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
                              struct kvm_io_range *range, const void *val)
  {
        int idx;
  
        while (idx < bus->dev_count &&
                kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
-               if (!kvm_iodevice_write(bus->range[idx].dev, range->addr,
+               if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
                                        range->len, val))
                        return idx;
                idx++;
  }
  
  /* kvm_io_bus_write - called under kvm->slots_lock */
- int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+ int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
                     int len, const void *val)
  {
        struct kvm_io_bus *bus;
                .len = len,
        };
  
-       bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
-       r = __kvm_io_bus_write(bus, &range, val);
+       bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
+       r = __kvm_io_bus_write(vcpu, bus, &range, val);
        return r < 0 ? r : 0;
  }
  
  /* kvm_io_bus_write_cookie - called under kvm->slots_lock */
- int kvm_io_bus_write_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
-                           int len, const void *val, long cookie)
+ int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
+                           gpa_t addr, int len, const void *val, long cookie)
  {
        struct kvm_io_bus *bus;
        struct kvm_io_range range;
                .len = len,
        };
  
-       bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
+       bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
  
        /* First try the device referenced by cookie. */
        if ((cookie >= 0) && (cookie < bus->dev_count) &&
            (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
-               if (!kvm_iodevice_write(bus->range[cookie].dev, addr, len,
+               if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
                                        val))
                        return cookie;
  
         * cookie contained garbage; fall back to search and return the
         * correct cookie value.
         */
-       return __kvm_io_bus_write(bus, &range, val);
+       return __kvm_io_bus_write(vcpu, bus, &range, val);
  }
  
- static int __kvm_io_bus_read(struct kvm_io_bus *bus, struct kvm_io_range *range,
-                            void *val)
+ static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
+                            struct kvm_io_range *range, void *val)
  {
        int idx;
  
  
        while (idx < bus->dev_count &&
                kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
-               if (!kvm_iodevice_read(bus->range[idx].dev, range->addr,
+               if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
                                       range->len, val))
                        return idx;
                idx++;
  EXPORT_SYMBOL_GPL(kvm_io_bus_write);
  
  /* kvm_io_bus_read - called under kvm->slots_lock */
- int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
                    int len, void *val)
  {
        struct kvm_io_bus *bus;
                .len = len,
        };
  
-       bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
-       r = __kvm_io_bus_read(bus, &range, val);
+       bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
+       r = __kvm_io_bus_read(vcpu, bus, &range, val);
        return r < 0 ? r : 0;
  }
  
@@@ -3269,6 -3255,7 +3256,7 @@@ struct kvm_vcpu *preempt_notifier_to_vc
  static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
  {
        struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
        if (vcpu->preempted)
                vcpu->preempted = false;
  
@@@ -3350,7 -3337,7 +3338,7 @@@ int kvm_init(void *opaque, unsigned vcp
  
        r = misc_register(&kvm_dev);
        if (r) {
-               printk(KERN_ERR "kvm: misc device register failed\n");
+               pr_err("kvm: misc device register failed\n");
                goto out_unreg;
        }
  
  
        r = kvm_init_debug();
        if (r) {
-               printk(KERN_ERR "kvm: create debugfs files failed\n");
+               pr_err("kvm: create debugfs files failed\n");
                goto out_undebugfs;
        }