Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 30 Jan 2008 22:30:10 +0000 (09:30 +1100)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 30 Jan 2008 22:30:10 +0000 (09:30 +1100)
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm: (249 commits)
  KVM: Move apic timer migration away from critical section
  KVM: Put kvm_para.h include outside __KERNEL__
  KVM: Fix unbounded preemption latency
  KVM: Initialize the mmu caches only after verifying cpu support
  KVM: MMU: Fix dirty page setting for pages removed from rmap
  KVM: Portability: Move kvm_fpu to asm-x86/kvm.h
  KVM: x86 emulator: Only allow VMCALL/VMMCALL trapped by #UD
  KVM: MMU: Merge shadow level check in FNAME(fetch)
  KVM: MMU: Move kvm_free_some_pages() into critical section
  KVM: MMU: Switch to mmu spinlock
  KVM: MMU: Avoid calling gfn_to_page() in mmu_set_spte()
  KVM: Add kvm_read_guest_atomic()
  KVM: MMU: Concurrent guest walkers
  KVM: Disable vapic support on Intel machines with FlexPriority
  KVM: Accelerated apic support
  KVM: local APIC TPR access reporting facility
  KVM: Print data for unimplemented wrmsr
  KVM: MMU: Add cache miss statistic
  KVM: MMU: Coalesce remote tlb flushes
  KVM: Expose ioapic to ia64 save/restore APIs
  ...

41 files changed:
arch/x86/Kconfig
arch/x86/Makefile
arch/x86/kvm/Kconfig [moved from drivers/kvm/Kconfig with 94% similarity]
arch/x86/kvm/Makefile [moved from drivers/kvm/Makefile with 51% similarity]
arch/x86/kvm/i8259.c [moved from drivers/kvm/i8259.c with 98% similarity]
arch/x86/kvm/irq.c [moved from drivers/kvm/irq.c with 81% similarity]
arch/x86/kvm/irq.h [new file with mode: 0644]
arch/x86/kvm/kvm_svm.h [moved from drivers/kvm/kvm_svm.h with 96% similarity]
arch/x86/kvm/lapic.c [moved from drivers/kvm/lapic.c with 83% similarity]
arch/x86/kvm/lapic.h [new file with mode: 0644]
arch/x86/kvm/mmu.c [new file with mode: 0644]
arch/x86/kvm/mmu.h [new file with mode: 0644]
arch/x86/kvm/paging_tmpl.h [new file with mode: 0644]
arch/x86/kvm/segment_descriptor.h [moved from drivers/kvm/segment_descriptor.h with 53% similarity]
arch/x86/kvm/svm.c [moved from drivers/kvm/svm.c with 84% similarity]
arch/x86/kvm/svm.h [moved from drivers/kvm/svm.h with 98% similarity]
arch/x86/kvm/vmx.c [moved from drivers/kvm/vmx.c with 75% similarity]
arch/x86/kvm/vmx.h [moved from drivers/kvm/vmx.h with 96% similarity]
arch/x86/kvm/x86.c [moved from drivers/kvm/kvm_main.c with 52% similarity]
arch/x86/kvm/x86_emulate.c [new file with mode: 0644]
drivers/Kconfig
drivers/Makefile
drivers/kvm/irq.h [deleted file]
drivers/kvm/mmu.c [deleted file]
drivers/kvm/paging_tmpl.h [deleted file]
drivers/kvm/x86_emulate.c [deleted file]
include/asm-x86/Kbuild
include/asm-x86/kvm.h [new file with mode: 0644]
include/asm-x86/kvm_host.h [moved from drivers/kvm/kvm.h with 64% similarity]
include/asm-x86/kvm_para.h [new file with mode: 0644]
include/asm-x86/kvm_x86_emulate.h [moved from drivers/kvm/x86_emulate.h with 83% similarity]
include/linux/Kbuild
include/linux/kvm.h
include/linux/kvm_host.h [new file with mode: 0644]
include/linux/kvm_para.h
include/linux/kvm_types.h [new file with mode: 0644]
kernel/fork.c
virt/kvm/ioapic.c [moved from drivers/kvm/ioapic.c with 83% similarity]
virt/kvm/ioapic.h [new file with mode: 0644]
virt/kvm/iodev.h [new file with mode: 0644]
virt/kvm/kvm_main.c [new file with mode: 0644]

index fb3eea3e38ee192af15126169939988828bf3afe..65b449134cf7b15cbc2bdf66993b813fa6f409f0 100644 (file)
@@ -107,6 +107,7 @@ config ARCH_SUPPORTS_OPROFILE
        bool
        default y
 
+select HAVE_KVM
 
 config ZONE_DMA32
        bool
@@ -1598,4 +1599,6 @@ source "security/Kconfig"
 
 source "crypto/Kconfig"
 
+source "arch/x86/kvm/Kconfig"
+
 source "lib/Kconfig"
index b08f18261df662a4525ac7af2c73fec5b3e776ee..da8f4129780bd8d25801eeee7742986ec92dddbc 100644 (file)
@@ -7,6 +7,8 @@ else
         KBUILD_DEFCONFIG := $(ARCH)_defconfig
 endif
 
+core-$(CONFIG_KVM) += arch/x86/kvm/
+
 # BITS is used as extension for files which are available in a 32 bit
 # and a 64 bit version to simplify shared Makefiles.
 # e.g.: obj-y += foo_$(BITS).o
similarity index 94%
rename from drivers/kvm/Kconfig
rename to arch/x86/kvm/Kconfig
index 656920636cb2f6dafba5ffbd0dc41c7c177e601e..c83e1c9b5129b3cc96ecfcb8992e378cd7d4119e 100644 (file)
@@ -1,9 +1,12 @@
 #
 # KVM configuration
 #
+config HAVE_KVM
+       bool
+
 menuconfig VIRTUALIZATION
        bool "Virtualization"
-       depends on X86
+       depends on HAVE_KVM || X86
        default y
        ---help---
          Say Y here to get to see options for using your Linux host to run other
@@ -16,7 +19,7 @@ if VIRTUALIZATION
 
 config KVM
        tristate "Kernel-based Virtual Machine (KVM) support"
-       depends on X86 && EXPERIMENTAL
+       depends on HAVE_KVM && EXPERIMENTAL
        select PREEMPT_NOTIFIERS
        select ANON_INODES
        ---help---
similarity index 51%
rename from drivers/kvm/Makefile
rename to arch/x86/kvm/Makefile
index e5a8f4d3e97386f0ba73e629b8d2a13341fc5dd0..ffdd0b310784059527a4837ca040400c170db0d2 100644 (file)
@@ -2,7 +2,11 @@
 # Makefile for Kernel-based Virtual Machine module
 #
 
-kvm-objs := kvm_main.o mmu.o x86_emulate.o i8259.o irq.o lapic.o ioapic.o
+common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o)
+
+EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
+
+kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o
 obj-$(CONFIG_KVM) += kvm.o
 kvm-intel-objs = vmx.o
 obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
similarity index 98%
rename from drivers/kvm/i8259.c
rename to arch/x86/kvm/i8259.c
index a679157bc599ae0a54979ca9e75f9a4e6ff610dc..ab29cf2def47cc0180c903491ef71ea52684934f 100644 (file)
@@ -28,6 +28,8 @@
 #include <linux/mm.h>
 #include "irq.h"
 
+#include <linux/kvm_host.h>
+
 /*
  * set irq level. If an edge is detected, then the IRR is set to 1
  */
@@ -181,10 +183,8 @@ int kvm_pic_read_irq(struct kvm_pic *s)
        return intno;
 }
 
-static void pic_reset(void *opaque)
+void kvm_pic_reset(struct kvm_kpic_state *s)
 {
-       struct kvm_kpic_state *s = opaque;
-
        s->last_irr = 0;
        s->irr = 0;
        s->imr = 0;
@@ -209,7 +209,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
        addr &= 1;
        if (addr == 0) {
                if (val & 0x10) {
-                       pic_reset(s);   /* init */
+                       kvm_pic_reset(s);       /* init */
                        /*
                         * deassert a pending interrupt
                         */
similarity index 81%
rename from drivers/kvm/irq.c
rename to arch/x86/kvm/irq.c
index 7628c7ff628ff1db90f3a9a4d20df792bb59f4ad..e5714759e97fcee29989eda9a262fef7cd5183fb 100644 (file)
@@ -20,8 +20,8 @@
  */
 
 #include <linux/module.h>
+#include <linux/kvm_host.h>
 
-#include "kvm.h"
 #include "irq.h"
 
 /*
@@ -63,26 +63,6 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
 }
 EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
 
-static void vcpu_kick_intr(void *info)
-{
-#ifdef DEBUG
-       struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
-       printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
-#endif
-}
-
-void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
-{
-       int ipi_pcpu = vcpu->cpu;
-
-       if (waitqueue_active(&vcpu->wq)) {
-               wake_up_interruptible(&vcpu->wq);
-               ++vcpu->stat.halt_wakeup;
-       }
-       if (vcpu->guest_mode)
-               smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
-}
-
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
 {
        kvm_inject_apic_timer_irqs(vcpu);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
new file mode 100644 (file)
index 0000000..fa5ed5d
--- /dev/null
@@ -0,0 +1,88 @@
+/*
+ * irq.h: in kernel interrupt controller related definitions
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * Authors:
+ *   Yaozu (Eddie) Dong <Eddie.dong@intel.com>
+ *
+ */
+
+#ifndef __IRQ_H
+#define __IRQ_H
+
+#include <linux/mm_types.h>
+#include <linux/hrtimer.h>
+#include <linux/kvm_host.h>
+
+#include "iodev.h"
+#include "ioapic.h"
+#include "lapic.h"
+
+struct kvm;
+struct kvm_vcpu;
+
+typedef void irq_request_func(void *opaque, int level);
+
+struct kvm_kpic_state {
+       u8 last_irr;    /* edge detection */
+       u8 irr;         /* interrupt request register */
+       u8 imr;         /* interrupt mask register */
+       u8 isr;         /* interrupt service register */
+       u8 priority_add;        /* highest irq priority */
+       u8 irq_base;
+       u8 read_reg_select;
+       u8 poll;
+       u8 special_mask;
+       u8 init_state;
+       u8 auto_eoi;
+       u8 rotate_on_auto_eoi;
+       u8 special_fully_nested_mode;
+       u8 init4;               /* true if 4 byte init */
+       u8 elcr;                /* PIIX edge/trigger selection */
+       u8 elcr_mask;
+       struct kvm_pic *pics_state;
+};
+
+struct kvm_pic {
+       struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
+       irq_request_func *irq_request;
+       void *irq_request_opaque;
+       int output;             /* intr from master PIC */
+       struct kvm_io_device dev;
+};
+
+struct kvm_pic *kvm_create_pic(struct kvm *kvm);
+void kvm_pic_set_irq(void *opaque, int irq, int level);
+int kvm_pic_read_irq(struct kvm_pic *s);
+void kvm_pic_update_irq(struct kvm_pic *s);
+
+static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
+{
+       return kvm->arch.vpic;
+}
+
+static inline int irqchip_in_kernel(struct kvm *kvm)
+{
+       return pic_irqchip(kvm) != NULL;
+}
+
+void kvm_pic_reset(struct kvm_kpic_state *s);
+
+void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
+void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
+void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
+void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
+
+#endif
similarity index 96%
rename from drivers/kvm/kvm_svm.h
rename to arch/x86/kvm/kvm_svm.h
index a0e415daef5b0142ba00cf2b5cec9b9e12b53d5b..ecdfe97e4635393eb0df2ba755ffd0150443e7dd 100644 (file)
@@ -4,10 +4,10 @@
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/list.h>
+#include <linux/kvm_host.h>
 #include <asm/msr.h>
 
 #include "svm.h"
-#include "kvm.h"
 
 static const u32 host_save_user_msrs[] = {
 #ifdef CONFIG_X86_64
similarity index 83%
rename from drivers/kvm/lapic.c
rename to arch/x86/kvm/lapic.c
index 238fcad3ceceee4fc32f8ebc2396ed455b4107ae..2cbee9479ce423850a99df39290e3efbbe51ae48 100644 (file)
@@ -17,7 +17,7 @@
  * the COPYING file in the top-level directory.
  */
 
-#include "kvm.h"
+#include <linux/kvm_host.h>
 #include <linux/kvm.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
@@ -56,6 +56,7 @@
 
 #define VEC_POS(v) ((v) & (32 - 1))
 #define REG_POS(v) (((v) >> 5) << 4)
+
 static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
 {
        return *((u32 *) (apic->regs + reg_off));
@@ -88,7 +89,7 @@ static inline void apic_clear_vector(int vec, void *bitmap)
 
 static inline int apic_hw_enabled(struct kvm_lapic *apic)
 {
-       return (apic)->vcpu->apic_base & MSR_IA32_APICBASE_ENABLE;
+       return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
 }
 
 static inline int  apic_sw_enabled(struct kvm_lapic *apic)
@@ -172,7 +173,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
 
 int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
 {
-       struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
+       struct kvm_lapic *apic = vcpu->arch.apic;
        int highest_irr;
 
        if (!apic)
@@ -183,8 +184,10 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
 
-int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig)
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig)
 {
+       struct kvm_lapic *apic = vcpu->arch.apic;
+
        if (!apic_test_and_set_irr(vec, apic)) {
                /* a new pending irq is set in IRR */
                if (trig)
@@ -268,7 +271,7 @@ static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
                           int short_hand, int dest, int dest_mode)
 {
        int result = 0;
-       struct kvm_lapic *target = vcpu->apic;
+       struct kvm_lapic *target = vcpu->arch.apic;
 
        apic_debug("target %p, source %p, dest 0x%x, "
                   "dest_mode 0x%x, short_hand 0x%x",
@@ -335,10 +338,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
                } else
                        apic_clear_vector(vector, apic->regs + APIC_TMR);
 
-               if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE)
+               if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
                        kvm_vcpu_kick(vcpu);
-               else if (vcpu->mp_state == VCPU_MP_STATE_HALTED) {
-                       vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
+               else if (vcpu->arch.mp_state == VCPU_MP_STATE_HALTED) {
+                       vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
                        if (waitqueue_active(&vcpu->wq))
                                wake_up_interruptible(&vcpu->wq);
                }
@@ -359,11 +362,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 
        case APIC_DM_INIT:
                if (level) {
-                       if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE)
+                       if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
                                printk(KERN_DEBUG
                                       "INIT on a runnable vcpu %d\n",
                                       vcpu->vcpu_id);
-                       vcpu->mp_state = VCPU_MP_STATE_INIT_RECEIVED;
+                       vcpu->arch.mp_state = VCPU_MP_STATE_INIT_RECEIVED;
                        kvm_vcpu_kick(vcpu);
                } else {
                        printk(KERN_DEBUG
@@ -376,9 +379,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
        case APIC_DM_STARTUP:
                printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n",
                       vcpu->vcpu_id, vector);
-               if (vcpu->mp_state == VCPU_MP_STATE_INIT_RECEIVED) {
-                       vcpu->sipi_vector = vector;
-                       vcpu->mp_state = VCPU_MP_STATE_SIPI_RECEIVED;
+               if (vcpu->arch.mp_state == VCPU_MP_STATE_INIT_RECEIVED) {
+                       vcpu->arch.sipi_vector = vector;
+                       vcpu->arch.mp_state = VCPU_MP_STATE_SIPI_RECEIVED;
                        if (waitqueue_active(&vcpu->wq))
                                wake_up_interruptible(&vcpu->wq);
                }
@@ -392,15 +395,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
        return result;
 }
 
-struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
+static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
                                       unsigned long bitmap)
 {
-       int vcpu_id;
        int last;
        int next;
-       struct kvm_lapic *apic;
+       struct kvm_lapic *apic = NULL;
 
-       last = kvm->round_robin_prev_vcpu;
+       last = kvm->arch.round_robin_prev_vcpu;
        next = last;
 
        do {
@@ -408,25 +410,30 @@ struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
                        next = 0;
                if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap))
                        continue;
-               apic = kvm->vcpus[next]->apic;
+               apic = kvm->vcpus[next]->arch.apic;
                if (apic && apic_enabled(apic))
                        break;
                apic = NULL;
        } while (next != last);
-       kvm->round_robin_prev_vcpu = next;
+       kvm->arch.round_robin_prev_vcpu = next;
 
-       if (!apic) {
-               vcpu_id = ffs(bitmap) - 1;
-               if (vcpu_id < 0) {
-                       vcpu_id = 0;
-                       printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
-               }
-               apic = kvm->vcpus[vcpu_id]->apic;
-       }
+       if (!apic)
+               printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
 
        return apic;
 }
 
+struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
+               unsigned long bitmap)
+{
+       struct kvm_lapic *apic;
+
+       apic = kvm_apic_round_robin(kvm, vector, bitmap);
+       if (apic)
+               return apic->vcpu;
+       return NULL;
+}
+
 static void apic_set_eoi(struct kvm_lapic *apic)
 {
        int vector = apic_find_highest_isr(apic);
@@ -458,7 +465,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
        unsigned int delivery_mode = icr_low & APIC_MODE_MASK;
        unsigned int vector = icr_low & APIC_VECTOR_MASK;
 
-       struct kvm_lapic *target;
+       struct kvm_vcpu *target;
        struct kvm_vcpu *vcpu;
        unsigned long lpr_map = 0;
        int i;
@@ -474,20 +481,20 @@ static void apic_send_ipi(struct kvm_lapic *apic)
                if (!vcpu)
                        continue;
 
-               if (vcpu->apic &&
+               if (vcpu->arch.apic &&
                    apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) {
                        if (delivery_mode == APIC_DM_LOWEST)
                                set_bit(vcpu->vcpu_id, &lpr_map);
                        else
-                               __apic_accept_irq(vcpu->apic, delivery_mode,
+                               __apic_accept_irq(vcpu->arch.apic, delivery_mode,
                                                  vector, level, trig_mode);
                }
        }
 
        if (delivery_mode == APIC_DM_LOWEST) {
-               target = kvm_apic_round_robin(vcpu->kvm, vector, lpr_map);
+               target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map);
                if (target != NULL)
-                       __apic_accept_irq(target, delivery_mode,
+                       __apic_accept_irq(target->arch.apic, delivery_mode,
                                          vector, level, trig_mode);
        }
 }
@@ -544,6 +551,23 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
        return tmcct;
 }
 
+static void __report_tpr_access(struct kvm_lapic *apic, bool write)
+{
+       struct kvm_vcpu *vcpu = apic->vcpu;
+       struct kvm_run *run = vcpu->run;
+
+       set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests);
+       kvm_x86_ops->cache_regs(vcpu);
+       run->tpr_access.rip = vcpu->arch.rip;
+       run->tpr_access.is_write = write;
+}
+
+static inline void report_tpr_access(struct kvm_lapic *apic, bool write)
+{
+       if (apic->vcpu->arch.tpr_access_reporting)
+               __report_tpr_access(apic, write);
+}
+
 static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
 {
        u32 val = 0;
@@ -561,6 +585,9 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
                val = apic_get_tmcct(apic);
                break;
 
+       case APIC_TASKPRI:
+               report_tpr_access(apic, false);
+               /* fall thru */
        default:
                apic_update_ppr(apic);
                val = apic_get_reg(apic, offset);
@@ -670,6 +697,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
                break;
 
        case APIC_TASKPRI:
+               report_tpr_access(apic, true);
                apic_set_tpr(apic, val & 0xff);
                break;
 
@@ -762,19 +790,17 @@ static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr)
        return ret;
 }
 
-void kvm_free_apic(struct kvm_lapic *apic)
+void kvm_free_lapic(struct kvm_vcpu *vcpu)
 {
-       if (!apic)
+       if (!vcpu->arch.apic)
                return;
 
-       hrtimer_cancel(&apic->timer.dev);
+       hrtimer_cancel(&vcpu->arch.apic->timer.dev);
 
-       if (apic->regs_page) {
-               __free_page(apic->regs_page);
-               apic->regs_page = 0;
-       }
+       if (vcpu->arch.apic->regs_page)
+               __free_page(vcpu->arch.apic->regs_page);
 
-       kfree(apic);
+       kfree(vcpu->arch.apic);
 }
 
 /*
@@ -785,16 +811,17 @@ void kvm_free_apic(struct kvm_lapic *apic)
 
 void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
 {
-       struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
+       struct kvm_lapic *apic = vcpu->arch.apic;
 
        if (!apic)
                return;
-       apic_set_tpr(apic, ((cr8 & 0x0f) << 4));
+       apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
+                    | (apic_get_reg(apic, APIC_TASKPRI) & 4));
 }
 
 u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
 {
-       struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
+       struct kvm_lapic *apic = vcpu->arch.apic;
        u64 tpr;
 
        if (!apic)
@@ -807,29 +834,29 @@ EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8);
 
 void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
 {
-       struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
+       struct kvm_lapic *apic = vcpu->arch.apic;
 
        if (!apic) {
                value |= MSR_IA32_APICBASE_BSP;
-               vcpu->apic_base = value;
+               vcpu->arch.apic_base = value;
                return;
        }
        if (apic->vcpu->vcpu_id)
                value &= ~MSR_IA32_APICBASE_BSP;
 
-       vcpu->apic_base = value;
-       apic->base_address = apic->vcpu->apic_base &
+       vcpu->arch.apic_base = value;
+       apic->base_address = apic->vcpu->arch.apic_base &
                             MSR_IA32_APICBASE_BASE;
 
        /* with FSB delivery interrupt, we can restart APIC functionality */
        apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is "
-                  "0x%lx.\n", apic->apic_base, apic->base_address);
+                  "0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address);
 
 }
 
 u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu)
 {
-       return vcpu->apic_base;
+       return vcpu->arch.apic_base;
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_get_base);
 
@@ -841,7 +868,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
        apic_debug("%s\n", __FUNCTION__);
 
        ASSERT(vcpu);
-       apic = vcpu->apic;
+       apic = vcpu->arch.apic;
        ASSERT(apic != NULL);
 
        /* Stop the timer in case it's a reset to an active apic */
@@ -872,19 +899,19 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
        update_divide_count(apic);
        atomic_set(&apic->timer.pending, 0);
        if (vcpu->vcpu_id == 0)
-               vcpu->apic_base |= MSR_IA32_APICBASE_BSP;
+               vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
        apic_update_ppr(apic);
 
        apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
                   "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__,
                   vcpu, kvm_apic_id(apic),
-                  vcpu->apic_base, apic->base_address);
+                  vcpu->arch.apic_base, apic->base_address);
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_reset);
 
 int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
 {
-       struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
+       struct kvm_lapic *apic = vcpu->arch.apic;
        int ret = 0;
 
        if (!apic)
@@ -908,9 +935,8 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
        wait_queue_head_t *q = &apic->vcpu->wq;
 
        atomic_inc(&apic->timer.pending);
-       if (waitqueue_active(q))
-       {
-               apic->vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
+       if (waitqueue_active(q)) {
+               apic->vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
                wake_up_interruptible(q);
        }
        if (apic_lvtt_period(apic)) {
@@ -956,13 +982,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
        if (!apic)
                goto nomem;
 
-       vcpu->apic = apic;
+       vcpu->arch.apic = apic;
 
        apic->regs_page = alloc_page(GFP_KERNEL);
        if (apic->regs_page == NULL) {
                printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
                       vcpu->vcpu_id);
-               goto nomem;
+               goto nomem_free_apic;
        }
        apic->regs = page_address(apic->regs_page);
        memset(apic->regs, 0, PAGE_SIZE);
@@ -971,7 +997,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
        hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
        apic->timer.dev.function = apic_timer_fn;
        apic->base_address = APIC_DEFAULT_PHYS_BASE;
-       vcpu->apic_base = APIC_DEFAULT_PHYS_BASE;
+       vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
 
        kvm_lapic_reset(vcpu);
        apic->dev.read = apic_mmio_read;
@@ -980,15 +1006,16 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
        apic->dev.private = apic;
 
        return 0;
+nomem_free_apic:
+       kfree(apic);
 nomem:
-       kvm_free_apic(apic);
        return -ENOMEM;
 }
 EXPORT_SYMBOL_GPL(kvm_create_lapic);
 
 int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
 {
-       struct kvm_lapic *apic = vcpu->apic;
+       struct kvm_lapic *apic = vcpu->arch.apic;
        int highest_irr;
 
        if (!apic || !apic_enabled(apic))
@@ -1004,11 +1031,11 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
 
 int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
 {
-       u32 lvt0 = apic_get_reg(vcpu->apic, APIC_LVT0);
+       u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
        int r = 0;
 
        if (vcpu->vcpu_id == 0) {
-               if (!apic_hw_enabled(vcpu->apic))
+               if (!apic_hw_enabled(vcpu->arch.apic))
                        r = 1;
                if ((lvt0 & APIC_LVT_MASKED) == 0 &&
                    GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
@@ -1019,7 +1046,7 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
 
 void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
 {
-       struct kvm_lapic *apic = vcpu->apic;
+       struct kvm_lapic *apic = vcpu->arch.apic;
 
        if (apic && apic_lvt_enabled(apic, APIC_LVTT) &&
                atomic_read(&apic->timer.pending) > 0) {
@@ -1030,7 +1057,7 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
 
 void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
 {
-       struct kvm_lapic *apic = vcpu->apic;
+       struct kvm_lapic *apic = vcpu->arch.apic;
 
        if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec)
                apic->timer.last_update = ktime_add_ns(
@@ -1041,7 +1068,7 @@ void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
 int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
 {
        int vector = kvm_apic_has_interrupt(vcpu);
-       struct kvm_lapic *apic = vcpu->apic;
+       struct kvm_lapic *apic = vcpu->arch.apic;
 
        if (vector == -1)
                return -1;
@@ -1054,9 +1081,9 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
 
 void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
 {
-       struct kvm_lapic *apic = vcpu->apic;
+       struct kvm_lapic *apic = vcpu->arch.apic;
 
-       apic->base_address = vcpu->apic_base &
+       apic->base_address = vcpu->arch.apic_base &
                             MSR_IA32_APICBASE_BASE;
        apic_set_reg(apic, APIC_LVR, APIC_VERSION);
        apic_update_ppr(apic);
@@ -1065,9 +1092,9 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
        start_apic_timer(apic);
 }
 
-void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
+void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
 {
-       struct kvm_lapic *apic = vcpu->apic;
+       struct kvm_lapic *apic = vcpu->arch.apic;
        struct hrtimer *timer;
 
        if (!apic)
@@ -1077,4 +1104,51 @@ void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
        if (hrtimer_cancel(timer))
                hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
 }
-EXPORT_SYMBOL_GPL(kvm_migrate_apic_timer);
+
+void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
+{
+       u32 data;
+       void *vapic;
+
+       if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
+               return;
+
+       vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0);
+       data = *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr));
+       kunmap_atomic(vapic, KM_USER0);
+
+       apic_set_tpr(vcpu->arch.apic, data & 0xff);
+}
+
+void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
+{
+       u32 data, tpr;
+       int max_irr, max_isr;
+       struct kvm_lapic *apic;
+       void *vapic;
+
+       if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
+               return;
+
+       apic = vcpu->arch.apic;
+       tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff;
+       max_irr = apic_find_highest_irr(apic);
+       if (max_irr < 0)
+               max_irr = 0;
+       max_isr = apic_find_highest_isr(apic);
+       if (max_isr < 0)
+               max_isr = 0;
+       data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24);
+
+       vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0);
+       *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)) = data;
+       kunmap_atomic(vapic, KM_USER0);
+}
+
+void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
+{
+       if (!irqchip_in_kernel(vcpu->kvm))
+               return;
+
+       vcpu->arch.apic->vapic_addr = vapic_addr;
+}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
new file mode 100644 (file)
index 0000000..676c396
--- /dev/null
@@ -0,0 +1,50 @@
+#ifndef __KVM_X86_LAPIC_H
+#define __KVM_X86_LAPIC_H
+
+#include "iodev.h"
+
+#include <linux/kvm_host.h>
+
+struct kvm_lapic {
+       unsigned long base_address;
+       struct kvm_io_device dev;
+       struct {
+               atomic_t pending;
+               s64 period;     /* unit: ns */
+               u32 divide_count;
+               ktime_t last_update;
+               struct hrtimer dev;
+       } timer;
+       struct kvm_vcpu *vcpu;
+       struct page *regs_page;
+       void *regs;
+       gpa_t vapic_addr;
+       struct page *vapic_page;
+};
+int kvm_create_lapic(struct kvm_vcpu *vcpu);
+void kvm_free_lapic(struct kvm_vcpu *vcpu);
+
+int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
+int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
+int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
+void kvm_lapic_reset(struct kvm_vcpu *vcpu);
+u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
+void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
+void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
+
+int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
+int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig);
+
+u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
+void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
+void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
+int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
+int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
+void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
+
+void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
+void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
+void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
+
+#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
new file mode 100644 (file)
index 0000000..8efdcdb
--- /dev/null
@@ -0,0 +1,1885 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * MMU support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ *
+ * Authors:
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ *   Avi Kivity   <avi@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "vmx.h"
+#include "mmu.h"
+
+#include <linux/kvm_host.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+
+#include <asm/page.h>
+#include <asm/cmpxchg.h>
+#include <asm/io.h>
+
+#undef MMU_DEBUG
+
+#undef AUDIT
+
+#ifdef AUDIT
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
+#else
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
+#endif
+
+#ifdef MMU_DEBUG
+
+#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
+#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
+
+#else
+
+#define pgprintk(x...) do { } while (0)
+#define rmap_printk(x...) do { } while (0)
+
+#endif
+
+#if defined(MMU_DEBUG) || defined(AUDIT)
+static int dbg = 1;
+#endif
+
+#ifndef MMU_DEBUG
+#define ASSERT(x) do { } while (0)
+#else
+#define ASSERT(x)                                                      \
+       if (!(x)) {                                                     \
+               printk(KERN_WARNING "assertion failed %s:%d: %s\n",     \
+                      __FILE__, __LINE__, #x);                         \
+       }
+#endif
+
+#define PT64_PT_BITS 9
+#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
+#define PT32_PT_BITS 10
+#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
+
+#define PT_WRITABLE_SHIFT 1
+
+#define PT_PRESENT_MASK (1ULL << 0)
+#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
+#define PT_USER_MASK (1ULL << 2)
+#define PT_PWT_MASK (1ULL << 3)
+#define PT_PCD_MASK (1ULL << 4)
+#define PT_ACCESSED_MASK (1ULL << 5)
+#define PT_DIRTY_MASK (1ULL << 6)
+#define PT_PAGE_SIZE_MASK (1ULL << 7)
+#define PT_PAT_MASK (1ULL << 7)
+#define PT_GLOBAL_MASK (1ULL << 8)
+#define PT64_NX_SHIFT 63
+#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)
+
+#define PT_PAT_SHIFT 7
+#define PT_DIR_PAT_SHIFT 12
+#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
+
+#define PT32_DIR_PSE36_SIZE 4
+#define PT32_DIR_PSE36_SHIFT 13
+#define PT32_DIR_PSE36_MASK \
+       (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
+
+
+#define PT_FIRST_AVAIL_BITS_SHIFT 9
+#define PT64_SECOND_AVAIL_BITS_SHIFT 52
+
+#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
+
+#define VALID_PAGE(x) ((x) != INVALID_PAGE)
+
+#define PT64_LEVEL_BITS 9
+
+#define PT64_LEVEL_SHIFT(level) \
+               (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
+
+#define PT64_LEVEL_MASK(level) \
+               (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
+
+#define PT64_INDEX(address, level)\
+       (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
+
+
+#define PT32_LEVEL_BITS 10
+
+#define PT32_LEVEL_SHIFT(level) \
+               (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
+
+#define PT32_LEVEL_MASK(level) \
+               (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
+
+#define PT32_INDEX(address, level)\
+       (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
+
+
+#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
+#define PT64_DIR_BASE_ADDR_MASK \
+       (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
+
+#define PT32_BASE_ADDR_MASK PAGE_MASK
+#define PT32_DIR_BASE_ADDR_MASK \
+       (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
+
+#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
+                       | PT64_NX_MASK)
+
+#define PFERR_PRESENT_MASK (1U << 0)
+#define PFERR_WRITE_MASK (1U << 1)
+#define PFERR_USER_MASK (1U << 2)
+#define PFERR_FETCH_MASK (1U << 4)
+
+#define PT64_ROOT_LEVEL 4
+#define PT32_ROOT_LEVEL 2
+#define PT32E_ROOT_LEVEL 3
+
+#define PT_DIRECTORY_LEVEL 2
+#define PT_PAGE_TABLE_LEVEL 1
+
+#define RMAP_EXT 4
+
+#define ACC_EXEC_MASK    1
+#define ACC_WRITE_MASK   PT_WRITABLE_MASK
+#define ACC_USER_MASK    PT_USER_MASK
+#define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
+
+struct kvm_rmap_desc {
+       u64 *shadow_ptes[RMAP_EXT];
+       struct kvm_rmap_desc *more;
+};
+
+static struct kmem_cache *pte_chain_cache;
+static struct kmem_cache *rmap_desc_cache;
+static struct kmem_cache *mmu_page_header_cache;
+
+static u64 __read_mostly shadow_trap_nonpresent_pte;
+static u64 __read_mostly shadow_notrap_nonpresent_pte;
+
+void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
+{
+       shadow_trap_nonpresent_pte = trap_pte;
+       shadow_notrap_nonpresent_pte = notrap_pte;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
+
+static int is_write_protection(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.cr0 & X86_CR0_WP;
+}
+
+static int is_cpuid_PSE36(void)
+{
+       return 1;
+}
+
+static int is_nx(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.shadow_efer & EFER_NX;
+}
+
+static int is_present_pte(unsigned long pte)
+{
+       return pte & PT_PRESENT_MASK;
+}
+
+static int is_shadow_present_pte(u64 pte)
+{
+       pte &= ~PT_SHADOW_IO_MARK;
+       return pte != shadow_trap_nonpresent_pte
+               && pte != shadow_notrap_nonpresent_pte;
+}
+
+static int is_writeble_pte(unsigned long pte)
+{
+       return pte & PT_WRITABLE_MASK;
+}
+
+static int is_dirty_pte(unsigned long pte)
+{
+       return pte & PT_DIRTY_MASK;
+}
+
+static int is_io_pte(unsigned long pte)
+{
+       return pte & PT_SHADOW_IO_MARK;
+}
+
+static int is_rmap_pte(u64 pte)
+{
+       return pte != shadow_trap_nonpresent_pte
+               && pte != shadow_notrap_nonpresent_pte;
+}
+
+static gfn_t pse36_gfn_delta(u32 gpte)
+{
+       int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
+
+       return (gpte & PT32_DIR_PSE36_MASK) << shift;
+}
+
+static void set_shadow_pte(u64 *sptep, u64 spte)
+{
+#ifdef CONFIG_X86_64
+       set_64bit((unsigned long *)sptep, spte);
+#else
+       set_64bit((unsigned long long *)sptep, spte);
+#endif
+}
+
+static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
+                                 struct kmem_cache *base_cache, int min)
+{
+       void *obj;
+
+       if (cache->nobjs >= min)
+               return 0;
+       while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
+               obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
+               if (!obj)
+                       return -ENOMEM;
+               cache->objects[cache->nobjs++] = obj;
+       }
+       return 0;
+}
+
+static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
+{
+       while (mc->nobjs)
+               kfree(mc->objects[--mc->nobjs]);
+}
+
+static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
+                                      int min)
+{
+       struct page *page;
+
+       if (cache->nobjs >= min)
+               return 0;
+       while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
+               page = alloc_page(GFP_KERNEL);
+               if (!page)
+                       return -ENOMEM;
+               set_page_private(page, 0);
+               cache->objects[cache->nobjs++] = page_address(page);
+       }
+       return 0;
+}
+
+static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
+{
+       while (mc->nobjs)
+               free_page((unsigned long)mc->objects[--mc->nobjs]);
+}
+
+static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
+{
+       int r;
+
+       r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
+                                  pte_chain_cache, 4);
+       if (r)
+               goto out;
+       r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
+                                  rmap_desc_cache, 1);
+       if (r)
+               goto out;
+       r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
+       if (r)
+               goto out;
+       r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
+                                  mmu_page_header_cache, 4);
+out:
+       return r;
+}
+
+static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
+{
+       mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache);
+       mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache);
+       mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
+       mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
+}
+
+static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
+                                   size_t size)
+{
+       void *p;
+
+       BUG_ON(!mc->nobjs);
+       p = mc->objects[--mc->nobjs];
+       memset(p, 0, size);
+       return p;
+}
+
+static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
+{
+       return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
+                                     sizeof(struct kvm_pte_chain));
+}
+
+static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
+{
+       kfree(pc);
+}
+
+static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
+{
+       return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
+                                     sizeof(struct kvm_rmap_desc));
+}
+
+static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
+{
+       kfree(rd);
+}
+
+/*
+ * Take gfn and return the reverse mapping to it.
+ * Note: gfn must be unaliased before this function get called
+ */
+
+static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn)
+{
+       struct kvm_memory_slot *slot;
+
+       slot = gfn_to_memslot(kvm, gfn);
+       return &slot->rmap[gfn - slot->base_gfn];
+}
+
+/*
+ * Reverse mapping data structures:
+ *
+ * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
+ * that points to page_address(page).
+ *
+ * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
+ * containing more mappings.
+ */
+static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
+{
+       struct kvm_mmu_page *sp;
+       struct kvm_rmap_desc *desc;
+       unsigned long *rmapp;
+       int i;
+
+       if (!is_rmap_pte(*spte))
+               return;
+       gfn = unalias_gfn(vcpu->kvm, gfn);
+       sp = page_header(__pa(spte));
+       sp->gfns[spte - sp->spt] = gfn;
+       rmapp = gfn_to_rmap(vcpu->kvm, gfn);
+       if (!*rmapp) {
+               rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
+               *rmapp = (unsigned long)spte;
+       } else if (!(*rmapp & 1)) {
+               rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
+               desc = mmu_alloc_rmap_desc(vcpu);
+               desc->shadow_ptes[0] = (u64 *)*rmapp;
+               desc->shadow_ptes[1] = spte;
+               *rmapp = (unsigned long)desc | 1;
+       } else {
+               rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
+               desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
+               while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
+                       desc = desc->more;
+               if (desc->shadow_ptes[RMAP_EXT-1]) {
+                       desc->more = mmu_alloc_rmap_desc(vcpu);
+                       desc = desc->more;
+               }
+               for (i = 0; desc->shadow_ptes[i]; ++i)
+                       ;
+               desc->shadow_ptes[i] = spte;
+       }
+}
+
+static void rmap_desc_remove_entry(unsigned long *rmapp,
+                                  struct kvm_rmap_desc *desc,
+                                  int i,
+                                  struct kvm_rmap_desc *prev_desc)
+{
+       int j;
+
+       for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
+               ;
+       desc->shadow_ptes[i] = desc->shadow_ptes[j];
+       desc->shadow_ptes[j] = NULL;
+       if (j != 0)
+               return;
+       if (!prev_desc && !desc->more)
+               *rmapp = (unsigned long)desc->shadow_ptes[0];
+       else
+               if (prev_desc)
+                       prev_desc->more = desc->more;
+               else
+                       *rmapp = (unsigned long)desc->more | 1;
+       mmu_free_rmap_desc(desc);
+}
+
+static void rmap_remove(struct kvm *kvm, u64 *spte)
+{
+       struct kvm_rmap_desc *desc;
+       struct kvm_rmap_desc *prev_desc;
+       struct kvm_mmu_page *sp;
+       struct page *page;
+       unsigned long *rmapp;
+       int i;
+
+       if (!is_rmap_pte(*spte))
+               return;
+       sp = page_header(__pa(spte));
+       page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
+       mark_page_accessed(page);
+       if (is_writeble_pte(*spte))
+               kvm_release_page_dirty(page);
+       else
+               kvm_release_page_clean(page);
+       rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]);
+       if (!*rmapp) {
+               printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
+               BUG();
+       } else if (!(*rmapp & 1)) {
+               rmap_printk("rmap_remove:  %p %llx 1->0\n", spte, *spte);
+               if ((u64 *)*rmapp != spte) {
+                       printk(KERN_ERR "rmap_remove:  %p %llx 1->BUG\n",
+                              spte, *spte);
+                       BUG();
+               }
+               *rmapp = 0;
+       } else {
+               rmap_printk("rmap_remove:  %p %llx many->many\n", spte, *spte);
+               desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
+               prev_desc = NULL;
+               while (desc) {
+                       for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
+                               if (desc->shadow_ptes[i] == spte) {
+                                       rmap_desc_remove_entry(rmapp,
+                                                              desc, i,
+                                                              prev_desc);
+                                       return;
+                               }
+                       prev_desc = desc;
+                       desc = desc->more;
+               }
+               BUG();
+       }
+}
+
+static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
+{
+       struct kvm_rmap_desc *desc;
+       struct kvm_rmap_desc *prev_desc;
+       u64 *prev_spte;
+       int i;
+
+       if (!*rmapp)
+               return NULL;
+       else if (!(*rmapp & 1)) {
+               if (!spte)
+                       return (u64 *)*rmapp;
+               return NULL;
+       }
+       desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
+       prev_desc = NULL;
+       prev_spte = NULL;
+       while (desc) {
+               for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) {
+                       if (prev_spte == spte)
+                               return desc->shadow_ptes[i];
+                       prev_spte = desc->shadow_ptes[i];
+               }
+               desc = desc->more;
+       }
+       return NULL;
+}
+
+static void rmap_write_protect(struct kvm *kvm, u64 gfn)
+{
+       unsigned long *rmapp;
+       u64 *spte;
+       int write_protected = 0;
+
+       gfn = unalias_gfn(kvm, gfn);
+       rmapp = gfn_to_rmap(kvm, gfn);
+
+       spte = rmap_next(kvm, rmapp, NULL);
+       while (spte) {
+               BUG_ON(!spte);
+               BUG_ON(!(*spte & PT_PRESENT_MASK));
+               rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
+               if (is_writeble_pte(*spte)) {
+                       set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
+                       write_protected = 1;
+               }
+               spte = rmap_next(kvm, rmapp, spte);
+       }
+       if (write_protected)
+               kvm_flush_remote_tlbs(kvm);
+}
+
+#ifdef MMU_DEBUG
+static int is_empty_shadow_page(u64 *spt)
+{
+       u64 *pos;
+       u64 *end;
+
+       for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
+               if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) {
+                       printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
+                              pos, *pos);
+                       return 0;
+               }
+       return 1;
+}
+#endif
+
+static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+       ASSERT(is_empty_shadow_page(sp->spt));
+       list_del(&sp->link);
+       __free_page(virt_to_page(sp->spt));
+       __free_page(virt_to_page(sp->gfns));
+       kfree(sp);
+       ++kvm->arch.n_free_mmu_pages;
+}
+
+static unsigned kvm_page_table_hashfn(gfn_t gfn)
+{
+       return gfn;
+}
+
+static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
+                                              u64 *parent_pte)
+{
+       struct kvm_mmu_page *sp;
+
+       sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
+       sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
+       sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
+       set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+       list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
+       ASSERT(is_empty_shadow_page(sp->spt));
+       sp->slot_bitmap = 0;
+       sp->multimapped = 0;
+       sp->parent_pte = parent_pte;
+       --vcpu->kvm->arch.n_free_mmu_pages;
+       return sp;
+}
+
+static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
+                                   struct kvm_mmu_page *sp, u64 *parent_pte)
+{
+       struct kvm_pte_chain *pte_chain;
+       struct hlist_node *node;
+       int i;
+
+       if (!parent_pte)
+               return;
+       if (!sp->multimapped) {
+               u64 *old = sp->parent_pte;
+
+               if (!old) {
+                       sp->parent_pte = parent_pte;
+                       return;
+               }
+               sp->multimapped = 1;
+               pte_chain = mmu_alloc_pte_chain(vcpu);
+               INIT_HLIST_HEAD(&sp->parent_ptes);
+               hlist_add_head(&pte_chain->link, &sp->parent_ptes);
+               pte_chain->parent_ptes[0] = old;
+       }
+       hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
+               if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
+                       continue;
+               for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
+                       if (!pte_chain->parent_ptes[i]) {
+                               pte_chain->parent_ptes[i] = parent_pte;
+                               return;
+                       }
+       }
+       pte_chain = mmu_alloc_pte_chain(vcpu);
+       BUG_ON(!pte_chain);
+       hlist_add_head(&pte_chain->link, &sp->parent_ptes);
+       pte_chain->parent_ptes[0] = parent_pte;
+}
+
+static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
+                                      u64 *parent_pte)
+{
+       struct kvm_pte_chain *pte_chain;
+       struct hlist_node *node;
+       int i;
+
+       if (!sp->multimapped) {
+               BUG_ON(sp->parent_pte != parent_pte);
+               sp->parent_pte = NULL;
+               return;
+       }
+       hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
+               for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
+                       if (!pte_chain->parent_ptes[i])
+                               break;
+                       if (pte_chain->parent_ptes[i] != parent_pte)
+                               continue;
+                       while (i + 1 < NR_PTE_CHAIN_ENTRIES
+                               && pte_chain->parent_ptes[i + 1]) {
+                               pte_chain->parent_ptes[i]
+                                       = pte_chain->parent_ptes[i + 1];
+                               ++i;
+                       }
+                       pte_chain->parent_ptes[i] = NULL;
+                       if (i == 0) {
+                               hlist_del(&pte_chain->link);
+                               mmu_free_pte_chain(pte_chain);
+                               if (hlist_empty(&sp->parent_ptes)) {
+                                       sp->multimapped = 0;
+                                       sp->parent_pte = NULL;
+                               }
+                       }
+                       return;
+               }
+       BUG();
+}
+
+static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
+{
+       unsigned index;
+       struct hlist_head *bucket;
+       struct kvm_mmu_page *sp;
+       struct hlist_node *node;
+
+       pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
+       index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+       bucket = &kvm->arch.mmu_page_hash[index];
+       hlist_for_each_entry(sp, node, bucket, hash_link)
+               if (sp->gfn == gfn && !sp->role.metaphysical) {
+                       pgprintk("%s: found role %x\n",
+                                __FUNCTION__, sp->role.word);
+                       return sp;
+               }
+       return NULL;
+}
+
+static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
+                                            gfn_t gfn,
+                                            gva_t gaddr,
+                                            unsigned level,
+                                            int metaphysical,
+                                            unsigned access,
+                                            u64 *parent_pte,
+                                            bool *new_page)
+{
+       union kvm_mmu_page_role role;
+       unsigned index;
+       unsigned quadrant;
+       struct hlist_head *bucket;
+       struct kvm_mmu_page *sp;
+       struct hlist_node *node;
+
+       role.word = 0;
+       role.glevels = vcpu->arch.mmu.root_level;
+       role.level = level;
+       role.metaphysical = metaphysical;
+       role.access = access;
+       if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
+               quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
+               quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
+               role.quadrant = quadrant;
+       }
+       pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
+                gfn, role.word);
+       index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+       bucket = &vcpu->kvm->arch.mmu_page_hash[index];
+       hlist_for_each_entry(sp, node, bucket, hash_link)
+               if (sp->gfn == gfn && sp->role.word == role.word) {
+                       mmu_page_add_parent_pte(vcpu, sp, parent_pte);
+                       pgprintk("%s: found\n", __FUNCTION__);
+                       return sp;
+               }
+       ++vcpu->kvm->stat.mmu_cache_miss;
+       sp = kvm_mmu_alloc_page(vcpu, parent_pte);
+       if (!sp)
+               return sp;
+       pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
+       sp->gfn = gfn;
+       sp->role = role;
+       hlist_add_head(&sp->hash_link, bucket);
+       vcpu->arch.mmu.prefetch_page(vcpu, sp);
+       if (!metaphysical)
+               rmap_write_protect(vcpu->kvm, gfn);
+       if (new_page)
+               *new_page = 1;
+       return sp;
+}
+
+static void kvm_mmu_page_unlink_children(struct kvm *kvm,
+                                        struct kvm_mmu_page *sp)
+{
+       unsigned i;
+       u64 *pt;
+       u64 ent;
+
+       pt = sp->spt;
+
+       if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
+               for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+                       if (is_shadow_present_pte(pt[i]))
+                               rmap_remove(kvm, &pt[i]);
+                       pt[i] = shadow_trap_nonpresent_pte;
+               }
+               kvm_flush_remote_tlbs(kvm);
+               return;
+       }
+
+       for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+               ent = pt[i];
+
+               pt[i] = shadow_trap_nonpresent_pte;
+               if (!is_shadow_present_pte(ent))
+                       continue;
+               ent &= PT64_BASE_ADDR_MASK;
+               mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
+       }
+       kvm_flush_remote_tlbs(kvm);
+}
+
+static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
+{
+       mmu_page_remove_parent_pte(sp, parent_pte);
+}
+
+static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
+{
+       int i;
+
+       for (i = 0; i < KVM_MAX_VCPUS; ++i)
+               if (kvm->vcpus[i])
+                       kvm->vcpus[i]->arch.last_pte_updated = NULL;
+}
+
+static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+       u64 *parent_pte;
+
+       ++kvm->stat.mmu_shadow_zapped;
+       while (sp->multimapped || sp->parent_pte) {
+               if (!sp->multimapped)
+                       parent_pte = sp->parent_pte;
+               else {
+                       struct kvm_pte_chain *chain;
+
+                       chain = container_of(sp->parent_ptes.first,
+                                            struct kvm_pte_chain, link);
+                       parent_pte = chain->parent_ptes[0];
+               }
+               BUG_ON(!parent_pte);
+               kvm_mmu_put_page(sp, parent_pte);
+               set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
+       }
+       kvm_mmu_page_unlink_children(kvm, sp);
+       if (!sp->root_count) {
+               hlist_del(&sp->hash_link);
+               kvm_mmu_free_page(kvm, sp);
+       } else
+               list_move(&sp->link, &kvm->arch.active_mmu_pages);
+       kvm_mmu_reset_last_pte_updated(kvm);
+}
+
+/*
+ * Changing the number of mmu pages allocated to the vm
+ * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
+ */
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
+{
+       /*
+        * If we set the number of mmu pages to be smaller be than the
+        * number of actived pages , we must to free some mmu pages before we
+        * change the value
+        */
+
+       if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) >
+           kvm_nr_mmu_pages) {
+               int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages
+                                      - kvm->arch.n_free_mmu_pages;
+
+               while (n_used_mmu_pages > kvm_nr_mmu_pages) {
+                       struct kvm_mmu_page *page;
+
+                       page = container_of(kvm->arch.active_mmu_pages.prev,
+                                           struct kvm_mmu_page, link);
+                       kvm_mmu_zap_page(kvm, page);
+                       n_used_mmu_pages--;
+               }
+               kvm->arch.n_free_mmu_pages = 0;
+       }
+       else
+               kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
+                                        - kvm->arch.n_alloc_mmu_pages;
+
+       kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
+}
+
+static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
+{
+       unsigned index;
+       struct hlist_head *bucket;
+       struct kvm_mmu_page *sp;
+       struct hlist_node *node, *n;
+       int r;
+
+       pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
+       r = 0;
+       index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+       bucket = &kvm->arch.mmu_page_hash[index];
+       hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
+               if (sp->gfn == gfn && !sp->role.metaphysical) {
+                       pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
+                                sp->role.word);
+                       kvm_mmu_zap_page(kvm, sp);
+                       r = 1;
+               }
+       return r;
+}
+
+static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
+{
+       struct kvm_mmu_page *sp;
+
+       while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) {
+               pgprintk("%s: zap %lx %x\n", __FUNCTION__, gfn, sp->role.word);
+               kvm_mmu_zap_page(kvm, sp);
+       }
+}
+
+static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
+{
+       int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn));
+       struct kvm_mmu_page *sp = page_header(__pa(pte));
+
+       __set_bit(slot, &sp->slot_bitmap);
+}
+
+struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
+{
+       gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
+
+       if (gpa == UNMAPPED_GVA)
+               return NULL;
+       return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+}
+
+static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
+                        unsigned pt_access, unsigned pte_access,
+                        int user_fault, int write_fault, int dirty,
+                        int *ptwrite, gfn_t gfn, struct page *page)
+{
+       u64 spte;
+       int was_rmapped = is_rmap_pte(*shadow_pte);
+       int was_writeble = is_writeble_pte(*shadow_pte);
+
+       pgprintk("%s: spte %llx access %x write_fault %d"
+                " user_fault %d gfn %lx\n",
+                __FUNCTION__, *shadow_pte, pt_access,
+                write_fault, user_fault, gfn);
+
+       /*
+        * We don't set the accessed bit, since we sometimes want to see
+        * whether the guest actually used the pte (in order to detect
+        * demand paging).
+        */
+       spte = PT_PRESENT_MASK | PT_DIRTY_MASK;
+       if (!dirty)
+               pte_access &= ~ACC_WRITE_MASK;
+       if (!(pte_access & ACC_EXEC_MASK))
+               spte |= PT64_NX_MASK;
+
+       spte |= PT_PRESENT_MASK;
+       if (pte_access & ACC_USER_MASK)
+               spte |= PT_USER_MASK;
+
+       if (is_error_page(page)) {
+               set_shadow_pte(shadow_pte,
+                              shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK);
+               kvm_release_page_clean(page);
+               return;
+       }
+
+       spte |= page_to_phys(page);
+
+       if ((pte_access & ACC_WRITE_MASK)
+           || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
+               struct kvm_mmu_page *shadow;
+
+               spte |= PT_WRITABLE_MASK;
+               if (user_fault) {
+                       mmu_unshadow(vcpu->kvm, gfn);
+                       goto unshadowed;
+               }
+
+               shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
+               if (shadow) {
+                       pgprintk("%s: found shadow page for %lx, marking ro\n",
+                                __FUNCTION__, gfn);
+                       pte_access &= ~ACC_WRITE_MASK;
+                       if (is_writeble_pte(spte)) {
+                               spte &= ~PT_WRITABLE_MASK;
+                               kvm_x86_ops->tlb_flush(vcpu);
+                       }
+                       if (write_fault)
+                               *ptwrite = 1;
+               }
+       }
+
+unshadowed:
+
+       if (pte_access & ACC_WRITE_MASK)
+               mark_page_dirty(vcpu->kvm, gfn);
+
+       pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte);
+       set_shadow_pte(shadow_pte, spte);
+       page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
+       if (!was_rmapped) {
+               rmap_add(vcpu, shadow_pte, gfn);
+               if (!is_rmap_pte(*shadow_pte))
+                       kvm_release_page_clean(page);
+       } else {
+               if (was_writeble)
+                       kvm_release_page_dirty(page);
+               else
+                       kvm_release_page_clean(page);
+       }
+       if (!ptwrite || !*ptwrite)
+               vcpu->arch.last_pte_updated = shadow_pte;
+}
+
+static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
+{
+}
+
+static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write,
+                          gfn_t gfn, struct page *page)
+{
+       int level = PT32E_ROOT_LEVEL;
+       hpa_t table_addr = vcpu->arch.mmu.root_hpa;
+       int pt_write = 0;
+
+       for (; ; level--) {
+               u32 index = PT64_INDEX(v, level);
+               u64 *table;
+
+               ASSERT(VALID_PAGE(table_addr));
+               table = __va(table_addr);
+
+               if (level == 1) {
+                       mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
+                                    0, write, 1, &pt_write, gfn, page);
+                       return pt_write || is_io_pte(table[index]);
+               }
+
+               if (table[index] == shadow_trap_nonpresent_pte) {
+                       struct kvm_mmu_page *new_table;
+                       gfn_t pseudo_gfn;
+
+                       pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
+                               >> PAGE_SHIFT;
+                       new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
+                                                    v, level - 1,
+                                                    1, ACC_ALL, &table[index],
+                                                    NULL);
+                       if (!new_table) {
+                               pgprintk("nonpaging_map: ENOMEM\n");
+                               kvm_release_page_clean(page);
+                               return -ENOMEM;
+                       }
+
+                       table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
+                               | PT_WRITABLE_MASK | PT_USER_MASK;
+               }
+               table_addr = table[index] & PT64_BASE_ADDR_MASK;
+       }
+}
+
+static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
+{
+       int r;
+
+       struct page *page;
+
+       down_read(&current->mm->mmap_sem);
+       page = gfn_to_page(vcpu->kvm, gfn);
+
+       spin_lock(&vcpu->kvm->mmu_lock);
+       kvm_mmu_free_some_pages(vcpu);
+       r = __nonpaging_map(vcpu, v, write, gfn, page);
+       spin_unlock(&vcpu->kvm->mmu_lock);
+
+       up_read(&current->mm->mmap_sem);
+
+       return r;
+}
+
+
+static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
+                                   struct kvm_mmu_page *sp)
+{
+       int i;
+
+       for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
+               sp->spt[i] = shadow_trap_nonpresent_pte;
+}
+
+static void mmu_free_roots(struct kvm_vcpu *vcpu)
+{
+       int i;
+       struct kvm_mmu_page *sp;
+
+       if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+               return;
+       spin_lock(&vcpu->kvm->mmu_lock);
+#ifdef CONFIG_X86_64
+       if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+               hpa_t root = vcpu->arch.mmu.root_hpa;
+
+               sp = page_header(root);
+               --sp->root_count;
+               vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+               spin_unlock(&vcpu->kvm->mmu_lock);
+               return;
+       }
+#endif
+       for (i = 0; i < 4; ++i) {
+               hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+               if (root) {
+                       root &= PT64_BASE_ADDR_MASK;
+                       sp = page_header(root);
+                       --sp->root_count;
+               }
+               vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
+       }
+       spin_unlock(&vcpu->kvm->mmu_lock);
+       vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+}
+
+static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
+{
+       int i;
+       gfn_t root_gfn;
+       struct kvm_mmu_page *sp;
+
+       root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
+
+#ifdef CONFIG_X86_64
+       if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+               hpa_t root = vcpu->arch.mmu.root_hpa;
+
+               ASSERT(!VALID_PAGE(root));
+               sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
+                                     PT64_ROOT_LEVEL, 0, ACC_ALL, NULL, NULL);
+               root = __pa(sp->spt);
+               ++sp->root_count;
+               vcpu->arch.mmu.root_hpa = root;
+               return;
+       }
+#endif
+       for (i = 0; i < 4; ++i) {
+               hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+               ASSERT(!VALID_PAGE(root));
+               if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
+                       if (!is_present_pte(vcpu->arch.pdptrs[i])) {
+                               vcpu->arch.mmu.pae_root[i] = 0;
+                               continue;
+                       }
+                       root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT;
+               } else if (vcpu->arch.mmu.root_level == 0)
+                       root_gfn = 0;
+               sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
+                                     PT32_ROOT_LEVEL, !is_paging(vcpu),
+                                     ACC_ALL, NULL, NULL);
+               root = __pa(sp->spt);
+               ++sp->root_count;
+               vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
+       }
+       vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+}
+
+static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
+{
+       return vaddr;
+}
+
+static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
+                               u32 error_code)
+{
+       gfn_t gfn;
+       int r;
+
+       pgprintk("%s: gva %lx error %x\n", __FUNCTION__, gva, error_code);
+       r = mmu_topup_memory_caches(vcpu);
+       if (r)
+               return r;
+
+       ASSERT(vcpu);
+       ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+       gfn = gva >> PAGE_SHIFT;
+
+       return nonpaging_map(vcpu, gva & PAGE_MASK,
+                            error_code & PFERR_WRITE_MASK, gfn);
+}
+
+static void nonpaging_free(struct kvm_vcpu *vcpu)
+{
+       mmu_free_roots(vcpu);
+}
+
+static int nonpaging_init_context(struct kvm_vcpu *vcpu)
+{
+       struct kvm_mmu *context = &vcpu->arch.mmu;
+
+       context->new_cr3 = nonpaging_new_cr3;
+       context->page_fault = nonpaging_page_fault;
+       context->gva_to_gpa = nonpaging_gva_to_gpa;
+       context->free = nonpaging_free;
+       context->prefetch_page = nonpaging_prefetch_page;
+       context->root_level = 0;
+       context->shadow_root_level = PT32E_ROOT_LEVEL;
+       context->root_hpa = INVALID_PAGE;
+       return 0;
+}
+
+void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
+{
+       ++vcpu->stat.tlb_flush;
+       kvm_x86_ops->tlb_flush(vcpu);
+}
+
+static void paging_new_cr3(struct kvm_vcpu *vcpu)
+{
+       pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
+       mmu_free_roots(vcpu);
+}
+
+static void inject_page_fault(struct kvm_vcpu *vcpu,
+                             u64 addr,
+                             u32 err_code)
+{
+       kvm_inject_page_fault(vcpu, addr, err_code);
+}
+
+static void paging_free(struct kvm_vcpu *vcpu)
+{
+       nonpaging_free(vcpu);
+}
+
+#define PTTYPE 64
+#include "paging_tmpl.h"
+#undef PTTYPE
+
+#define PTTYPE 32
+#include "paging_tmpl.h"
+#undef PTTYPE
+
+static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
+{
+       struct kvm_mmu *context = &vcpu->arch.mmu;
+
+       ASSERT(is_pae(vcpu));
+       context->new_cr3 = paging_new_cr3;
+       context->page_fault = paging64_page_fault;
+       context->gva_to_gpa = paging64_gva_to_gpa;
+       context->prefetch_page = paging64_prefetch_page;
+       context->free = paging_free;
+       context->root_level = level;
+       context->shadow_root_level = level;
+       context->root_hpa = INVALID_PAGE;
+       return 0;
+}
+
+static int paging64_init_context(struct kvm_vcpu *vcpu)
+{
+       return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
+}
+
+static int paging32_init_context(struct kvm_vcpu *vcpu)
+{
+       struct kvm_mmu *context = &vcpu->arch.mmu;
+
+       context->new_cr3 = paging_new_cr3;
+       context->page_fault = paging32_page_fault;
+       context->gva_to_gpa = paging32_gva_to_gpa;
+       context->free = paging_free;
+       context->prefetch_page = paging32_prefetch_page;
+       context->root_level = PT32_ROOT_LEVEL;
+       context->shadow_root_level = PT32E_ROOT_LEVEL;
+       context->root_hpa = INVALID_PAGE;
+       return 0;
+}
+
+static int paging32E_init_context(struct kvm_vcpu *vcpu)
+{
+       return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
+}
+
+static int init_kvm_mmu(struct kvm_vcpu *vcpu)
+{
+       ASSERT(vcpu);
+       ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+       if (!is_paging(vcpu))
+               return nonpaging_init_context(vcpu);
+       else if (is_long_mode(vcpu))
+               return paging64_init_context(vcpu);
+       else if (is_pae(vcpu))
+               return paging32E_init_context(vcpu);
+       else
+               return paging32_init_context(vcpu);
+}
+
+static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
+{
+       ASSERT(vcpu);
+       if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
+               vcpu->arch.mmu.free(vcpu);
+               vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+       }
+}
+
+int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
+{
+       destroy_kvm_mmu(vcpu);
+       return init_kvm_mmu(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
+
+int kvm_mmu_load(struct kvm_vcpu *vcpu)
+{
+       int r;
+
+       r = mmu_topup_memory_caches(vcpu);
+       if (r)
+               goto out;
+       spin_lock(&vcpu->kvm->mmu_lock);
+       kvm_mmu_free_some_pages(vcpu);
+       mmu_alloc_roots(vcpu);
+       spin_unlock(&vcpu->kvm->mmu_lock);
+       kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
+       kvm_mmu_flush_tlb(vcpu);
+out:
+       return r;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_load);
+
+void kvm_mmu_unload(struct kvm_vcpu *vcpu)
+{
+       mmu_free_roots(vcpu);
+}
+
+static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
+                                 struct kvm_mmu_page *sp,
+                                 u64 *spte)
+{
+       u64 pte;
+       struct kvm_mmu_page *child;
+
+       pte = *spte;
+       if (is_shadow_present_pte(pte)) {
+               if (sp->role.level == PT_PAGE_TABLE_LEVEL)
+                       rmap_remove(vcpu->kvm, spte);
+               else {
+                       child = page_header(pte & PT64_BASE_ADDR_MASK);
+                       mmu_page_remove_parent_pte(child, spte);
+               }
+       }
+       set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+}
+
+static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
+                                 struct kvm_mmu_page *sp,
+                                 u64 *spte,
+                                 const void *new, int bytes,
+                                 int offset_in_pte)
+{
+       if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
+               ++vcpu->kvm->stat.mmu_pde_zapped;
+               return;
+       }
+
+       ++vcpu->kvm->stat.mmu_pte_updated;
+       if (sp->role.glevels == PT32_ROOT_LEVEL)
+               paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
+       else
+               paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
+}
+
+static bool need_remote_flush(u64 old, u64 new)
+{
+       if (!is_shadow_present_pte(old))
+               return false;
+       if (!is_shadow_present_pte(new))
+               return true;
+       if ((old ^ new) & PT64_BASE_ADDR_MASK)
+               return true;
+       old ^= PT64_NX_MASK;
+       new ^= PT64_NX_MASK;
+       return (old & ~new & PT64_PERM_MASK) != 0;
+}
+
+static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
+{
+       if (need_remote_flush(old, new))
+               kvm_flush_remote_tlbs(vcpu->kvm);
+       else
+               kvm_mmu_flush_tlb(vcpu);
+}
+
+static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
+{
+       u64 *spte = vcpu->arch.last_pte_updated;
+
+       return !!(spte && (*spte & PT_ACCESSED_MASK));
+}
+
+static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+                                         const u8 *new, int bytes)
+{
+       gfn_t gfn;
+       int r;
+       u64 gpte = 0;
+
+       if (bytes != 4 && bytes != 8)
+               return;
+
+       /*
+        * Assume that the pte write on a page table of the same type
+        * as the current vcpu paging mode.  This is nearly always true
+        * (might be false while changing modes).  Note it is verified later
+        * by update_pte().
+        */
+       if (is_pae(vcpu)) {
+               /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
+               if ((bytes == 4) && (gpa % 4 == 0)) {
+                       r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
+                       if (r)
+                               return;
+                       memcpy((void *)&gpte + (gpa % 8), new, 4);
+               } else if ((bytes == 8) && (gpa % 8 == 0)) {
+                       memcpy((void *)&gpte, new, 8);
+               }
+       } else {
+               if ((bytes == 4) && (gpa % 4 == 0))
+                       memcpy((void *)&gpte, new, 4);
+       }
+       if (!is_present_pte(gpte))
+               return;
+       gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
+       vcpu->arch.update_pte.gfn = gfn;
+       vcpu->arch.update_pte.page = gfn_to_page(vcpu->kvm, gfn);
+}
+
+void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+                      const u8 *new, int bytes)
+{
+       gfn_t gfn = gpa >> PAGE_SHIFT;
+       struct kvm_mmu_page *sp;
+       struct hlist_node *node, *n;
+       struct hlist_head *bucket;
+       unsigned index;
+       u64 entry;
+       u64 *spte;
+       unsigned offset = offset_in_page(gpa);
+       unsigned pte_size;
+       unsigned page_offset;
+       unsigned misaligned;
+       unsigned quadrant;
+       int level;
+       int flooded = 0;
+       int npte;
+
+       pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
+       mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
+       spin_lock(&vcpu->kvm->mmu_lock);
+       kvm_mmu_free_some_pages(vcpu);
+       ++vcpu->kvm->stat.mmu_pte_write;
+       kvm_mmu_audit(vcpu, "pre pte write");
+       if (gfn == vcpu->arch.last_pt_write_gfn
+           && !last_updated_pte_accessed(vcpu)) {
+               ++vcpu->arch.last_pt_write_count;
+               if (vcpu->arch.last_pt_write_count >= 3)
+                       flooded = 1;
+       } else {
+               vcpu->arch.last_pt_write_gfn = gfn;
+               vcpu->arch.last_pt_write_count = 1;
+               vcpu->arch.last_pte_updated = NULL;
+       }
+       index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+       bucket = &vcpu->kvm->arch.mmu_page_hash[index];
+       hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
+               if (sp->gfn != gfn || sp->role.metaphysical)
+                       continue;
+               pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
+               misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
+               misaligned |= bytes < 4;
+               if (misaligned || flooded) {
+                       /*
+                        * Misaligned accesses are too much trouble to fix
+                        * up; also, they usually indicate a page is not used
+                        * as a page table.
+                        *
+                        * If we're seeing too many writes to a page,
+                        * it may no longer be a page table, or we may be
+                        * forking, in which case it is better to unmap the
+                        * page.
+                        */
+                       pgprintk("misaligned: gpa %llx bytes %d role %x\n",
+                                gpa, bytes, sp->role.word);
+                       kvm_mmu_zap_page(vcpu->kvm, sp);
+                       ++vcpu->kvm->stat.mmu_flooded;
+                       continue;
+               }
+               page_offset = offset;
+               level = sp->role.level;
+               npte = 1;
+               if (sp->role.glevels == PT32_ROOT_LEVEL) {
+                       page_offset <<= 1;      /* 32->64 */
+                       /*
+                        * A 32-bit pde maps 4MB while the shadow pdes map
+                        * only 2MB.  So we need to double the offset again
+                        * and zap two pdes instead of one.
+                        */
+                       if (level == PT32_ROOT_LEVEL) {
+                               page_offset &= ~7; /* kill rounding error */
+                               page_offset <<= 1;
+                               npte = 2;
+                       }
+                       quadrant = page_offset >> PAGE_SHIFT;
+                       page_offset &= ~PAGE_MASK;
+                       if (quadrant != sp->role.quadrant)
+                               continue;
+               }
+               spte = &sp->spt[page_offset / sizeof(*spte)];
+               while (npte--) {
+                       entry = *spte;
+                       mmu_pte_write_zap_pte(vcpu, sp, spte);
+                       mmu_pte_write_new_pte(vcpu, sp, spte, new, bytes,
+                                             page_offset & (pte_size - 1));
+                       mmu_pte_write_flush_tlb(vcpu, entry, *spte);
+                       ++spte;
+               }
+       }
+       kvm_mmu_audit(vcpu, "post pte write");
+       spin_unlock(&vcpu->kvm->mmu_lock);
+       if (vcpu->arch.update_pte.page) {
+               kvm_release_page_clean(vcpu->arch.update_pte.page);
+               vcpu->arch.update_pte.page = NULL;
+       }
+}
+
+int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
+{
+       gpa_t gpa;
+       int r;
+
+       down_read(&current->mm->mmap_sem);
+       gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
+       up_read(&current->mm->mmap_sem);
+
+       spin_lock(&vcpu->kvm->mmu_lock);
+       r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+       spin_unlock(&vcpu->kvm->mmu_lock);
+       return r;
+}
+
+void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
+{
+       while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) {
+               struct kvm_mmu_page *sp;
+
+               sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
+                                 struct kvm_mmu_page, link);
+               kvm_mmu_zap_page(vcpu->kvm, sp);
+               ++vcpu->kvm->stat.mmu_recycled;
+       }
+}
+
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
+{
+       int r;
+       enum emulation_result er;
+
+       r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
+       if (r < 0)
+               goto out;
+
+       if (!r) {
+               r = 1;
+               goto out;
+       }
+
+       r = mmu_topup_memory_caches(vcpu);
+       if (r)
+               goto out;
+
+       er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0);
+
+       switch (er) {
+       case EMULATE_DONE:
+               return 1;
+       case EMULATE_DO_MMIO:
+               ++vcpu->stat.mmio_exits;
+               return 0;
+       case EMULATE_FAIL:
+               kvm_report_emulation_failure(vcpu, "pagetable");
+               return 1;
+       default:
+               BUG();
+       }
+out:
+       return r;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
+
+static void free_mmu_pages(struct kvm_vcpu *vcpu)
+{
+       struct kvm_mmu_page *sp;
+
+       while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
+               sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
+                                 struct kvm_mmu_page, link);
+               kvm_mmu_zap_page(vcpu->kvm, sp);
+       }
+       free_page((unsigned long)vcpu->arch.mmu.pae_root);
+}
+
+static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
+{
+       struct page *page;
+       int i;
+
+       ASSERT(vcpu);
+
+       if (vcpu->kvm->arch.n_requested_mmu_pages)
+               vcpu->kvm->arch.n_free_mmu_pages =
+                                       vcpu->kvm->arch.n_requested_mmu_pages;
+       else
+               vcpu->kvm->arch.n_free_mmu_pages =
+                                       vcpu->kvm->arch.n_alloc_mmu_pages;
+       /*
+        * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
+        * Therefore we need to allocate shadow page tables in the first
+        * 4GB of memory, which happens to fit the DMA32 zone.
+        */
+       page = alloc_page(GFP_KERNEL | __GFP_DMA32);
+       if (!page)
+               goto error_1;
+       vcpu->arch.mmu.pae_root = page_address(page);
+       for (i = 0; i < 4; ++i)
+               vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
+
+       return 0;
+
+error_1:
+       free_mmu_pages(vcpu);
+       return -ENOMEM;
+}
+
+int kvm_mmu_create(struct kvm_vcpu *vcpu)
+{
+       ASSERT(vcpu);
+       ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+       return alloc_mmu_pages(vcpu);
+}
+
+int kvm_mmu_setup(struct kvm_vcpu *vcpu)
+{
+       ASSERT(vcpu);
+       ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+       return init_kvm_mmu(vcpu);
+}
+
+void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
+{
+       ASSERT(vcpu);
+
+       destroy_kvm_mmu(vcpu);
+       free_mmu_pages(vcpu);
+       mmu_free_memory_caches(vcpu);
+}
+
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
+{
+       struct kvm_mmu_page *sp;
+
+       list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
+               int i;
+               u64 *pt;
+
+               if (!test_bit(slot, &sp->slot_bitmap))
+                       continue;
+
+               pt = sp->spt;
+               for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
+                       /* avoid RMW */
+                       if (pt[i] & PT_WRITABLE_MASK)
+                               pt[i] &= ~PT_WRITABLE_MASK;
+       }
+}
+
+void kvm_mmu_zap_all(struct kvm *kvm)
+{
+       struct kvm_mmu_page *sp, *node;
+
+       spin_lock(&kvm->mmu_lock);
+       list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
+               kvm_mmu_zap_page(kvm, sp);
+       spin_unlock(&kvm->mmu_lock);
+
+       kvm_flush_remote_tlbs(kvm);
+}
+
+void kvm_mmu_module_exit(void)
+{
+       if (pte_chain_cache)
+               kmem_cache_destroy(pte_chain_cache);
+       if (rmap_desc_cache)
+               kmem_cache_destroy(rmap_desc_cache);
+       if (mmu_page_header_cache)
+               kmem_cache_destroy(mmu_page_header_cache);
+}
+
+int kvm_mmu_module_init(void)
+{
+       pte_chain_cache = kmem_cache_create("kvm_pte_chain",
+                                           sizeof(struct kvm_pte_chain),
+                                           0, 0, NULL);
+       if (!pte_chain_cache)
+               goto nomem;
+       rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
+                                           sizeof(struct kvm_rmap_desc),
+                                           0, 0, NULL);
+       if (!rmap_desc_cache)
+               goto nomem;
+
+       mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
+                                                 sizeof(struct kvm_mmu_page),
+                                                 0, 0, NULL);
+       if (!mmu_page_header_cache)
+               goto nomem;
+
+       return 0;
+
+nomem:
+       kvm_mmu_module_exit();
+       return -ENOMEM;
+}
+
+/*
+ * Caculate mmu pages needed for kvm.
+ */
+unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
+{
+       int i;
+       unsigned int nr_mmu_pages;
+       unsigned int  nr_pages = 0;
+
+       for (i = 0; i < kvm->nmemslots; i++)
+               nr_pages += kvm->memslots[i].npages;
+
+       nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
+       nr_mmu_pages = max(nr_mmu_pages,
+                       (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
+
+       return nr_mmu_pages;
+}
+
+#ifdef AUDIT
+
+static const char *audit_msg;
+
+static gva_t canonicalize(gva_t gva)
+{
+#ifdef CONFIG_X86_64
+       gva = (long long)(gva << 16) >> 16;
+#endif
+       return gva;
+}
+
+static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
+                               gva_t va, int level)
+{
+       u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
+       int i;
+       gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
+
+       for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
+               u64 ent = pt[i];
+
+               if (ent == shadow_trap_nonpresent_pte)
+                       continue;
+
+               va = canonicalize(va);
+               if (level > 1) {
+                       if (ent == shadow_notrap_nonpresent_pte)
+                               printk(KERN_ERR "audit: (%s) nontrapping pte"
+                                      " in nonleaf level: levels %d gva %lx"
+                                      " level %d pte %llx\n", audit_msg,
+                                      vcpu->arch.mmu.root_level, va, level, ent);
+
+                       audit_mappings_page(vcpu, ent, va, level - 1);
+               } else {
+                       gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
+                       struct page *page = gpa_to_page(vcpu, gpa);
+                       hpa_t hpa = page_to_phys(page);
+
+                       if (is_shadow_present_pte(ent)
+                           && (ent & PT64_BASE_ADDR_MASK) != hpa)
+                               printk(KERN_ERR "xx audit error: (%s) levels %d"
+                                      " gva %lx gpa %llx hpa %llx ent %llx %d\n",
+                                      audit_msg, vcpu->arch.mmu.root_level,
+                                      va, gpa, hpa, ent,
+                                      is_shadow_present_pte(ent));
+                       else if (ent == shadow_notrap_nonpresent_pte
+                                && !is_error_hpa(hpa))
+                               printk(KERN_ERR "audit: (%s) notrap shadow,"
+                                      " valid guest gva %lx\n", audit_msg, va);
+                       kvm_release_page_clean(page);
+
+               }
+       }
+}
+
+static void audit_mappings(struct kvm_vcpu *vcpu)
+{
+       unsigned i;
+
+       if (vcpu->arch.mmu.root_level == 4)
+               audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
+       else
+               for (i = 0; i < 4; ++i)
+                       if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
+                               audit_mappings_page(vcpu,
+                                                   vcpu->arch.mmu.pae_root[i],
+                                                   i << 30,
+                                                   2);
+}
+
+static int count_rmaps(struct kvm_vcpu *vcpu)
+{
+       int nmaps = 0;
+       int i, j, k;
+
+       for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
+               struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
+               struct kvm_rmap_desc *d;
+
+               for (j = 0; j < m->npages; ++j) {
+                       unsigned long *rmapp = &m->rmap[j];
+
+                       if (!*rmapp)
+                               continue;
+                       if (!(*rmapp & 1)) {
+                               ++nmaps;
+                               continue;
+                       }
+                       d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
+                       while (d) {
+                               for (k = 0; k < RMAP_EXT; ++k)
+                                       if (d->shadow_ptes[k])
+                                               ++nmaps;
+                                       else
+                                               break;
+                               d = d->more;
+                       }
+               }
+       }
+       return nmaps;
+}
+
+static int count_writable_mappings(struct kvm_vcpu *vcpu)
+{
+       int nmaps = 0;
+       struct kvm_mmu_page *sp;
+       int i;
+
+       list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
+               u64 *pt = sp->spt;
+
+               if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+                       continue;
+
+               for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+                       u64 ent = pt[i];
+
+                       if (!(ent & PT_PRESENT_MASK))
+                               continue;
+                       if (!(ent & PT_WRITABLE_MASK))
+                               continue;
+                       ++nmaps;
+               }
+       }
+       return nmaps;
+}
+
+static void audit_rmap(struct kvm_vcpu *vcpu)
+{
+       int n_rmap = count_rmaps(vcpu);
+       int n_actual = count_writable_mappings(vcpu);
+
+       if (n_rmap != n_actual)
+               printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
+                      __FUNCTION__, audit_msg, n_rmap, n_actual);
+}
+
+static void audit_write_protection(struct kvm_vcpu *vcpu)
+{
+       struct kvm_mmu_page *sp;
+       struct kvm_memory_slot *slot;
+       unsigned long *rmapp;
+       gfn_t gfn;
+
+       list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
+               if (sp->role.metaphysical)
+                       continue;
+
+               slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
+               gfn = unalias_gfn(vcpu->kvm, sp->gfn);
+               rmapp = &slot->rmap[gfn - slot->base_gfn];
+               if (*rmapp)
+                       printk(KERN_ERR "%s: (%s) shadow page has writable"
+                              " mappings: gfn %lx role %x\n",
+                              __FUNCTION__, audit_msg, sp->gfn,
+                              sp->role.word);
+       }
+}
+
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
+{
+       int olddbg = dbg;
+
+       dbg = 0;
+       audit_msg = msg;
+       audit_rmap(vcpu);
+       audit_write_protection(vcpu);
+       audit_mappings(vcpu);
+       dbg = olddbg;
+}
+
+#endif
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
new file mode 100644 (file)
index 0000000..1fce19e
--- /dev/null
@@ -0,0 +1,44 @@
+#ifndef __KVM_X86_MMU_H
+#define __KVM_X86_MMU_H
+
+#include <linux/kvm_host.h>
+
+static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
+{
+       if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
+               __kvm_mmu_free_some_pages(vcpu);
+}
+
+static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
+{
+       if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE))
+               return 0;
+
+       return kvm_mmu_load(vcpu);
+}
+
+static inline int is_long_mode(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+       return vcpu->arch.shadow_efer & EFER_LME;
+#else
+       return 0;
+#endif
+}
+
+static inline int is_pae(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.cr4 & X86_CR4_PAE;
+}
+
+static inline int is_pse(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.cr4 & X86_CR4_PSE;
+}
+
+static inline int is_paging(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.cr0 & X86_CR0_PG;
+}
+
+#endif
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
new file mode 100644 (file)
index 0000000..03ba860
--- /dev/null
@@ -0,0 +1,484 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * MMU support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ *
+ * Authors:
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ *   Avi Kivity   <avi@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+/*
+ * We need the mmu code to access both 32-bit and 64-bit guest ptes,
+ * so the code in this file is compiled twice, once per pte size.
+ */
+
+#if PTTYPE == 64
+       #define pt_element_t u64
+       #define guest_walker guest_walker64
+       #define FNAME(name) paging##64_##name
+       #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+       #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
+       #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
+       #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
+       #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
+       #define PT_LEVEL_BITS PT64_LEVEL_BITS
+       #ifdef CONFIG_X86_64
+       #define PT_MAX_FULL_LEVELS 4
+       #define CMPXCHG cmpxchg
+       #else
+       #define CMPXCHG cmpxchg64
+       #define PT_MAX_FULL_LEVELS 2
+       #endif
+#elif PTTYPE == 32
+       #define pt_element_t u32
+       #define guest_walker guest_walker32
+       #define FNAME(name) paging##32_##name
+       #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
+       #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
+       #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
+       #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
+       #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
+       #define PT_LEVEL_BITS PT32_LEVEL_BITS
+       #define PT_MAX_FULL_LEVELS 2
+       #define CMPXCHG cmpxchg
+#else
+       #error Invalid PTTYPE value
+#endif
+
+#define gpte_to_gfn FNAME(gpte_to_gfn)
+#define gpte_to_gfn_pde FNAME(gpte_to_gfn_pde)
+
+/*
+ * The guest_walker structure emulates the behavior of the hardware page
+ * table walker.
+ */
+struct guest_walker {
+       int level;
+       gfn_t table_gfn[PT_MAX_FULL_LEVELS];
+       pt_element_t ptes[PT_MAX_FULL_LEVELS];
+       gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
+       unsigned pt_access;
+       unsigned pte_access;
+       gfn_t gfn;
+       u32 error_code;
+};
+
+static gfn_t gpte_to_gfn(pt_element_t gpte)
+{
+       return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
+}
+
+static gfn_t gpte_to_gfn_pde(pt_element_t gpte)
+{
+       return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
+}
+
+static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
+                        gfn_t table_gfn, unsigned index,
+                        pt_element_t orig_pte, pt_element_t new_pte)
+{
+       pt_element_t ret;
+       pt_element_t *table;
+       struct page *page;
+
+       page = gfn_to_page(kvm, table_gfn);
+       table = kmap_atomic(page, KM_USER0);
+
+       ret = CMPXCHG(&table[index], orig_pte, new_pte);
+
+       kunmap_atomic(table, KM_USER0);
+
+       kvm_release_page_dirty(page);
+
+       return (ret != orig_pte);
+}
+
+static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
+{
+       unsigned access;
+
+       access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
+#if PTTYPE == 64
+       if (is_nx(vcpu))
+               access &= ~(gpte >> PT64_NX_SHIFT);
+#endif
+       return access;
+}
+
+/*
+ * Fetch a guest pte for a guest virtual address
+ */
+static int FNAME(walk_addr)(struct guest_walker *walker,
+                           struct kvm_vcpu *vcpu, gva_t addr,
+                           int write_fault, int user_fault, int fetch_fault)
+{
+       pt_element_t pte;
+       gfn_t table_gfn;
+       unsigned index, pt_access, pte_access;
+       gpa_t pte_gpa;
+
+       pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
+walk:
+       walker->level = vcpu->arch.mmu.root_level;
+       pte = vcpu->arch.cr3;
+#if PTTYPE == 64
+       if (!is_long_mode(vcpu)) {
+               pte = vcpu->arch.pdptrs[(addr >> 30) & 3];
+               if (!is_present_pte(pte))
+                       goto not_present;
+               --walker->level;
+       }
+#endif
+       ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
+              (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
+
+       pt_access = ACC_ALL;
+
+       for (;;) {
+               index = PT_INDEX(addr, walker->level);
+
+               table_gfn = gpte_to_gfn(pte);
+               pte_gpa = gfn_to_gpa(table_gfn);
+               pte_gpa += index * sizeof(pt_element_t);
+               walker->table_gfn[walker->level - 1] = table_gfn;
+               walker->pte_gpa[walker->level - 1] = pte_gpa;
+               pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
+                        walker->level - 1, table_gfn);
+
+               kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
+
+               if (!is_present_pte(pte))
+                       goto not_present;
+
+               if (write_fault && !is_writeble_pte(pte))
+                       if (user_fault || is_write_protection(vcpu))
+                               goto access_error;
+
+               if (user_fault && !(pte & PT_USER_MASK))
+                       goto access_error;
+
+#if PTTYPE == 64
+               if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK))
+                       goto access_error;
+#endif
+
+               if (!(pte & PT_ACCESSED_MASK)) {
+                       mark_page_dirty(vcpu->kvm, table_gfn);
+                       if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
+                           index, pte, pte|PT_ACCESSED_MASK))
+                               goto walk;
+                       pte |= PT_ACCESSED_MASK;
+               }
+
+               pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
+
+               walker->ptes[walker->level - 1] = pte;
+
+               if (walker->level == PT_PAGE_TABLE_LEVEL) {
+                       walker->gfn = gpte_to_gfn(pte);
+                       break;
+               }
+
+               if (walker->level == PT_DIRECTORY_LEVEL
+                   && (pte & PT_PAGE_SIZE_MASK)
+                   && (PTTYPE == 64 || is_pse(vcpu))) {
+                       walker->gfn = gpte_to_gfn_pde(pte);
+                       walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
+                       if (PTTYPE == 32 && is_cpuid_PSE36())
+                               walker->gfn += pse36_gfn_delta(pte);
+                       break;
+               }
+
+               pt_access = pte_access;
+               --walker->level;
+       }
+
+       if (write_fault && !is_dirty_pte(pte)) {
+               bool ret;
+
+               mark_page_dirty(vcpu->kvm, table_gfn);
+               ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
+                           pte|PT_DIRTY_MASK);
+               if (ret)
+                       goto walk;
+               pte |= PT_DIRTY_MASK;
+               kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
+               walker->ptes[walker->level - 1] = pte;
+       }
+
+       walker->pt_access = pt_access;
+       walker->pte_access = pte_access;
+       pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
+                __FUNCTION__, (u64)pte, pt_access, pte_access);
+       return 1;
+
+not_present:
+       walker->error_code = 0;
+       goto err;
+
+access_error:
+       walker->error_code = PFERR_PRESENT_MASK;
+
+err:
+       if (write_fault)
+               walker->error_code |= PFERR_WRITE_MASK;
+       if (user_fault)
+               walker->error_code |= PFERR_USER_MASK;
+       if (fetch_fault)
+               walker->error_code |= PFERR_FETCH_MASK;
+       return 0;
+}
+
+static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
+                             u64 *spte, const void *pte, int bytes,
+                             int offset_in_pte)
+{
+       pt_element_t gpte;
+       unsigned pte_access;
+       struct page *npage;
+
+       gpte = *(const pt_element_t *)pte;
+       if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
+               if (!offset_in_pte && !is_present_pte(gpte))
+                       set_shadow_pte(spte, shadow_notrap_nonpresent_pte);
+               return;
+       }
+       if (bytes < sizeof(pt_element_t))
+               return;
+       pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
+       pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
+       if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
+               return;
+       npage = vcpu->arch.update_pte.page;
+       if (!npage)
+               return;
+       get_page(npage);
+       mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
+                    gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte), npage);
+}
+
+/*
+ * Fetch a shadow pte for a specific level in the paging hierarchy.
+ */
+static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
+                        struct guest_walker *walker,
+                        int user_fault, int write_fault, int *ptwrite,
+                        struct page *page)
+{
+       hpa_t shadow_addr;
+       int level;
+       u64 *shadow_ent;
+       unsigned access = walker->pt_access;
+
+       if (!is_present_pte(walker->ptes[walker->level - 1]))
+               return NULL;
+
+       shadow_addr = vcpu->arch.mmu.root_hpa;
+       level = vcpu->arch.mmu.shadow_root_level;
+       if (level == PT32E_ROOT_LEVEL) {
+               shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
+               shadow_addr &= PT64_BASE_ADDR_MASK;
+               --level;
+       }
+
+       for (; ; level--) {
+               u32 index = SHADOW_PT_INDEX(addr, level);
+               struct kvm_mmu_page *shadow_page;
+               u64 shadow_pte;
+               int metaphysical;
+               gfn_t table_gfn;
+               bool new_page = 0;
+
+               shadow_ent = ((u64 *)__va(shadow_addr)) + index;
+               if (level == PT_PAGE_TABLE_LEVEL)
+                       break;
+               if (is_shadow_present_pte(*shadow_ent)) {
+                       shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
+                       continue;
+               }
+
+               if (level - 1 == PT_PAGE_TABLE_LEVEL
+                   && walker->level == PT_DIRECTORY_LEVEL) {
+                       metaphysical = 1;
+                       if (!is_dirty_pte(walker->ptes[level - 1]))
+                               access &= ~ACC_WRITE_MASK;
+                       table_gfn = gpte_to_gfn(walker->ptes[level - 1]);
+               } else {
+                       metaphysical = 0;
+                       table_gfn = walker->table_gfn[level - 2];
+               }
+               shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
+                                              metaphysical, access,
+                                              shadow_ent, &new_page);
+               if (new_page && !metaphysical) {
+                       int r;
+                       pt_element_t curr_pte;
+                       r = kvm_read_guest_atomic(vcpu->kvm,
+                                                 walker->pte_gpa[level - 2],
+                                                 &curr_pte, sizeof(curr_pte));
+                       if (r || curr_pte != walker->ptes[level - 2]) {
+                               kvm_release_page_clean(page);
+                               return NULL;
+                       }
+               }
+               shadow_addr = __pa(shadow_page->spt);
+               shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
+                       | PT_WRITABLE_MASK | PT_USER_MASK;
+               *shadow_ent = shadow_pte;
+       }
+
+       mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
+                    user_fault, write_fault,
+                    walker->ptes[walker->level-1] & PT_DIRTY_MASK,
+                    ptwrite, walker->gfn, page);
+
+       return shadow_ent;
+}
+
+/*
+ * Page fault handler.  There are several causes for a page fault:
+ *   - there is no shadow pte for the guest pte
+ *   - write access through a shadow pte marked read only so that we can set
+ *     the dirty bit
+ *   - write access to a shadow pte marked read only so we can update the page
+ *     dirty bitmap, when userspace requests it
+ *   - mmio access; in this case we will never install a present shadow pte
+ *   - normal guest page fault due to the guest pte marked not present, not
+ *     writable, or not executable
+ *
+ *  Returns: 1 if we need to emulate the instruction, 0 otherwise, or
+ *           a negative value on error.
+ */
+static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
+                              u32 error_code)
+{
+       int write_fault = error_code & PFERR_WRITE_MASK;
+       int user_fault = error_code & PFERR_USER_MASK;
+       int fetch_fault = error_code & PFERR_FETCH_MASK;
+       struct guest_walker walker;
+       u64 *shadow_pte;
+       int write_pt = 0;
+       int r;
+       struct page *page;
+
+       pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
+       kvm_mmu_audit(vcpu, "pre page fault");
+
+       r = mmu_topup_memory_caches(vcpu);
+       if (r)
+               return r;
+
+       down_read(&current->mm->mmap_sem);
+       /*
+        * Look up the shadow pte for the faulting address.
+        */
+       r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
+                            fetch_fault);
+
+       /*
+        * The page is not mapped by the guest.  Let the guest handle it.
+        */
+       if (!r) {
+               pgprintk("%s: guest page fault\n", __FUNCTION__);
+               inject_page_fault(vcpu, addr, walker.error_code);
+               vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
+               up_read(&current->mm->mmap_sem);
+               return 0;
+       }
+
+       page = gfn_to_page(vcpu->kvm, walker.gfn);
+
+       spin_lock(&vcpu->kvm->mmu_lock);
+       kvm_mmu_free_some_pages(vcpu);
+       shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
+                                 &write_pt, page);
+       pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
+                shadow_pte, *shadow_pte, write_pt);
+
+       if (!write_pt)
+               vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
+
+       /*
+        * mmio: emulate if accessible, otherwise its a guest fault.
+        */
+       if (shadow_pte && is_io_pte(*shadow_pte)) {
+               spin_unlock(&vcpu->kvm->mmu_lock);
+               up_read(&current->mm->mmap_sem);
+               return 1;
+       }
+
+       ++vcpu->stat.pf_fixed;
+       kvm_mmu_audit(vcpu, "post page fault (fixed)");
+       spin_unlock(&vcpu->kvm->mmu_lock);
+       up_read(&current->mm->mmap_sem);
+
+       return write_pt;
+}
+
+static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
+{
+       struct guest_walker walker;
+       gpa_t gpa = UNMAPPED_GVA;
+       int r;
+
+       r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0);
+
+       if (r) {
+               gpa = gfn_to_gpa(walker.gfn);
+               gpa |= vaddr & ~PAGE_MASK;
+       }
+
+       return gpa;
+}
+
+static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
+                                struct kvm_mmu_page *sp)
+{
+       int i, offset = 0, r = 0;
+       pt_element_t pt;
+
+       if (sp->role.metaphysical
+           || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
+               nonpaging_prefetch_page(vcpu, sp);
+               return;
+       }
+
+       if (PTTYPE == 32)
+               offset = sp->role.quadrant << PT64_LEVEL_BITS;
+
+       for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+               gpa_t pte_gpa = gfn_to_gpa(sp->gfn);
+               pte_gpa += (i+offset) * sizeof(pt_element_t);
+
+               r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &pt,
+                                         sizeof(pt_element_t));
+               if (r || is_present_pte(pt))
+                       sp->spt[i] = shadow_trap_nonpresent_pte;
+               else
+                       sp->spt[i] = shadow_notrap_nonpresent_pte;
+       }
+}
+
+#undef pt_element_t
+#undef guest_walker
+#undef FNAME
+#undef PT_BASE_ADDR_MASK
+#undef PT_INDEX
+#undef SHADOW_PT_INDEX
+#undef PT_LEVEL_MASK
+#undef PT_DIR_BASE_ADDR_MASK
+#undef PT_LEVEL_BITS
+#undef PT_MAX_FULL_LEVELS
+#undef gpte_to_gfn
+#undef gpte_to_gfn_pde
+#undef CMPXCHG
similarity index 53%
rename from drivers/kvm/segment_descriptor.h
rename to arch/x86/kvm/segment_descriptor.h
index 71fdf458619a001092c9bba6ddaf4338b482372a..56fc4c8733894db1554e6c81ac5ea321f1228e4b 100644 (file)
@@ -1,3 +1,6 @@
+#ifndef __SEGMENT_DESCRIPTOR_H
+#define __SEGMENT_DESCRIPTOR_H
+
 struct segment_descriptor {
        u16 limit_low;
        u16 base_low;
@@ -14,4 +17,13 @@ struct segment_descriptor {
        u8  base_high;
 } __attribute__((packed));
 
+#ifdef CONFIG_X86_64
+/* LDT or TSS descriptor in the GDT. 16 bytes. */
+struct segment_descriptor_64 {
+       struct segment_descriptor s;
+       u32 base_higher;
+       u32 pad_zero;
+};
 
+#endif
+#endif
similarity index 84%
rename from drivers/kvm/svm.c
rename to arch/x86/kvm/svm.c
index ced4ac1955db56630f63e37f7feacbba3ef83396..de755cb1431dcef84617b04e29eacb5a06fc59d0 100644 (file)
  * the COPYING file in the top-level directory.
  *
  */
+#include <linux/kvm_host.h>
 
 #include "kvm_svm.h"
-#include "x86_emulate.h"
 #include "irq.h"
+#include "mmu.h"
 
 #include <linux/module.h>
 #include <linux/kernel.h>
@@ -42,9 +43,6 @@ MODULE_LICENSE("GPL");
 #define SEG_TYPE_LDT 2
 #define SEG_TYPE_BUSY_TSS16 3
 
-#define KVM_EFER_LMA (1 << 10)
-#define KVM_EFER_LME (1 << 8)
-
 #define SVM_FEATURE_NPT  (1 << 0)
 #define SVM_FEATURE_LBRV (1 << 1)
 #define SVM_DEATURE_SVML (1 << 2)
@@ -102,20 +100,20 @@ static inline u32 svm_has(u32 feat)
 
 static inline u8 pop_irq(struct kvm_vcpu *vcpu)
 {
-       int word_index = __ffs(vcpu->irq_summary);
-       int bit_index = __ffs(vcpu->irq_pending[word_index]);
+       int word_index = __ffs(vcpu->arch.irq_summary);
+       int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
        int irq = word_index * BITS_PER_LONG + bit_index;
 
-       clear_bit(bit_index, &vcpu->irq_pending[word_index]);
-       if (!vcpu->irq_pending[word_index])
-               clear_bit(word_index, &vcpu->irq_summary);
+       clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
+       if (!vcpu->arch.irq_pending[word_index])
+               clear_bit(word_index, &vcpu->arch.irq_summary);
        return irq;
 }
 
 static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
 {
-       set_bit(irq, vcpu->irq_pending);
-       set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary);
+       set_bit(irq, vcpu->arch.irq_pending);
+       set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
 }
 
 static inline void clgi(void)
@@ -184,35 +182,30 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
 
 static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
-       if (!(efer & KVM_EFER_LMA))
-               efer &= ~KVM_EFER_LME;
+       if (!(efer & EFER_LMA))
+               efer &= ~EFER_LME;
 
        to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK;
-       vcpu->shadow_efer = efer;
+       vcpu->arch.shadow_efer = efer;
 }
 
-static void svm_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code)
+static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
+                               bool has_error_code, u32 error_code)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
 
-       svm->vmcb->control.event_inj =          SVM_EVTINJ_VALID |
-                                               SVM_EVTINJ_VALID_ERR |
-                                               SVM_EVTINJ_TYPE_EXEPT |
-                                               GP_VECTOR;
+       svm->vmcb->control.event_inj = nr
+               | SVM_EVTINJ_VALID
+               | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
+               | SVM_EVTINJ_TYPE_EXEPT;
        svm->vmcb->control.event_inj_err = error_code;
 }
 
-static void inject_ud(struct kvm_vcpu *vcpu)
+static bool svm_exception_injected(struct kvm_vcpu *vcpu)
 {
-       to_svm(vcpu)->vmcb->control.event_inj = SVM_EVTINJ_VALID |
-                                               SVM_EVTINJ_TYPE_EXEPT |
-                                               UD_VECTOR;
-}
+       struct vcpu_svm *svm = to_svm(vcpu);
 
-static int is_page_fault(uint32_t info)
-{
-       info &= SVM_EVTINJ_VEC_MASK | SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
-       return info == (PF_VECTOR | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT);
+       return !(svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID);
 }
 
 static int is_external_interrupt(u32 info)
@@ -229,17 +222,16 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
                printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__);
                return;
        }
-       if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) {
+       if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE)
                printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n",
                       __FUNCTION__,
                       svm->vmcb->save.rip,
                       svm->next_rip);
-       }
 
-       vcpu->rip = svm->vmcb->save.rip = svm->next_rip;
+       vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip;
        svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
 
-       vcpu->interrupt_window_open = 1;
+       vcpu->arch.interrupt_window_open = 1;
 }
 
 static int has_svm(void)
@@ -312,7 +304,7 @@ static void svm_hardware_enable(void *garbage)
        svm_data->next_asid = svm_data->max_asid + 1;
        svm_features = cpuid_edx(SVM_CPUID_FUNC);
 
-       asm volatile ( "sgdt %0" : "=m"(gdt_descr) );
+       asm volatile ("sgdt %0" : "=m"(gdt_descr));
        gdt = (struct desc_struct *)gdt_descr.address;
        svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
 
@@ -458,11 +450,13 @@ static void init_vmcb(struct vmcb *vmcb)
 
        control->intercept_cr_read =    INTERCEPT_CR0_MASK |
                                        INTERCEPT_CR3_MASK |
-                                       INTERCEPT_CR4_MASK;
+                                       INTERCEPT_CR4_MASK |
+                                       INTERCEPT_CR8_MASK;
 
        control->intercept_cr_write =   INTERCEPT_CR0_MASK |
                                        INTERCEPT_CR3_MASK |
-                                       INTERCEPT_CR4_MASK;
+                                       INTERCEPT_CR4_MASK |
+                                       INTERCEPT_CR8_MASK;
 
        control->intercept_dr_read =    INTERCEPT_DR0_MASK |
                                        INTERCEPT_DR1_MASK |
@@ -476,7 +470,8 @@ static void init_vmcb(struct vmcb *vmcb)
                                        INTERCEPT_DR5_MASK |
                                        INTERCEPT_DR7_MASK;
 
-       control->intercept_exceptions = 1 << PF_VECTOR;
+       control->intercept_exceptions = (1 << PF_VECTOR) |
+                                       (1 << UD_VECTOR);
 
 
        control->intercept =    (1ULL << INTERCEPT_INTR) |
@@ -543,8 +538,7 @@ static void init_vmcb(struct vmcb *vmcb)
        init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
 
        save->efer = MSR_EFER_SVME_MASK;
-
-        save->dr6 = 0xffff0ff0;
+       save->dr6 = 0xffff0ff0;
        save->dr7 = 0x400;
        save->rflags = 2;
        save->rip = 0x0000fff0;
@@ -558,7 +552,7 @@ static void init_vmcb(struct vmcb *vmcb)
        /* rdx = ?? */
 }
 
-static void svm_vcpu_reset(struct kvm_vcpu *vcpu)
+static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
 
@@ -566,9 +560,11 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu)
 
        if (vcpu->vcpu_id != 0) {
                svm->vmcb->save.rip = 0;
-               svm->vmcb->save.cs.base = svm->vcpu.sipi_vector << 12;
-               svm->vmcb->save.cs.selector = svm->vcpu.sipi_vector << 8;
+               svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
+               svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
        }
+
+       return 0;
 }
 
 static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
@@ -587,12 +583,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
        if (err)
                goto free_svm;
 
-       if (irqchip_in_kernel(kvm)) {
-               err = kvm_create_lapic(&svm->vcpu);
-               if (err < 0)
-                       goto free_svm;
-       }
-
        page = alloc_page(GFP_KERNEL);
        if (!page) {
                err = -ENOMEM;
@@ -608,9 +598,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 
        fx_init(&svm->vcpu);
        svm->vcpu.fpu_active = 1;
-       svm->vcpu.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
+       svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
        if (svm->vcpu.vcpu_id == 0)
-               svm->vcpu.apic_base |= MSR_IA32_APICBASE_BSP;
+               svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
 
        return &svm->vcpu;
 
@@ -644,7 +634,7 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                 * increasing TSC.
                 */
                rdtscll(tsc_this);
-               delta = vcpu->host_tsc - tsc_this;
+               delta = vcpu->arch.host_tsc - tsc_this;
                svm->vmcb->control.tsc_offset += delta;
                vcpu->cpu = cpu;
                kvm_migrate_apic_timer(vcpu);
@@ -659,11 +649,11 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
        struct vcpu_svm *svm = to_svm(vcpu);
        int i;
 
+       ++vcpu->stat.host_state_reload;
        for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
                wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
 
-       rdtscll(vcpu->host_tsc);
-       kvm_put_guest_fpu(vcpu);
+       rdtscll(vcpu->arch.host_tsc);
 }
 
 static void svm_vcpu_decache(struct kvm_vcpu *vcpu)
@@ -674,17 +664,17 @@ static void svm_cache_regs(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
 
-       vcpu->regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
-       vcpu->regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
-       vcpu->rip = svm->vmcb->save.rip;
+       vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
+       vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
+       vcpu->arch.rip = svm->vmcb->save.rip;
 }
 
 static void svm_decache_regs(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
-       svm->vmcb->save.rax = vcpu->regs[VCPU_REGS_RAX];
-       svm->vmcb->save.rsp = vcpu->regs[VCPU_REGS_RSP];
-       svm->vmcb->save.rip = vcpu->rip;
+       svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+       svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+       svm->vmcb->save.rip = vcpu->arch.rip;
 }
 
 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
@@ -782,24 +772,24 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
        struct vcpu_svm *svm = to_svm(vcpu);
 
 #ifdef CONFIG_X86_64
-       if (vcpu->shadow_efer & KVM_EFER_LME) {
+       if (vcpu->arch.shadow_efer & EFER_LME) {
                if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
-                       vcpu->shadow_efer |= KVM_EFER_LMA;
-                       svm->vmcb->save.efer |= KVM_EFER_LMA | KVM_EFER_LME;
+                       vcpu->arch.shadow_efer |= EFER_LMA;
+                       svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
                }
 
-               if (is_paging(vcpu) && !(cr0 & X86_CR0_PG) ) {
-                       vcpu->shadow_efer &= ~KVM_EFER_LMA;
-                       svm->vmcb->save.efer &= ~(KVM_EFER_LMA | KVM_EFER_LME);
+               if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
+                       vcpu->arch.shadow_efer &= ~EFER_LMA;
+                       svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
                }
        }
 #endif
-       if ((vcpu->cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
+       if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
                svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
                vcpu->fpu_active = 1;
        }
 
-       vcpu->cr0 = cr0;
+       vcpu->arch.cr0 = cr0;
        cr0 |= X86_CR0_PG | X86_CR0_WP;
        cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
        svm->vmcb->save.cr0 = cr0;
@@ -807,7 +797,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 
 static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
-       vcpu->cr4 = cr4;
+       vcpu->arch.cr4 = cr4;
        to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE;
 }
 
@@ -912,7 +902,7 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
                svm->db_regs[dr] = value;
                return;
        case 4 ... 5:
-               if (vcpu->cr4 & X86_CR4_DE) {
+               if (vcpu->arch.cr4 & X86_CR4_DE) {
                        *exception = UD_VECTOR;
                        return;
                }
@@ -938,51 +928,30 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
        struct kvm *kvm = svm->vcpu.kvm;
        u64 fault_address;
        u32 error_code;
-       enum emulation_result er;
-       int r;
 
        if (!irqchip_in_kernel(kvm) &&
                is_external_interrupt(exit_int_info))
                push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
 
-       mutex_lock(&kvm->lock);
-
        fault_address  = svm->vmcb->control.exit_info_2;
        error_code = svm->vmcb->control.exit_info_1;
-       r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
-       if (r < 0) {
-               mutex_unlock(&kvm->lock);
-               return r;
-       }
-       if (!r) {
-               mutex_unlock(&kvm->lock);
-               return 1;
-       }
-       er = emulate_instruction(&svm->vcpu, kvm_run, fault_address,
-                                error_code);
-       mutex_unlock(&kvm->lock);
+       return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
+}
 
-       switch (er) {
-       case EMULATE_DONE:
-               return 1;
-       case EMULATE_DO_MMIO:
-               ++svm->vcpu.stat.mmio_exits;
-               return 0;
-       case EMULATE_FAIL:
-               kvm_report_emulation_failure(&svm->vcpu, "pagetable");
-               break;
-       default:
-               BUG();
-       }
+static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+       int er;
 
-       kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
-       return 0;
+       er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD);
+       if (er != EMULATE_DONE)
+               kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+       return 1;
 }
 
 static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
        svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
-       if (!(svm->vcpu.cr0 & X86_CR0_TS))
+       if (!(svm->vcpu.arch.cr0 & X86_CR0_TS))
                svm->vmcb->save.cr0 &= ~X86_CR0_TS;
        svm->vcpu.fpu_active = 1;
 
@@ -1004,7 +973,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 
 static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
-       u32 io_info = svm->vmcb->control.exit_info_1; //address size bug?
+       u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
        int size, down, in, string, rep;
        unsigned port;
 
@@ -1015,7 +984,8 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
        string = (io_info & SVM_IOIO_STR_MASK) != 0;
 
        if (string) {
-               if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO)
+               if (emulate_instruction(&svm->vcpu,
+                                       kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
                        return 0;
                return 1;
        }
@@ -1045,13 +1015,14 @@ static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
        svm->next_rip = svm->vmcb->save.rip + 3;
        skip_emulated_instruction(&svm->vcpu);
-       return kvm_hypercall(&svm->vcpu, kvm_run);
+       kvm_emulate_hypercall(&svm->vcpu);
+       return 1;
 }
 
 static int invalid_op_interception(struct vcpu_svm *svm,
                                   struct kvm_run *kvm_run)
 {
-       inject_ud(&svm->vcpu);
+       kvm_queue_exception(&svm->vcpu, UD_VECTOR);
        return 1;
 }
 
@@ -1073,11 +1044,20 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 static int emulate_on_interception(struct vcpu_svm *svm,
                                   struct kvm_run *kvm_run)
 {
-       if (emulate_instruction(&svm->vcpu, NULL, 0, 0) != EMULATE_DONE)
+       if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE)
                pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__);
        return 1;
 }
 
+static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+       emulate_instruction(&svm->vcpu, NULL, 0, 0, 0);
+       if (irqchip_in_kernel(svm->vcpu.kvm))
+               return 1;
+       kvm_run->exit_reason = KVM_EXIT_SET_TPR;
+       return 0;
+}
+
 static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
@@ -1124,14 +1104,14 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
 
 static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
-       u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX];
+       u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
        u64 data;
 
        if (svm_get_msr(&svm->vcpu, ecx, &data))
-               svm_inject_gp(&svm->vcpu, 0);
+               kvm_inject_gp(&svm->vcpu, 0);
        else {
                svm->vmcb->save.rax = data & 0xffffffff;
-               svm->vcpu.regs[VCPU_REGS_RDX] = data >> 32;
+               svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
                svm->next_rip = svm->vmcb->save.rip + 2;
                skip_emulated_instruction(&svm->vcpu);
        }
@@ -1176,7 +1156,20 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
        case MSR_IA32_SYSENTER_ESP:
                svm->vmcb->save.sysenter_esp = data;
                break;
+       case MSR_K7_EVNTSEL0:
+       case MSR_K7_EVNTSEL1:
+       case MSR_K7_EVNTSEL2:
+       case MSR_K7_EVNTSEL3:
+               /*
+                * only support writing 0 to the performance counters for now
+                * to make Windows happy. Should be replaced by a real
+                * performance counter emulation later.
+                */
+               if (data != 0)
+                       goto unhandled;
+               break;
        default:
+       unhandled:
                return kvm_set_msr_common(vcpu, ecx, data);
        }
        return 0;
@@ -1184,12 +1177,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
 
 static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
-       u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX];
+       u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
        u64 data = (svm->vmcb->save.rax & -1u)
-               | ((u64)(svm->vcpu.regs[VCPU_REGS_RDX] & -1u) << 32);
+               | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
        svm->next_rip = svm->vmcb->save.rip + 2;
        if (svm_set_msr(&svm->vcpu, ecx, data))
-               svm_inject_gp(&svm->vcpu, 0);
+               kvm_inject_gp(&svm->vcpu, 0);
        else
                skip_emulated_instruction(&svm->vcpu);
        return 1;
@@ -1213,7 +1206,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
         * possible
         */
        if (kvm_run->request_interrupt_window &&
-           !svm->vcpu.irq_summary) {
+           !svm->vcpu.arch.irq_summary) {
                ++svm->vcpu.stat.irq_window_exits;
                kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
                return 0;
@@ -1227,10 +1220,12 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
        [SVM_EXIT_READ_CR0]                     = emulate_on_interception,
        [SVM_EXIT_READ_CR3]                     = emulate_on_interception,
        [SVM_EXIT_READ_CR4]                     = emulate_on_interception,
+       [SVM_EXIT_READ_CR8]                     = emulate_on_interception,
        /* for now: */
        [SVM_EXIT_WRITE_CR0]                    = emulate_on_interception,
        [SVM_EXIT_WRITE_CR3]                    = emulate_on_interception,
        [SVM_EXIT_WRITE_CR4]                    = emulate_on_interception,
+       [SVM_EXIT_WRITE_CR8]                    = cr8_write_interception,
        [SVM_EXIT_READ_DR0]                     = emulate_on_interception,
        [SVM_EXIT_READ_DR1]                     = emulate_on_interception,
        [SVM_EXIT_READ_DR2]                     = emulate_on_interception,
@@ -1241,6 +1236,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
        [SVM_EXIT_WRITE_DR3]                    = emulate_on_interception,
        [SVM_EXIT_WRITE_DR5]                    = emulate_on_interception,
        [SVM_EXIT_WRITE_DR7]                    = emulate_on_interception,
+       [SVM_EXIT_EXCP_BASE + UD_VECTOR]        = ud_interception,
        [SVM_EXIT_EXCP_BASE + PF_VECTOR]        = pf_interception,
        [SVM_EXIT_EXCP_BASE + NM_VECTOR]        = nm_interception,
        [SVM_EXIT_INTR]                         = nop_on_interception,
@@ -1293,7 +1289,7 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
                       exit_code);
 
        if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
-           || svm_exit_handlers[exit_code] == 0) {
+           || !svm_exit_handlers[exit_code]) {
                kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
                kvm_run->hw.hardware_exit_reason = exit_code;
                return 0;
@@ -1307,7 +1303,7 @@ static void reload_tss(struct kvm_vcpu *vcpu)
        int cpu = raw_smp_processor_id();
 
        struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
-       svm_data->tss_desc->type = 9; //available 32/64-bit TSS
+       svm_data->tss_desc->type = 9; /* available 32/64-bit TSS */
        load_TR_desc();
 }
 
@@ -1348,7 +1344,6 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu)
        struct vmcb *vmcb = svm->vmcb;
        int intr_vector = -1;
 
-       kvm_inject_pending_timer_irqs(vcpu);
        if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) &&
            ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) {
                intr_vector = vmcb->control.exit_int_info &
@@ -1388,20 +1383,20 @@ static void kvm_reput_irq(struct vcpu_svm *svm)
                push_irq(&svm->vcpu, control->int_vector);
        }
 
-       svm->vcpu.interrupt_window_open =
+       svm->vcpu.arch.interrupt_window_open =
                !(control->int_state & SVM_INTERRUPT_SHADOW_MASK);
 }
 
 static void svm_do_inject_vector(struct vcpu_svm *svm)
 {
        struct kvm_vcpu *vcpu = &svm->vcpu;
-       int word_index = __ffs(vcpu->irq_summary);
-       int bit_index = __ffs(vcpu->irq_pending[word_index]);
+       int word_index = __ffs(vcpu->arch.irq_summary);
+       int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
        int irq = word_index * BITS_PER_LONG + bit_index;
 
-       clear_bit(bit_index, &vcpu->irq_pending[word_index]);
-       if (!vcpu->irq_pending[word_index])
-               clear_bit(word_index, &vcpu->irq_summary);
+       clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
+       if (!vcpu->arch.irq_pending[word_index])
+               clear_bit(word_index, &vcpu->arch.irq_summary);
        svm_inject_irq(svm, irq);
 }
 
@@ -1411,11 +1406,11 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
        struct vcpu_svm *svm = to_svm(vcpu);
        struct vmcb_control_area *control = &svm->vmcb->control;
 
-       svm->vcpu.interrupt_window_open =
+       svm->vcpu.arch.interrupt_window_open =
                (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
                 (svm->vmcb->save.rflags & X86_EFLAGS_IF));
 
-       if (svm->vcpu.interrupt_window_open && svm->vcpu.irq_summary)
+       if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary)
                /*
                 * If interrupts enabled, and not blocked by sti or mov ss. Good.
                 */
@@ -1424,13 +1419,18 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
        /*
         * Interrupts blocked.  Wait for unblock.
         */
-       if (!svm->vcpu.interrupt_window_open &&
-           (svm->vcpu.irq_summary || kvm_run->request_interrupt_window)) {
+       if (!svm->vcpu.arch.interrupt_window_open &&
+           (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window))
                control->intercept |= 1ULL << INTERCEPT_VINTR;
-       } else
+        else
                control->intercept &= ~(1ULL << INTERCEPT_VINTR);
 }
 
+static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
+{
+       return 0;
+}
+
 static void save_db_regs(unsigned long *db_regs)
 {
        asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0]));
@@ -1472,7 +1472,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        svm->host_cr2 = kvm_read_cr2();
        svm->host_dr6 = read_dr6();
        svm->host_dr7 = read_dr7();
-       svm->vmcb->save.cr2 = vcpu->cr2;
+       svm->vmcb->save.cr2 = vcpu->arch.cr2;
 
        if (svm->vmcb->save.dr7 & 0xff) {
                write_dr7(0);
@@ -1486,13 +1486,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
        asm volatile (
 #ifdef CONFIG_X86_64
-               "push %%rbx; push %%rcx; push %%rdx;"
-               "push %%rsi; push %%rdi; push %%rbp;"
-               "push %%r8;  push %%r9;  push %%r10; push %%r11;"
-               "push %%r12; push %%r13; push %%r14; push %%r15;"
+               "push %%rbp; \n\t"
 #else
-               "push %%ebx; push %%ecx; push %%edx;"
-               "push %%esi; push %%edi; push %%ebp;"
+               "push %%ebp; \n\t"
 #endif
 
 #ifdef CONFIG_X86_64
@@ -1554,10 +1550,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                "mov %%r14, %c[r14](%[svm]) \n\t"
                "mov %%r15, %c[r15](%[svm]) \n\t"
 
-               "pop  %%r15; pop  %%r14; pop  %%r13; pop  %%r12;"
-               "pop  %%r11; pop  %%r10; pop  %%r9;  pop  %%r8;"
-               "pop  %%rbp; pop  %%rdi; pop  %%rsi;"
-               "pop  %%rdx; pop  %%rcx; pop  %%rbx; \n\t"
+               "pop  %%rbp; \n\t"
 #else
                "mov %%ebx, %c[rbx](%[svm]) \n\t"
                "mov %%ecx, %c[rcx](%[svm]) \n\t"
@@ -1566,34 +1559,40 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                "mov %%edi, %c[rdi](%[svm]) \n\t"
                "mov %%ebp, %c[rbp](%[svm]) \n\t"
 
-               "pop  %%ebp; pop  %%edi; pop  %%esi;"
-               "pop  %%edx; pop  %%ecx; pop  %%ebx; \n\t"
+               "pop  %%ebp; \n\t"
 #endif
                :
                : [svm]"a"(svm),
                  [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
-                 [rbx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBX])),
-                 [rcx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RCX])),
-                 [rdx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDX])),
-                 [rsi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RSI])),
-                 [rdi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDI])),
-                 [rbp]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBP]))
+                 [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
+                 [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
+                 [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
+                 [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
+                 [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
+                 [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
 #ifdef CONFIG_X86_64
-                 ,[r8 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R8])),
-                 [r9 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R9 ])),
-                 [r10]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R10])),
-                 [r11]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R11])),
-                 [r12]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R12])),
-                 [r13]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R13])),
-                 [r14]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R14])),
-                 [r15]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R15]))
+                 , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
+                 [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
+                 [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
+                 [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
+                 [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
+                 [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
+                 [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
+                 [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
 #endif
-               : "cc", "memory" );
+               : "cc", "memory"
+#ifdef CONFIG_X86_64
+               , "rbx", "rcx", "rdx", "rsi", "rdi"
+               , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
+#else
+               , "ebx", "ecx", "edx" , "esi", "edi"
+#endif
+               );
 
        if ((svm->vmcb->save.dr7 & 0xff))
                load_db_regs(svm->host_db_regs);
 
-       vcpu->cr2 = svm->vmcb->save.cr2;
+       vcpu->arch.cr2 = svm->vmcb->save.cr2;
 
        write_dr6(svm->host_dr6);
        write_dr7(svm->host_dr7);
@@ -1627,34 +1626,6 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
        }
 }
 
-static void svm_inject_page_fault(struct kvm_vcpu *vcpu,
-                                 unsigned long  addr,
-                                 uint32_t err_code)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-       uint32_t exit_int_info = svm->vmcb->control.exit_int_info;
-
-       ++vcpu->stat.pf_guest;
-
-       if (is_page_fault(exit_int_info)) {
-
-               svm->vmcb->control.event_inj_err = 0;
-               svm->vmcb->control.event_inj =  SVM_EVTINJ_VALID |
-                                               SVM_EVTINJ_VALID_ERR |
-                                               SVM_EVTINJ_TYPE_EXEPT |
-                                               DF_VECTOR;
-               return;
-       }
-       vcpu->cr2 = addr;
-       svm->vmcb->save.cr2 = addr;
-       svm->vmcb->control.event_inj =  SVM_EVTINJ_VALID |
-                                       SVM_EVTINJ_VALID_ERR |
-                                       SVM_EVTINJ_TYPE_EXEPT |
-                                       PF_VECTOR;
-       svm->vmcb->control.event_inj_err = err_code;
-}
-
-
 static int is_disabled(void)
 {
        u64 vm_cr;
@@ -1675,7 +1646,6 @@ svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
        hypercall[0] = 0x0f;
        hypercall[1] = 0x01;
        hypercall[2] = 0xd9;
-       hypercall[3] = 0xc3;
 }
 
 static void svm_check_processor_compat(void *rtn)
@@ -1683,6 +1653,11 @@ static void svm_check_processor_compat(void *rtn)
        *(int *)rtn = 0;
 }
 
+static bool svm_cpu_has_accelerated_tpr(void)
+{
+       return false;
+}
+
 static struct kvm_x86_ops svm_x86_ops = {
        .cpu_has_kvm_support = has_svm,
        .disabled_by_bios = is_disabled,
@@ -1691,6 +1666,7 @@ static struct kvm_x86_ops svm_x86_ops = {
        .check_processor_compatibility = svm_check_processor_compat,
        .hardware_enable = svm_hardware_enable,
        .hardware_disable = svm_hardware_disable,
+       .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
 
        .vcpu_create = svm_create_vcpu,
        .vcpu_free = svm_free_vcpu,
@@ -1725,9 +1701,6 @@ static struct kvm_x86_ops svm_x86_ops = {
        .set_rflags = svm_set_rflags,
 
        .tlb_flush = svm_flush_tlb,
-       .inject_page_fault = svm_inject_page_fault,
-
-       .inject_gp = svm_inject_gp,
 
        .run = svm_vcpu_run,
        .handle_exit = handle_exit,
@@ -1735,19 +1708,23 @@ static struct kvm_x86_ops svm_x86_ops = {
        .patch_hypercall = svm_patch_hypercall,
        .get_irq = svm_get_irq,
        .set_irq = svm_set_irq,
+       .queue_exception = svm_queue_exception,
+       .exception_injected = svm_exception_injected,
        .inject_pending_irq = svm_intr_assist,
        .inject_pending_vectors = do_interrupt_requests,
+
+       .set_tss_addr = svm_set_tss_addr,
 };
 
 static int __init svm_init(void)
 {
-       return kvm_init_x86(&svm_x86_ops, sizeof(struct vcpu_svm),
+       return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
                              THIS_MODULE);
 }
 
 static void __exit svm_exit(void)
 {
-       kvm_exit_x86();
+       kvm_exit();
 }
 
 module_init(svm_init)
similarity index 98%
rename from drivers/kvm/svm.h
rename to arch/x86/kvm/svm.h
index 3b1b0f35b6cba172ac5fbb58cdceec93cdc7221c..5fd50491b55505fd1f02a033c3f244e134efdf5b 100644 (file)
@@ -204,6 +204,7 @@ struct __attribute__ ((__packed__)) vmcb {
 #define INTERCEPT_CR0_MASK 1
 #define INTERCEPT_CR3_MASK (1 << 3)
 #define INTERCEPT_CR4_MASK (1 << 4)
+#define INTERCEPT_CR8_MASK (1 << 8)
 
 #define INTERCEPT_DR0_MASK 1
 #define INTERCEPT_DR1_MASK (1 << 1)
@@ -311,7 +312,7 @@ struct __attribute__ ((__packed__)) vmcb {
 
 #define SVM_EXIT_ERR           -1
 
-#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) // TS and MP
+#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */
 
 #define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda"
 #define SVM_VMRUN  ".byte 0x0f, 0x01, 0xd8"
similarity index 75%
rename from drivers/kvm/vmx.c
rename to arch/x86/kvm/vmx.c
index 5b397b6c9f93d2d7fb646c410dd0eba6710c7989..ad36447e696e6c80bbf70ce53e0b88a7e2c2bbe8 100644 (file)
  *
  */
 
-#include "kvm.h"
-#include "x86_emulate.h"
 #include "irq.h"
 #include "vmx.h"
 #include "segment_descriptor.h"
+#include "mmu.h"
 
+#include <linux/kvm_host.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/sched.h>
+#include <linux/moduleparam.h>
 
 #include <asm/io.h>
 #include <asm/desc.h>
@@ -33,6 +34,9 @@
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
+static int bypass_guest_pf = 1;
+module_param(bypass_guest_pf, bool, 0);
+
 struct vmcs {
        u32 revision_id;
        u32 abort;
@@ -43,6 +47,7 @@ struct vcpu_vmx {
        struct kvm_vcpu       vcpu;
        int                   launched;
        u8                    fail;
+       u32                   idt_vectoring_info;
        struct kvm_msr_entry *guest_msrs;
        struct kvm_msr_entry *host_msrs;
        int                   nmsrs;
@@ -57,8 +62,15 @@ struct vcpu_vmx {
                u16           fs_sel, gs_sel, ldt_sel;
                int           gs_ldt_reload_needed;
                int           fs_reload_needed;
-       }host_state;
-
+               int           guest_efer_loaded;
+       } host_state;
+       struct {
+               struct {
+                       bool pending;
+                       u8 vector;
+                       unsigned rip;
+               } irq;
+       } rmode;
 };
 
 static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -74,14 +86,13 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
 static struct page *vmx_io_bitmap_a;
 static struct page *vmx_io_bitmap_b;
 
-#define EFER_SAVE_RESTORE_BITS ((u64)EFER_SCE)
-
 static struct vmcs_config {
        int size;
        int order;
        u32 revision_id;
        u32 pin_based_exec_ctrl;
        u32 cpu_based_exec_ctrl;
+       u32 cpu_based_2nd_exec_ctrl;
        u32 vmexit_ctrl;
        u32 vmentry_ctrl;
 } vmcs_config;
@@ -138,18 +149,6 @@ static void save_msrs(struct kvm_msr_entry *e, int n)
                rdmsrl(e[i].index, e[i].data);
 }
 
-static inline u64 msr_efer_save_restore_bits(struct kvm_msr_entry msr)
-{
-       return (u64)msr.data & EFER_SAVE_RESTORE_BITS;
-}
-
-static inline int msr_efer_need_save_restore(struct vcpu_vmx *vmx)
-{
-       int efer_offset = vmx->msr_offset_efer;
-       return msr_efer_save_restore_bits(vmx->host_msrs[efer_offset]) !=
-               msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]);
-}
-
 static inline int is_page_fault(u32 intr_info)
 {
        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
@@ -164,6 +163,13 @@ static inline int is_no_device(u32 intr_info)
                (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
 }
 
+static inline int is_invalid_opcode(u32 intr_info)
+{
+       return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
+                            INTR_INFO_VALID_MASK)) ==
+               (INTR_TYPE_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
+}
+
 static inline int is_external_interrupt(u32 intr_info)
 {
        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -180,6 +186,24 @@ static inline int vm_need_tpr_shadow(struct kvm *kvm)
        return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)));
 }
 
+static inline int cpu_has_secondary_exec_ctrls(void)
+{
+       return (vmcs_config.cpu_based_exec_ctrl &
+               CPU_BASED_ACTIVATE_SECONDARY_CONTROLS);
+}
+
+static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
+{
+       return (vmcs_config.cpu_based_2nd_exec_ctrl &
+               SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+}
+
+static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
+{
+       return ((cpu_has_vmx_virtualize_apic_accesses()) &&
+               (irqchip_in_kernel(kvm)));
+}
+
 static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
 {
        int i;
@@ -222,16 +246,14 @@ static void __vcpu_clear(void *arg)
                vmcs_clear(vmx->vmcs);
        if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
                per_cpu(current_vmcs, cpu) = NULL;
-       rdtscll(vmx->vcpu.host_tsc);
+       rdtscll(vmx->vcpu.arch.host_tsc);
 }
 
 static void vcpu_clear(struct vcpu_vmx *vmx)
 {
-       if (vmx->vcpu.cpu != raw_smp_processor_id() && vmx->vcpu.cpu != -1)
-               smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear,
-                                        vmx, 0, 1);
-       else
-               __vcpu_clear(vmx);
+       if (vmx->vcpu.cpu == -1)
+               return;
+       smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 0, 1);
        vmx->launched = 0;
 }
 
@@ -275,7 +297,7 @@ static void vmcs_writel(unsigned long field, unsigned long value)
        u8 error;
 
        asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0"
-                      : "=q"(error) : "a"(value), "d"(field) : "cc" );
+                      : "=q"(error) : "a"(value), "d"(field) : "cc");
        if (unlikely(error))
                vmwrite_error(field, value);
 }
@@ -315,12 +337,12 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 {
        u32 eb;
 
-       eb = 1u << PF_VECTOR;
+       eb = (1u << PF_VECTOR) | (1u << UD_VECTOR);
        if (!vcpu->fpu_active)
                eb |= 1u << NM_VECTOR;
        if (vcpu->guest_debug.enabled)
                eb |= 1u << 1;
-       if (vcpu->rmode.active)
+       if (vcpu->arch.rmode.active)
                eb = ~0;
        vmcs_write32(EXCEPTION_BITMAP, eb);
 }
@@ -344,16 +366,42 @@ static void reload_tss(void)
 
 static void load_transition_efer(struct vcpu_vmx *vmx)
 {
-       u64 trans_efer;
        int efer_offset = vmx->msr_offset_efer;
+       u64 host_efer = vmx->host_msrs[efer_offset].data;
+       u64 guest_efer = vmx->guest_msrs[efer_offset].data;
+       u64 ignore_bits;
 
-       trans_efer = vmx->host_msrs[efer_offset].data;
-       trans_efer &= ~EFER_SAVE_RESTORE_BITS;
-       trans_efer |= msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]);
-       wrmsrl(MSR_EFER, trans_efer);
+       if (efer_offset < 0)
+               return;
+       /*
+        * NX is emulated; LMA and LME handled by hardware; SCE meaninless
+        * outside long mode
+        */
+       ignore_bits = EFER_NX | EFER_SCE;
+#ifdef CONFIG_X86_64
+       ignore_bits |= EFER_LMA | EFER_LME;
+       /* SCE is meaningful only in long mode on Intel */
+       if (guest_efer & EFER_LMA)
+               ignore_bits &= ~(u64)EFER_SCE;
+#endif
+       if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits))
+               return;
+
+       vmx->host_state.guest_efer_loaded = 1;
+       guest_efer &= ~ignore_bits;
+       guest_efer |= host_efer & ignore_bits;
+       wrmsrl(MSR_EFER, guest_efer);
        vmx->vcpu.stat.efer_reload++;
 }
 
+static void reload_host_efer(struct vcpu_vmx *vmx)
+{
+       if (vmx->host_state.guest_efer_loaded) {
+               vmx->host_state.guest_efer_loaded = 0;
+               load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
+       }
+}
+
 static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -393,14 +441,13 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 #endif
 
 #ifdef CONFIG_X86_64
-       if (is_long_mode(&vmx->vcpu)) {
+       if (is_long_mode(&vmx->vcpu))
                save_msrs(vmx->host_msrs +
                          vmx->msr_offset_kernel_gs_base, 1);
-       }
+
 #endif
        load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
-       if (msr_efer_need_save_restore(vmx))
-               load_transition_efer(vmx);
+       load_transition_efer(vmx);
 }
 
 static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -410,6 +457,7 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
        if (!vmx->host_state.loaded)
                return;
 
+       ++vmx->vcpu.stat.host_state_reload;
        vmx->host_state.loaded = 0;
        if (vmx->host_state.fs_reload_needed)
                load_fs(vmx->host_state.fs_sel);
@@ -429,8 +477,7 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
        reload_tss();
        save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
        load_msrs(vmx->host_msrs, vmx->save_nmsrs);
-       if (msr_efer_need_save_restore(vmx))
-               load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
+       reload_host_efer(vmx);
 }
 
 /*
@@ -480,7 +527,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                 * Make sure the time stamp counter is monotonous.
                 */
                rdtscll(tsc_this);
-               delta = vcpu->host_tsc - tsc_this;
+               delta = vcpu->arch.host_tsc - tsc_this;
                vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta);
        }
 }
@@ -488,7 +535,6 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 {
        vmx_load_host_state(to_vmx(vcpu));
-       kvm_put_guest_fpu(vcpu);
 }
 
 static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
@@ -497,7 +543,7 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
                return;
        vcpu->fpu_active = 1;
        vmcs_clear_bits(GUEST_CR0, X86_CR0_TS);
-       if (vcpu->cr0 & X86_CR0_TS)
+       if (vcpu->arch.cr0 & X86_CR0_TS)
                vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
        update_exception_bitmap(vcpu);
 }
@@ -523,7 +569,7 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
 
 static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 {
-       if (vcpu->rmode.active)
+       if (vcpu->arch.rmode.active)
                rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
        vmcs_writel(GUEST_RFLAGS, rflags);
 }
@@ -545,19 +591,25 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
        if (interruptibility & 3)
                vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
                             interruptibility & ~3);
-       vcpu->interrupt_window_open = 1;
+       vcpu->arch.interrupt_window_open = 1;
 }
 
-static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code)
+static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
+                               bool has_error_code, u32 error_code)
 {
-       printk(KERN_DEBUG "inject_general_protection: rip 0x%lx\n",
-              vmcs_readl(GUEST_RIP));
-       vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-                    GP_VECTOR |
-                    INTR_TYPE_EXCEPTION |
-                    INTR_INFO_DELIEVER_CODE_MASK |
-                    INTR_INFO_VALID_MASK);
+                    nr | INTR_TYPE_EXCEPTION
+                    | (has_error_code ? INTR_INFO_DELIEVER_CODE_MASK : 0)
+                    | INTR_INFO_VALID_MASK);
+       if (has_error_code)
+               vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
+}
+
+static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
 }
 
 /*
@@ -608,7 +660,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
                 * if efer.sce is enabled.
                 */
                index = __find_msr_index(vmx, MSR_K6_STAR);
-               if ((index >= 0) && (vmx->vcpu.shadow_efer & EFER_SCE))
+               if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE))
                        move_msr_up(vmx, index, save_nmsrs++);
        }
 #endif
@@ -712,8 +764,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 #ifdef CONFIG_X86_64
        case MSR_EFER:
                ret = kvm_set_msr_common(vcpu, msr_index, data);
-               if (vmx->host_state.loaded)
+               if (vmx->host_state.loaded) {
+                       reload_host_efer(vmx);
                        load_transition_efer(vmx);
+               }
                break;
        case MSR_FS_BASE:
                vmcs_writel(GUEST_FS_BASE, data);
@@ -750,12 +804,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 
 /*
  * Sync the rsp and rip registers into the vcpu structure.  This allows
- * registers to be accessed by indexing vcpu->regs.
+ * registers to be accessed by indexing vcpu->arch.regs.
  */
 static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
 {
-       vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
-       vcpu->rip = vmcs_readl(GUEST_RIP);
+       vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
+       vcpu->arch.rip = vmcs_readl(GUEST_RIP);
 }
 
 /*
@@ -764,8 +818,8 @@ static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
  */
 static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
 {
-       vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]);
-       vmcs_writel(GUEST_RIP, vcpu->rip);
+       vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
+       vmcs_writel(GUEST_RIP, vcpu->arch.rip);
 }
 
 static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
@@ -808,14 +862,15 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
 
 static int vmx_get_irq(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 idtv_info_field;
 
-       idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+       idtv_info_field = vmx->idt_vectoring_info;
        if (idtv_info_field & INTR_INFO_VALID_MASK) {
                if (is_external_interrupt(idtv_info_field))
                        return idtv_info_field & VECTORING_INFO_VECTOR_MASK;
                else
-                       printk("pending exception: not handled yet\n");
+                       printk(KERN_DEBUG "pending exception: not handled yet\n");
        }
        return -1;
 }
@@ -863,7 +918,7 @@ static void hardware_disable(void *garbage)
 }
 
 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
-                                     u32 msr, u32result)
+                                     u32 msr, u32 *result)
 {
        u32 vmx_msr_low, vmx_msr_high;
        u32 ctl = ctl_min | ctl_opt;
@@ -887,6 +942,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
        u32 min, opt;
        u32 _pin_based_exec_control = 0;
        u32 _cpu_based_exec_control = 0;
+       u32 _cpu_based_2nd_exec_control = 0;
        u32 _vmexit_control = 0;
        u32 _vmentry_control = 0;
 
@@ -904,11 +960,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
              CPU_BASED_USE_IO_BITMAPS |
              CPU_BASED_MOV_DR_EXITING |
              CPU_BASED_USE_TSC_OFFSETING;
-#ifdef CONFIG_X86_64
-       opt = CPU_BASED_TPR_SHADOW;
-#else
-       opt = 0;
-#endif
+       opt = CPU_BASED_TPR_SHADOW |
+             CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
                                &_cpu_based_exec_control) < 0)
                return -EIO;
@@ -917,6 +970,19 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
                _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
                                           ~CPU_BASED_CR8_STORE_EXITING;
 #endif
+       if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
+               min = 0;
+               opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+                       SECONDARY_EXEC_WBINVD_EXITING;
+               if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2,
+                                       &_cpu_based_2nd_exec_control) < 0)
+                       return -EIO;
+       }
+#ifndef CONFIG_X86_64
+       if (!(_cpu_based_2nd_exec_control &
+                               SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+               _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
+#endif
 
        min = 0;
 #ifdef CONFIG_X86_64
@@ -954,6 +1020,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 
        vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
        vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
+       vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
        vmcs_conf->vmexit_ctrl         = _vmexit_control;
        vmcs_conf->vmentry_ctrl        = _vmentry_control;
 
@@ -1043,15 +1110,15 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
 {
        unsigned long flags;
 
-       vcpu->rmode.active = 0;
+       vcpu->arch.rmode.active = 0;
 
-       vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base);
-       vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit);
-       vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar);
+       vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
+       vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit);
+       vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar);
 
        flags = vmcs_readl(GUEST_RFLAGS);
        flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
-       flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT);
+       flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT);
        vmcs_writel(GUEST_RFLAGS, flags);
 
        vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
@@ -1059,10 +1126,10 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
 
        update_exception_bitmap(vcpu);
 
-       fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->rmode.es);
-       fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->rmode.ds);
-       fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->rmode.gs);
-       fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->rmode.fs);
+       fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
+       fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
+       fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
+       fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
 
        vmcs_write16(GUEST_SS_SELECTOR, 0);
        vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
@@ -1072,10 +1139,14 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
        vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
 }
 
-static gva_t rmode_tss_base(struct kvmkvm)
+static gva_t rmode_tss_base(struct kvm *kvm)
 {
-       gfn_t base_gfn = kvm->memslots[0].base_gfn + kvm->memslots[0].npages - 3;
-       return base_gfn << PAGE_SHIFT;
+       if (!kvm->arch.tss_addr) {
+               gfn_t base_gfn = kvm->memslots[0].base_gfn +
+                                kvm->memslots[0].npages - 3;
+               return base_gfn << PAGE_SHIFT;
+       }
+       return kvm->arch.tss_addr;
 }
 
 static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
@@ -1086,7 +1157,8 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
        save->base = vmcs_readl(sf->base);
        save->limit = vmcs_read32(sf->limit);
        save->ar = vmcs_read32(sf->ar_bytes);
-       vmcs_write16(sf->selector, vmcs_readl(sf->base) >> 4);
+       vmcs_write16(sf->selector, save->base >> 4);
+       vmcs_write32(sf->base, save->base & 0xfffff);
        vmcs_write32(sf->limit, 0xffff);
        vmcs_write32(sf->ar_bytes, 0xf3);
 }
@@ -1095,19 +1167,20 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
 {
        unsigned long flags;
 
-       vcpu->rmode.active = 1;
+       vcpu->arch.rmode.active = 1;
 
-       vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
+       vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
        vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
 
-       vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
+       vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
        vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
 
-       vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
+       vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
        vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
 
        flags = vmcs_readl(GUEST_RFLAGS);
-       vcpu->rmode.save_iopl = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
+       vcpu->arch.rmode.save_iopl
+               = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
 
        flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
 
@@ -1125,10 +1198,10 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
                vmcs_writel(GUEST_CS_BASE, 0xf0000);
        vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
 
-       fix_rmode_seg(VCPU_SREG_ES, &vcpu->rmode.es);
-       fix_rmode_seg(VCPU_SREG_DS, &vcpu->rmode.ds);
-       fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs);
-       fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs);
+       fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
+       fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
+       fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
+       fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
 
        kvm_mmu_reset_context(vcpu);
        init_rmode_tss(vcpu->kvm);
@@ -1149,7 +1222,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
                             | AR_TYPE_BUSY_64_TSS);
        }
 
-       vcpu->shadow_efer |= EFER_LMA;
+       vcpu->arch.shadow_efer |= EFER_LMA;
 
        find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME;
        vmcs_write32(VM_ENTRY_CONTROLS,
@@ -1159,7 +1232,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
 
 static void exit_lmode(struct kvm_vcpu *vcpu)
 {
-       vcpu->shadow_efer &= ~EFER_LMA;
+       vcpu->arch.shadow_efer &= ~EFER_LMA;
 
        vmcs_write32(VM_ENTRY_CONTROLS,
                     vmcs_read32(VM_ENTRY_CONTROLS)
@@ -1170,22 +1243,22 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
 
 static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
 {
-       vcpu->cr4 &= KVM_GUEST_CR4_MASK;
-       vcpu->cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
+       vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK;
+       vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
 }
 
 static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
        vmx_fpu_deactivate(vcpu);
 
-       if (vcpu->rmode.active && (cr0 & X86_CR0_PE))
+       if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE))
                enter_pmode(vcpu);
 
-       if (!vcpu->rmode.active && !(cr0 & X86_CR0_PE))
+       if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE))
                enter_rmode(vcpu);
 
 #ifdef CONFIG_X86_64
-       if (vcpu->shadow_efer & EFER_LME) {
+       if (vcpu->arch.shadow_efer & EFER_LME) {
                if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
                        enter_lmode(vcpu);
                if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
@@ -1196,7 +1269,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
        vmcs_writel(CR0_READ_SHADOW, cr0);
        vmcs_writel(GUEST_CR0,
                    (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
-       vcpu->cr0 = cr0;
+       vcpu->arch.cr0 = cr0;
 
        if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
                vmx_fpu_activate(vcpu);
@@ -1205,16 +1278,16 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
        vmcs_writel(GUEST_CR3, cr3);
-       if (vcpu->cr0 & X86_CR0_PE)
+       if (vcpu->arch.cr0 & X86_CR0_PE)
                vmx_fpu_deactivate(vcpu);
 }
 
 static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
        vmcs_writel(CR4_READ_SHADOW, cr4);
-       vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ?
+       vmcs_writel(GUEST_CR4, cr4 | (vcpu->arch.rmode.active ?
                    KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON));
-       vcpu->cr4 = cr4;
+       vcpu->arch.cr4 = cr4;
 }
 
 #ifdef CONFIG_X86_64
@@ -1224,7 +1297,7 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
 
-       vcpu->shadow_efer = efer;
+       vcpu->arch.shadow_efer = efer;
        if (efer & EFER_LMA) {
                vmcs_write32(VM_ENTRY_CONTROLS,
                                     vmcs_read32(VM_ENTRY_CONTROLS) |
@@ -1301,17 +1374,17 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
        struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
        u32 ar;
 
-       if (vcpu->rmode.active && seg == VCPU_SREG_TR) {
-               vcpu->rmode.tr.selector = var->selector;
-               vcpu->rmode.tr.base = var->base;
-               vcpu->rmode.tr.limit = var->limit;
-               vcpu->rmode.tr.ar = vmx_segment_access_rights(var);
+       if (vcpu->arch.rmode.active && seg == VCPU_SREG_TR) {
+               vcpu->arch.rmode.tr.selector = var->selector;
+               vcpu->arch.rmode.tr.base = var->base;
+               vcpu->arch.rmode.tr.limit = var->limit;
+               vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var);
                return;
        }
        vmcs_writel(sf->base, var->base);
        vmcs_write32(sf->limit, var->limit);
        vmcs_write16(sf->selector, var->selector);
-       if (vcpu->rmode.active && var->s) {
+       if (vcpu->arch.rmode.active && var->s) {
                /*
                 * Hack real-mode segments into vm86 compatibility.
                 */
@@ -1355,36 +1428,38 @@ static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
        vmcs_writel(GUEST_GDTR_BASE, dt->base);
 }
 
-static int init_rmode_tss(struct kvmkvm)
+static int init_rmode_tss(struct kvm *kvm)
 {
-       struct page *p1, *p2, *p3;
        gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
-       char *page;
-
-       p1 = gfn_to_page(kvm, fn++);
-       p2 = gfn_to_page(kvm, fn++);
-       p3 = gfn_to_page(kvm, fn);
-
-       if (!p1 || !p2 || !p3) {
-               kvm_printf(kvm,"%s: gfn_to_page failed\n", __FUNCTION__);
-               return 0;
-       }
-
-       page = kmap_atomic(p1, KM_USER0);
-       clear_page(page);
-       *(u16*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
-       kunmap_atomic(page, KM_USER0);
-
-       page = kmap_atomic(p2, KM_USER0);
-       clear_page(page);
-       kunmap_atomic(page, KM_USER0);
+       u16 data = 0;
+       int ret = 0;
+       int r;
 
-       page = kmap_atomic(p3, KM_USER0);
-       clear_page(page);
-       *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0;
-       kunmap_atomic(page, KM_USER0);
+       down_read(&current->mm->mmap_sem);
+       r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
+       if (r < 0)
+               goto out;
+       data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
+       r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16));
+       if (r < 0)
+               goto out;
+       r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
+       if (r < 0)
+               goto out;
+       r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
+       if (r < 0)
+               goto out;
+       data = ~0;
+       r = kvm_write_guest_page(kvm, fn, &data,
+                                RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
+                                sizeof(u8));
+       if (r < 0)
+               goto out;
 
-       return 1;
+       ret = 1;
+out:
+       up_read(&current->mm->mmap_sem);
+       return ret;
 }
 
 static void seg_setup(int seg)
@@ -1397,6 +1472,27 @@ static void seg_setup(int seg)
        vmcs_write32(sf->ar_bytes, 0x93);
 }
 
+static int alloc_apic_access_page(struct kvm *kvm)
+{
+       struct kvm_userspace_memory_region kvm_userspace_mem;
+       int r = 0;
+
+       down_write(&current->mm->mmap_sem);
+       if (kvm->arch.apic_access_page)
+               goto out;
+       kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
+       kvm_userspace_mem.flags = 0;
+       kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
+       kvm_userspace_mem.memory_size = PAGE_SIZE;
+       r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
+       if (r)
+               goto out;
+       kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
+out:
+       up_write(&current->mm->mmap_sem);
+       return r;
+}
+
 /*
  * Sets up the vmcs for emulated real mode.
  */
@@ -1407,92 +1503,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
        unsigned long a;
        struct descriptor_table dt;
        int i;
-       int ret = 0;
        unsigned long kvm_vmx_return;
-       u64 msr;
        u32 exec_control;
 
-       if (!init_rmode_tss(vmx->vcpu.kvm)) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       vmx->vcpu.rmode.active = 0;
-
-       vmx->vcpu.regs[VCPU_REGS_RDX] = get_rdx_init_val();
-       set_cr8(&vmx->vcpu, 0);
-       msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
-       if (vmx->vcpu.vcpu_id == 0)
-               msr |= MSR_IA32_APICBASE_BSP;
-       kvm_set_apic_base(&vmx->vcpu, msr);
-
-       fx_init(&vmx->vcpu);
-
-       /*
-        * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
-        * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4.  Sigh.
-        */
-       if (vmx->vcpu.vcpu_id == 0) {
-               vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
-               vmcs_writel(GUEST_CS_BASE, 0x000f0000);
-       } else {
-               vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.sipi_vector << 8);
-               vmcs_writel(GUEST_CS_BASE, vmx->vcpu.sipi_vector << 12);
-       }
-       vmcs_write32(GUEST_CS_LIMIT, 0xffff);
-       vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
-
-       seg_setup(VCPU_SREG_DS);
-       seg_setup(VCPU_SREG_ES);
-       seg_setup(VCPU_SREG_FS);
-       seg_setup(VCPU_SREG_GS);
-       seg_setup(VCPU_SREG_SS);
-
-       vmcs_write16(GUEST_TR_SELECTOR, 0);
-       vmcs_writel(GUEST_TR_BASE, 0);
-       vmcs_write32(GUEST_TR_LIMIT, 0xffff);
-       vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
-
-       vmcs_write16(GUEST_LDTR_SELECTOR, 0);
-       vmcs_writel(GUEST_LDTR_BASE, 0);
-       vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
-       vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
-
-       vmcs_write32(GUEST_SYSENTER_CS, 0);
-       vmcs_writel(GUEST_SYSENTER_ESP, 0);
-       vmcs_writel(GUEST_SYSENTER_EIP, 0);
-
-       vmcs_writel(GUEST_RFLAGS, 0x02);
-       if (vmx->vcpu.vcpu_id == 0)
-               vmcs_writel(GUEST_RIP, 0xfff0);
-       else
-               vmcs_writel(GUEST_RIP, 0);
-       vmcs_writel(GUEST_RSP, 0);
-
-       //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0
-       vmcs_writel(GUEST_DR7, 0x400);
-
-       vmcs_writel(GUEST_GDTR_BASE, 0);
-       vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
-
-       vmcs_writel(GUEST_IDTR_BASE, 0);
-       vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
-
-       vmcs_write32(GUEST_ACTIVITY_STATE, 0);
-       vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
-       vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
-
        /* I/O */
        vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a));
        vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b));
 
-       guest_write_tsc(0);
-
        vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
 
-       /* Special registers */
-       vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
-
        /* Control */
        vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
                vmcs_config.pin_based_exec_ctrl);
@@ -1507,8 +1526,16 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
        }
        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
 
-       vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
-       vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
+       if (cpu_has_secondary_exec_ctrls()) {
+               exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
+               if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
+                       exec_control &=
+                               ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+               vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+       }
+
+       vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
+       vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
        vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
 
        vmcs_writel(HOST_CR0, read_cr0());  /* 22.2.3 */
@@ -1536,7 +1563,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
        get_idt(&dt);
        vmcs_writel(HOST_IDTR_BASE, dt.base);   /* 22.2.4 */
 
-       asm ("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
+       asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
        vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
        vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
        vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
@@ -1567,97 +1594,145 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
                ++vmx->nmsrs;
        }
 
-       setup_msrs(vmx);
-
        vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
 
        /* 22.2.1, 20.8.1 */
        vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
 
-       vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
-
-#ifdef CONFIG_X86_64
-       vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
-       if (vm_need_tpr_shadow(vmx->vcpu.kvm))
-               vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
-                            page_to_phys(vmx->vcpu.apic->regs_page));
-       vmcs_write32(TPR_THRESHOLD, 0);
-#endif
-
        vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
        vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
 
-       vmx->vcpu.cr0 = 0x60000010;
-       vmx_set_cr0(&vmx->vcpu, vmx->vcpu.cr0); // enter rmode
-       vmx_set_cr4(&vmx->vcpu, 0);
-#ifdef CONFIG_X86_64
-       vmx_set_efer(&vmx->vcpu, 0);
-#endif
-       vmx_fpu_activate(&vmx->vcpu);
-       update_exception_bitmap(&vmx->vcpu);
+       if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
+               if (alloc_apic_access_page(vmx->vcpu.kvm) != 0)
+                       return -ENOMEM;
 
        return 0;
-
-out:
-       return ret;
 }
 
-static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
+static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u64 msr;
+       int ret;
 
-       vmx_vcpu_setup(vmx);
-}
-
-static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq)
-{
-       u16 ent[2];
-       u16 cs;
-       u16 ip;
-       unsigned long flags;
-       unsigned long ss_base = vmcs_readl(GUEST_SS_BASE);
-       u16 sp =  vmcs_readl(GUEST_RSP);
-       u32 ss_limit = vmcs_read32(GUEST_SS_LIMIT);
-
-       if (sp > ss_limit || sp < 6 ) {
-               vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n",
-                           __FUNCTION__,
-                           vmcs_readl(GUEST_RSP),
-                           vmcs_readl(GUEST_SS_BASE),
-                           vmcs_read32(GUEST_SS_LIMIT));
-               return;
+       if (!init_rmode_tss(vmx->vcpu.kvm)) {
+               ret = -ENOMEM;
+               goto out;
        }
 
-       if (emulator_read_std(irq * sizeof(ent), &ent, sizeof(ent), vcpu) !=
-                                                       X86EMUL_CONTINUE) {
-               vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__);
-               return;
+       vmx->vcpu.arch.rmode.active = 0;
+
+       vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
+       set_cr8(&vmx->vcpu, 0);
+       msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
+       if (vmx->vcpu.vcpu_id == 0)
+               msr |= MSR_IA32_APICBASE_BSP;
+       kvm_set_apic_base(&vmx->vcpu, msr);
+
+       fx_init(&vmx->vcpu);
+
+       /*
+        * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
+        * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4.  Sigh.
+        */
+       if (vmx->vcpu.vcpu_id == 0) {
+               vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
+               vmcs_writel(GUEST_CS_BASE, 0x000f0000);
+       } else {
+               vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
+               vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
        }
+       vmcs_write32(GUEST_CS_LIMIT, 0xffff);
+       vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
+
+       seg_setup(VCPU_SREG_DS);
+       seg_setup(VCPU_SREG_ES);
+       seg_setup(VCPU_SREG_FS);
+       seg_setup(VCPU_SREG_GS);
+       seg_setup(VCPU_SREG_SS);
+
+       vmcs_write16(GUEST_TR_SELECTOR, 0);
+       vmcs_writel(GUEST_TR_BASE, 0);
+       vmcs_write32(GUEST_TR_LIMIT, 0xffff);
+       vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
 
-       flags =  vmcs_readl(GUEST_RFLAGS);
-       cs =  vmcs_readl(GUEST_CS_BASE) >> 4;
-       ip =  vmcs_readl(GUEST_RIP);
+       vmcs_write16(GUEST_LDTR_SELECTOR, 0);
+       vmcs_writel(GUEST_LDTR_BASE, 0);
+       vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
+       vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
 
+       vmcs_write32(GUEST_SYSENTER_CS, 0);
+       vmcs_writel(GUEST_SYSENTER_ESP, 0);
+       vmcs_writel(GUEST_SYSENTER_EIP, 0);
 
-       if (emulator_write_emulated(ss_base + sp - 2, &flags, 2, vcpu) != X86EMUL_CONTINUE ||
-           emulator_write_emulated(ss_base + sp - 4, &cs, 2, vcpu) != X86EMUL_CONTINUE ||
-           emulator_write_emulated(ss_base + sp - 6, &ip, 2, vcpu) != X86EMUL_CONTINUE) {
-               vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__);
-               return;
+       vmcs_writel(GUEST_RFLAGS, 0x02);
+       if (vmx->vcpu.vcpu_id == 0)
+               vmcs_writel(GUEST_RIP, 0xfff0);
+       else
+               vmcs_writel(GUEST_RIP, 0);
+       vmcs_writel(GUEST_RSP, 0);
+
+       /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */
+       vmcs_writel(GUEST_DR7, 0x400);
+
+       vmcs_writel(GUEST_GDTR_BASE, 0);
+       vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
+
+       vmcs_writel(GUEST_IDTR_BASE, 0);
+       vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
+
+       vmcs_write32(GUEST_ACTIVITY_STATE, 0);
+       vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
+       vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
+
+       guest_write_tsc(0);
+
+       /* Special registers */
+       vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+
+       setup_msrs(vmx);
+
+       vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
+
+       if (cpu_has_vmx_tpr_shadow()) {
+               vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
+               if (vm_need_tpr_shadow(vmx->vcpu.kvm))
+                       vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
+                               page_to_phys(vmx->vcpu.arch.apic->regs_page));
+               vmcs_write32(TPR_THRESHOLD, 0);
        }
 
-       vmcs_writel(GUEST_RFLAGS, flags &
-                   ~( X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF));
-       vmcs_write16(GUEST_CS_SELECTOR, ent[1]) ;
-       vmcs_writel(GUEST_CS_BASE, ent[1] << 4);
-       vmcs_writel(GUEST_RIP, ent[0]);
-       vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6));
+       if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
+               vmcs_write64(APIC_ACCESS_ADDR,
+                            page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
+
+       vmx->vcpu.arch.cr0 = 0x60000010;
+       vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
+       vmx_set_cr4(&vmx->vcpu, 0);
+#ifdef CONFIG_X86_64
+       vmx_set_efer(&vmx->vcpu, 0);
+#endif
+       vmx_fpu_activate(&vmx->vcpu);
+       update_exception_bitmap(&vmx->vcpu);
+
+       return 0;
+
+out:
+       return ret;
 }
 
 static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
 {
-       if (vcpu->rmode.active) {
-               inject_rmode_irq(vcpu, irq);
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       if (vcpu->arch.rmode.active) {
+               vmx->rmode.irq.pending = true;
+               vmx->rmode.irq.vector = irq;
+               vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP);
+               vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+                            irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
+               vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
+               vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1);
                return;
        }
        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
@@ -1666,13 +1741,13 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
 
 static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
 {
-       int word_index = __ffs(vcpu->irq_summary);
-       int bit_index = __ffs(vcpu->irq_pending[word_index]);
+       int word_index = __ffs(vcpu->arch.irq_summary);
+       int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
        int irq = word_index * BITS_PER_LONG + bit_index;
 
-       clear_bit(bit_index, &vcpu->irq_pending[word_index]);
-       if (!vcpu->irq_pending[word_index])
-               clear_bit(word_index, &vcpu->irq_summary);
+       clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
+       if (!vcpu->arch.irq_pending[word_index])
+               clear_bit(word_index, &vcpu->arch.irq_summary);
        vmx_inject_irq(vcpu, irq);
 }
 
@@ -1682,12 +1757,12 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 {
        u32 cpu_based_vm_exec_control;
 
-       vcpu->interrupt_window_open =
+       vcpu->arch.interrupt_window_open =
                ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
                 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
 
-       if (vcpu->interrupt_window_open &&
-           vcpu->irq_summary &&
+       if (vcpu->arch.interrupt_window_open &&
+           vcpu->arch.irq_summary &&
            !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
                /*
                 * If interrupts enabled, and not blocked by sti or mov ss. Good.
@@ -1695,8 +1770,8 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
                kvm_do_inject_irq(vcpu);
 
        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-       if (!vcpu->interrupt_window_open &&
-           (vcpu->irq_summary || kvm_run->request_interrupt_window))
+       if (!vcpu->arch.interrupt_window_open &&
+           (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
                /*
                 * Interrupts blocked.  Wait for unblock.
                 */
@@ -1706,6 +1781,23 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
 }
 
+static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
+{
+       int ret;
+       struct kvm_userspace_memory_region tss_mem = {
+               .slot = 8,
+               .guest_phys_addr = addr,
+               .memory_size = PAGE_SIZE * 3,
+               .flags = 0,
+       };
+
+       ret = kvm_set_memory_region(kvm, &tss_mem, 0);
+       if (ret)
+               return ret;
+       kvm->arch.tss_addr = addr;
+       return 0;
+}
+
 static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
 {
        struct kvm_guest_debug *dbg = &vcpu->guest_debug;
@@ -1727,7 +1819,7 @@ static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
 static int handle_rmode_exception(struct kvm_vcpu *vcpu,
                                  int vec, u32 err_code)
 {
-       if (!vcpu->rmode.active)
+       if (!vcpu->arch.rmode.active)
                return 0;
 
        /*
@@ -1735,32 +1827,31 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
         * Cause the #SS fault with 0 error code in VM86 mode.
         */
        if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
-               if (emulate_instruction(vcpu, NULL, 0, 0) == EMULATE_DONE)
+               if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE)
                        return 1;
        return 0;
 }
 
 static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 intr_info, error_code;
        unsigned long cr2, rip;
        u32 vect_info;
        enum emulation_result er;
-       int r;
 
-       vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+       vect_info = vmx->idt_vectoring_info;
        intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 
        if ((vect_info & VECTORING_INFO_VALID_MASK) &&
-                                               !is_page_fault(intr_info)) {
+                                               !is_page_fault(intr_info))
                printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
                       "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
-       }
 
        if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) {
                int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
-               set_bit(irq, vcpu->irq_pending);
-               set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary);
+               set_bit(irq, vcpu->arch.irq_pending);
+               set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
        }
 
        if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
@@ -1771,52 +1862,34 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                return 1;
        }
 
+       if (is_invalid_opcode(intr_info)) {
+               er = emulate_instruction(vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD);
+               if (er != EMULATE_DONE)
+                       kvm_queue_exception(vcpu, UD_VECTOR);
+               return 1;
+       }
+
        error_code = 0;
        rip = vmcs_readl(GUEST_RIP);
        if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
                error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
        if (is_page_fault(intr_info)) {
                cr2 = vmcs_readl(EXIT_QUALIFICATION);
-
-               mutex_lock(&vcpu->kvm->lock);
-               r = kvm_mmu_page_fault(vcpu, cr2, error_code);
-               if (r < 0) {
-                       mutex_unlock(&vcpu->kvm->lock);
-                       return r;
-               }
-               if (!r) {
-                       mutex_unlock(&vcpu->kvm->lock);
-                       return 1;
-               }
-
-               er = emulate_instruction(vcpu, kvm_run, cr2, error_code);
-               mutex_unlock(&vcpu->kvm->lock);
-
-               switch (er) {
-               case EMULATE_DONE:
-                       return 1;
-               case EMULATE_DO_MMIO:
-                       ++vcpu->stat.mmio_exits;
-                       return 0;
-                case EMULATE_FAIL:
-                       kvm_report_emulation_failure(vcpu, "pagetable");
-                       break;
-               default:
-                       BUG();
-               }
+               return kvm_mmu_page_fault(vcpu, cr2, error_code);
        }
 
-       if (vcpu->rmode.active &&
+       if (vcpu->arch.rmode.active &&
            handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
                                                                error_code)) {
-               if (vcpu->halt_request) {
-                       vcpu->halt_request = 0;
+               if (vcpu->arch.halt_request) {
+                       vcpu->arch.halt_request = 0;
                        return kvm_emulate_halt(vcpu);
                }
                return 1;
        }
 
-       if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) {
+       if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) ==
+           (INTR_TYPE_EXCEPTION | 1)) {
                kvm_run->exit_reason = KVM_EXIT_DEBUG;
                return 0;
        }
@@ -1850,7 +1923,8 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        string = (exit_qualification & 16) != 0;
 
        if (string) {
-               if (emulate_instruction(vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO)
+               if (emulate_instruction(vcpu,
+                                       kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
                        return 0;
                return 1;
        }
@@ -1873,7 +1947,6 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
        hypercall[0] = 0x0f;
        hypercall[1] = 0x01;
        hypercall[2] = 0xc1;
-       hypercall[3] = 0xc3;
 }
 
 static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
@@ -1890,23 +1963,25 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                switch (cr) {
                case 0:
                        vcpu_load_rsp_rip(vcpu);
-                       set_cr0(vcpu, vcpu->regs[reg]);
+                       set_cr0(vcpu, vcpu->arch.regs[reg]);
                        skip_emulated_instruction(vcpu);
                        return 1;
                case 3:
                        vcpu_load_rsp_rip(vcpu);
-                       set_cr3(vcpu, vcpu->regs[reg]);
+                       set_cr3(vcpu, vcpu->arch.regs[reg]);
                        skip_emulated_instruction(vcpu);
                        return 1;
                case 4:
                        vcpu_load_rsp_rip(vcpu);
-                       set_cr4(vcpu, vcpu->regs[reg]);
+                       set_cr4(vcpu, vcpu->arch.regs[reg]);
                        skip_emulated_instruction(vcpu);
                        return 1;
                case 8:
                        vcpu_load_rsp_rip(vcpu);
-                       set_cr8(vcpu, vcpu->regs[reg]);
+                       set_cr8(vcpu, vcpu->arch.regs[reg]);
                        skip_emulated_instruction(vcpu);
+                       if (irqchip_in_kernel(vcpu->kvm))
+                               return 1;
                        kvm_run->exit_reason = KVM_EXIT_SET_TPR;
                        return 0;
                };
@@ -1914,8 +1989,8 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        case 2: /* clts */
                vcpu_load_rsp_rip(vcpu);
                vmx_fpu_deactivate(vcpu);
-               vcpu->cr0 &= ~X86_CR0_TS;
-               vmcs_writel(CR0_READ_SHADOW, vcpu->cr0);
+               vcpu->arch.cr0 &= ~X86_CR0_TS;
+               vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
                vmx_fpu_activate(vcpu);
                skip_emulated_instruction(vcpu);
                return 1;
@@ -1923,13 +1998,13 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                switch (cr) {
                case 3:
                        vcpu_load_rsp_rip(vcpu);
-                       vcpu->regs[reg] = vcpu->cr3;
+                       vcpu->arch.regs[reg] = vcpu->arch.cr3;
                        vcpu_put_rsp_rip(vcpu);
                        skip_emulated_instruction(vcpu);
                        return 1;
                case 8:
                        vcpu_load_rsp_rip(vcpu);
-                       vcpu->regs[reg] = get_cr8(vcpu);
+                       vcpu->arch.regs[reg] = get_cr8(vcpu);
                        vcpu_put_rsp_rip(vcpu);
                        skip_emulated_instruction(vcpu);
                        return 1;
@@ -1975,7 +2050,7 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                default:
                        val = 0;
                }
-               vcpu->regs[reg] = val;
+               vcpu->arch.regs[reg] = val;
        } else {
                /* mov to dr */
        }
@@ -1992,29 +2067,29 @@ static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
-       u32 ecx = vcpu->regs[VCPU_REGS_RCX];
+       u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
        u64 data;
 
        if (vmx_get_msr(vcpu, ecx, &data)) {
-               vmx_inject_gp(vcpu, 0);
+               kvm_inject_gp(vcpu, 0);
                return 1;
        }
 
        /* FIXME: handling of bits 32:63 of rax, rdx */
-       vcpu->regs[VCPU_REGS_RAX] = data & -1u;
-       vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
+       vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
+       vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
        skip_emulated_instruction(vcpu);
        return 1;
 }
 
 static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
-       u32 ecx = vcpu->regs[VCPU_REGS_RCX];
-       u64 data = (vcpu->regs[VCPU_REGS_RAX] & -1u)
-               | ((u64)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32);
+       u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
+       u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
+               | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
 
        if (vmx_set_msr(vcpu, ecx, data) != 0) {
-               vmx_inject_gp(vcpu, 0);
+               kvm_inject_gp(vcpu, 0);
                return 1;
        }
 
@@ -2042,7 +2117,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
         * possible
         */
        if (kvm_run->request_interrupt_window &&
-           !vcpu->irq_summary) {
+           !vcpu->arch.irq_summary) {
                kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
                ++vcpu->stat.irq_window_exits;
                return 0;
@@ -2059,7 +2134,35 @@ static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
        skip_emulated_instruction(vcpu);
-       return kvm_hypercall(vcpu, kvm_run);
+       kvm_emulate_hypercall(vcpu);
+       return 1;
+}
+
+static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+       skip_emulated_instruction(vcpu);
+       /* TODO: Add support for VT-d/pass-through device */
+       return 1;
+}
+
+static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+       u64 exit_qualification;
+       enum emulation_result er;
+       unsigned long offset;
+
+       exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
+       offset = exit_qualification & 0xffful;
+
+       er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
+
+       if (er !=  EMULATE_DONE) {
+               printk(KERN_ERR
+                      "Fail to handle apic access vmexit! Offset is 0x%lx\n",
+                      offset);
+               return -ENOTSUPP;
+       }
+       return 1;
 }
 
 /*
@@ -2081,7 +2184,9 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
        [EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
        [EXIT_REASON_HLT]                     = handle_halt,
        [EXIT_REASON_VMCALL]                  = handle_vmcall,
-       [EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold
+       [EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
+       [EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
+       [EXIT_REASON_WBINVD]                  = handle_wbinvd,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -2093,9 +2198,9 @@ static const int kvm_vmx_max_exit_handlers =
  */
 static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 {
-       u32 vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
        u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
        struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u32 vectoring_info = vmx->idt_vectoring_info;
 
        if (unlikely(vmx->fail)) {
                kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -2104,8 +2209,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
                return 0;
        }
 
-       if ( (vectoring_info & VECTORING_INFO_VALID_MASK) &&
-                               exit_reason != EXIT_REASON_EXCEPTION_NMI )
+       if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
+                               exit_reason != EXIT_REASON_EXCEPTION_NMI)
                printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
                       "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
        if (exit_reason < kvm_vmx_max_exit_handlers
@@ -2150,26 +2255,38 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
 
 static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 idtv_info_field, intr_info_field;
        int has_ext_irq, interrupt_window_open;
        int vector;
 
-       kvm_inject_pending_timer_irqs(vcpu);
        update_tpr_threshold(vcpu);
 
        has_ext_irq = kvm_cpu_has_interrupt(vcpu);
        intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
-       idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+       idtv_info_field = vmx->idt_vectoring_info;
        if (intr_info_field & INTR_INFO_VALID_MASK) {
                if (idtv_info_field & INTR_INFO_VALID_MASK) {
                        /* TODO: fault when IDT_Vectoring */
-                       printk(KERN_ERR "Fault when IDT_Vectoring\n");
+                       if (printk_ratelimit())
+                               printk(KERN_ERR "Fault when IDT_Vectoring\n");
                }
                if (has_ext_irq)
                        enable_irq_window(vcpu);
                return;
        }
        if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) {
+               if ((idtv_info_field & VECTORING_INFO_TYPE_MASK)
+                   == INTR_TYPE_EXT_INTR
+                   && vcpu->arch.rmode.active) {
+                       u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK;
+
+                       vmx_inject_irq(vcpu, vect);
+                       if (unlikely(has_ext_irq))
+                               enable_irq_window(vcpu);
+                       return;
+               }
+
                vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field);
                vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
                                vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
@@ -2194,6 +2311,29 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
                enable_irq_window(vcpu);
 }
 
+/*
+ * Failure to inject an interrupt should give us the information
+ * in IDT_VECTORING_INFO_FIELD.  However, if the failure occurs
+ * when fetching the interrupt redirection bitmap in the real-mode
+ * tss, this doesn't happen.  So we do it ourselves.
+ */
+static void fixup_rmode_irq(struct vcpu_vmx *vmx)
+{
+       vmx->rmode.irq.pending = 0;
+       if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip)
+               return;
+       vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip);
+       if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
+               vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
+               vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR;
+               return;
+       }
+       vmx->idt_vectoring_info =
+               VECTORING_INFO_VALID_MASK
+               | INTR_TYPE_EXT_INTR
+               | vmx->rmode.irq.vector;
+}
+
 static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2204,50 +2344,47 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
         */
        vmcs_writel(HOST_CR0, read_cr0());
 
-       asm (
+       asm(
                /* Store host registers */
 #ifdef CONFIG_X86_64
-               "push %%rax; push %%rbx; push %%rdx;"
-               "push %%rsi; push %%rdi; push %%rbp;"
-               "push %%r8;  push %%r9;  push %%r10; push %%r11;"
-               "push %%r12; push %%r13; push %%r14; push %%r15;"
+               "push %%rdx; push %%rbp;"
                "push %%rcx \n\t"
-               ASM_VMX_VMWRITE_RSP_RDX "\n\t"
 #else
-               "pusha; push %%ecx \n\t"
-               ASM_VMX_VMWRITE_RSP_RDX "\n\t"
+               "push %%edx; push %%ebp;"
+               "push %%ecx \n\t"
 #endif
+               ASM_VMX_VMWRITE_RSP_RDX "\n\t"
                /* Check if vmlaunch of vmresume is needed */
-               "cmp $0, %1 \n\t"
+               "cmpl $0, %c[launched](%0) \n\t"
                /* Load guest registers.  Don't clobber flags. */
 #ifdef CONFIG_X86_64
-               "mov %c[cr2](%3), %%rax \n\t"
+               "mov %c[cr2](%0), %%rax \n\t"
                "mov %%rax, %%cr2 \n\t"
-               "mov %c[rax](%3), %%rax \n\t"
-               "mov %c[rbx](%3), %%rbx \n\t"
-               "mov %c[rdx](%3), %%rdx \n\t"
-               "mov %c[rsi](%3), %%rsi \n\t"
-               "mov %c[rdi](%3), %%rdi \n\t"
-               "mov %c[rbp](%3), %%rbp \n\t"
-               "mov %c[r8](%3),  %%r8  \n\t"
-               "mov %c[r9](%3),  %%r9  \n\t"
-               "mov %c[r10](%3), %%r10 \n\t"
-               "mov %c[r11](%3), %%r11 \n\t"
-               "mov %c[r12](%3), %%r12 \n\t"
-               "mov %c[r13](%3), %%r13 \n\t"
-               "mov %c[r14](%3), %%r14 \n\t"
-               "mov %c[r15](%3), %%r15 \n\t"
-               "mov %c[rcx](%3), %%rcx \n\t" /* kills %3 (rcx) */
+               "mov %c[rax](%0), %%rax \n\t"
+               "mov %c[rbx](%0), %%rbx \n\t"
+               "mov %c[rdx](%0), %%rdx \n\t"
+               "mov %c[rsi](%0), %%rsi \n\t"
+               "mov %c[rdi](%0), %%rdi \n\t"
+               "mov %c[rbp](%0), %%rbp \n\t"
+               "mov %c[r8](%0),  %%r8  \n\t"
+               "mov %c[r9](%0),  %%r9  \n\t"
+               "mov %c[r10](%0), %%r10 \n\t"
+               "mov %c[r11](%0), %%r11 \n\t"
+               "mov %c[r12](%0), %%r12 \n\t"
+               "mov %c[r13](%0), %%r13 \n\t"
+               "mov %c[r14](%0), %%r14 \n\t"
+               "mov %c[r15](%0), %%r15 \n\t"
+               "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */
 #else
-               "mov %c[cr2](%3), %%eax \n\t"
+               "mov %c[cr2](%0), %%eax \n\t"
                "mov %%eax,   %%cr2 \n\t"
-               "mov %c[rax](%3), %%eax \n\t"
-               "mov %c[rbx](%3), %%ebx \n\t"
-               "mov %c[rdx](%3), %%edx \n\t"
-               "mov %c[rsi](%3), %%esi \n\t"
-               "mov %c[rdi](%3), %%edi \n\t"
-               "mov %c[rbp](%3), %%ebp \n\t"
-               "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */
+               "mov %c[rax](%0), %%eax \n\t"
+               "mov %c[rbx](%0), %%ebx \n\t"
+               "mov %c[rdx](%0), %%edx \n\t"
+               "mov %c[rsi](%0), %%esi \n\t"
+               "mov %c[rdi](%0), %%edi \n\t"
+               "mov %c[rbp](%0), %%ebp \n\t"
+               "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */
 #endif
                /* Enter guest mode */
                "jne .Llaunched \n\t"
@@ -2257,72 +2394,79 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                ".Lkvm_vmx_return: "
                /* Save guest registers, load host registers, keep flags */
 #ifdef CONFIG_X86_64
-               "xchg %3,     (%%rsp) \n\t"
-               "mov %%rax, %c[rax](%3) \n\t"
-               "mov %%rbx, %c[rbx](%3) \n\t"
-               "pushq (%%rsp); popq %c[rcx](%3) \n\t"
-               "mov %%rdx, %c[rdx](%3) \n\t"
-               "mov %%rsi, %c[rsi](%3) \n\t"
-               "mov %%rdi, %c[rdi](%3) \n\t"
-               "mov %%rbp, %c[rbp](%3) \n\t"
-               "mov %%r8,  %c[r8](%3) \n\t"
-               "mov %%r9,  %c[r9](%3) \n\t"
-               "mov %%r10, %c[r10](%3) \n\t"
-               "mov %%r11, %c[r11](%3) \n\t"
-               "mov %%r12, %c[r12](%3) \n\t"
-               "mov %%r13, %c[r13](%3) \n\t"
-               "mov %%r14, %c[r14](%3) \n\t"
-               "mov %%r15, %c[r15](%3) \n\t"
+               "xchg %0,     (%%rsp) \n\t"
+               "mov %%rax, %c[rax](%0) \n\t"
+               "mov %%rbx, %c[rbx](%0) \n\t"
+               "pushq (%%rsp); popq %c[rcx](%0) \n\t"
+               "mov %%rdx, %c[rdx](%0) \n\t"
+               "mov %%rsi, %c[rsi](%0) \n\t"
+               "mov %%rdi, %c[rdi](%0) \n\t"
+               "mov %%rbp, %c[rbp](%0) \n\t"
+               "mov %%r8,  %c[r8](%0) \n\t"
+               "mov %%r9,  %c[r9](%0) \n\t"
+               "mov %%r10, %c[r10](%0) \n\t"
+               "mov %%r11, %c[r11](%0) \n\t"
+               "mov %%r12, %c[r12](%0) \n\t"
+               "mov %%r13, %c[r13](%0) \n\t"
+               "mov %%r14, %c[r14](%0) \n\t"
+               "mov %%r15, %c[r15](%0) \n\t"
                "mov %%cr2, %%rax   \n\t"
-               "mov %%rax, %c[cr2](%3) \n\t"
-               "mov (%%rsp), %3 \n\t"
+               "mov %%rax, %c[cr2](%0) \n\t"
 
-               "pop  %%rcx; pop  %%r15; pop  %%r14; pop  %%r13; pop  %%r12;"
-               "pop  %%r11; pop  %%r10; pop  %%r9;  pop  %%r8;"
-               "pop  %%rbp; pop  %%rdi; pop  %%rsi;"
-               "pop  %%rdx; pop  %%rbx; pop  %%rax \n\t"
+               "pop  %%rbp; pop  %%rbp; pop  %%rdx \n\t"
 #else
-               "xchg %3, (%%esp) \n\t"
-               "mov %%eax, %c[rax](%3) \n\t"
-               "mov %%ebx, %c[rbx](%3) \n\t"
-               "pushl (%%esp); popl %c[rcx](%3) \n\t"
-               "mov %%edx, %c[rdx](%3) \n\t"
-               "mov %%esi, %c[rsi](%3) \n\t"
-               "mov %%edi, %c[rdi](%3) \n\t"
-               "mov %%ebp, %c[rbp](%3) \n\t"
+               "xchg %0, (%%esp) \n\t"
+               "mov %%eax, %c[rax](%0) \n\t"
+               "mov %%ebx, %c[rbx](%0) \n\t"
+               "pushl (%%esp); popl %c[rcx](%0) \n\t"
+               "mov %%edx, %c[rdx](%0) \n\t"
+               "mov %%esi, %c[rsi](%0) \n\t"
+               "mov %%edi, %c[rdi](%0) \n\t"
+               "mov %%ebp, %c[rbp](%0) \n\t"
                "mov %%cr2, %%eax  \n\t"
-               "mov %%eax, %c[cr2](%3) \n\t"
-               "mov (%%esp), %3 \n\t"
+               "mov %%eax, %c[cr2](%0) \n\t"
 
-               "pop %%ecx; popa \n\t"
+               "pop %%ebp; pop %%ebp; pop %%edx \n\t"
+#endif
+               "setbe %c[fail](%0) \n\t"
+             : : "c"(vmx), "d"((unsigned long)HOST_RSP),
+               [launched]"i"(offsetof(struct vcpu_vmx, launched)),
+               [fail]"i"(offsetof(struct vcpu_vmx, fail)),
+               [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
+               [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
+               [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
+               [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
+               [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
+               [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
+               [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
+#ifdef CONFIG_X86_64
+               [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
+               [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
+               [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
+               [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
+               [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
+               [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
+               [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
+               [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
 #endif
-               "setbe %0 \n\t"
-             : "=q" (vmx->fail)
-             : "r"(vmx->launched), "d"((unsigned long)HOST_RSP),
-               "c"(vcpu),
-               [rax]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RAX])),
-               [rbx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBX])),
-               [rcx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RCX])),
-               [rdx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDX])),
-               [rsi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RSI])),
-               [rdi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDI])),
-               [rbp]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBP])),
+               [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
+             : "cc", "memory"
 #ifdef CONFIG_X86_64
-               [r8 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R8 ])),
-               [r9 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R9 ])),
-               [r10]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R10])),
-               [r11]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R11])),
-               [r12]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R12])),
-               [r13]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R13])),
-               [r14]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R14])),
-               [r15]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R15])),
+               , "rbx", "rdi", "rsi"
+               , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
+#else
+               , "ebx", "edi", "rsi"
 #endif
-               [cr2]"i"(offsetof(struct kvm_vcpu, cr2))
-             : "cc", "memory" );
+             );
+
+       vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+       if (vmx->rmode.irq.pending)
+               fixup_rmode_irq(vmx);
 
-       vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
+       vcpu->arch.interrupt_window_open =
+               (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
 
-       asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
+       asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
        vmx->launched = 1;
 
        intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
@@ -2332,36 +2476,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                asm("int $2");
 }
 
-static void vmx_inject_page_fault(struct kvm_vcpu *vcpu,
-                                 unsigned long addr,
-                                 u32 err_code)
-{
-       u32 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
-
-       ++vcpu->stat.pf_guest;
-
-       if (is_page_fault(vect_info)) {
-               printk(KERN_DEBUG "inject_page_fault: "
-                      "double fault 0x%lx @ 0x%lx\n",
-                      addr, vmcs_readl(GUEST_RIP));
-               vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0);
-               vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-                            DF_VECTOR |
-                            INTR_TYPE_EXCEPTION |
-                            INTR_INFO_DELIEVER_CODE_MASK |
-                            INTR_INFO_VALID_MASK);
-               return;
-       }
-       vcpu->cr2 = addr;
-       vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, err_code);
-       vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-                    PF_VECTOR |
-                    INTR_TYPE_EXCEPTION |
-                    INTR_INFO_DELIEVER_CODE_MASK |
-                    INTR_INFO_VALID_MASK);
-
-}
-
 static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2397,12 +2511,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
        if (err)
                goto free_vcpu;
 
-       if (irqchip_in_kernel(kvm)) {
-               err = kvm_create_lapic(&vmx->vcpu);
-               if (err < 0)
-                       goto free_vcpu;
-       }
-
        vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
        if (!vmx->guest_msrs) {
                err = -ENOMEM;
@@ -2464,6 +2572,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .check_processor_compatibility = vmx_check_processor_compat,
        .hardware_enable = hardware_enable,
        .hardware_disable = hardware_disable,
+       .cpu_has_accelerated_tpr = cpu_has_vmx_virtualize_apic_accesses,
 
        .vcpu_create = vmx_create_vcpu,
        .vcpu_free = vmx_free_vcpu,
@@ -2499,9 +2608,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .set_rflags = vmx_set_rflags,
 
        .tlb_flush = vmx_flush_tlb,
-       .inject_page_fault = vmx_inject_page_fault,
-
-       .inject_gp = vmx_inject_gp,
 
        .run = vmx_vcpu_run,
        .handle_exit = kvm_handle_exit,
@@ -2509,8 +2615,12 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .patch_hypercall = vmx_patch_hypercall,
        .get_irq = vmx_get_irq,
        .set_irq = vmx_inject_irq,
+       .queue_exception = vmx_queue_exception,
+       .exception_injected = vmx_exception_injected,
        .inject_pending_irq = vmx_intr_assist,
        .inject_pending_vectors = do_interrupt_requests,
+
+       .set_tss_addr = vmx_set_tss_addr,
 };
 
 static int __init vmx_init(void)
@@ -2541,10 +2651,13 @@ static int __init vmx_init(void)
        memset(iova, 0xff, PAGE_SIZE);
        kunmap(vmx_io_bitmap_b);
 
-       r = kvm_init_x86(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
+       r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
        if (r)
                goto out1;
 
+       if (bypass_guest_pf)
+               kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
+
        return 0;
 
 out1:
@@ -2559,7 +2672,7 @@ static void __exit vmx_exit(void)
        __free_page(vmx_io_bitmap_b);
        __free_page(vmx_io_bitmap_a);
 
-       kvm_exit_x86();
+       kvm_exit();
 }
 
 module_init(vmx_init)
similarity index 96%
rename from drivers/kvm/vmx.h
rename to arch/x86/kvm/vmx.h
index fd4e14666088098f7af7d337bc5bc66bbd5e177c..d52ae8d7303de2586351d80662fd385366331ec5 100644 (file)
@@ -25,6 +25,9 @@
  *
  */
 
+/*
+ * Definitions of Primary Processor-Based VM-Execution Controls.
+ */
 #define CPU_BASED_VIRTUAL_INTR_PENDING          0x00000004
 #define CPU_BASED_USE_TSC_OFFSETING             0x00000008
 #define CPU_BASED_HLT_EXITING                   0x00000080
 #define CPU_BASED_MONITOR_EXITING               0x20000000
 #define CPU_BASED_PAUSE_EXITING                 0x40000000
 #define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS   0x80000000
+/*
+ * Definitions of Secondary Processor-Based VM-Execution Controls.
+ */
+#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
+#define SECONDARY_EXEC_WBINVD_EXITING          0x00000040
+
 
 #define PIN_BASED_EXT_INTR_MASK                 0x00000001
 #define PIN_BASED_NMI_EXITING                   0x00000008
@@ -54,8 +63,6 @@
 #define VM_ENTRY_SMM                            0x00000400
 #define VM_ENTRY_DEACT_DUAL_MONITOR             0x00000800
 
-#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
-
 /* VMCS Encodings */
 enum vmcs_field {
        GUEST_ES_SELECTOR               = 0x00000800,
@@ -89,6 +96,8 @@ enum vmcs_field {
        TSC_OFFSET_HIGH                 = 0x00002011,
        VIRTUAL_APIC_PAGE_ADDR          = 0x00002012,
        VIRTUAL_APIC_PAGE_ADDR_HIGH     = 0x00002013,
+       APIC_ACCESS_ADDR                = 0x00002014,
+       APIC_ACCESS_ADDR_HIGH           = 0x00002015,
        VMCS_LINK_POINTER               = 0x00002800,
        VMCS_LINK_POINTER_HIGH          = 0x00002801,
        GUEST_IA32_DEBUGCTL             = 0x00002802,
@@ -214,6 +223,8 @@ enum vmcs_field {
 #define EXIT_REASON_MSR_WRITE           32
 #define EXIT_REASON_MWAIT_INSTRUCTION   36
 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43
+#define EXIT_REASON_APIC_ACCESS         44
+#define EXIT_REASON_WBINVD             54
 
 /*
  * Interruption-information format
@@ -230,13 +241,14 @@ enum vmcs_field {
 
 #define INTR_TYPE_EXT_INTR              (0 << 8) /* external interrupt */
 #define INTR_TYPE_EXCEPTION             (3 << 8) /* processor exception */
+#define INTR_TYPE_SOFT_INTR             (4 << 8) /* software interrupt */
 
 /*
  * Exit Qualifications for MOV for Control Register Access
  */
-#define CONTROL_REG_ACCESS_NUM          0x7     /* 2:0, number of control register */
+#define CONTROL_REG_ACCESS_NUM          0x7     /* 2:0, number of control reg.*/
 #define CONTROL_REG_ACCESS_TYPE         0x30    /* 5:4, access type */
-#define CONTROL_REG_ACCESS_REG          0xf00   /* 10:8, general purpose register */
+#define CONTROL_REG_ACCESS_REG          0xf00   /* 10:8, general purpose reg. */
 #define LMSW_SOURCE_DATA_SHIFT 16
 #define LMSW_SOURCE_DATA  (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */
 #define REG_EAX                         (0 << 8)
@@ -259,11 +271,11 @@ enum vmcs_field {
 /*
  * Exit Qualifications for MOV for Debug Register Access
  */
-#define DEBUG_REG_ACCESS_NUM            0x7     /* 2:0, number of debug register */
+#define DEBUG_REG_ACCESS_NUM            0x7     /* 2:0, number of debug reg. */
 #define DEBUG_REG_ACCESS_TYPE           0x10    /* 4, direction of access */
 #define TYPE_MOV_TO_DR                  (0 << 4)
 #define TYPE_MOV_FROM_DR                (1 << 4)
-#define DEBUG_REG_ACCESS_REG            0xf00   /* 11:8, general purpose register */
+#define DEBUG_REG_ACCESS_REG            0xf00   /* 11:8, general purpose reg. */
 
 
 /* segment AR */
@@ -307,4 +319,6 @@ enum vmcs_field {
 #define MSR_IA32_FEATURE_CONTROL_LOCKED         0x1
 #define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED  0x4
 
+#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT       9
+
 #endif
similarity index 52%
rename from drivers/kvm/kvm_main.c
rename to arch/x86/kvm/x86.c
index c0f372f1d761312bc648d9deccd8e74609fbc513..8f94a0b89dffd51e5c5387be1b0bb6b1653762bc 100644 (file)
@@ -1,8 +1,7 @@
 /*
  * Kernel-based Virtual Machine driver for Linux
  *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
+ * derived from drivers/kvm/kvm_main.c
  *
  * Copyright (C) 2006 Qumranet, Inc.
  *
  *
  */
 
-#include "kvm.h"
-#include "x86_emulate.h"
+#include <linux/kvm_host.h>
 #include "segment_descriptor.h"
 #include "irq.h"
+#include "mmu.h"
 
 #include <linux/kvm.h>
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/percpu.h>
-#include <linux/gfp.h>
-#include <linux/mm.h>
-#include <linux/miscdevice.h>
+#include <linux/fs.h>
 #include <linux/vmalloc.h>
-#include <linux/reboot.h>
-#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/mman.h>
 #include <linux/highmem.h>
-#include <linux/file.h>
-#include <linux/sysdev.h>
-#include <linux/cpu.h>
-#include <linux/sched.h>
-#include <linux/cpumask.h>
-#include <linux/smp.h>
-#include <linux/anon_inodes.h>
-#include <linux/profile.h>
-
-#include <asm/processor.h>
-#include <asm/msr.h>
-#include <asm/io.h>
-#include <asm/uaccess.h>
-#include <asm/desc.h>
-
-MODULE_AUTHOR("Qumranet");
-MODULE_LICENSE("GPL");
-
-static DEFINE_SPINLOCK(kvm_lock);
-static LIST_HEAD(vm_list);
-
-static cpumask_t cpus_hardware_enabled;
-
-struct kvm_x86_ops *kvm_x86_ops;
-struct kmem_cache *kvm_vcpu_cache;
-EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
-
-static __read_mostly struct preempt_ops kvm_preempt_ops;
-
-#define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x)
-
-static struct kvm_stats_debugfs_item {
-       const char *name;
-       int offset;
-       struct dentry *dentry;
-} debugfs_entries[] = {
-       { "pf_fixed", STAT_OFFSET(pf_fixed) },
-       { "pf_guest", STAT_OFFSET(pf_guest) },
-       { "tlb_flush", STAT_OFFSET(tlb_flush) },
-       { "invlpg", STAT_OFFSET(invlpg) },
-       { "exits", STAT_OFFSET(exits) },
-       { "io_exits", STAT_OFFSET(io_exits) },
-       { "mmio_exits", STAT_OFFSET(mmio_exits) },
-       { "signal_exits", STAT_OFFSET(signal_exits) },
-       { "irq_window", STAT_OFFSET(irq_window_exits) },
-       { "halt_exits", STAT_OFFSET(halt_exits) },
-       { "halt_wakeup", STAT_OFFSET(halt_wakeup) },
-       { "request_irq", STAT_OFFSET(request_irq_exits) },
-       { "irq_exits", STAT_OFFSET(irq_exits) },
-       { "light_exits", STAT_OFFSET(light_exits) },
-       { "efer_reload", STAT_OFFSET(efer_reload) },
-       { NULL }
-};
 
-static struct dentry *debugfs_dir;
+#include <asm/uaccess.h>
+#include <asm/msr.h>
 
 #define MAX_IO_MSRS 256
-
 #define CR0_RESERVED_BITS                                              \
        (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
                          | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
@@ -102,317 +43,151 @@ static struct dentry *debugfs_dir;
 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
 #define EFER_RESERVED_BITS 0xfffffffffffff2fe
 
-#ifdef CONFIG_X86_64
-// LDT or TSS descriptor in the GDT. 16 bytes.
-struct segment_descriptor_64 {
-       struct segment_descriptor s;
-       u32 base_higher;
-       u32 pad_zero;
-};
+#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
+#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
 
-#endif
+struct kvm_x86_ops *kvm_x86_ops;
+
+struct kvm_stats_debugfs_item debugfs_entries[] = {
+       { "pf_fixed", VCPU_STAT(pf_fixed) },
+       { "pf_guest", VCPU_STAT(pf_guest) },
+       { "tlb_flush", VCPU_STAT(tlb_flush) },
+       { "invlpg", VCPU_STAT(invlpg) },
+       { "exits", VCPU_STAT(exits) },
+       { "io_exits", VCPU_STAT(io_exits) },
+       { "mmio_exits", VCPU_STAT(mmio_exits) },
+       { "signal_exits", VCPU_STAT(signal_exits) },
+       { "irq_window", VCPU_STAT(irq_window_exits) },
+       { "halt_exits", VCPU_STAT(halt_exits) },
+       { "halt_wakeup", VCPU_STAT(halt_wakeup) },
+       { "request_irq", VCPU_STAT(request_irq_exits) },
+       { "irq_exits", VCPU_STAT(irq_exits) },
+       { "host_state_reload", VCPU_STAT(host_state_reload) },
+       { "efer_reload", VCPU_STAT(efer_reload) },
+       { "fpu_reload", VCPU_STAT(fpu_reload) },
+       { "insn_emulation", VCPU_STAT(insn_emulation) },
+       { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
+       { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
+       { "mmu_pte_write", VM_STAT(mmu_pte_write) },
+       { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
+       { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
+       { "mmu_flooded", VM_STAT(mmu_flooded) },
+       { "mmu_recycled", VM_STAT(mmu_recycled) },
+       { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
+       { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
+       { NULL }
+};
 
-static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
-                          unsigned long arg);
 
 unsigned long segment_base(u16 selector)
 {
        struct descriptor_table gdt;
        struct segment_descriptor *d;
        unsigned long table_base;
-       typedef unsigned long ul;
        unsigned long v;
 
        if (selector == 0)
                return 0;
 
-       asm ("sgdt %0" : "=m"(gdt));
+       asm("sgdt %0" : "=m"(gdt));
        table_base = gdt.base;
 
        if (selector & 4) {           /* from ldt */
                u16 ldt_selector;
 
-               asm ("sldt %0" : "=g"(ldt_selector));
+               asm("sldt %0" : "=g"(ldt_selector));
                table_base = segment_base(ldt_selector);
        }
        d = (struct segment_descriptor *)(table_base + (selector & ~7));
-       v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24);
+       v = d->base_low | ((unsigned long)d->base_mid << 16) |
+               ((unsigned long)d->base_high << 24);
 #ifdef CONFIG_X86_64
-       if (d->system == 0
-           && (d->type == 2 || d->type == 9 || d->type == 11))
-               v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32;
+       if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
+               v |= ((unsigned long) \
+                     ((struct segment_descriptor_64 *)d)->base_higher) << 32;
 #endif
        return v;
 }
 EXPORT_SYMBOL_GPL(segment_base);
 
-static inline int valid_vcpu(int n)
-{
-       return likely(n >= 0 && n < KVM_MAX_VCPUS);
-}
-
-void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
-{
-       if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
-               return;
-
-       vcpu->guest_fpu_loaded = 1;
-       fx_save(&vcpu->host_fx_image);
-       fx_restore(&vcpu->guest_fx_image);
-}
-EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
-
-void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
-{
-       if (!vcpu->guest_fpu_loaded)
-               return;
-
-       vcpu->guest_fpu_loaded = 0;
-       fx_save(&vcpu->guest_fx_image);
-       fx_restore(&vcpu->host_fx_image);
-}
-EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
-
-/*
- * Switches to specified vcpu, until a matching vcpu_put()
- */
-static void vcpu_load(struct kvm_vcpu *vcpu)
-{
-       int cpu;
-
-       mutex_lock(&vcpu->mutex);
-       cpu = get_cpu();
-       preempt_notifier_register(&vcpu->preempt_notifier);
-       kvm_x86_ops->vcpu_load(vcpu, cpu);
-       put_cpu();
-}
-
-static void vcpu_put(struct kvm_vcpu *vcpu)
-{
-       preempt_disable();
-       kvm_x86_ops->vcpu_put(vcpu);
-       preempt_notifier_unregister(&vcpu->preempt_notifier);
-       preempt_enable();
-       mutex_unlock(&vcpu->mutex);
-}
-
-static void ack_flush(void *_completed)
-{
-}
-
-void kvm_flush_remote_tlbs(struct kvm *kvm)
-{
-       int i, cpu;
-       cpumask_t cpus;
-       struct kvm_vcpu *vcpu;
-
-       cpus_clear(cpus);
-       for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-               vcpu = kvm->vcpus[i];
-               if (!vcpu)
-                       continue;
-               if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests))
-                       continue;
-               cpu = vcpu->cpu;
-               if (cpu != -1 && cpu != raw_smp_processor_id())
-                       cpu_set(cpu, cpus);
-       }
-       smp_call_function_mask(cpus, ack_flush, NULL, 1);
-}
-
-int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
+u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
 {
-       struct page *page;
-       int r;
-
-       mutex_init(&vcpu->mutex);
-       vcpu->cpu = -1;
-       vcpu->mmu.root_hpa = INVALID_PAGE;
-       vcpu->kvm = kvm;
-       vcpu->vcpu_id = id;
-       if (!irqchip_in_kernel(kvm) || id == 0)
-               vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
+       if (irqchip_in_kernel(vcpu->kvm))
+               return vcpu->arch.apic_base;
        else
-               vcpu->mp_state = VCPU_MP_STATE_UNINITIALIZED;
-       init_waitqueue_head(&vcpu->wq);
-
-       page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-       if (!page) {
-               r = -ENOMEM;
-               goto fail;
-       }
-       vcpu->run = page_address(page);
-
-       page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-       if (!page) {
-               r = -ENOMEM;
-               goto fail_free_run;
-       }
-       vcpu->pio_data = page_address(page);
-
-       r = kvm_mmu_create(vcpu);
-       if (r < 0)
-               goto fail_free_pio_data;
-
-       return 0;
-
-fail_free_pio_data:
-       free_page((unsigned long)vcpu->pio_data);
-fail_free_run:
-       free_page((unsigned long)vcpu->run);
-fail:
-       return -ENOMEM;
-}
-EXPORT_SYMBOL_GPL(kvm_vcpu_init);
-
-void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
-{
-       kvm_mmu_destroy(vcpu);
-       if (vcpu->apic)
-               hrtimer_cancel(&vcpu->apic->timer.dev);
-       kvm_free_apic(vcpu->apic);
-       free_page((unsigned long)vcpu->pio_data);
-       free_page((unsigned long)vcpu->run);
-}
-EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
-
-static struct kvm *kvm_create_vm(void)
-{
-       struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
-
-       if (!kvm)
-               return ERR_PTR(-ENOMEM);
-
-       kvm_io_bus_init(&kvm->pio_bus);
-       mutex_init(&kvm->lock);
-       INIT_LIST_HEAD(&kvm->active_mmu_pages);
-       kvm_io_bus_init(&kvm->mmio_bus);
-       spin_lock(&kvm_lock);
-       list_add(&kvm->vm_list, &vm_list);
-       spin_unlock(&kvm_lock);
-       return kvm;
-}
-
-/*
- * Free any memory in @free but not in @dont.
- */
-static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
-                                 struct kvm_memory_slot *dont)
-{
-       int i;
-
-       if (!dont || free->phys_mem != dont->phys_mem)
-               if (free->phys_mem) {
-                       for (i = 0; i < free->npages; ++i)
-                               if (free->phys_mem[i])
-                                       __free_page(free->phys_mem[i]);
-                       vfree(free->phys_mem);
-               }
-
-       if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
-               vfree(free->dirty_bitmap);
-
-       free->phys_mem = NULL;
-       free->npages = 0;
-       free->dirty_bitmap = NULL;
-}
-
-static void kvm_free_physmem(struct kvm *kvm)
-{
-       int i;
-
-       for (i = 0; i < kvm->nmemslots; ++i)
-               kvm_free_physmem_slot(&kvm->memslots[i], NULL);
+               return vcpu->arch.apic_base;
 }
+EXPORT_SYMBOL_GPL(kvm_get_apic_base);
 
-static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
+void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
 {
-       int i;
-
-       for (i = 0; i < ARRAY_SIZE(vcpu->pio.guest_pages); ++i)
-               if (vcpu->pio.guest_pages[i]) {
-                       __free_page(vcpu->pio.guest_pages[i]);
-                       vcpu->pio.guest_pages[i] = NULL;
-               }
+       /* TODO: reserve bits check */
+       if (irqchip_in_kernel(vcpu->kvm))
+               kvm_lapic_set_base(vcpu, data);
+       else
+               vcpu->arch.apic_base = data;
 }
+EXPORT_SYMBOL_GPL(kvm_set_apic_base);
 
-static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
+void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 {
-       vcpu_load(vcpu);
-       kvm_mmu_unload(vcpu);
-       vcpu_put(vcpu);
+       WARN_ON(vcpu->arch.exception.pending);
+       vcpu->arch.exception.pending = true;
+       vcpu->arch.exception.has_error_code = false;
+       vcpu->arch.exception.nr = nr;
 }
+EXPORT_SYMBOL_GPL(kvm_queue_exception);
 
-static void kvm_free_vcpus(struct kvm *kvm)
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
+                          u32 error_code)
 {
-       unsigned int i;
-
-       /*
-        * Unpin any mmu pages first.
-        */
-       for (i = 0; i < KVM_MAX_VCPUS; ++i)
-               if (kvm->vcpus[i])
-                       kvm_unload_vcpu_mmu(kvm->vcpus[i]);
-       for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-               if (kvm->vcpus[i]) {
-                       kvm_x86_ops->vcpu_free(kvm->vcpus[i]);
-                       kvm->vcpus[i] = NULL;
-               }
+       ++vcpu->stat.pf_guest;
+       if (vcpu->arch.exception.pending && vcpu->arch.exception.nr == PF_VECTOR) {
+               printk(KERN_DEBUG "kvm: inject_page_fault:"
+                      " double fault 0x%lx\n", addr);
+               vcpu->arch.exception.nr = DF_VECTOR;
+               vcpu->arch.exception.error_code = 0;
+               return;
        }
-
-}
-
-static void kvm_destroy_vm(struct kvm *kvm)
-{
-       spin_lock(&kvm_lock);
-       list_del(&kvm->vm_list);
-       spin_unlock(&kvm_lock);
-       kvm_io_bus_destroy(&kvm->pio_bus);
-       kvm_io_bus_destroy(&kvm->mmio_bus);
-       kfree(kvm->vpic);
-       kfree(kvm->vioapic);
-       kvm_free_vcpus(kvm);
-       kvm_free_physmem(kvm);
-       kfree(kvm);
+       vcpu->arch.cr2 = addr;
+       kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
 }
 
-static int kvm_vm_release(struct inode *inode, struct file *filp)
+void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
 {
-       struct kvm *kvm = filp->private_data;
-
-       kvm_destroy_vm(kvm);
-       return 0;
+       WARN_ON(vcpu->arch.exception.pending);
+       vcpu->arch.exception.pending = true;
+       vcpu->arch.exception.has_error_code = true;
+       vcpu->arch.exception.nr = nr;
+       vcpu->arch.exception.error_code = error_code;
 }
+EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
 
-static void inject_gp(struct kvm_vcpu *vcpu)
+static void __queue_exception(struct kvm_vcpu *vcpu)
 {
-       kvm_x86_ops->inject_gp(vcpu, 0);
+       kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
+                                    vcpu->arch.exception.has_error_code,
+                                    vcpu->arch.exception.error_code);
 }
 
 /*
  * Load the pae pdptrs.  Return true is they are all valid.
  */
-static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
+int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
        gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
        unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
        int i;
-       u64 *pdpt;
        int ret;
-       struct page *page;
-       u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)];
+       u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
 
-       mutex_lock(&vcpu->kvm->lock);
-       page = gfn_to_page(vcpu->kvm, pdpt_gfn);
-       if (!page) {
+       down_read(&current->mm->mmap_sem);
+       ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
+                                 offset * sizeof(u64), sizeof(pdpte));
+       if (ret < 0) {
                ret = 0;
                goto out;
        }
-
-       pdpt = kmap_atomic(page, KM_USER0);
-       memcpy(pdpte, pdpt+offset, sizeof(pdpte));
-       kunmap_atomic(pdpt, KM_USER0);
-
        for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
                if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
                        ret = 0;
@@ -421,78 +196,96 @@ static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
        }
        ret = 1;
 
-       memcpy(vcpu->pdptrs, pdpte, sizeof(vcpu->pdptrs));
+       memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
 out:
-       mutex_unlock(&vcpu->kvm->lock);
+       up_read(&current->mm->mmap_sem);
 
        return ret;
 }
 
+static bool pdptrs_changed(struct kvm_vcpu *vcpu)
+{
+       u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
+       bool changed = true;
+       int r;
+
+       if (is_long_mode(vcpu) || !is_pae(vcpu))
+               return false;
+
+       down_read(&current->mm->mmap_sem);
+       r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
+       if (r < 0)
+               goto out;
+       changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
+out:
+       up_read(&current->mm->mmap_sem);
+
+       return changed;
+}
+
 void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
        if (cr0 & CR0_RESERVED_BITS) {
                printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
-                      cr0, vcpu->cr0);
-               inject_gp(vcpu);
+                      cr0, vcpu->arch.cr0);
+               kvm_inject_gp(vcpu, 0);
                return;
        }
 
        if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
                printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
-               inject_gp(vcpu);
+               kvm_inject_gp(vcpu, 0);
                return;
        }
 
        if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
                printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
                       "and a clear PE flag\n");
-               inject_gp(vcpu);
+               kvm_inject_gp(vcpu, 0);
                return;
        }
 
        if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
 #ifdef CONFIG_X86_64
-               if ((vcpu->shadow_efer & EFER_LME)) {
+               if ((vcpu->arch.shadow_efer & EFER_LME)) {
                        int cs_db, cs_l;
 
                        if (!is_pae(vcpu)) {
                                printk(KERN_DEBUG "set_cr0: #GP, start paging "
                                       "in long mode while PAE is disabled\n");
-                               inject_gp(vcpu);
+                               kvm_inject_gp(vcpu, 0);
                                return;
                        }
                        kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
                        if (cs_l) {
                                printk(KERN_DEBUG "set_cr0: #GP, start paging "
                                       "in long mode while CS.L == 1\n");
-                               inject_gp(vcpu);
+                               kvm_inject_gp(vcpu, 0);
                                return;
 
                        }
                } else
 #endif
-               if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) {
+               if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
                        printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
                               "reserved bits\n");
-                       inject_gp(vcpu);
+                       kvm_inject_gp(vcpu, 0);
                        return;
                }
 
        }
 
        kvm_x86_ops->set_cr0(vcpu, cr0);
-       vcpu->cr0 = cr0;
+       vcpu->arch.cr0 = cr0;
 
-       mutex_lock(&vcpu->kvm->lock);
        kvm_mmu_reset_context(vcpu);
-       mutex_unlock(&vcpu->kvm->lock);
        return;
 }
 EXPORT_SYMBOL_GPL(set_cr0);
 
 void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 {
-       set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f));
+       set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
 }
 EXPORT_SYMBOL_GPL(lmsw);
 
@@ -500,7 +293,7 @@ void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
        if (cr4 & CR4_RESERVED_BITS) {
                printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
-               inject_gp(vcpu);
+               kvm_inject_gp(vcpu, 0);
                return;
        }
 
@@ -508,35 +301,38 @@ void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
                if (!(cr4 & X86_CR4_PAE)) {
                        printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
                               "in long mode\n");
-                       inject_gp(vcpu);
+                       kvm_inject_gp(vcpu, 0);
                        return;
                }
        } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
-                  && !load_pdptrs(vcpu, vcpu->cr3)) {
+                  && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
                printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
-               inject_gp(vcpu);
+               kvm_inject_gp(vcpu, 0);
                return;
        }
 
        if (cr4 & X86_CR4_VMXE) {
                printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
-               inject_gp(vcpu);
+               kvm_inject_gp(vcpu, 0);
                return;
        }
        kvm_x86_ops->set_cr4(vcpu, cr4);
-       vcpu->cr4 = cr4;
-       mutex_lock(&vcpu->kvm->lock);
+       vcpu->arch.cr4 = cr4;
        kvm_mmu_reset_context(vcpu);
-       mutex_unlock(&vcpu->kvm->lock);
 }
 EXPORT_SYMBOL_GPL(set_cr4);
 
 void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
+       if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
+               kvm_mmu_flush_tlb(vcpu);
+               return;
+       }
+
        if (is_long_mode(vcpu)) {
                if (cr3 & CR3_L_MODE_RESERVED_BITS) {
                        printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
-                       inject_gp(vcpu);
+                       kvm_inject_gp(vcpu, 0);
                        return;
                }
        } else {
@@ -544,26 +340,23 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
                        if (cr3 & CR3_PAE_RESERVED_BITS) {
                                printk(KERN_DEBUG
                                       "set_cr3: #GP, reserved bits\n");
-                               inject_gp(vcpu);
+                               kvm_inject_gp(vcpu, 0);
                                return;
                        }
                        if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
                                printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
                                       "reserved bits\n");
-                               inject_gp(vcpu);
-                               return;
-                       }
-               } else {
-                       if (cr3 & CR3_NONPAE_RESERVED_BITS) {
-                               printk(KERN_DEBUG
-                                      "set_cr3: #GP, reserved bits\n");
-                               inject_gp(vcpu);
+                               kvm_inject_gp(vcpu, 0);
                                return;
                        }
                }
+               /*
+                * We don't check reserved bits in nonpae mode, because
+                * this isn't enforced, and VMware depends on this.
+                */
        }
 
-       mutex_lock(&vcpu->kvm->lock);
+       down_read(&current->mm->mmap_sem);
        /*
         * Does the new cr3 value map to physical memory? (Note, we
         * catch an invalid cr3 even in real-mode, because it would
@@ -574,12 +367,12 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
         * to debug) behavior on the guest side.
         */
        if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
-               inject_gp(vcpu);
+               kvm_inject_gp(vcpu, 0);
        else {
-               vcpu->cr3 = cr3;
-               vcpu->mmu.new_cr3(vcpu);
+               vcpu->arch.cr3 = cr3;
+               vcpu->arch.mmu.new_cr3(vcpu);
        }
-       mutex_unlock(&vcpu->kvm->lock);
+       up_read(&current->mm->mmap_sem);
 }
 EXPORT_SYMBOL_GPL(set_cr3);
 
@@ -587,13 +380,13 @@ void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 {
        if (cr8 & CR8_RESERVED_BITS) {
                printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
-               inject_gp(vcpu);
+               kvm_inject_gp(vcpu, 0);
                return;
        }
        if (irqchip_in_kernel(vcpu->kvm))
                kvm_lapic_set_tpr(vcpu, cr8);
        else
-               vcpu->cr8 = cr8;
+               vcpu->arch.cr8 = cr8;
 }
 EXPORT_SYMBOL_GPL(set_cr8);
 
@@ -602,1157 +395,1589 @@ unsigned long get_cr8(struct kvm_vcpu *vcpu)
        if (irqchip_in_kernel(vcpu->kvm))
                return kvm_lapic_get_cr8(vcpu);
        else
-               return vcpu->cr8;
+               return vcpu->arch.cr8;
 }
 EXPORT_SYMBOL_GPL(get_cr8);
 
-u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
-{
-       if (irqchip_in_kernel(vcpu->kvm))
-               return vcpu->apic_base;
-       else
-               return vcpu->apic_base;
-}
-EXPORT_SYMBOL_GPL(kvm_get_apic_base);
-
-void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
-{
-       /* TODO: reserve bits check */
-       if (irqchip_in_kernel(vcpu->kvm))
-               kvm_lapic_set_base(vcpu, data);
-       else
-               vcpu->apic_base = data;
-}
-EXPORT_SYMBOL_GPL(kvm_set_apic_base);
-
-void fx_init(struct kvm_vcpu *vcpu)
-{
-       unsigned after_mxcsr_mask;
-
-       /* Initialize guest FPU by resetting ours and saving into guest's */
-       preempt_disable();
-       fx_save(&vcpu->host_fx_image);
-       fpu_init();
-       fx_save(&vcpu->guest_fx_image);
-       fx_restore(&vcpu->host_fx_image);
-       preempt_enable();
-
-       vcpu->cr0 |= X86_CR0_ET;
-       after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
-       vcpu->guest_fx_image.mxcsr = 0x1f80;
-       memset((void *)&vcpu->guest_fx_image + after_mxcsr_mask,
-              0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
-}
-EXPORT_SYMBOL_GPL(fx_init);
-
 /*
- * Allocate some memory and give it an address in the guest physical address
- * space.
+ * List of msr numbers which we expose to userspace through KVM_GET_MSRS
+ * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
  *
- * Discontiguous memory is allowed, mostly for framebuffers.
+ * This list is modified at module load time to reflect the
+ * capabilities of the host cpu.
  */
-static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
-                                         struct kvm_memory_region *mem)
-{
-       int r;
-       gfn_t base_gfn;
-       unsigned long npages;
-       unsigned long i;
-       struct kvm_memory_slot *memslot;
-       struct kvm_memory_slot old, new;
-
-       r = -EINVAL;
-       /* General sanity checks */
-       if (mem->memory_size & (PAGE_SIZE - 1))
-               goto out;
-       if (mem->guest_phys_addr & (PAGE_SIZE - 1))
-               goto out;
-       if (mem->slot >= KVM_MEMORY_SLOTS)
-               goto out;
-       if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
-               goto out;
-
-       memslot = &kvm->memslots[mem->slot];
-       base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
-       npages = mem->memory_size >> PAGE_SHIFT;
-
-       if (!npages)
-               mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
+static u32 msrs_to_save[] = {
+       MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
+       MSR_K6_STAR,
+#ifdef CONFIG_X86_64
+       MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
+#endif
+       MSR_IA32_TIME_STAMP_COUNTER,
+};
 
-       mutex_lock(&kvm->lock);
+static unsigned num_msrs_to_save;
 
-       new = old = *memslot;
+static u32 emulated_msrs[] = {
+       MSR_IA32_MISC_ENABLE,
+};
 
-       new.base_gfn = base_gfn;
-       new.npages = npages;
-       new.flags = mem->flags;
+#ifdef CONFIG_X86_64
 
-       /* Disallow changing a memory slot's size. */
-       r = -EINVAL;
-       if (npages && old.npages && npages != old.npages)
-               goto out_unlock;
-
-       /* Check for overlaps */
-       r = -EEXIST;
-       for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
-               struct kvm_memory_slot *s = &kvm->memslots[i];
+static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+       if (efer & EFER_RESERVED_BITS) {
+               printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
+                      efer);
+               kvm_inject_gp(vcpu, 0);
+               return;
+       }
 
-               if (s == memslot)
-                       continue;
-               if (!((base_gfn + npages <= s->base_gfn) ||
-                     (base_gfn >= s->base_gfn + s->npages)))
-                       goto out_unlock;
+       if (is_paging(vcpu)
+           && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
+               printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
+               kvm_inject_gp(vcpu, 0);
+               return;
        }
 
-       /* Deallocate if slot is being removed */
-       if (!npages)
-               new.phys_mem = NULL;
+       kvm_x86_ops->set_efer(vcpu, efer);
 
-       /* Free page dirty bitmap if unneeded */
-       if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
-               new.dirty_bitmap = NULL;
+       efer &= ~EFER_LMA;
+       efer |= vcpu->arch.shadow_efer & EFER_LMA;
 
-       r = -ENOMEM;
+       vcpu->arch.shadow_efer = efer;
+}
 
-       /* Allocate if a slot is being created */
-       if (npages && !new.phys_mem) {
-               new.phys_mem = vmalloc(npages * sizeof(struct page *));
+#endif
 
-               if (!new.phys_mem)
-                       goto out_unlock;
+/*
+ * Writes msr value into into the appropriate "register".
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+{
+       return kvm_x86_ops->set_msr(vcpu, msr_index, data);
+}
 
-               memset(new.phys_mem, 0, npages * sizeof(struct page *));
-               for (i = 0; i < npages; ++i) {
-                       new.phys_mem[i] = alloc_page(GFP_HIGHUSER
-                                                    | __GFP_ZERO);
-                       if (!new.phys_mem[i])
-                               goto out_unlock;
-                       set_page_private(new.phys_mem[i],0);
-               }
-       }
+/*
+ * Adapt set_msr() to msr_io()'s calling convention
+ */
+static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
+{
+       return kvm_set_msr(vcpu, index, *data);
+}
 
-       /* Allocate page dirty bitmap if needed */
-       if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
-               unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
 
-               new.dirty_bitmap = vmalloc(dirty_bytes);
-               if (!new.dirty_bitmap)
-                       goto out_unlock;
-               memset(new.dirty_bitmap, 0, dirty_bytes);
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+       switch (msr) {
+#ifdef CONFIG_X86_64
+       case MSR_EFER:
+               set_efer(vcpu, data);
+               break;
+#endif
+       case MSR_IA32_MC0_STATUS:
+               pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
+                      __FUNCTION__, data);
+               break;
+       case MSR_IA32_MCG_STATUS:
+               pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
+                       __FUNCTION__, data);
+               break;
+       case MSR_IA32_UCODE_REV:
+       case MSR_IA32_UCODE_WRITE:
+       case 0x200 ... 0x2ff: /* MTRRs */
+               break;
+       case MSR_IA32_APICBASE:
+               kvm_set_apic_base(vcpu, data);
+               break;
+       case MSR_IA32_MISC_ENABLE:
+               vcpu->arch.ia32_misc_enable_msr = data;
+               break;
+       default:
+               pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
+               return 1;
        }
+       return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_msr_common);
 
-       if (mem->slot >= kvm->nmemslots)
-               kvm->nmemslots = mem->slot + 1;
-
-       *memslot = new;
 
-       kvm_mmu_slot_remove_write_access(kvm, mem->slot);
-       kvm_flush_remote_tlbs(kvm);
+/*
+ * Reads an msr value (of 'msr_index') into 'pdata'.
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
+{
+       return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
+}
 
-       mutex_unlock(&kvm->lock);
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+       u64 data;
 
-       kvm_free_physmem_slot(&old, &new);
+       switch (msr) {
+       case 0xc0010010: /* SYSCFG */
+       case 0xc0010015: /* HWCR */
+       case MSR_IA32_PLATFORM_ID:
+       case MSR_IA32_P5_MC_ADDR:
+       case MSR_IA32_P5_MC_TYPE:
+       case MSR_IA32_MC0_CTL:
+       case MSR_IA32_MCG_STATUS:
+       case MSR_IA32_MCG_CAP:
+       case MSR_IA32_MC0_MISC:
+       case MSR_IA32_MC0_MISC+4:
+       case MSR_IA32_MC0_MISC+8:
+       case MSR_IA32_MC0_MISC+12:
+       case MSR_IA32_MC0_MISC+16:
+       case MSR_IA32_UCODE_REV:
+       case MSR_IA32_PERF_STATUS:
+       case MSR_IA32_EBL_CR_POWERON:
+               /* MTRR registers */
+       case 0xfe:
+       case 0x200 ... 0x2ff:
+               data = 0;
+               break;
+       case 0xcd: /* fsb frequency */
+               data = 3;
+               break;
+       case MSR_IA32_APICBASE:
+               data = kvm_get_apic_base(vcpu);
+               break;
+       case MSR_IA32_MISC_ENABLE:
+               data = vcpu->arch.ia32_misc_enable_msr;
+               break;
+#ifdef CONFIG_X86_64
+       case MSR_EFER:
+               data = vcpu->arch.shadow_efer;
+               break;
+#endif
+       default:
+               pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
+               return 1;
+       }
+       *pdata = data;
        return 0;
-
-out_unlock:
-       mutex_unlock(&kvm->lock);
-       kvm_free_physmem_slot(&new, &old);
-out:
-       return r;
 }
+EXPORT_SYMBOL_GPL(kvm_get_msr_common);
 
 /*
- * Get (and clear) the dirty memory log for a memory slot.
+ * Read or write a bunch of msrs. All parameters are kernel addresses.
+ *
+ * @return number of msrs set successfully.
  */
-static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
-                                     struct kvm_dirty_log *log)
+static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
+                   struct kvm_msr_entry *entries,
+                   int (*do_msr)(struct kvm_vcpu *vcpu,
+                                 unsigned index, u64 *data))
 {
-       struct kvm_memory_slot *memslot;
-       int r, i;
-       int n;
-       unsigned long any = 0;
-
-       mutex_lock(&kvm->lock);
-
-       r = -EINVAL;
-       if (log->slot >= KVM_MEMORY_SLOTS)
-               goto out;
-
-       memslot = &kvm->memslots[log->slot];
-       r = -ENOENT;
-       if (!memslot->dirty_bitmap)
-               goto out;
-
-       n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
-
-       for (i = 0; !any && i < n/sizeof(long); ++i)
-               any = memslot->dirty_bitmap[i];
+       int i;
 
-       r = -EFAULT;
-       if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
-               goto out;
+       vcpu_load(vcpu);
 
-       /* If nothing is dirty, don't bother messing with page tables. */
-       if (any) {
-               kvm_mmu_slot_remove_write_access(kvm, log->slot);
-               kvm_flush_remote_tlbs(kvm);
-               memset(memslot->dirty_bitmap, 0, n);
-       }
+       for (i = 0; i < msrs->nmsrs; ++i)
+               if (do_msr(vcpu, entries[i].index, &entries[i].data))
+                       break;
 
-       r = 0;
+       vcpu_put(vcpu);
 
-out:
-       mutex_unlock(&kvm->lock);
-       return r;
+       return i;
 }
 
 /*
- * Set a new alias region.  Aliases map a portion of physical memory into
- * another portion.  This is useful for memory windows, for example the PC
- * VGA region.
+ * Read or write a bunch of msrs. Parameters are user addresses.
+ *
+ * @return number of msrs set successfully.
  */
-static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
-                                        struct kvm_memory_alias *alias)
+static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
+                 int (*do_msr)(struct kvm_vcpu *vcpu,
+                               unsigned index, u64 *data),
+                 int writeback)
 {
+       struct kvm_msrs msrs;
+       struct kvm_msr_entry *entries;
        int r, n;
-       struct kvm_mem_alias *p;
+       unsigned size;
 
-       r = -EINVAL;
-       /* General sanity checks */
-       if (alias->memory_size & (PAGE_SIZE - 1))
-               goto out;
-       if (alias->guest_phys_addr & (PAGE_SIZE - 1))
-               goto out;
-       if (alias->slot >= KVM_ALIAS_SLOTS)
-               goto out;
-       if (alias->guest_phys_addr + alias->memory_size
-           < alias->guest_phys_addr)
-               goto out;
-       if (alias->target_phys_addr + alias->memory_size
-           < alias->target_phys_addr)
+       r = -EFAULT;
+       if (copy_from_user(&msrs, user_msrs, sizeof msrs))
                goto out;
 
-       mutex_lock(&kvm->lock);
+       r = -E2BIG;
+       if (msrs.nmsrs >= MAX_IO_MSRS)
+               goto out;
 
-       p = &kvm->aliases[alias->slot];
-       p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
-       p->npages = alias->memory_size >> PAGE_SHIFT;
-       p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
+       r = -ENOMEM;
+       size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
+       entries = vmalloc(size);
+       if (!entries)
+               goto out;
 
-       for (n = KVM_ALIAS_SLOTS; n > 0; --n)
-               if (kvm->aliases[n - 1].npages)
-                       break;
-       kvm->naliases = n;
+       r = -EFAULT;
+       if (copy_from_user(entries, user_msrs->entries, size))
+               goto out_free;
 
-       kvm_mmu_zap_all(kvm);
+       r = n = __msr_io(vcpu, &msrs, entries, do_msr);
+       if (r < 0)
+               goto out_free;
 
-       mutex_unlock(&kvm->lock);
+       r = -EFAULT;
+       if (writeback && copy_to_user(user_msrs->entries, entries, size))
+               goto out_free;
 
-       return 0;
+       r = n;
 
+out_free:
+       vfree(entries);
 out:
        return r;
 }
 
-static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+/*
+ * Make sure that a cpu that is being hot-unplugged does not have any vcpus
+ * cached on it.
+ */
+void decache_vcpus_on_cpu(int cpu)
+{
+       struct kvm *vm;
+       struct kvm_vcpu *vcpu;
+       int i;
+
+       spin_lock(&kvm_lock);
+       list_for_each_entry(vm, &vm_list, vm_list)
+               for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+                       vcpu = vm->vcpus[i];
+                       if (!vcpu)
+                               continue;
+                       /*
+                        * If the vcpu is locked, then it is running on some
+                        * other cpu and therefore it is not cached on the
+                        * cpu in question.
+                        *
+                        * If it's not locked, check the last cpu it executed
+                        * on.
+                        */
+                       if (mutex_trylock(&vcpu->mutex)) {
+                               if (vcpu->cpu == cpu) {
+                                       kvm_x86_ops->vcpu_decache(vcpu);
+                                       vcpu->cpu = -1;
+                               }
+                               mutex_unlock(&vcpu->mutex);
+                       }
+               }
+       spin_unlock(&kvm_lock);
+}
+
+int kvm_dev_ioctl_check_extension(long ext)
 {
        int r;
 
-       r = 0;
-       switch (chip->chip_id) {
-       case KVM_IRQCHIP_PIC_MASTER:
-               memcpy (&chip->chip.pic,
-                       &pic_irqchip(kvm)->pics[0],
-                       sizeof(struct kvm_pic_state));
-               break;
-       case KVM_IRQCHIP_PIC_SLAVE:
-               memcpy (&chip->chip.pic,
-                       &pic_irqchip(kvm)->pics[1],
-                       sizeof(struct kvm_pic_state));
+       switch (ext) {
+       case KVM_CAP_IRQCHIP:
+       case KVM_CAP_HLT:
+       case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
+       case KVM_CAP_USER_MEMORY:
+       case KVM_CAP_SET_TSS_ADDR:
+       case KVM_CAP_EXT_CPUID:
+               r = 1;
                break;
-       case KVM_IRQCHIP_IOAPIC:
-               memcpy (&chip->chip.ioapic,
-                       ioapic_irqchip(kvm),
-                       sizeof(struct kvm_ioapic_state));
+       case KVM_CAP_VAPIC:
+               r = !kvm_x86_ops->cpu_has_accelerated_tpr();
                break;
        default:
-               r = -EINVAL;
+               r = 0;
                break;
        }
        return r;
+
 }
 
-static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+long kvm_arch_dev_ioctl(struct file *filp,
+                       unsigned int ioctl, unsigned long arg)
 {
-       int r;
+       void __user *argp = (void __user *)arg;
+       long r;
 
-       r = 0;
-       switch (chip->chip_id) {
-       case KVM_IRQCHIP_PIC_MASTER:
-               memcpy (&pic_irqchip(kvm)->pics[0],
-                       &chip->chip.pic,
-                       sizeof(struct kvm_pic_state));
-               break;
-       case KVM_IRQCHIP_PIC_SLAVE:
-               memcpy (&pic_irqchip(kvm)->pics[1],
-                       &chip->chip.pic,
-                       sizeof(struct kvm_pic_state));
-               break;
-       case KVM_IRQCHIP_IOAPIC:
-               memcpy (ioapic_irqchip(kvm),
-                       &chip->chip.ioapic,
-                       sizeof(struct kvm_ioapic_state));
+       switch (ioctl) {
+       case KVM_GET_MSR_INDEX_LIST: {
+               struct kvm_msr_list __user *user_msr_list = argp;
+               struct kvm_msr_list msr_list;
+               unsigned n;
+
+               r = -EFAULT;
+               if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
+                       goto out;
+               n = msr_list.nmsrs;
+               msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
+               if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
+                       goto out;
+               r = -E2BIG;
+               if (n < num_msrs_to_save)
+                       goto out;
+               r = -EFAULT;
+               if (copy_to_user(user_msr_list->indices, &msrs_to_save,
+                                num_msrs_to_save * sizeof(u32)))
+                       goto out;
+               if (copy_to_user(user_msr_list->indices
+                                + num_msrs_to_save * sizeof(u32),
+                                &emulated_msrs,
+                                ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
+                       goto out;
+               r = 0;
                break;
+       }
        default:
                r = -EINVAL;
-               break;
        }
-       kvm_pic_update_irq(pic_irqchip(kvm));
+out:
        return r;
 }
 
-static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
-       int i;
-       struct kvm_mem_alias *alias;
+       kvm_x86_ops->vcpu_load(vcpu, cpu);
+}
 
-       for (i = 0; i < kvm->naliases; ++i) {
-               alias = &kvm->aliases[i];
-               if (gfn >= alias->base_gfn
-                   && gfn < alias->base_gfn + alias->npages)
-                       return alias->target_gfn + gfn - alias->base_gfn;
-       }
-       return gfn;
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
+{
+       kvm_x86_ops->vcpu_put(vcpu);
+       kvm_put_guest_fpu(vcpu);
 }
 
-static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
+static int is_efer_nx(void)
 {
-       int i;
+       u64 efer;
+
+       rdmsrl(MSR_EFER, efer);
+       return efer & EFER_NX;
+}
 
-       for (i = 0; i < kvm->nmemslots; ++i) {
-               struct kvm_memory_slot *memslot = &kvm->memslots[i];
+static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
+{
+       int i;
+       struct kvm_cpuid_entry2 *e, *entry;
 
-               if (gfn >= memslot->base_gfn
-                   && gfn < memslot->base_gfn + memslot->npages)
-                       return memslot;
+       entry = NULL;
+       for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
+               e = &vcpu->arch.cpuid_entries[i];
+               if (e->function == 0x80000001) {
+                       entry = e;
+                       break;
+               }
+       }
+       if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
+               entry->edx &= ~(1 << 20);
+               printk(KERN_INFO "kvm: guest NX capability removed\n");
        }
-       return NULL;
 }
 
-struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
+/* when an old userspace process fills a new kernel module */
+static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
+                                   struct kvm_cpuid *cpuid,
+                                   struct kvm_cpuid_entry __user *entries)
 {
-       gfn = unalias_gfn(kvm, gfn);
-       return __gfn_to_memslot(kvm, gfn);
+       int r, i;
+       struct kvm_cpuid_entry *cpuid_entries;
+
+       r = -E2BIG;
+       if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+               goto out;
+       r = -ENOMEM;
+       cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
+       if (!cpuid_entries)
+               goto out;
+       r = -EFAULT;
+       if (copy_from_user(cpuid_entries, entries,
+                          cpuid->nent * sizeof(struct kvm_cpuid_entry)))
+               goto out_free;
+       for (i = 0; i < cpuid->nent; i++) {
+               vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
+               vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
+               vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
+               vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
+               vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
+               vcpu->arch.cpuid_entries[i].index = 0;
+               vcpu->arch.cpuid_entries[i].flags = 0;
+               vcpu->arch.cpuid_entries[i].padding[0] = 0;
+               vcpu->arch.cpuid_entries[i].padding[1] = 0;
+               vcpu->arch.cpuid_entries[i].padding[2] = 0;
+       }
+       vcpu->arch.cpuid_nent = cpuid->nent;
+       cpuid_fix_nx_cap(vcpu);
+       r = 0;
+
+out_free:
+       vfree(cpuid_entries);
+out:
+       return r;
 }
 
-struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
+static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
+                                   struct kvm_cpuid2 *cpuid,
+                                   struct kvm_cpuid_entry2 __user *entries)
 {
-       struct kvm_memory_slot *slot;
+       int r;
+
+       r = -E2BIG;
+       if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+               goto out;
+       r = -EFAULT;
+       if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
+                          cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
+               goto out;
+       vcpu->arch.cpuid_nent = cpuid->nent;
+       return 0;
 
-       gfn = unalias_gfn(kvm, gfn);
-       slot = __gfn_to_memslot(kvm, gfn);
-       if (!slot)
-               return NULL;
-       return slot->phys_mem[gfn - slot->base_gfn];
+out:
+       return r;
 }
-EXPORT_SYMBOL_GPL(gfn_to_page);
 
-/* WARNING: Does not work on aliased pages. */
-void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
+static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
+                                   struct kvm_cpuid2 *cpuid,
+                                   struct kvm_cpuid_entry2 __user *entries)
 {
-       struct kvm_memory_slot *memslot;
+       int r;
+
+       r = -E2BIG;
+       if (cpuid->nent < vcpu->arch.cpuid_nent)
+               goto out;
+       r = -EFAULT;
+       if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
+                          vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
+               goto out;
+       return 0;
 
-       memslot = __gfn_to_memslot(kvm, gfn);
-       if (memslot && memslot->dirty_bitmap) {
-               unsigned long rel_gfn = gfn - memslot->base_gfn;
+out:
+       cpuid->nent = vcpu->arch.cpuid_nent;
+       return r;
+}
 
-               /* avoid RMW */
-               if (!test_bit(rel_gfn, memslot->dirty_bitmap))
-                       set_bit(rel_gfn, memslot->dirty_bitmap);
+static inline u32 bit(int bitno)
+{
+       return 1 << (bitno & 31);
+}
+
+static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+                         u32 index)
+{
+       entry->function = function;
+       entry->index = index;
+       cpuid_count(entry->function, entry->index,
+               &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
+       entry->flags = 0;
+}
+
+static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+                        u32 index, int *nent, int maxnent)
+{
+       const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) |
+               bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
+               bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
+               bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
+               bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
+               bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |
+               bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
+               bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
+               bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
+               bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
+       const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
+               bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
+               bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
+               bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
+               bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
+               bit(X86_FEATURE_PGE) |
+               bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
+               bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
+               bit(X86_FEATURE_SYSCALL) |
+               (bit(X86_FEATURE_NX) && is_efer_nx()) |
+#ifdef CONFIG_X86_64
+               bit(X86_FEATURE_LM) |
+#endif
+               bit(X86_FEATURE_MMXEXT) |
+               bit(X86_FEATURE_3DNOWEXT) |
+               bit(X86_FEATURE_3DNOW);
+       const u32 kvm_supported_word3_x86_features =
+               bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
+       const u32 kvm_supported_word6_x86_features =
+               bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY);
+
+       /* all func 2 cpuid_count() should be called on the same cpu */
+       get_cpu();
+       do_cpuid_1_ent(entry, function, index);
+       ++*nent;
+
+       switch (function) {
+       case 0:
+               entry->eax = min(entry->eax, (u32)0xb);
+               break;
+       case 1:
+               entry->edx &= kvm_supported_word0_x86_features;
+               entry->ecx &= kvm_supported_word3_x86_features;
+   &