Merge branch 'kvm-updates/2.6.36' of git://git.kernel.org/pub/scm/virt/kvm/kvm
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 4 Aug 2010 17:43:01 +0000 (10:43 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 4 Aug 2010 17:43:01 +0000 (10:43 -0700)
* 'kvm-updates/2.6.36' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (198 commits)
  KVM: VMX: Fix host GDT.LIMIT corruption
  KVM: MMU: using __xchg_spte more smarter
  KVM: MMU: cleanup spte set and accssed/dirty tracking
  KVM: MMU: don't atomicly set spte if it's not present
  KVM: MMU: fix page dirty tracking lost while sync page
  KVM: MMU: fix broken page accessed tracking with ept enabled
  KVM: MMU: add missing reserved bits check in speculative path
  KVM: MMU: fix mmu notifier invalidate handler for huge spte
  KVM: x86 emulator: fix xchg instruction emulation
  KVM: x86: Call mask notifiers from pic
  KVM: x86: never re-execute instruction with enabled tdp
  KVM: Document KVM_GET_SUPPORTED_CPUID2 ioctl
  KVM: x86: emulator: inc/dec can have lock prefix
  KVM: MMU: Eliminate redundant temporaries in FNAME(fetch)
  KVM: MMU: Validate all gptes during fetch, not just those used for new pages
  KVM: MMU: Simplify spte fetch() function
  KVM: MMU: Add gpte_valid() helper
  KVM: MMU: Add validate_direct_spte() helper
  KVM: MMU: Add drop_large_spte() helper
  KVM: MMU: Use __set_spte to link shadow pages
  ...

63 files changed:
Documentation/feature-removal-schedule.txt
Documentation/kvm/api.txt
Documentation/kvm/mmu.txt
Documentation/kvm/msr.txt [new file with mode: 0644]
Documentation/kvm/review-checklist.txt [new file with mode: 0644]
arch/ia64/include/asm/kvm_host.h
arch/ia64/kvm/kvm-ia64.c
arch/powerpc/include/asm/kvm_book3s.h
arch/powerpc/include/asm/kvm_fpu.h
arch/powerpc/include/asm/kvm_host.h
arch/powerpc/kernel/ppc_ksyms.c
arch/powerpc/kvm/44x_tlb.c
arch/powerpc/kvm/Makefile
arch/powerpc/kvm/book3s.c
arch/powerpc/kvm/book3s_32_mmu.c
arch/powerpc/kvm/book3s_32_mmu_host.c
arch/powerpc/kvm/book3s_64_mmu_host.c
arch/powerpc/kvm/book3s_mmu_hpte.c [new file with mode: 0644]
arch/powerpc/kvm/book3s_paired_singles.c
arch/powerpc/kvm/booke.c
arch/powerpc/kvm/fpu.S
arch/powerpc/kvm/powerpc.c
arch/s390/include/asm/kvm_host.h
arch/s390/kvm/intercept.c
arch/s390/kvm/kvm-s390.c
arch/s390/kvm/kvm-s390.h
arch/x86/include/asm/i387.h
arch/x86/include/asm/kvm.h
arch/x86/include/asm/kvm_emulate.h
arch/x86/include/asm/kvm_host.h
arch/x86/include/asm/msr-index.h
arch/x86/include/asm/vmx.h
arch/x86/include/asm/xsave.h
arch/x86/kernel/i387.c
arch/x86/kernel/process.c
arch/x86/kvm/emulate.c
arch/x86/kvm/i8254.c
arch/x86/kvm/i8254.h
arch/x86/kvm/i8259.c
arch/x86/kvm/irq.c
arch/x86/kvm/irq.h
arch/x86/kvm/kvm_cache_regs.h
arch/x86/kvm/lapic.c
arch/x86/kvm/mmu.c
arch/x86/kvm/mmutrace.h
arch/x86/kvm/paging_tmpl.h
arch/x86/kvm/svm.c
arch/x86/kvm/timer.c
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
arch/x86/kvm/x86.h
include/linux/kvm.h
include/linux/kvm_host.h
include/linux/kvm_types.h
include/linux/mm.h
mm/memory-failure.c
virt/kvm/assigned-dev.c
virt/kvm/coalesced_mmio.c
virt/kvm/eventfd.c
virt/kvm/ioapic.c
virt/kvm/iommu.c
virt/kvm/irq_comm.c
virt/kvm/kvm_main.c

index 79cb554761afb18fab2b2d0023b246a1324e46f4..b273d35039eda59555e43485a68ea282aaf377af 100644 (file)
@@ -487,17 +487,6 @@ Who:       Jan Kiszka <jan.kiszka@web.de>
 
 ----------------------------
 
-What:  KVM memory aliases support
-When:  July 2010
-Why:   Memory aliasing support is used for speeding up guest vga access
-       through the vga windows.
-
-       Modern userspace no longer uses this feature, so it's just bitrotted
-       code and can be removed with no impact.
-Who:   Avi Kivity <avi@redhat.com>
-
-----------------------------
-
 What:  xtime, wall_to_monotonic
 When:  2.6.36+
 Files: kernel/time/timekeeping.c include/linux/time.h
@@ -508,16 +497,6 @@ Who:       John Stultz <johnstul@us.ibm.com>
 
 ----------------------------
 
-What:  KVM kernel-allocated memory slots
-When:  July 2010
-Why:   Since 2.6.25, kvm supports user-allocated memory slots, which are
-       much more flexible than kernel-allocated slots.  All current userspace
-       supports the newer interface and this code can be removed with no
-       impact.
-Who:   Avi Kivity <avi@redhat.com>
-
-----------------------------
-
 What:  KVM paravirt mmu host support
 When:  January 2011
 Why:   The paravirt mmu host support is slower than non-paravirt mmu, both
index a237518e51b971e9beafab8209e96ffeff7f8242..5f5b64982b1a5c709e4a579c48d8700d522a0b92 100644 (file)
@@ -126,6 +126,10 @@ user fills in the size of the indices array in nmsrs, and in return
 kvm adjusts nmsrs to reflect the actual number of msrs and fills in
 the indices array with their numbers.
 
+Note: if kvm indicates supports MCE (KVM_CAP_MCE), then the MCE bank MSRs are
+not returned in the MSR list, as different vcpus can have a different number
+of banks, as set via the KVM_X86_SETUP_MCE ioctl.
+
 4.4 KVM_CHECK_EXTENSION
 
 Capability: basic
@@ -160,29 +164,7 @@ Type: vm ioctl
 Parameters: struct kvm_memory_region (in)
 Returns: 0 on success, -1 on error
 
-struct kvm_memory_region {
-       __u32 slot;
-       __u32 flags;
-       __u64 guest_phys_addr;
-       __u64 memory_size; /* bytes */
-};
-
-/* for kvm_memory_region::flags */
-#define KVM_MEM_LOG_DIRTY_PAGES  1UL
-
-This ioctl allows the user to create or modify a guest physical memory
-slot.  When changing an existing slot, it may be moved in the guest
-physical memory space, or its flags may be modified.  It may not be
-resized.  Slots may not overlap.
-
-The flags field supports just one flag, KVM_MEM_LOG_DIRTY_PAGES, which
-instructs kvm to keep track of writes to memory within the slot.  See
-the KVM_GET_DIRTY_LOG ioctl.
-
-It is recommended to use the KVM_SET_USER_MEMORY_REGION ioctl instead
-of this API, if available.  This newer API allows placing guest memory
-at specified locations in the host address space, yielding better
-control and easy access.
+This ioctl is obsolete and has been removed.
 
 4.6 KVM_CREATE_VCPU
 
@@ -226,17 +208,7 @@ Type: vm ioctl
 Parameters: struct kvm_memory_alias (in)
 Returns: 0 (success), -1 (error)
 
-struct kvm_memory_alias {
-       __u32 slot;  /* this has a different namespace than memory slots */
-       __u32 flags;
-       __u64 guest_phys_addr;
-       __u64 memory_size;
-       __u64 target_phys_addr;
-};
-
-Defines a guest physical address space region as an alias to another
-region.  Useful for aliased address, for example the VGA low memory
-window. Should not be used with userspace memory.
+This ioctl is obsolete and has been removed.
 
 4.9 KVM_RUN
 
@@ -892,6 +864,174 @@ arguments.
 This ioctl is only useful after KVM_CREATE_IRQCHIP.  Without an in-kernel
 irqchip, the multiprocessing state must be maintained by userspace.
 
+4.39 KVM_SET_IDENTITY_MAP_ADDR
+
+Capability: KVM_CAP_SET_IDENTITY_MAP_ADDR
+Architectures: x86
+Type: vm ioctl
+Parameters: unsigned long identity (in)
+Returns: 0 on success, -1 on error
+
+This ioctl defines the physical address of a one-page region in the guest
+physical address space.  The region must be within the first 4GB of the
+guest physical address space and must not conflict with any memory slot
+or any mmio address.  The guest may malfunction if it accesses this memory
+region.
+
+This ioctl is required on Intel-based hosts.  This is needed on Intel hardware
+because of a quirk in the virtualization implementation (see the internals
+documentation when it pops into existence).
+
+4.40 KVM_SET_BOOT_CPU_ID
+
+Capability: KVM_CAP_SET_BOOT_CPU_ID
+Architectures: x86, ia64
+Type: vm ioctl
+Parameters: unsigned long vcpu_id
+Returns: 0 on success, -1 on error
+
+Define which vcpu is the Bootstrap Processor (BSP).  Values are the same
+as the vcpu id in KVM_CREATE_VCPU.  If this ioctl is not called, the default
+is vcpu 0.
+
+4.41 KVM_GET_XSAVE
+
+Capability: KVM_CAP_XSAVE
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_xsave (out)
+Returns: 0 on success, -1 on error
+
+struct kvm_xsave {
+       __u32 region[1024];
+};
+
+This ioctl would copy current vcpu's xsave struct to the userspace.
+
+4.42 KVM_SET_XSAVE
+
+Capability: KVM_CAP_XSAVE
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_xsave (in)
+Returns: 0 on success, -1 on error
+
+struct kvm_xsave {
+       __u32 region[1024];
+};
+
+This ioctl would copy userspace's xsave struct to the kernel.
+
+4.43 KVM_GET_XCRS
+
+Capability: KVM_CAP_XCRS
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_xcrs (out)
+Returns: 0 on success, -1 on error
+
+struct kvm_xcr {
+       __u32 xcr;
+       __u32 reserved;
+       __u64 value;
+};
+
+struct kvm_xcrs {
+       __u32 nr_xcrs;
+       __u32 flags;
+       struct kvm_xcr xcrs[KVM_MAX_XCRS];
+       __u64 padding[16];
+};
+
+This ioctl would copy current vcpu's xcrs to the userspace.
+
+4.44 KVM_SET_XCRS
+
+Capability: KVM_CAP_XCRS
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_xcrs (in)
+Returns: 0 on success, -1 on error
+
+struct kvm_xcr {
+       __u32 xcr;
+       __u32 reserved;
+       __u64 value;
+};
+
+struct kvm_xcrs {
+       __u32 nr_xcrs;
+       __u32 flags;
+       struct kvm_xcr xcrs[KVM_MAX_XCRS];
+       __u64 padding[16];
+};
+
+This ioctl would set vcpu's xcr to the value userspace specified.
+
+4.45 KVM_GET_SUPPORTED_CPUID
+
+Capability: KVM_CAP_EXT_CPUID
+Architectures: x86
+Type: system ioctl
+Parameters: struct kvm_cpuid2 (in/out)
+Returns: 0 on success, -1 on error
+
+struct kvm_cpuid2 {
+       __u32 nent;
+       __u32 padding;
+       struct kvm_cpuid_entry2 entries[0];
+};
+
+#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1
+#define KVM_CPUID_FLAG_STATEFUL_FUNC    2
+#define KVM_CPUID_FLAG_STATE_READ_NEXT  4
+
+struct kvm_cpuid_entry2 {
+       __u32 function;
+       __u32 index;
+       __u32 flags;
+       __u32 eax;
+       __u32 ebx;
+       __u32 ecx;
+       __u32 edx;
+       __u32 padding[3];
+};
+
+This ioctl returns x86 cpuid features which are supported by both the hardware
+and kvm.  Userspace can use the information returned by this ioctl to
+construct cpuid information (for KVM_SET_CPUID2) that is consistent with
+hardware, kernel, and userspace capabilities, and with user requirements (for
+example, the user may wish to constrain cpuid to emulate older hardware,
+or for feature consistency across a cluster).
+
+Userspace invokes KVM_GET_SUPPORTED_CPUID by passing a kvm_cpuid2 structure
+with the 'nent' field indicating the number of entries in the variable-size
+array 'entries'.  If the number of entries is too low to describe the cpu
+capabilities, an error (E2BIG) is returned.  If the number is too high,
+the 'nent' field is adjusted and an error (ENOMEM) is returned.  If the
+number is just right, the 'nent' field is adjusted to the number of valid
+entries in the 'entries' array, which is then filled.
+
+The entries returned are the host cpuid as returned by the cpuid instruction,
+with unknown or unsupported features masked out.  The fields in each entry
+are defined as follows:
+
+  function: the eax value used to obtain the entry
+  index: the ecx value used to obtain the entry (for entries that are
+         affected by ecx)
+  flags: an OR of zero or more of the following:
+        KVM_CPUID_FLAG_SIGNIFCANT_INDEX:
+           if the index field is valid
+        KVM_CPUID_FLAG_STATEFUL_FUNC:
+           if cpuid for this function returns different values for successive
+           invocations; there will be several entries with the same function,
+           all with this flag set
+        KVM_CPUID_FLAG_STATE_READ_NEXT:
+           for KVM_CPUID_FLAG_STATEFUL_FUNC entries, set if this entry is
+           the first entry to be read by a cpu
+   eax, ebx, ecx, edx: the values returned by the cpuid instruction for
+         this function/index combination
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
index aaed6ab9d7abd606da3ab55779f5ae3408d78c94..142cc5136650be89a8cbfe545bb50804b41ecf71 100644 (file)
@@ -77,10 +77,10 @@ Memory
 
 Guest memory (gpa) is part of the user address space of the process that is
 using kvm.  Userspace defines the translation between guest addresses and user
-addresses (gpa->hva); note that two gpas may alias to the same gva, but not
+addresses (gpa->hva); note that two gpas may alias to the same hva, but not
 vice versa.
 
-These gvas may be backed using any method available to the host: anonymous
+These hvas may be backed using any method available to the host: anonymous
 memory, file backed memory, and device memory.  Memory might be paged by the
 host at any time.
 
@@ -161,7 +161,7 @@ Shadow pages contain the following information:
   role.cr4_pae:
     Contains the value of cr4.pae for which the page is valid (e.g. whether
     32-bit or 64-bit gptes are in use).
-  role.cr4_nxe:
+  role.nxe:
     Contains the value of efer.nxe for which the page is valid.
   role.cr0_wp:
     Contains the value of cr0.wp for which the page is valid.
@@ -180,7 +180,9 @@ Shadow pages contain the following information:
     guest pages as leaves.
   gfns:
     An array of 512 guest frame numbers, one for each present pte.  Used to
-    perform a reverse map from a pte to a gfn.
+    perform a reverse map from a pte to a gfn. When role.direct is set, any
+    element of this array can be calculated from the gfn field when used, in
+    this case, the array of gfns is not allocated. See role.direct and gfn.
   slot_bitmap:
     A bitmap containing one bit per memory slot.  If the page contains a pte
     mapping a page from memory slot n, then bit n of slot_bitmap will be set
@@ -296,6 +298,48 @@ Host translation updates:
   - look up affected sptes through reverse map
   - drop (or update) translations
 
+Emulating cr0.wp
+================
+
+If tdp is not enabled, the host must keep cr0.wp=1 so page write protection
+works for the guest kernel, not guest guest userspace.  When the guest
+cr0.wp=1, this does not present a problem.  However when the guest cr0.wp=0,
+we cannot map the permissions for gpte.u=1, gpte.w=0 to any spte (the
+semantics require allowing any guest kernel access plus user read access).
+
+We handle this by mapping the permissions to two possible sptes, depending
+on fault type:
+
+- kernel write fault: spte.u=0, spte.w=1 (allows full kernel access,
+  disallows user access)
+- read fault: spte.u=1, spte.w=0 (allows full read access, disallows kernel
+  write access)
+
+(user write faults generate a #PF)
+
+Large pages
+===========
+
+The mmu supports all combinations of large and small guest and host pages.
+Supported page sizes include 4k, 2M, 4M, and 1G.  4M pages are treated as
+two separate 2M pages, on both guest and host, since the mmu always uses PAE
+paging.
+
+To instantiate a large spte, four constraints must be satisfied:
+
+- the spte must point to a large host page
+- the guest pte must be a large pte of at least equivalent size (if tdp is
+  enabled, there is no guest pte and this condition is satisified)
+- if the spte will be writeable, the large page frame may not overlap any
+  write-protected pages
+- the guest page must be wholly contained by a single memory slot
+
+To check the last two conditions, the mmu maintains a ->write_count set of
+arrays for each memory slot and large page size.  Every write protected page
+causes its write_count to be incremented, thus preventing instantiation of
+a large spte.  The frames at the end of an unaligned memory slot have
+artificically inflated ->write_counts so they can never be instantiated.
+
 Further reading
 ===============
 
diff --git a/Documentation/kvm/msr.txt b/Documentation/kvm/msr.txt
new file mode 100644 (file)
index 0000000..8ddcfe8
--- /dev/null
@@ -0,0 +1,153 @@
+KVM-specific MSRs.
+Glauber Costa <glommer@redhat.com>, Red Hat Inc, 2010
+=====================================================
+
+KVM makes use of some custom MSRs to service some requests.
+At present, this facility is only used by kvmclock.
+
+Custom MSRs have a range reserved for them, that goes from
+0x4b564d00 to 0x4b564dff. There are MSRs outside this area,
+but they are deprecated and their use is discouraged.
+
+Custom MSR list
+--------
+
+The current supported Custom MSR list is:
+
+MSR_KVM_WALL_CLOCK_NEW:   0x4b564d00
+
+       data: 4-byte alignment physical address of a memory area which must be
+       in guest RAM. This memory is expected to hold a copy of the following
+       structure:
+
+       struct pvclock_wall_clock {
+               u32   version;
+               u32   sec;
+               u32   nsec;
+       } __attribute__((__packed__));
+
+       whose data will be filled in by the hypervisor. The hypervisor is only
+       guaranteed to update this data at the moment of MSR write.
+       Users that want to reliably query this information more than once have
+       to write more than once to this MSR. Fields have the following meanings:
+
+               version: guest has to check version before and after grabbing
+               time information and check that they are both equal and even.
+               An odd version indicates an in-progress update.
+
+               sec: number of seconds for wallclock.
+
+               nsec: number of nanoseconds for wallclock.
+
+       Note that although MSRs are per-CPU entities, the effect of this
+       particular MSR is global.
+
+       Availability of this MSR must be checked via bit 3 in 0x4000001 cpuid
+       leaf prior to usage.
+
+MSR_KVM_SYSTEM_TIME_NEW:  0x4b564d01
+
+       data: 4-byte aligned physical address of a memory area which must be in
+       guest RAM, plus an enable bit in bit 0. This memory is expected to hold
+       a copy of the following structure:
+
+       struct pvclock_vcpu_time_info {
+               u32   version;
+               u32   pad0;
+               u64   tsc_timestamp;
+               u64   system_time;
+               u32   tsc_to_system_mul;
+               s8    tsc_shift;
+               u8    flags;
+               u8    pad[2];
+       } __attribute__((__packed__)); /* 32 bytes */
+
+       whose data will be filled in by the hypervisor periodically. Only one
+       write, or registration, is needed for each VCPU. The interval between
+       updates of this structure is arbitrary and implementation-dependent.
+       The hypervisor may update this structure at any time it sees fit until
+       anything with bit0 == 0 is written to it.
+
+       Fields have the following meanings:
+
+               version: guest has to check version before and after grabbing
+               time information and check that they are both equal and even.
+               An odd version indicates an in-progress update.
+
+               tsc_timestamp: the tsc value at the current VCPU at the time
+               of the update of this structure. Guests can subtract this value
+               from current tsc to derive a notion of elapsed time since the
+               structure update.
+
+               system_time: a host notion of monotonic time, including sleep
+               time at the time this structure was last updated. Unit is
+               nanoseconds.
+
+               tsc_to_system_mul: a function of the tsc frequency. One has
+               to multiply any tsc-related quantity by this value to get
+               a value in nanoseconds, besides dividing by 2^tsc_shift
+
+               tsc_shift: cycle to nanosecond divider, as a power of two, to
+               allow for shift rights. One has to shift right any tsc-related
+               quantity by this value to get a value in nanoseconds, besides
+               multiplying by tsc_to_system_mul.
+
+               With this information, guests can derive per-CPU time by
+               doing:
+
+                       time = (current_tsc - tsc_timestamp)
+                       time = (time * tsc_to_system_mul) >> tsc_shift
+                       time = time + system_time
+
+               flags: bits in this field indicate extended capabilities
+               coordinated between the guest and the hypervisor. Availability
+               of specific flags has to be checked in 0x40000001 cpuid leaf.
+               Current flags are:
+
+                flag bit   | cpuid bit    | meaning
+               -------------------------------------------------------------
+                           |              | time measures taken across
+                    0      |      24      | multiple cpus are guaranteed to
+                           |              | be monotonic
+               -------------------------------------------------------------
+
+       Availability of this MSR must be checked via bit 3 in 0x4000001 cpuid
+       leaf prior to usage.
+
+
+MSR_KVM_WALL_CLOCK:  0x11
+
+       data and functioning: same as MSR_KVM_WALL_CLOCK_NEW. Use that instead.
+
+       This MSR falls outside the reserved KVM range and may be removed in the
+       future. Its usage is deprecated.
+
+       Availability of this MSR must be checked via bit 0 in 0x4000001 cpuid
+       leaf prior to usage.
+
+MSR_KVM_SYSTEM_TIME: 0x12
+
+       data and functioning: same as MSR_KVM_SYSTEM_TIME_NEW. Use that instead.
+
+       This MSR falls outside the reserved KVM range and may be removed in the
+       future. Its usage is deprecated.
+
+       Availability of this MSR must be checked via bit 0 in 0x4000001 cpuid
+       leaf prior to usage.
+
+       The suggested algorithm for detecting kvmclock presence is then:
+
+               if (!kvm_para_available())    /* refer to cpuid.txt */
+                       return NON_PRESENT;
+
+               flags = cpuid_eax(0x40000001);
+               if (flags & 3) {
+                       msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW;
+                       msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW;
+                       return PRESENT;
+               } else if (flags & 0) {
+                       msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
+                       msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
+                       return PRESENT;
+               } else
+                       return NON_PRESENT;
diff --git a/Documentation/kvm/review-checklist.txt b/Documentation/kvm/review-checklist.txt
new file mode 100644 (file)
index 0000000..730475a
--- /dev/null
@@ -0,0 +1,38 @@
+Review checklist for kvm patches
+================================
+
+1.  The patch must follow Documentation/CodingStyle and
+    Documentation/SubmittingPatches.
+
+2.  Patches should be against kvm.git master branch.
+
+3.  If the patch introduces or modifies a new userspace API:
+    - the API must be documented in Documentation/kvm/api.txt
+    - the API must be discoverable using KVM_CHECK_EXTENSION
+
+4.  New state must include support for save/restore.
+
+5.  New features must default to off (userspace should explicitly request them).
+    Performance improvements can and should default to on.
+
+6.  New cpu features should be exposed via KVM_GET_SUPPORTED_CPUID2
+
+7.  Emulator changes should be accompanied by unit tests for qemu-kvm.git
+    kvm/test directory.
+
+8.  Changes should be vendor neutral when possible.  Changes to common code
+    are better than duplicating changes to vendor code.
+
+9.  Similarly, prefer changes to arch independent code than to arch dependent
+    code.
+
+10. User/kernel interfaces and guest/host interfaces must be 64-bit clean
+    (all variables and sizes naturally aligned on 64-bit; use specific types
+    only - u64 rather than ulong).
+
+11. New guest visible features must either be documented in a hardware manual
+    or be accompanied by documentation.
+
+12. Features must be robust against reset and kexec - for example, shared
+    host/guest memory must be unshared to prevent the host from writing to
+    guest memory that the guest has not reserved for this purpose.
index a362e67e0ca6edec83923e42a655d61d639001c8..2f229e5de4980885bbdab5dcd614495bf8e9abd3 100644 (file)
@@ -235,6 +235,7 @@ struct kvm_vm_data {
 #define KVM_REQ_PTC_G          32
 #define KVM_REQ_RESUME         33
 
+#define KVM_HPAGE_GFN_SHIFT(x) 0
 #define KVM_NR_PAGE_SIZES      1
 #define KVM_PAGES_PER_HPAGE(x) 1
 
index 21b701374f72335905ee9bfde866975e60800229..5cb58655cd5f98d482895b51e9251852a70c1236 100644 (file)
@@ -725,8 +725,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        int r;
        sigset_t sigsaved;
 
-       vcpu_load(vcpu);
-
        if (vcpu->sigset_active)
                sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
 
@@ -748,7 +746,6 @@ out:
        if (vcpu->sigset_active)
                sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
-       vcpu_put(vcpu);
        return r;
 }
 
@@ -883,8 +880,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
        struct vpd *vpd = to_host(vcpu->kvm, vcpu->arch.vpd);
        int i;
 
-       vcpu_load(vcpu);
-
        for (i = 0; i < 16; i++) {
                vpd->vgr[i] = regs->vpd.vgr[i];
                vpd->vbgr[i] = regs->vpd.vbgr[i];
@@ -931,8 +926,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
        vcpu->arch.itc_offset = regs->saved_itc - kvm_get_itc(vcpu);
        set_bit(KVM_REQ_RESUME, &vcpu->requests);
 
-       vcpu_put(vcpu);
-
        return 0;
 }
 
@@ -1802,35 +1795,24 @@ void kvm_arch_exit(void)
        kvm_vmm_info = NULL;
 }
 
-static int kvm_ia64_sync_dirty_log(struct kvm *kvm,
-               struct kvm_dirty_log *log)
+static void kvm_ia64_sync_dirty_log(struct kvm *kvm,
+                                   struct kvm_memory_slot *memslot)
 {
-       struct kvm_memory_slot *memslot;
-       int r, i;
+       int i;
        long base;
        unsigned long n;
        unsigned long *dirty_bitmap = (unsigned long *)(kvm->arch.vm_base +
                        offsetof(struct kvm_vm_data, kvm_mem_dirty_log));
 
-       r = -EINVAL;
-       if (log->slot >= KVM_MEMORY_SLOTS)
-               goto out;
-
-       memslot = &kvm->memslots->memslots[log->slot];
-       r = -ENOENT;
-       if (!memslot->dirty_bitmap)
-               goto out;
-
        n = kvm_dirty_bitmap_bytes(memslot);
        base = memslot->base_gfn / BITS_PER_LONG;
 
+       spin_lock(&kvm->arch.dirty_log_lock);
        for (i = 0; i < n/sizeof(long); ++i) {
                memslot->dirty_bitmap[i] = dirty_bitmap[base + i];
                dirty_bitmap[base + i] = 0;
        }
-       r = 0;
-out:
-       return r;
+       spin_unlock(&kvm->arch.dirty_log_lock);
 }
 
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
@@ -1842,12 +1824,17 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
        int is_dirty = 0;
 
        mutex_lock(&kvm->slots_lock);
-       spin_lock(&kvm->arch.dirty_log_lock);
 
-       r = kvm_ia64_sync_dirty_log(kvm, log);
-       if (r)
+       r = -EINVAL;
+       if (log->slot >= KVM_MEMORY_SLOTS)
+               goto out;
+
+       memslot = &kvm->memslots->memslots[log->slot];
+       r = -ENOENT;
+       if (!memslot->dirty_bitmap)
                goto out;
 
+       kvm_ia64_sync_dirty_log(kvm, memslot);
        r = kvm_get_dirty_log(kvm, log, &is_dirty);
        if (r)
                goto out;
@@ -1855,14 +1842,12 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
        /* If nothing is dirty, don't bother messing with page tables. */
        if (is_dirty) {
                kvm_flush_remote_tlbs(kvm);
-               memslot = &kvm->memslots->memslots[log->slot];
                n = kvm_dirty_bitmap_bytes(memslot);
                memset(memslot->dirty_bitmap, 0, n);
        }
        r = 0;
 out:
        mutex_unlock(&kvm->slots_lock);
-       spin_unlock(&kvm->arch.dirty_log_lock);
        return r;
 }
 
@@ -1953,11 +1938,6 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
        return vcpu->arch.timer_fired;
 }
 
-gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
-{
-       return gfn;
-}
-
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
        return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) ||
@@ -1967,9 +1947,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
                                    struct kvm_mp_state *mp_state)
 {
-       vcpu_load(vcpu);
        mp_state->mp_state = vcpu->arch.mp_state;
-       vcpu_put(vcpu);
        return 0;
 }
 
@@ -2000,10 +1978,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 {
        int r = 0;
 
-       vcpu_load(vcpu);
        vcpu->arch.mp_state = mp_state->mp_state;
        if (vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)
                r = vcpu_reset(vcpu);
-       vcpu_put(vcpu);
        return r;
 }
index 6f74d93725a0f3327e4d3c6088828113b62079f4..8274a2d4392525ebe9f58a83dbb426686da7c224 100644 (file)
@@ -115,7 +115,15 @@ extern void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu);
 extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte);
 extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr);
 extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu);
-extern struct kvmppc_pte *kvmppc_mmu_find_pte(struct kvm_vcpu *vcpu, u64 ea, bool data);
+
+extern void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte);
+extern struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu);
+extern void kvmppc_mmu_hpte_destroy(struct kvm_vcpu *vcpu);
+extern int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu);
+extern void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte);
+extern int kvmppc_mmu_hpte_sysinit(void);
+extern void kvmppc_mmu_hpte_sysexit(void);
+
 extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
index 94f05de9ad04dd3f60498f034a1cbcf043cb2abe..c3d4f0518a67c2f89b4c42e3d1b81e20dea0c059 100644 (file)
 
 #include <linux/types.h>
 
-extern void fps_fres(struct thread_struct *t, u32 *dst, u32 *src1);
-extern void fps_frsqrte(struct thread_struct *t, u32 *dst, u32 *src1);
-extern void fps_fsqrts(struct thread_struct *t, u32 *dst, u32 *src1);
+extern void fps_fres(u64 *fpscr, u32 *dst, u32 *src1);
+extern void fps_frsqrte(u64 *fpscr, u32 *dst, u32 *src1);
+extern void fps_fsqrts(u64 *fpscr, u32 *dst, u32 *src1);
 
-extern void fps_fadds(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2);
-extern void fps_fdivs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2);
-extern void fps_fmuls(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2);
-extern void fps_fsubs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2);
+extern void fps_fadds(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2);
+extern void fps_fdivs(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2);
+extern void fps_fmuls(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2);
+extern void fps_fsubs(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2);
 
-extern void fps_fmadds(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2,
+extern void fps_fmadds(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2,
                       u32 *src3);
-extern void fps_fmsubs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2,
+extern void fps_fmsubs(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2,
                       u32 *src3);
-extern void fps_fnmadds(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2,
+extern void fps_fnmadds(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2,
                        u32 *src3);
-extern void fps_fnmsubs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2,
+extern void fps_fnmsubs(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2,
                        u32 *src3);
-extern void fps_fsel(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2,
+extern void fps_fsel(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2,
                     u32 *src3);
 
 #define FPD_ONE_IN(name) extern void fpd_ ## name(u64 *fpscr, u32 *cr, \
@@ -82,4 +82,7 @@ FPD_THREE_IN(fmadd)
 FPD_THREE_IN(fnmsub)
 FPD_THREE_IN(fnmadd)
 
+extern void kvm_cvt_fd(u32 *from, u64 *to, u64 *fpscr);
+extern void kvm_cvt_df(u64 *from, u32 *to, u64 *fpscr);
+
 #endif
index 0c9ad869decd36cb95988d61a10fe993c8bc3b1c..b0b23c007d6e7ecba2501b7eb8d35a5ccb7b1e70 100644 (file)
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 
 /* We don't currently support large pages. */
+#define KVM_HPAGE_GFN_SHIFT(x) 0
 #define KVM_NR_PAGE_SIZES      1
 #define KVM_PAGES_PER_HPAGE(x) (1UL<<31)
 
-#define HPTEG_CACHE_NUM 1024
+#define HPTEG_CACHE_NUM                        (1 << 15)
+#define HPTEG_HASH_BITS_PTE            13
+#define HPTEG_HASH_BITS_VPTE           13
+#define HPTEG_HASH_BITS_VPTE_LONG      5
+#define HPTEG_HASH_NUM_PTE             (1 << HPTEG_HASH_BITS_PTE)
+#define HPTEG_HASH_NUM_VPTE            (1 << HPTEG_HASH_BITS_VPTE)
+#define HPTEG_HASH_NUM_VPTE_LONG       (1 << HPTEG_HASH_BITS_VPTE_LONG)
 
 struct kvm;
 struct kvm_run;
@@ -151,6 +158,9 @@ struct kvmppc_mmu {
 };
 
 struct hpte_cache {
+       struct hlist_node list_pte;
+       struct hlist_node list_vpte;
+       struct hlist_node list_vpte_long;
        u64 host_va;
        u64 pfn;
        ulong slot;
@@ -282,8 +292,10 @@ struct kvm_vcpu_arch {
        unsigned long pending_exceptions;
 
 #ifdef CONFIG_PPC_BOOK3S
-       struct hpte_cache hpte_cache[HPTEG_CACHE_NUM];
-       int hpte_cache_offset;
+       struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE];
+       struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE];
+       struct hlist_head hpte_hash_vpte_long[HPTEG_HASH_NUM_VPTE_LONG];
+       int hpte_cache_count;
 #endif
 };
 
index 3b4dcc82a4c1f43ab1f26a29506591e0d4aa4369..ab3e392ac63c41fda474ce4857660bbd38e459bd 100644 (file)
@@ -101,10 +101,6 @@ EXPORT_SYMBOL(pci_dram_offset);
 EXPORT_SYMBOL(start_thread);
 EXPORT_SYMBOL(kernel_thread);
 
-#ifdef CONFIG_PPC_FPU
-EXPORT_SYMBOL_GPL(cvt_df);
-EXPORT_SYMBOL_GPL(cvt_fd);
-#endif
 EXPORT_SYMBOL(giveup_fpu);
 #ifdef CONFIG_ALTIVEC
 EXPORT_SYMBOL(giveup_altivec);
index 812312542e50cd9b42d73df0b02168d5a678ac7e..9b9b5cdea840bcdfb87128687dab6f0d41726446 100644 (file)
@@ -316,7 +316,8 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr,
        gfn = gpaddr >> PAGE_SHIFT;
        new_page = gfn_to_page(vcpu->kvm, gfn);
        if (is_error_page(new_page)) {
-               printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n", gfn);
+               printk(KERN_ERR "Couldn't get guest page for gfn %llx!\n",
+                       (unsigned long long)gfn);
                kvm_release_page_clean(new_page);
                return;
        }
index ff436066bf776f4677d7f853ab158631f753ea4f..d45c818a384c2281f0c8689272bcb130c18de98a 100644 (file)
@@ -45,6 +45,7 @@ kvm-book3s_64-objs := \
        book3s.o \
        book3s_emulate.o \
        book3s_interrupts.o \
+       book3s_mmu_hpte.o \
        book3s_64_mmu_host.o \
        book3s_64_mmu.o \
        book3s_32_mmu.o
@@ -57,6 +58,7 @@ kvm-book3s_32-objs := \
        book3s.o \
        book3s_emulate.o \
        book3s_interrupts.o \
+       book3s_mmu_hpte.o \
        book3s_32_mmu_host.o \
        book3s_32_mmu.o
 kvm-objs-$(CONFIG_KVM_BOOK3S_32) := $(kvm-book3s_32-objs)
index b998abf1a63d13f8f05132037560b8e466d3b955..a3cef30d1d4224ded85ceb9b5acd2b4df33e3ee7 100644 (file)
@@ -1047,8 +1047,6 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
        int i;
 
-       vcpu_load(vcpu);
-
        regs->pc = kvmppc_get_pc(vcpu);
        regs->cr = kvmppc_get_cr(vcpu);
        regs->ctr = kvmppc_get_ctr(vcpu);
@@ -1069,8 +1067,6 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
        for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
                regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
 
-       vcpu_put(vcpu);
-
        return 0;
 }
 
@@ -1078,8 +1074,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
        int i;
 
-       vcpu_load(vcpu);
-
        kvmppc_set_pc(vcpu, regs->pc);
        kvmppc_set_cr(vcpu, regs->cr);
        kvmppc_set_ctr(vcpu, regs->ctr);
@@ -1099,8 +1093,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
        for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
                kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
 
-       vcpu_put(vcpu);
-
        return 0;
 }
 
@@ -1110,8 +1102,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
        struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
        int i;
 
-       vcpu_load(vcpu);
-
        sregs->pvr = vcpu->arch.pvr;
 
        sregs->u.s.sdr1 = to_book3s(vcpu)->sdr1;
@@ -1131,8 +1121,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
                }
        }
 
-       vcpu_put(vcpu);
-
        return 0;
 }
 
@@ -1142,8 +1130,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
        struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
        int i;
 
-       vcpu_load(vcpu);
-
        kvmppc_set_pvr(vcpu, sregs->pvr);
 
        vcpu3s->sdr1 = sregs->u.s.sdr1;
@@ -1171,8 +1157,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
        /* Flush the MMU after messing with the segments */
        kvmppc_mmu_pte_flush(vcpu, 0, 0);
 
-       vcpu_put(vcpu);
-
        return 0;
 }
 
@@ -1309,12 +1293,17 @@ extern int __kvmppc_vcpu_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
 int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 {
        int ret;
-       struct thread_struct ext_bkp;
+       double fpr[32][TS_FPRWIDTH];
+       unsigned int fpscr;
+       int fpexc_mode;
 #ifdef CONFIG_ALTIVEC
-       bool save_vec = current->thread.used_vr;
+       vector128 vr[32];
+       vector128 vscr;
+       unsigned long uninitialized_var(vrsave);
+       int used_vr;
 #endif
 #ifdef CONFIG_VSX
-       bool save_vsx = current->thread.used_vsr;
+       int used_vsr;
 #endif
        ulong ext_msr;
 
@@ -1327,27 +1316,27 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
        /* Save FPU state in stack */
        if (current->thread.regs->msr & MSR_FP)
                giveup_fpu(current);
-       memcpy(ext_bkp.fpr, current->thread.fpr, sizeof(current->thread.fpr));
-       ext_bkp.fpscr = current->thread.fpscr;
-       ext_bkp.fpexc_mode = current->thread.fpexc_mode;
+       memcpy(fpr, current->thread.fpr, sizeof(current->thread.fpr));
+       fpscr = current->thread.fpscr.val;
+       fpexc_mode = current->thread.fpexc_mode;
 
 #ifdef CONFIG_ALTIVEC
        /* Save Altivec state in stack */
-       if (save_vec) {
+       used_vr = current->thread.used_vr;
+       if (used_vr) {
                if (current->thread.regs->msr & MSR_VEC)
                        giveup_altivec(current);
-               memcpy(ext_bkp.vr, current->thread.vr, sizeof(ext_bkp.vr));
-               ext_bkp.vscr = current->thread.vscr;
-               ext_bkp.vrsave = current->thread.vrsave;
+               memcpy(vr, current->thread.vr, sizeof(current->thread.vr));
+               vscr = current->thread.vscr;
+               vrsave = current->thread.vrsave;
        }
-       ext_bkp.used_vr = current->thread.used_vr;
 #endif
 
 #ifdef CONFIG_VSX
        /* Save VSX state in stack */
-       if (save_vsx && (current->thread.regs->msr & MSR_VSX))
+       used_vsr = current->thread.used_vsr;
+       if (used_vsr && (current->thread.regs->msr & MSR_VSX))
                        __giveup_vsx(current);
-       ext_bkp.used_vsr = current->thread.used_vsr;
 #endif
 
        /* Remember the MSR with disabled extensions */
@@ -1372,22 +1361,22 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
        kvmppc_giveup_ext(vcpu, MSR_VSX);
 
        /* Restore FPU state from stack */
-       memcpy(current->thread.fpr, ext_bkp.fpr, sizeof(ext_bkp.fpr));
-       current->thread.fpscr = ext_bkp.fpscr;
-       current->thread.fpexc_mode = ext_bkp.fpexc_mode;
+       memcpy(current->thread.fpr, fpr, sizeof(current->thread.fpr));
+       current->thread.fpscr.val = fpscr;
+       current->thread.fpexc_mode = fpexc_mode;
 
 #ifdef CONFIG_ALTIVEC
        /* Restore Altivec state from stack */
-       if (save_vec && current->thread.used_vr) {
-               memcpy(current->thread.vr, ext_bkp.vr, sizeof(ext_bkp.vr));
-               current->thread.vscr = ext_bkp.vscr;
-               current->thread.vrsave= ext_bkp.vrsave;
+       if (used_vr && current->thread.used_vr) {
+               memcpy(current->thread.vr, vr, sizeof(current->thread.vr));
+               current->thread.vscr = vscr;
+               current->thread.vrsave = vrsave;
        }
-       current->thread.used_vr = ext_bkp.used_vr;
+       current->thread.used_vr = used_vr;
 #endif
 
 #ifdef CONFIG_VSX
-       current->thread.used_vsr = ext_bkp.used_vsr;
+       current->thread.used_vsr = used_vsr;
 #endif
 
        return ret;
@@ -1395,12 +1384,22 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
 static int kvmppc_book3s_init(void)
 {
-       return kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), 0,
-                       THIS_MODULE);
+       int r;
+
+       r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), 0,
+                    THIS_MODULE);
+
+       if (r)
+               return r;
+
+       r = kvmppc_mmu_hpte_sysinit();
+
+       return r;
 }
 
 static void kvmppc_book3s_exit(void)
 {
+       kvmppc_mmu_hpte_sysexit();
        kvm_exit();
 }
 
index 0b10503c8a4aac4dfce46c7bf50c4fa017aa0db3..3292d76101d2eefa6808b0bf72dd81c3fed1eba5 100644 (file)
@@ -354,10 +354,10 @@ static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
                *vsid = VSID_REAL_DR | gvsid;
                break;
        case MSR_DR|MSR_IR:
-               if (!sr->valid)
-                       return -1;
-
-               *vsid = sr->vsid;
+               if (sr->valid)
+                       *vsid = sr->vsid;
+               else
+                       *vsid = VSID_BAT | gvsid;
                break;
        default:
                BUG();
index 0bb66005338f0799efb8039790bea3a45de27091..0b51ef872c1e629d7c9697694c029ca6524df4df 100644 (file)
@@ -19,6 +19,7 @@
  */
 
 #include <linux/kvm_host.h>
+#include <linux/hash.h>
 
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
 static ulong htab;
 static u32 htabmask;
 
-static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
+void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
 {
        volatile u32 *pteg;
 
-       dprintk_mmu("KVM: Flushing SPTE: 0x%llx (0x%llx) -> 0x%llx\n",
-                   pte->pte.eaddr, pte->pte.vpage, pte->host_va);
-
+       /* Remove from host HTAB */
        pteg = (u32*)pte->slot;
-
        pteg[0] = 0;
+
+       /* And make sure it's gone from the TLB too */
        asm volatile ("sync");
        asm volatile ("tlbie %0" : : "r" (pte->pte.eaddr) : "memory");
        asm volatile ("sync");
        asm volatile ("tlbsync");
-
-       pte->host_va = 0;
-
-       if (pte->pte.may_write)
-               kvm_release_pfn_dirty(pte->pfn);
-       else
-               kvm_release_pfn_clean(pte->pfn);
-}
-
-void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
-{
-       int i;
-
-       dprintk_mmu("KVM: Flushing %d Shadow PTEs: 0x%x & 0x%x\n",
-                   vcpu->arch.hpte_cache_offset, guest_ea, ea_mask);
-       BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
-
-       guest_ea &= ea_mask;
-       for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
-               struct hpte_cache *pte;
-
-               pte = &vcpu->arch.hpte_cache[i];
-               if (!pte->host_va)
-                       continue;
-
-               if ((pte->pte.eaddr & ea_mask) == guest_ea) {
-                       invalidate_pte(vcpu, pte);
-               }
-       }
-
-       /* Doing a complete flush -> start from scratch */
-       if (!ea_mask)
-               vcpu->arch.hpte_cache_offset = 0;
-}
-
-void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask)
-{
-       int i;
-
-       dprintk_mmu("KVM: Flushing %d Shadow vPTEs: 0x%llx & 0x%llx\n",
-                   vcpu->arch.hpte_cache_offset, guest_vp, vp_mask);
-       BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
-
-       guest_vp &= vp_mask;
-       for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
-               struct hpte_cache *pte;
-
-               pte = &vcpu->arch.hpte_cache[i];
-               if (!pte->host_va)
-                       continue;
-
-               if ((pte->pte.vpage & vp_mask) == guest_vp) {
-                       invalidate_pte(vcpu, pte);
-               }
-       }
-}
-
-void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
-{
-       int i;
-
-       dprintk_mmu("KVM: Flushing %d Shadow pPTEs: 0x%llx & 0x%llx\n",
-                   vcpu->arch.hpte_cache_offset, pa_start, pa_end);
-       BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
-
-       for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
-               struct hpte_cache *pte;
-
-               pte = &vcpu->arch.hpte_cache[i];
-               if (!pte->host_va)
-                       continue;
-
-               if ((pte->pte.raddr >= pa_start) &&
-                   (pte->pte.raddr < pa_end)) {
-                       invalidate_pte(vcpu, pte);
-               }
-       }
-}
-
-struct kvmppc_pte *kvmppc_mmu_find_pte(struct kvm_vcpu *vcpu, u64 ea, bool data)
-{
-       int i;
-       u64 guest_vp;
-
-       guest_vp = vcpu->arch.mmu.ea_to_vp(vcpu, ea, false);
-       for (i=0; i<vcpu->arch.hpte_cache_offset; i++) {
-               struct hpte_cache *pte;
-
-               pte = &vcpu->arch.hpte_cache[i];
-               if (!pte->host_va)
-                       continue;
-
-               if (pte->pte.vpage == guest_vp)
-                       return &pte->pte;
-       }
-
-       return NULL;
-}
-
-static int kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu)
-{
-       if (vcpu->arch.hpte_cache_offset == HPTEG_CACHE_NUM)
-               kvmppc_mmu_pte_flush(vcpu, 0, 0);
-
-       return vcpu->arch.hpte_cache_offset++;
 }
 
 /* We keep 512 gvsid->hvsid entries, mapping the guest ones to the array using
  * a hash, so we don't waste cycles on looping */
 static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid)
 {
-       return (u16)(((gvsid >> (SID_MAP_BITS * 7)) & SID_MAP_MASK) ^
-                    ((gvsid >> (SID_MAP_BITS * 6)) & SID_MAP_MASK) ^
-                    ((gvsid >> (SID_MAP_BITS * 5)) & SID_MAP_MASK) ^
-                    ((gvsid >> (SID_MAP_BITS * 4)) & SID_MAP_MASK) ^
-                    ((gvsid >> (SID_MAP_BITS * 3)) & SID_MAP_MASK) ^
-                    ((gvsid >> (SID_MAP_BITS * 2)) & SID_MAP_MASK) ^
-                    ((gvsid >> (SID_MAP_BITS * 1)) & SID_MAP_MASK) ^
-                    ((gvsid >> (SID_MAP_BITS * 0)) & SID_MAP_MASK));
+       return hash_64(gvsid, SID_MAP_BITS);
 }
 
 
@@ -256,7 +144,6 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
        register int rr = 0;
        bool primary = false;
        bool evict = false;
-       int hpte_id;
        struct hpte_cache *pte;
 
        /* Get host physical address for gpa */
@@ -341,8 +228,7 @@ next_pteg:
 
        /* Now tell our Shadow PTE code about the new page */
 
-       hpte_id = kvmppc_mmu_hpte_cache_next(vcpu);
-       pte = &vcpu->arch.hpte_cache[hpte_id];
+       pte = kvmppc_mmu_hpte_cache_next(vcpu);
 
        dprintk_mmu("KVM: %c%c Map 0x%llx: [%lx] 0x%llx (0x%llx) -> %lx\n",
                    orig_pte->may_write ? 'w' : '-',
@@ -355,6 +241,8 @@ next_pteg:
        pte->pte = *orig_pte;
        pte->pfn = hpaddr >> PAGE_SHIFT;
 
+       kvmppc_mmu_hpte_cache_map(vcpu, pte);
+
        return 0;
 }
 
@@ -439,7 +327,7 @@ void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)
 
 void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
 {
-       kvmppc_mmu_pte_flush(vcpu, 0, 0);
+       kvmppc_mmu_hpte_destroy(vcpu);
        preempt_disable();
        __destroy_context(to_book3s(vcpu)->context_id);
        preempt_enable();
@@ -479,5 +367,7 @@ int kvmppc_mmu_init(struct kvm_vcpu *vcpu)
        htabmask = ((sdr1 & 0x1FF) << 16) | 0xFFC0;
        htab = (ulong)__va(sdr1 & 0xffff0000);
 
+       kvmppc_mmu_hpte_init(vcpu);
+
        return 0;
 }
index e4b5744977f6a4ad89c2a20653cef70b1286bbfb..384179a5002b9f2bdad7c188651689550edfde5b 100644 (file)
@@ -20,6 +20,7 @@
  */
 
 #include <linux/kvm_host.h>
+#include <linux/hash.h>
 
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
 #define dprintk_slb(a, ...) do { } while(0)
 #endif
 
-static void invalidate_pte(struct hpte_cache *pte)
+void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
 {
-       dprintk_mmu("KVM: Flushing SPT: 0x%lx (0x%llx) -> 0x%llx\n",
-                   pte->pte.eaddr, pte->pte.vpage, pte->host_va);
-
        ppc_md.hpte_invalidate(pte->slot, pte->host_va,
                               MMU_PAGE_4K, MMU_SEGSIZE_256M,
                               false);
-       pte->host_va = 0;
-
-       if (pte->pte.may_write)
-               kvm_release_pfn_dirty(pte->pfn);
-       else
-               kvm_release_pfn_clean(pte->pfn);
-}
-
-void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
-{
-       int i;
-
-       dprintk_mmu("KVM: Flushing %d Shadow PTEs: 0x%lx & 0x%lx\n",
-                   vcpu->arch.hpte_cache_offset, guest_ea, ea_mask);
-       BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
-
-       guest_ea &= ea_mask;
-       for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
-               struct hpte_cache *pte;
-
-               pte = &vcpu->arch.hpte_cache[i];
-               if (!pte->host_va)
-                       continue;
-
-               if ((pte->pte.eaddr & ea_mask) == guest_ea) {
-                       invalidate_pte(pte);
-               }
-       }
-
-       /* Doing a complete flush -> start from scratch */
-       if (!ea_mask)
-               vcpu->arch.hpte_cache_offset = 0;
-}
-
-void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask)
-{
-       int i;
-
-       dprintk_mmu("KVM: Flushing %d Shadow vPTEs: 0x%llx & 0x%llx\n",
-                   vcpu->arch.hpte_cache_offset, guest_vp, vp_mask);
-       BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
-
-       guest_vp &= vp_mask;
-       for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
-               struct hpte_cache *pte;
-
-               pte = &vcpu->arch.hpte_cache[i];
-               if (!pte->host_va)
-                       continue;
-
-               if ((pte->pte.vpage & vp_mask) == guest_vp) {
-                       invalidate_pte(pte);
-               }
-       }
-}
-
-void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
-{
-       int i;
-
-       dprintk_mmu("KVM: Flushing %d Shadow pPTEs: 0x%lx & 0x%lx\n",
-                   vcpu->arch.hpte_cache_offset, pa_start, pa_end);
-       BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
-
-       for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
-               struct hpte_cache *pte;
-
-               pte = &vcpu->arch.hpte_cache[i];
-               if (!pte->host_va)
-                       continue;
-
-               if ((pte->pte.raddr >= pa_start) &&
-                   (pte->pte.raddr < pa_end)) {
-                       invalidate_pte(pte);
-               }
-       }
-}
-
-struct kvmppc_pte *kvmppc_mmu_find_pte(struct kvm_vcpu *vcpu, u64 ea, bool data)
-{
-       int i;
-       u64 guest_vp;
-
-       guest_vp = vcpu->arch.mmu.ea_to_vp(vcpu, ea, false);
-       for (i=0; i<vcpu->arch.hpte_cache_offset; i++) {
-               struct hpte_cache *pte;
-
-               pte = &vcpu->arch.hpte_cache[i];
-               if (!pte->host_va)
-                       continue;
-
-               if (pte->pte.vpage == guest_vp)
-                       return &pte->pte;
-       }
-
-       return NULL;
-}
-
-static int kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu)
-{
-       if (vcpu->arch.hpte_cache_offset == HPTEG_CACHE_NUM)
-               kvmppc_mmu_pte_flush(vcpu, 0, 0);
-
-       return vcpu->arch.hpte_cache_offset++;
 }
 
 /* We keep 512 gvsid->hvsid entries, mapping the guest ones to the array using
  * a hash, so we don't waste cycles on looping */
 static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid)
 {
-       return (u16)(((gvsid >> (SID_MAP_BITS * 7)) & SID_MAP_MASK) ^
-                    ((gvsid >> (SID_MAP_BITS * 6)) & SID_MAP_MASK) ^
-                    ((gvsid >> (SID_MAP_BITS * 5)) & SID_MAP_MASK) ^
-                    ((gvsid >> (SID_MAP_BITS * 4)) & SID_MAP_MASK) ^
-                    ((gvsid >> (SID_MAP_BITS * 3)) & SID_MAP_MASK) ^
-                    ((gvsid >> (SID_MAP_BITS * 2)) & SID_MAP_MASK) ^
-                    ((gvsid >> (SID_MAP_BITS * 1)) & SID_MAP_MASK) ^
-                    ((gvsid >> (SID_MAP_BITS * 0)) & SID_MAP_MASK));
+       return hash_64(gvsid, SID_MAP_BITS);
 }
 
-
 static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid)
 {
        struct kvmppc_sid_map *map;
@@ -273,8 +159,7 @@ map_again:
                attempt++;
                goto map_again;
        } else {
-               int hpte_id = kvmppc_mmu_hpte_cache_next(vcpu);
-               struct hpte_cache *pte = &vcpu->arch.hpte_cache[hpte_id];
+               struct hpte_cache *pte = kvmppc_mmu_hpte_cache_next(vcpu);
 
                dprintk_mmu("KVM: %c%c Map 0x%lx: [%lx] 0x%lx (0x%llx) -> %lx\n",
                            ((rflags & HPTE_R_PP) == 3) ? '-' : 'w',
@@ -292,6 +177,8 @@ map_again:
                pte->host_va = va;
                pte->pte = *orig_pte;
                pte->pfn = hpaddr >> PAGE_SHIFT;
+
+               kvmppc_mmu_hpte_cache_map(vcpu, pte);
        }
 
        return 0;
@@ -418,7 +305,7 @@ void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)
 
 void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
 {
-       kvmppc_mmu_pte_flush(vcpu, 0, 0);
+       kvmppc_mmu_hpte_destroy(vcpu);
        __destroy_context(to_book3s(vcpu)->context_id);
 }
 
@@ -436,5 +323,7 @@ int kvmppc_mmu_init(struct kvm_vcpu *vcpu)
        vcpu3s->vsid_first = vcpu3s->context_id << USER_ESID_BITS;
        vcpu3s->vsid_next = vcpu3s->vsid_first;
 
+       kvmppc_mmu_hpte_init(vcpu);
+
        return 0;
 }
diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c
new file mode 100644 (file)
index 0000000..4868d4a
--- /dev/null
@@ -0,0 +1,277 @@
+/*
+ * Copyright (C) 2010 SUSE Linux Products GmbH. All rights reserved.
+ *
+ * Authors:
+ *     Alexander Graf <agraf@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/hash.h>
+#include <linux/slab.h>
+
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/machdep.h>
+#include <asm/mmu_context.h>
+#include <asm/hw_irq.h>
+
+#define PTE_SIZE       12
+
+/* #define DEBUG_MMU */
+
+#ifdef DEBUG_MMU
+#define dprintk_mmu(a, ...) printk(KERN_INFO a, __VA_ARGS__)
+#else
+#define dprintk_mmu(a, ...) do { } while(0)
+#endif
+
+static struct kmem_cache *hpte_cache;
+
+static inline u64 kvmppc_mmu_hash_pte(u64 eaddr)
+{
+       return hash_64(eaddr >> PTE_SIZE, HPTEG_HASH_BITS_PTE);
+}
+
+static inline u64 kvmppc_mmu_hash_vpte(u64 vpage)
+{
+       return hash_64(vpage & 0xfffffffffULL, HPTEG_HASH_BITS_VPTE);
+}
+
+static inline u64 kvmppc_mmu_hash_vpte_long(u64 vpage)
+{
+       return hash_64((vpage & 0xffffff000ULL) >> 12,
+                      HPTEG_HASH_BITS_VPTE_LONG);
+}
+
+void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
+{
+       u64 index;
+
+       /* Add to ePTE list */
+       index = kvmppc_mmu_hash_pte(pte->pte.eaddr);
+       hlist_add_head(&pte->list_pte, &vcpu->arch.hpte_hash_pte[index]);
+
+       /* Add to vPTE list */
+       index = kvmppc_mmu_hash_vpte(pte->pte.vpage);
+       hlist_add_head(&pte->list_vpte, &vcpu->arch.hpte_hash_vpte[index]);
+
+       /* Add to vPTE_long list */
+       index = kvmppc_mmu_hash_vpte_long(pte->pte.vpage);
+       hlist_add_head(&pte->list_vpte_long,
+                      &vcpu->arch.hpte_hash_vpte_long[index]);
+}
+
+static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
+{
+       dprintk_mmu("KVM: Flushing SPT: 0x%lx (0x%llx) -> 0x%llx\n",
+                   pte->pte.eaddr, pte->pte.vpage, pte->host_va);
+
+       /* Different for 32 and 64 bit */
+       kvmppc_mmu_invalidate_pte(vcpu, pte);
+
+       if (pte->pte.may_write)
+               kvm_release_pfn_dirty(pte->pfn);
+       else
+               kvm_release_pfn_clean(pte->pfn);
+
+       hlist_del(&pte->list_pte);
+       hlist_del(&pte->list_vpte);
+       hlist_del(&pte->list_vpte_long);
+
+       vcpu->arch.hpte_cache_count--;
+       kmem_cache_free(hpte_cache, pte);
+}
+
+static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu)
+{
+       struct hpte_cache *pte;
+       struct hlist_node *node, *tmp;
+       int i;
+
+       for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) {
+               struct hlist_head *list = &vcpu->arch.hpte_hash_vpte_long[i];
+
+               hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte_long)
+                       invalidate_pte(vcpu, pte);
+       }
+}
+
+static void kvmppc_mmu_pte_flush_page(struct kvm_vcpu *vcpu, ulong guest_ea)
+{
+       struct hlist_head *list;
+       struct hlist_node *node, *tmp;
+       struct hpte_cache *pte;
+
+       /* Find the list of entries in the map */
+       list = &vcpu->arch.hpte_hash_pte[kvmppc_mmu_hash_pte(guest_ea)];
+
+       /* Check the list for matching entries and invalidate */
+       hlist_for_each_entry_safe(pte, node, tmp, list, list_pte)
+               if ((pte->pte.eaddr & ~0xfffUL) == guest_ea)
+                       invalidate_pte(vcpu, pte);
+}
+
+void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
+{
+       u64 i;
+
+       dprintk_mmu("KVM: Flushing %d Shadow PTEs: 0x%lx & 0x%lx\n",
+                   vcpu->arch.hpte_cache_count, guest_ea, ea_mask);
+
+       guest_ea &= ea_mask;
+
+       switch (ea_mask) {
+       case ~0xfffUL:
+               kvmppc_mmu_pte_flush_page(vcpu, guest_ea);
+               break;
+       case 0x0ffff000:
+               /* 32-bit flush w/o segment, go through all possible segments */
+               for (i = 0; i < 0x100000000ULL; i += 0x10000000ULL)
+                       kvmppc_mmu_pte_flush(vcpu, guest_ea | i, ~0xfffUL);
+               break;
+       case 0:
+               /* Doing a complete flush -> start from scratch */
+               kvmppc_mmu_pte_flush_all(vcpu);
+               break;
+       default:
+               WARN_ON(1);
+               break;
+       }
+}
+
+/* Flush with mask 0xfffffffff */
+static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp)
+{
+       struct hlist_head *list;
+       struct hlist_node *node, *tmp;
+       struct hpte_cache *pte;
+       u64 vp_mask = 0xfffffffffULL;
+
+       list = &vcpu->arch.hpte_hash_vpte[kvmppc_mmu_hash_vpte(guest_vp)];
+
+       /* Check the list for matching entries and invalidate */
+       hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte)
+               if ((pte->pte.vpage & vp_mask) == guest_vp)
+                       invalidate_pte(vcpu, pte);
+}
+
+/* Flush with mask 0xffffff000 */
+static void kvmppc_mmu_pte_vflush_long(struct kvm_vcpu *vcpu, u64 guest_vp)
+{
+       struct hlist_head *list;
+       struct hlist_node *node, *tmp;
+       struct hpte_cache *pte;
+       u64 vp_mask = 0xffffff000ULL;
+
+       list = &vcpu->arch.hpte_hash_vpte_long[
+               kvmppc_mmu_hash_vpte_long(guest_vp)];
+
+       /* Check the list for matching entries and invalidate */
+       hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte_long)
+               if ((pte->pte.vpage & vp_mask) == guest_vp)
+                       invalidate_pte(vcpu, pte);
+}
+
+void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask)
+{
+       dprintk_mmu("KVM: Flushing %d Shadow vPTEs: 0x%llx & 0x%llx\n",
+                   vcpu->arch.hpte_cache_count, guest_vp, vp_mask);
+       guest_vp &= vp_mask;
+
+       switch(vp_mask) {
+       case 0xfffffffffULL:
+               kvmppc_mmu_pte_vflush_short(vcpu, guest_vp);
+               break;
+       case 0xffffff000ULL:
+               kvmppc_mmu_pte_vflush_long(vcpu, guest_vp);
+               break;
+       default:
+               WARN_ON(1);
+               return;
+       }
+}
+
+void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
+{
+       struct hlist_node *node, *tmp;
+       struct hpte_cache *pte;
+       int i;
+
+       dprintk_mmu("KVM: Flushing %d Shadow pPTEs: 0x%lx - 0x%lx\n",
+                   vcpu->arch.hpte_cache_count, pa_start, pa_end);
+
+       for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) {
+               struct hlist_head *list = &vcpu->arch.hpte_hash_vpte_long[i];
+
+               hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte_long)
+                       if ((pte->pte.raddr >= pa_start) &&
+                           (pte->pte.raddr < pa_end))
+                               invalidate_pte(vcpu, pte);
+       }
+}
+
+struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu)
+{
+       struct hpte_cache *pte;
+
+       pte = kmem_cache_zalloc(hpte_cache, GFP_KERNEL);
+       vcpu->arch.hpte_cache_count++;
+
+       if (vcpu->arch.hpte_cache_count == HPTEG_CACHE_NUM)
+               kvmppc_mmu_pte_flush_all(vcpu);
+
+       return pte;
+}
+
+void kvmppc_mmu_hpte_destroy(struct kvm_vcpu *vcpu)
+{
+       kvmppc_mmu_pte_flush(vcpu, 0, 0);
+}
+
+static void kvmppc_mmu_hpte_init_hash(struct hlist_head *hash_list, int len)
+{
+       int i;
+
+       for (i = 0; i < len; i++)
+               INIT_HLIST_HEAD(&hash_list[i]);
+}
+
+int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu)
+{
+       /* init hpte lookup hashes */
+       kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_pte,
+                                 ARRAY_SIZE(vcpu->arch.hpte_hash_pte));
+       kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte,
+                                 ARRAY_SIZE(vcpu->arch.hpte_hash_vpte));
+       kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte_long,
+                                 ARRAY_SIZE(vcpu->arch.hpte_hash_vpte_long));
+
+       return 0;
+}
+
+int kvmppc_mmu_hpte_sysinit(void)
+{
+       /* init hpte slab cache */
+       hpte_cache = kmem_cache_create("kvm-spt", sizeof(struct hpte_cache),
+                                      sizeof(struct hpte_cache), 0, NULL);
+
+       return 0;
+}
+
+void kvmppc_mmu_hpte_sysexit(void)
+{
+       kmem_cache_destroy(hpte_cache);
+}
index a9f66abafcb3e3a1a510b44b5f47fabeae886ee6..474f2e24050a03a3d89ca85a7b34993d687a61f3 100644 (file)
 
 static inline void kvmppc_sync_qpr(struct kvm_vcpu *vcpu, int rt)
 {
-       struct thread_struct t;
-
-       t.fpscr.val = vcpu->arch.fpscr;
-       cvt_df((double*)&vcpu->arch.fpr[rt], (float*)&vcpu->arch.qpr[rt], &t);
+       kvm_cvt_df(&vcpu->arch.fpr[rt], &vcpu->arch.qpr[rt], &vcpu->arch.fpscr);
 }
 
 static void kvmppc_inject_pf(struct kvm_vcpu *vcpu, ulong eaddr, bool is_store)
@@ -183,7 +180,6 @@ static int kvmppc_emulate_fpr_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
                                   int rs, ulong addr, int ls_type)
 {
        int emulated = EMULATE_FAIL;
-       struct thread_struct t;
        int r;
        char tmp[8];
        int len = sizeof(u32);
@@ -191,8 +187,6 @@ static int kvmppc_emulate_fpr_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
        if (ls_type == FPU_LS_DOUBLE)
                len = sizeof(u64);
 
-       t.fpscr.val = vcpu->arch.fpscr;
-
        /* read from memory */
        r = kvmppc_ld(vcpu, &addr, len, tmp, true);
        vcpu->arch.paddr_accessed = addr;
@@ -210,7 +204,7 @@ static int kvmppc_emulate_fpr_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
        /* put in registers */
        switch (ls_type) {
        case FPU_LS_SINGLE:
-               cvt_fd((float*)tmp, (double*)&vcpu->arch.fpr[rs], &t);
+               kvm_cvt_fd((u32*)tmp, &vcpu->arch.fpr[rs], &vcpu->arch.fpscr);
                vcpu->arch.qpr[rs] = *((u32*)tmp);
                break;
        case FPU_LS_DOUBLE:
@@ -229,17 +223,14 @@ static int kvmppc_emulate_fpr_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
                                    int rs, ulong addr, int ls_type)
 {
        int emulated = EMULATE_FAIL;
-       struct thread_struct t;
        int r;
        char tmp[8];
        u64 val;
        int len;
 
-       t.fpscr.val = vcpu->arch.fpscr;
-
        switch (ls_type) {
        case FPU_LS_SINGLE:
-               cvt_df((double*)&vcpu->arch.fpr[rs], (float*)tmp, &t);
+               kvm_cvt_df(&vcpu->arch.fpr[rs], (u32*)tmp, &vcpu->arch.fpscr);
                val = *((u32*)tmp);
                len = sizeof(u32);
                break;
@@ -278,13 +269,10 @@ static int kvmppc_emulate_psq_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
                                   int rs, ulong addr, bool w, int i)
 {
        int emulated = EMULATE_FAIL;
-       struct thread_struct t;
        int r;
        float one = 1.0;
        u32 tmp[2];
 
-       t.fpscr.val = vcpu->arch.fpscr;
-
        /* read from memory */
        if (w) {
                r = kvmppc_ld(vcpu, &addr, sizeof(u32), tmp, true);
@@ -308,7 +296,7 @@ static int kvmppc_emulate_psq_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
        emulated = EMULATE_DONE;
 
        /* put in registers */
-       cvt_fd((float*)&tmp[0], (double*)&vcpu->arch.fpr[rs], &t);
+       kvm_cvt_fd(&tmp[0], &vcpu->arch.fpr[rs], &vcpu->arch.fpscr);
        vcpu->arch.qpr[rs] = tmp[1];
 
        dprintk(KERN_INFO "KVM: PSQ_LD [0x%x, 0x%x] at 0x%lx (%d)\n", tmp[0],
@@ -322,14 +310,11 @@ static int kvmppc_emulate_psq_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
                                    int rs, ulong addr, bool w, int i)
 {
        int emulated = EMULATE_FAIL;
-       struct thread_struct t;
        int r;
        u32 tmp[2];
        int len = w ? sizeof(u32) : sizeof(u64);
 
-       t.fpscr.val = vcpu->arch.fpscr;
-
-       cvt_df((double*)&vcpu->arch.fpr[rs], (float*)&tmp[0], &t);
+       kvm_cvt_df(&vcpu->arch.fpr[rs], &tmp[0], &vcpu->arch.fpscr);
        tmp[1] = vcpu->arch.qpr[rs];
 
        r = kvmppc_st(vcpu, &addr, len, tmp, true);
@@ -517,7 +502,7 @@ static int get_d_signext(u32 inst)
 static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc,
                                      int reg_out, int reg_in1, int reg_in2,
                                      int reg_in3, int scalar,
-                                     void (*func)(struct thread_struct *t,
+                                     void (*func)(u64 *fpscr,
                                                 u32 *dst, u32 *src1,
                                                 u32 *src2, u32 *src3))
 {
@@ -526,27 +511,25 @@ static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc,
        u32 ps0_out;
        u32 ps0_in1, ps0_in2, ps0_in3;
        u32 ps1_in1, ps1_in2, ps1_in3;
-       struct thread_struct t;
-       t.fpscr.val = vcpu->arch.fpscr;
 
        /* RC */
        WARN_ON(rc);
 
        /* PS0 */
-       cvt_df((double*)&fpr[reg_in1], (float*)&ps0_in1, &t);
-       cvt_df((double*)&fpr[reg_in2], (float*)&ps0_in2, &t);
-       cvt_df((double*)&fpr[reg_in3], (float*)&ps0_in3, &t);
+       kvm_cvt_df(&fpr[reg_in1], &ps0_in1, &vcpu->arch.fpscr);
+       kvm_cvt_df(&fpr[reg_in2], &ps0_in2, &vcpu->arch.fpscr);
+       kvm_cvt_df(&fpr[reg_in3], &ps0_in3, &vcpu->arch.fpscr);
 
        if (scalar & SCALAR_LOW)
                ps0_in2 = qpr[reg_in2];
 
-       func(&t, &ps0_out, &ps0_in1, &ps0_in2, &ps0_in3);
+       func(&vcpu->arch.fpscr, &ps0_out, &ps0_in1, &ps0_in2, &ps0_in3);
 
        dprintk(KERN_INFO "PS3 ps0 -> f(0x%x, 0x%x, 0x%x) = 0x%x\n",
                          ps0_in1, ps0_in2, ps0_in3, ps0_out);
 
        if (!(scalar & SCALAR_NO_PS0))
-               cvt_fd((float*)&ps0_out, (double*)&fpr[reg_out], &t);
+               kvm_cvt_fd(&ps0_out, &fpr[reg_out], &vcpu->arch.fpscr);
 
        /* PS1 */
        ps1_in1 = qpr[reg_in1];
@@ -557,7 +540,7 @@ static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc,
                ps1_in2 = ps0_in2;
 
        if (!(scalar & SCALAR_NO_PS1))
-               func(&t, &qpr[reg_out], &ps1_in1, &ps1_in2, &ps1_in3);
+               func(&vcpu->arch.fpscr, &qpr[reg_out], &ps1_in1, &ps1_in2, &ps1_in3);
 
        dprintk(KERN_INFO "PS3 ps1 -> f(0x%x, 0x%x, 0x%x) = 0x%x\n",
                          ps1_in1, ps1_in2, ps1_in3, qpr[reg_out]);
@@ -568,7 +551,7 @@ static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc,
 static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc,
                                    int reg_out, int reg_in1, int reg_in2,
                                    int scalar,
-                                   void (*func)(struct thread_struct *t,
+                                   void (*func)(u64 *fpscr,
                                                 u32 *dst, u32 *src1,
                                                 u32 *src2))
 {
@@ -578,27 +561,25 @@ static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc,
        u32 ps0_in1, ps0_in2;
        u32 ps1_out;
        u32 ps1_in1, ps1_in2;
-       struct thread_struct t;
-       t.fpscr.val = vcpu->arch.fpscr;
 
        /* RC */
        WARN_ON(rc);
 
        /* PS0 */
-       cvt_df((double*)&fpr[reg_in1], (float*)&ps0_in1, &t);
+       kvm_cvt_df(&fpr[reg_in1], &ps0_in1, &vcpu->arch.fpscr);
 
        if (scalar & SCALAR_LOW)
                ps0_in2 = qpr[reg_in2];
        else
-               cvt_df((double*)&fpr[reg_in2], (float*)&ps0_in2, &t);
+               kvm_cvt_df(&fpr[reg_in2], &ps0_in2, &vcpu->arch.fpscr);
 
-       func(&t, &ps0_out, &ps0_in1, &ps0_in2);
+       func(&vcpu->arch.fpscr, &ps0_out, &ps0_in1, &ps0_in2);
 
        if (!(scalar & SCALAR_NO_PS0)) {
                dprintk(KERN_INFO "PS2 ps0 -> f(0x%x, 0x%x) = 0x%x\n",
                                  ps0_in1, ps0_in2, ps0_out);
 
-               cvt_fd((float*)&ps0_out, (double*)&fpr[reg_out], &t);
+               kvm_cvt_fd(&ps0_out, &fpr[reg_out], &vcpu->arch.fpscr);
        }
 
        /* PS1 */
@@ -608,7 +589,7 @@ static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc,
        if (scalar & SCALAR_HIGH)
                ps1_in2 = ps0_in2;
 
-       func(&t, &ps1_out, &ps1_in1, &ps1_in2);
+       func(&vcpu->arch.fpscr, &ps1_out, &ps1_in1, &ps1_in2);
 
        if (!(scalar & SCALAR_NO_PS1)) {
                qpr[reg_out] = ps1_out;
@@ -622,31 +603,29 @@ static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc,
 
 static int kvmppc_ps_one_in(struct kvm_vcpu *vcpu, bool rc,
                                    int reg_out, int reg_in,
-                                   void (*func)(struct thread_struct *t,
+                                   void (*func)(u64 *t,
                                                 u32 *dst, u32 *src1))
 {
        u32 *qpr = vcpu->arch.qpr;
        u64 *fpr = vcpu->arch.fpr;
        u32 ps0_out, ps0_in;
        u32 ps1_in;
-       struct thread_struct t;
-       t.fpscr.val = vcpu->arch.fpscr;
 
        /* RC */
        WARN_ON(rc);
 
        /* PS0 */
-       cvt_df((double*)&fpr[reg_in], (float*)&ps0_in, &t);
-       func(&t, &ps0_out, &ps0_in);
+       kvm_cvt_df(&fpr[reg_in], &ps0_in, &vcpu->arch.fpscr);
+       func(&vcpu->arch.fpscr, &ps0_out, &ps0_in);
 
        dprintk(KERN_INFO "PS1 ps0 -> f(0x%x) = 0x%x\n",
                          ps0_in, ps0_out);
 
-       cvt_fd((float*)&ps0_out, (double*)&fpr[reg_out], &t);
+       kvm_cvt_fd(&ps0_out, &fpr[reg_out], &vcpu->arch.fpscr);
 
        /* PS1 */
        ps1_in = qpr[reg_in];
-       func(&t, &qpr[reg_out], &ps1_in);
+       func(&vcpu->arch.fpscr, &qpr[reg_out], &ps1_in);
 
        dprintk(KERN_INFO "PS1 ps1 -> f(0x%x) = 0x%x\n",
                          ps1_in, qpr[reg_out]);
@@ -672,13 +651,10 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
 
        bool rcomp = (inst & 1) ? true : false;
        u32 cr = kvmppc_get_cr(vcpu);
-       struct thread_struct t;
 #ifdef DEBUG
        int i;
 #endif
 
-       t.fpscr.val = vcpu->arch.fpscr;
-
        if (!kvmppc_inst_is_paired_single(vcpu, inst))
                return EMULATE_FAIL;
 
@@ -695,7 +671,7 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
 #ifdef DEBUG
        for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) {
                u32 f;
-               cvt_df((double*)&vcpu->arch.fpr[i], (float*)&f, &t);
+               kvm_cvt_df(&vcpu->arch.fpr[i], &f, &vcpu->arch.fpscr);
                dprintk(KERN_INFO "FPR[%d] = 0x%x / 0x%llx    QPR[%d] = 0x%x\n",
                        i, f, vcpu->arch.fpr[i], i, vcpu->arch.qpr[i]);
        }
@@ -819,8 +795,9 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
                        WARN_ON(rcomp);
                        vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_ra];
                        /* vcpu->arch.qpr[ax_rd] = vcpu->arch.fpr[ax_rb]; */
-                       cvt_df((double*)&vcpu->arch.fpr[ax_rb],
-                              (float*)&vcpu->arch.qpr[ax_rd], &t);
+                       kvm_cvt_df(&vcpu->arch.fpr[ax_rb],
+                                  &vcpu->arch.qpr[ax_rd],
+                                  &vcpu->arch.fpscr);
                        break;
                case OP_4X_PS_MERGE01:
                        WARN_ON(rcomp);
@@ -830,17 +807,20 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
                case OP_4X_PS_MERGE10:
                        WARN_ON(rcomp);
                        /* vcpu->arch.fpr[ax_rd] = vcpu->arch.qpr[ax_ra]; */
-                       cvt_fd((float*)&vcpu->arch.qpr[ax_ra],
-                              (double*)&vcpu->arch.fpr[ax_rd], &t);
+                       kvm_cvt_fd(&vcpu->arch.qpr[ax_ra],
+                                  &vcpu->arch.fpr[ax_rd],
+                                  &vcpu->arch.fpscr);
                        /* vcpu->arch.qpr[ax_rd] = vcpu->arch.fpr[ax_rb]; */
-                       cvt_df((double*)&vcpu->arch.fpr[ax_rb],
-                              (float*)&vcpu->arch.qpr[ax_rd], &t);
+                       kvm_cvt_df(&vcpu->arch.fpr[ax_rb],
+                                  &vcpu->arch.qpr[ax_rd],
+                                  &vcpu->arch.fpscr);
                        break;
                case OP_4X_PS_MERGE11:
                        WARN_ON(rcomp);
                        /* vcpu->arch.fpr[ax_rd] = vcpu->arch.qpr[ax_ra]; */
-                       cvt_fd((float*)&vcpu->arch.qpr[ax_ra],
-                              (double*)&vcpu->arch.fpr[ax_rd], &t);
+                       kvm_cvt_fd(&vcpu->arch.qpr[ax_ra],
+                                  &vcpu->arch.fpr[ax_rd],
+                                  &vcpu->arch.fpscr);
                        vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb];
                        break;
                }
@@ -1275,7 +1255,7 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
 #ifdef DEBUG
        for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) {
                u32 f;
-               cvt_df((double*)&vcpu->arch.fpr[i], (float*)&f, &t);
+               kvm_cvt_df(&vcpu->arch.fpr[i], &f, &vcpu->arch.fpscr);
                dprintk(KERN_INFO "FPR[%d] = 0x%x\n", i, f);
        }
 #endif
index a33ab8cc2ccc09a805a1b9991d2e9245de652a5a..8d4e35f5372c85d942d373b89a638f0189093c69 100644 (file)
@@ -144,7 +144,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
                                         unsigned int priority)
 {
        int allowed = 0;
-       ulong msr_mask;
+       ulong uninitialized_var(msr_mask);
        bool update_esr = false, update_dear = false;
 
        switch (priority) {
@@ -485,8 +485,6 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
        int i;
 
-       vcpu_load(vcpu);
-
        regs->pc = vcpu->arch.pc;
        regs->cr = kvmppc_get_cr(vcpu);
        regs->ctr = vcpu->arch.ctr;
@@ -507,8 +505,6 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
        for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
                regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
 
-       vcpu_put(vcpu);
-
        return 0;
 }
 
@@ -516,8 +512,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
        int i;
 
-       vcpu_load(vcpu);
-
        vcpu->arch.pc = regs->pc;
        kvmppc_set_cr(vcpu, regs->cr);
        vcpu->arch.ctr = regs->ctr;
@@ -537,8 +531,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
        for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
                kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
 
-       vcpu_put(vcpu);
-
        return 0;
 }
 
@@ -569,9 +561,7 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 {
        int r;
 
-       vcpu_load(vcpu);
        r = kvmppc_core_vcpu_translate(vcpu, tr);
-       vcpu_put(vcpu);
        return r;
 }
 
index 2b340a3eee903d5b2a26ab7c80746235f4f7397b..cb34bbe1611365c4761ebd1d20c58e8f2997ae12 100644 (file)
@@ -271,3 +271,21 @@ FPD_THREE_IN(fmsub)
 FPD_THREE_IN(fmadd)
 FPD_THREE_IN(fnmsub)
 FPD_THREE_IN(fnmadd)
+
+_GLOBAL(kvm_cvt_fd)
+       lfd     0,0(r5)                 /* load up fpscr value */
+       MTFSF_L(0)
+       lfs     0,0(r3)
+       stfd    0,0(r4)
+       mffs    0
+       stfd    0,0(r5)                 /* save new fpscr value */
+       blr
+
+_GLOBAL(kvm_cvt_df)
+       lfd     0,0(r5)                 /* load up fpscr value */
+       MTFSF_L(0)
+       lfd     0,0(r3)
+       stfs    0,0(r4)
+       mffs    0
+       stfd    0,0(r5)                 /* save new fpscr value */
+       blr
index 9b8683f39e0533576440892bed1ba16577dd6421..72a4ad86ee91f4f8900149f925df0eac755bcd10 100644 (file)
 #define CREATE_TRACE_POINTS
 #include "trace.h"
 
-gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
-{
-       return gfn;
-}
-
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
        return !(v->arch.msr & MSR_WE) || !!(v->arch.pending_exceptions);
@@ -287,7 +282,7 @@ static void kvmppc_complete_dcr_load(struct kvm_vcpu *vcpu,
 static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
                                       struct kvm_run *run)
 {
-       u64 gpr;
+       u64 uninitialized_var(gpr);
 
        if (run->mmio.len > sizeof(gpr)) {
                printk(KERN_ERR "bad MMIO length: %d\n", run->mmio.len);
@@ -423,8 +418,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
        int r;
        sigset_t sigsaved;
 
-       vcpu_load(vcpu);
-
        if (vcpu->sigset_active)
                sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
 
@@ -456,8 +449,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
        if (vcpu->sigset_active)
                sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
-       vcpu_put(vcpu);
-
        return r;
 }
 
@@ -523,8 +514,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                if (copy_from_user(&irq, argp, sizeof(irq)))
                        goto out;
                r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
-               break;
+               goto out;
        }
+
        case KVM_ENABLE_CAP:
        {
                struct kvm_enable_cap cap;
index 27605b62b980d1901067c40faa05b7c9f2102c78..cef7dbf69dfcea3dd28f70485cb2aa7d65bc5ce2 100644 (file)
@@ -26,7 +26,7 @@
 
 struct sca_entry {
        atomic_t scn;
-       __u64   reserved;
+       __u32   reserved;
        __u64   sda;
        __u64   reserved2[2];
 } __attribute__((packed));
@@ -41,7 +41,8 @@ struct sca_block {
 } __attribute__((packed));
 
 #define KVM_NR_PAGE_SIZES 2
-#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + ((x) - 1) * 8)
+#define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 8)
+#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x))
 #define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x))
 #define KVM_HPAGE_MASK(x)      (~(KVM_HPAGE_SIZE(x) - 1))
 #define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
index 3ddc30895e31f8bb9a3d87ff40fd357c671edb21..f7b6df45d8befb3ea19eb18045eceaf25ba10c27 100644 (file)
@@ -135,7 +135,7 @@ static int handle_stop(struct kvm_vcpu *vcpu)
        spin_lock_bh(&vcpu->arch.local_int.lock);
        if (vcpu->arch.local_int.action_bits & ACTION_STORE_ON_STOP) {
                vcpu->arch.local_int.action_bits &= ~ACTION_STORE_ON_STOP;
-               rc = __kvm_s390_vcpu_store_status(vcpu,
+               rc = kvm_s390_vcpu_store_status(vcpu,
                                                  KVM_S390_STORE_STATUS_NOADDR);
                if (rc >= 0)
                        rc = -EOPNOTSUPP;
index ae3705816878f7a7eb4195ff4001ec136a567d2f..4fe68650535cf2acc4d3e968b839a1e750a00529 100644 (file)
@@ -207,6 +207,7 @@ out_nokvm:
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
        VCPU_EVENT(vcpu, 3, "%s", "free cpu");
+       clear_bit(63 - vcpu->vcpu_id, (unsigned long *) &vcpu->kvm->arch.sca->mcn);
        if (vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda ==
                (__u64) vcpu->arch.sie_block)
                vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda = 0;
@@ -296,7 +297,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
        atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH);
        set_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests);
-       vcpu->arch.sie_block->ecb   = 2;
+       vcpu->arch.sie_block->ecb   = 6;
        vcpu->arch.sie_block->eca   = 0xC1002001U;
        vcpu->arch.sie_block->fac   = (int) (long) facilities;
        hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
@@ -329,6 +330,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
                kvm->arch.sca->cpu[id].sda = (__u64) vcpu->arch.sie_block;
        vcpu->arch.sie_block->scaoh = (__u32)(((__u64)kvm->arch.sca) >> 32);
        vcpu->arch.sie_block->scaol = (__u32)(__u64)kvm->arch.sca;
+       set_bit(63 - id, (unsigned long *) &kvm->arch.sca->mcn);
 
        spin_lock_init(&vcpu->arch.local_int.lock);
        INIT_LIST_HEAD(&vcpu->arch.local_int.list);
@@ -363,63 +365,49 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 
 static int kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu)
 {
-       vcpu_load(vcpu);
        kvm_s390_vcpu_initial_reset(vcpu);
-       vcpu_put(vcpu);
        return 0;
 }
 
 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
-       vcpu_load(vcpu);
        memcpy(&vcpu->arch.guest_gprs, &regs->gprs, sizeof(regs->gprs));
-       vcpu_put(vcpu);
        return 0;
 }
 
 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
-       vcpu_load(vcpu);
        memcpy(&regs->gprs, &vcpu->arch.guest_gprs, sizeof(regs->gprs));
-       vcpu_put(vcpu);
        return 0;
 }
 
 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
                                  struct kvm_sregs *sregs)
 {
-       vcpu_load(vcpu);
        memcpy(&vcpu->arch.guest_acrs, &sregs->acrs, sizeof(sregs->acrs));
        memcpy(&vcpu->arch.sie_block->gcr, &sregs->crs, sizeof(sregs->crs));
-       vcpu_put(vcpu);
        return 0;
 }
 
 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
                                  struct kvm_sregs *sregs)
 {
-       vcpu_load(vcpu);
        memcpy(&sregs->acrs, &vcpu->arch.guest_acrs, sizeof(sregs->acrs));
        memcpy(&sregs->crs, &vcpu->arch.sie_block->gcr, sizeof(sregs->crs));
-       vcpu_put(vcpu);
        return 0;
 }
 
 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
-       vcpu_load(vcpu);
        memcpy(&vcpu->arch.guest_fpregs.fprs, &fpu->fprs, sizeof(fpu->fprs));
        vcpu->arch.guest_fpregs.fpc = fpu->fpc;
-       vcpu_put(vcpu);
        return 0;
 }
 
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
-       vcpu_load(vcpu);
        memcpy(&fpu->fprs, &vcpu->arch.guest_fpregs.fprs, sizeof(fpu->fprs));
        fpu->fpc = vcpu->arch.guest_fpregs.fpc;
-       vcpu_put(vcpu);
        return 0;
 }
 
@@ -427,14 +415,12 @@ static int kvm_arch_vcpu_ioctl_set_initial_psw(struct kvm_vcpu *vcpu, psw_t psw)
 {
        int rc = 0;
 
-       vcpu_load(vcpu);
        if (atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_RUNNING)
                rc = -EBUSY;
        else {
                vcpu->run->psw_mask = psw.mask;
                vcpu->run->psw_addr = psw.addr;
        }
-       vcpu_put(vcpu);
        return rc;
 }
 
@@ -498,8 +484,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        int rc;
        sigset_t sigsaved;
 
-       vcpu_load(vcpu);
-
 rerun_vcpu:
        if (vcpu->requests)
                if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
@@ -568,8 +552,6 @@ rerun_vcpu:
        if (vcpu->sigset_active)
                sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
-       vcpu_put(vcpu);
-
        vcpu->stat.exit_userspace++;
        return rc;
 }
@@ -589,7 +571,7 @@ static int __guestcopy(struct kvm_vcpu *vcpu, u64 guestdest, const void *from,
  * KVM_S390_STORE_STATUS_NOADDR: -> 0x1200 on 64 bit
  * KVM_S390_STORE_STATUS_PREFIXED: -> prefix
  */
-int __kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
+int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
 {
        const unsigned char archmode = 1;
        int prefix;
@@ -651,45 +633,42 @@ int __kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
        return 0;
 }
 
-static int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
-{
-       int rc;
-
-       vcpu_load(vcpu);
-       rc = __kvm_s390_vcpu_store_status(vcpu, addr);
-       vcpu_put(vcpu);
-       return rc;
-}
-
 long kvm_arch_vcpu_ioctl(struct file *filp,
                         unsigned int ioctl, unsigned long arg)
 {
        struct kvm_vcpu *vcpu = filp->private_data;
        void __user *argp = (void __user *)arg;
+       long r;
 
        switch (ioctl) {
        case KVM_S390_INTERRUPT: {
                struct kvm_s390_interrupt s390int;
 
+               r = -EFAULT;
                if (copy_from_user(&s390int, argp, sizeof(s390int)))
-                       return -EFAULT;
-               return kvm_s390_inject_vcpu(vcpu, &s390int);
+                       break;
+               r = kvm_s390_inject_vcpu(vcpu, &s390int);
+               break;
        }
        case KVM_S390_STORE_STATUS:
-               return kvm_s390_vcpu_store_status(vcpu, arg);
+               r = kvm_s390_vcpu_store_status(vcpu, arg);
+               break;
        case KVM_S390_SET_INITIAL_PSW: {
                psw_t psw;
 
+               r = -EFAULT;
                if (copy_from_user(&psw, argp, sizeof(psw)))
-                       return -EFAULT;
-               return kvm_arch_vcpu_ioctl_set_initial_psw(vcpu, psw);
+                       break;
+               r = kvm_arch_vcpu_ioctl_set_initial_psw(vcpu, psw);
+               break;
        }
        case KVM_S390_INITIAL_RESET:
-               return kvm_arch_vcpu_ioctl_initial_reset(vcpu);
+               r = kvm_arch_vcpu_ioctl_initial_reset(vcpu);
+               break;
        default:
-               ;
+               r = -EINVAL;
        }
-       return -EINVAL;
+       return r;
 }
 
 /* Section: memory related */
@@ -744,11 +723,6 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
 {
 }
 
-gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
-{
-       return gfn;
-}
-
 static int __init kvm_s390_init(void)
 {
        int ret;
index cfa9d1777457894ab96679cfe571084e24ba790f..a7b7586626dbc75403635fe7ed0db17672d073df 100644 (file)
@@ -92,7 +92,7 @@ int kvm_s390_handle_b2(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
 
 /* implemented in kvm-s390.c */
-int __kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu,
+int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu,
                                 unsigned long addr);
 /* implemented in diag.c */
 int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);
index c991b3a7b904bbfea99c5b74e423808eb55b1c56..815c5b2b9f57de3011c4cbfa1aba663a2c0033f1 100644 (file)
@@ -482,6 +482,8 @@ static inline void fpu_copy(struct fpu *dst, struct fpu *src)
        memcpy(dst->state, src->state, xstate_size);
 }
 
+extern void fpu_finit(struct fpu *fpu);
+
 #endif /* __ASSEMBLY__ */
 
 #define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
index ff90055c7f0bfe9b38f901496877c4c6c31008c1..4d8dcbdfc1205956c3b90b6f13d6bff021b63860 100644 (file)
@@ -22,6 +22,8 @@
 #define __KVM_HAVE_XEN_HVM
 #define __KVM_HAVE_VCPU_EVENTS
 #define __KVM_HAVE_DEBUGREGS
+#define __KVM_HAVE_XSAVE
+#define __KVM_HAVE_XCRS
 
 /* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256
@@ -299,4 +301,24 @@ struct kvm_debugregs {
        __u64 reserved[9];
 };
 
+/* for KVM_CAP_XSAVE */
+struct kvm_xsave {
+       __u32 region[1024];
+};
+
+#define KVM_MAX_XCRS   16
+
+struct kvm_xcr {
+       __u32 xcr;
+       __u32 reserved;
+       __u64 value;
+};
+
+struct kvm_xcrs {
+       __u32 nr_xcrs;
+       __u32 flags;
+       struct kvm_xcr xcrs[KVM_MAX_XCRS];
+       __u64 padding[16];
+};
+
 #endif /* _ASM_X86_KVM_H */
index 0b2729bf207055ce1621bd8a92872f5b9909694b..51cfd730ac5d145ed9f184f643c254d441dbeaab 100644 (file)
@@ -51,8 +51,10 @@ struct x86_emulate_ctxt;
 #define X86EMUL_UNHANDLEABLE    1
 /* Terminate emulation but return success to the caller. */
 #define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */
-#define X86EMUL_RETRY_INSTR     2 /* retry the instruction for some reason */
-#define X86EMUL_CMPXCHG_FAILED  2 /* cmpxchg did not see expected value */
+#define X86EMUL_RETRY_INSTR     3 /* retry the instruction for some reason */
+#define X86EMUL_CMPXCHG_FAILED  4 /* cmpxchg did not see expected value */
+#define X86EMUL_IO_NEEDED       5 /* IO is needed to complete emulation */
+
 struct x86_emulate_ops {
        /*
         * read_std: Read bytes of standard (non-emulated/special) memory.
@@ -92,6 +94,7 @@ struct x86_emulate_ops {
        int (*read_emulated)(unsigned long addr,
                             void *val,
                             unsigned int bytes,
+                            unsigned int *error,
                             struct kvm_vcpu *vcpu);
 
        /*
@@ -104,6 +107,7 @@ struct x86_emulate_ops {
        int (*write_emulated)(unsigned long addr,
                              const void *val,
                              unsigned int bytes,
+                             unsigned int *error,
                              struct kvm_vcpu *vcpu);
 
        /*
@@ -118,6 +122,7 @@ struct x86_emulate_ops {
                                const void *old,
                                const void *new,
                                unsigned int bytes,
+                               unsigned int *error,
                                struct kvm_vcpu *vcpu);
 
        int (*pio_in_emulated)(int size, unsigned short port, void *val,
@@ -132,18 +137,26 @@ struct x86_emulate_ops {
                                      int seg, struct kvm_vcpu *vcpu);
        u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu);
        void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
+       unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu);
        void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
        ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
-       void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
+       int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
        int (*cpl)(struct kvm_vcpu *vcpu);
-       void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
+       int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu);
+       int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu);
+       int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
+       int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
 };
 
 /* Type, address-of, and value of an instruction's operand. */
 struct operand {
        enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
        unsigned int bytes;
-       unsigned long val, orig_val, *ptr;
+       unsigned long orig_val, *ptr;
+       union {
+               unsigned long val;
+               char valptr[sizeof(unsigned long) + 2];
+       };
 };
 
 struct fetch_cache {
@@ -186,6 +199,7 @@ struct decode_cache {
        unsigned long modrm_val;
        struct fetch_cache fetch;
        struct read_cache io_read;
+       struct read_cache mem_read;
 };
 
 struct x86_emulate_ctxt {
@@ -202,6 +216,12 @@ struct x86_emulate_ctxt {
        int interruptibility;
 
        bool restart; /* restart string instruction after writeback */
+
+       int exception; /* exception that happens during emulation or -1 */
+       u32 error_code; /* error code for exception */
+       bool error_code_valid;
+       unsigned long cr2; /* faulted address in case of #PF */
+
        /* decode cache */
        struct decode_cache decode;
 };
index 76f5483cffecb884b1054b90312bec176253d3b7..502e53f999cf28a25cc2b00f1766bd1ffb2f9ed2 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/mm.h>
 #include <linux/mmu_notifier.h>
 #include <linux/tracepoint.h>
+#include <linux/cpumask.h>
 
 #include <linux/kvm.h>
 #include <linux/kvm_para.h>
                                  0xFFFFFF0000000000ULL)
 
 #define INVALID_PAGE (~(hpa_t)0)
+#define VALID_PAGE(x) ((x) != INVALID_PAGE)
+
 #define UNMAPPED_GVA (~(gpa_t)0)
 
 /* KVM Hugepage definitions for x86 */
 #define KVM_NR_PAGE_SIZES      3
-#define KVM_HPAGE_SHIFT(x)     (PAGE_SHIFT + (((x) - 1) * 9))
+#define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 9)
+#define KVM_HPAGE_SHIFT(x)     (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x))
 #define KVM_HPAGE_SIZE(x)      (1UL << KVM_HPAGE_SHIFT(x))
 #define KVM_HPAGE_MASK(x)      (~(KVM_HPAGE_SIZE(x) - 1))
 #define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
@@ -69,8 +73,6 @@
 
 #define IOPL_SHIFT 12
 
-#define KVM_ALIAS_SLOTS 4
-
 #define KVM_PERMILLE_MMU_PAGES 20
 #define KVM_MIN_ALLOC_MMU_PAGES 64
 #define KVM_MMU_HASH_SHIFT 10
@@ -241,7 +243,7 @@ struct kvm_mmu {
        void (*prefetch_page)(struct kvm_vcpu *vcpu,
                              struct kvm_mmu_page *page);
        int (*sync_page)(struct kvm_vcpu *vcpu,
-                        struct kvm_mmu_page *sp);
+                        struct kvm_mmu_page *sp, bool clear_unsync);
        void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
        hpa_t root_hpa;
        int root_level;
@@ -301,8 +303,8 @@ struct kvm_vcpu_arch {
                unsigned long mmu_seq;
        } update_pte;
 
-       struct i387_fxsave_struct host_fx_image;
-       struct i387_fxsave_struct guest_fx_image;
+       struct fpu guest_fpu;
+       u64 xcr0;
 
        gva_t mmio_fault_cr2;
        struct kvm_pio_request pio;
@@ -360,26 +362,11 @@ struct kvm_vcpu_arch {
 
        /* fields used by HYPER-V emulation */
        u64 hv_vapic;
-};
-
-struct kvm_mem_alias {
-       gfn_t base_gfn;
-       unsigned long npages;
-       gfn_t target_gfn;
-#define KVM_ALIAS_INVALID     1UL
-       unsigned long flags;
-};
 
-#define KVM_ARCH_HAS_UNALIAS_INSTANTIATION
-
-struct kvm_mem_aliases {
-       struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
-       int naliases;
+       cpumask_var_t wbinvd_dirty_mask;
 };
 
 struct kvm_arch {
-       struct kvm_mem_aliases *aliases;
-
        unsigned int n_free_mmu_pages;
        unsigned int n_requested_mmu_pages;
        unsigned int n_alloc_mmu_pages;
@@ -533,6 +520,8 @@ struct kvm_x86_ops {
 
        void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
 
+       bool (*has_wbinvd_exit)(void);
+
        const struct trace_print_flags *exit_reasons_str;
 };
 
@@ -576,7 +565,6 @@ enum emulation_result {
 #define EMULTYPE_SKIP              (1 << 2)
 int emulate_instruction(struct kvm_vcpu *vcpu,
                        unsigned long cr2, u16 error_code, int emulation_type);
-void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
 void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
 
@@ -591,10 +579,7 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
 int kvm_emulate_halt(struct kvm_vcpu *vcpu);
 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
 int emulate_clts(struct kvm_vcpu *vcpu);
-int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
-                   unsigned long *dest);
-int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
-                   unsigned long value);
+int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
 
 void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
@@ -602,15 +587,16 @@ int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
                    bool has_error_code, u32 error_code);
 
-void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
-void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
-void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
+int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
 int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
 int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
+int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
 
 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
@@ -630,12 +616,7 @@ int kvm_pic_set_irq(void *opaque, int irq, int level);
 
 void kvm_inject_nmi(struct kvm_vcpu *vcpu);
 
-void fx_init(struct kvm_vcpu *vcpu);
-
-int emulator_write_emulated(unsigned long addr,
-                           const void *val,
-                           unsigned int bytes,
-                           struct kvm_vcpu *vcpu);
+int fx_init(struct kvm_vcpu *vcpu);
 
 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -664,8 +645,6 @@ void kvm_disable_tdp(void);
 int complete_pio(struct kvm_vcpu *vcpu);
 bool kvm_check_iopl(struct kvm_vcpu *vcpu);
 
-struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn);
-
 static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
 {
        struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
@@ -719,21 +698,6 @@ static inline unsigned long read_msr(unsigned long msr)
 }
 #endif
 
-static inline void kvm_fx_save(struct i387_fxsave_struct *image)
-{
-       asm("fxsave (%0)":: "r" (image));
-}
-
-static inline void kvm_fx_restore(struct i387_fxsave_struct *image)
-{
-       asm("fxrstor (%0)":: "r" (image));
-}
-
-static inline void kvm_fx_finit(void)
-{
-       asm("finit");
-}
-
 static inline u32 get_rdx_init_val(void)
 {
        return 0x600; /* P6 family */
index 8c7ae4318629445f16b588e9535b0dc2a105836c..509a42187dc25db56aaabd5bcb5652235ec27e8c 100644 (file)
@@ -20,6 +20,7 @@
 #define _EFER_LMA              10 /* Long mode active (read-only) */
 #define _EFER_NX               11 /* No execute enable */
 #define _EFER_SVME             12 /* Enable virtualization */
+#define _EFER_LMSLE            13 /* Long Mode Segment Limit Enable */
 #define _EFER_FFXSR            14 /* Enable Fast FXSAVE/FXRSTOR */
 
 #define EFER_SCE               (1<<_EFER_SCE)
@@ -27,6 +28,7 @@
 #define EFER_LMA               (1<<_EFER_LMA)
 #define EFER_NX                        (1<<_EFER_NX)
 #define EFER_SVME              (1<<_EFER_SVME)
+#define EFER_LMSLE             (1<<_EFER_LMSLE)
 #define EFER_FFXSR             (1<<_EFER_FFXSR)
 
 /* Intel MSRs. Some also available on other CPUs */
index 9e6779f7cf2d12461ddbb2164f101ae8a3c15674..9f0cbd987d5046ca9272d2c1b8b6287864ca2541 100644 (file)
@@ -257,6 +257,7 @@ enum vmcs_field {
 #define EXIT_REASON_IO_INSTRUCTION      30
 #define EXIT_REASON_MSR_READ            31
 #define EXIT_REASON_MSR_WRITE           32
+#define EXIT_REASON_INVALID_STATE      33
 #define EXIT_REASON_MWAIT_INSTRUCTION   36
 #define EXIT_REASON_MONITOR_INSTRUCTION 39
 #define EXIT_REASON_PAUSE_INSTRUCTION   40
@@ -266,6 +267,7 @@ enum vmcs_field {
 #define EXIT_REASON_EPT_VIOLATION       48
 #define EXIT_REASON_EPT_MISCONFIG       49
 #define EXIT_REASON_WBINVD             54
+#define EXIT_REASON_XSETBV             55
 
 /*
  * Interruption-information format
@@ -375,6 +377,9 @@ enum vmcs_field {
 #define VMX_EPT_EXTENT_CONTEXT_BIT             (1ull << 25)
 #define VMX_EPT_EXTENT_GLOBAL_BIT              (1ull << 26)
 
+#define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT      (1ull << 9) /* (41 - 32) */
+#define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT      (1ull << 10) /* (42 - 32) */
+
 #define VMX_EPT_DEFAULT_GAW                    3
 #define VMX_EPT_MAX_GAW                                0x4
 #define VMX_EPT_MT_EPTE_SHIFT                  3
index 2c4390cae22883014816647319fb6569d6867c2c..32c36668fa7bc1b6b3cc5fd31b1ddbdf3d0a9fa5 100644 (file)
 
 #define FXSAVE_SIZE    512
 
+#define XSAVE_HDR_SIZE     64
+#define XSAVE_HDR_OFFSET    FXSAVE_SIZE
+
+#define XSAVE_YMM_SIZE     256
+#define XSAVE_YMM_OFFSET    (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
+
 /*
  * These are the features that the OS can handle currently.
  */
index 86cef6b322530ffecaa657ecda41b6cf2619fb47..c4444bce8469876b2a4a43089cad8a97c3198b73 100644 (file)
@@ -107,7 +107,7 @@ void __cpuinit fpu_init(void)
 }
 #endif /* CONFIG_X86_64 */
 
-static void fpu_finit(struct fpu *fpu)
+void fpu_finit(struct fpu *fpu)
 {
 #ifdef CONFIG_X86_32
        if (!HAVE_HWFP) {
@@ -132,6 +132,7 @@ static void fpu_finit(struct fpu *fpu)
                fp->fos = 0xffff0000u;
        }
 }
+EXPORT_SYMBOL_GPL(fpu_finit);
 
 /*
  * The _current_ task is using the FPU for the first time
index e7e35219b32f23e115c23846d06c07b0128e26ae..ebcfcceccc725f61bb8d0b17cfcc67c78dcd0df6 100644 (file)
@@ -28,6 +28,7 @@ unsigned long idle_nomwait;
 EXPORT_SYMBOL(idle_nomwait);
 
 struct kmem_cache *task_xstate_cachep;
+EXPORT_SYMBOL_GPL(task_xstate_cachep);
 
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
index 5ac0bb465ed67fd725881ccacfeb22e44597f4cf..b38bd8b92aa6c84ed9a00295eeaac73bade26458 100644 (file)
@@ -9,6 +9,7 @@
  * privileged instructions:
  *
  * Copyright (C) 2006 Qumranet
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
  *
  *   Avi Kivity <avi@qumranet.com>
  *   Yaniv Kamay <yaniv@qumranet.com>
@@ -67,6 +68,9 @@
 #define SrcImmUByte (8<<4)      /* 8-bit unsigned immediate operand. */
 #define SrcImmU     (9<<4)      /* Immediate operand, unsigned */
 #define SrcSI       (0xa<<4)   /* Source is in the DS:RSI */
+#define SrcImmFAddr (0xb<<4)   /* Source is immediate far address */
+#define SrcMemFAddr (0xc<<4)   /* Source is far address in memory */
+#define SrcAcc      (0xd<<4)   /* Source Accumulator */
 #define SrcMask     (0xf<<4)
 /* Generic ModRM decode. */
 #define ModRM       (1<<8)
 #define Src2CL      (1<<29)
 #define Src2ImmByte (2<<29)
 #define Src2One     (3<<29)
-#define Src2Imm16   (4<<29)
-#define Src2Mem16   (5<<29) /* Used for Ep encoding. First argument has to be
-                              in memory and second argument is located
-                              immediately after the first one in memory. */
 #define Src2Mask    (7<<29)
 
 enum {
@@ -124,15 +124,15 @@ static u32 opcode_table[256] = {
        /* 0x20 - 0x27 */
        ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
        ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-       DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
+       ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
        /* 0x28 - 0x2F */
        ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
        ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-       0, 0, 0, 0,
+       ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
        /* 0x30 - 0x37 */
        ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
        ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-       0, 0, 0, 0,
+       ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
        /* 0x38 - 0x3F */
        ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
        ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -170,20 +170,20 @@ static u32 opcode_table[256] = {
        /* 0x88 - 0x8F */
        ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
        ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-       DstMem | SrcReg | ModRM | Mov, ModRM | DstReg,
-       DstReg | SrcMem | ModRM | Mov, Group | Group1A,
+       DstMem | SrcNone | ModRM | Mov, ModRM | DstReg,
+       ImplicitOps | SrcMem16 | ModRM, Group | Group1A,
        /* 0x90 - 0x97 */
        DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
        /* 0x98 - 0x9F */
-       0, 0, SrcImm | Src2Imm16 | No64, 0,
+       0, 0, SrcImmFAddr | No64, 0,
        ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
        /* 0xA0 - 0xA7 */
-       ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
-       ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
+       ByteOp | DstAcc | SrcMem | Mov | MemAbs, DstAcc | SrcMem | Mov | MemAbs,
+       ByteOp | DstMem | SrcAcc | Mov | MemAbs, DstMem | SrcAcc | Mov | MemAbs,
        ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String,
        ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String,
        /* 0xA8 - 0xAF */
-       0, 0, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
+       DstAcc | SrcImmByte | ByteOp, DstAcc | SrcImm, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
        ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String,
        ByteOp | DstDI | String, DstDI | String,
        /* 0xB0 - 0xB7 */
@@ -215,7 +215,7 @@ static u32 opcode_table[256] = {
        ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
        /* 0xE8 - 0xEF */
        SrcImm | Stack, SrcImm | ImplicitOps,
-       SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps,
+       SrcImmFAddr | No64, SrcImmByte | ImplicitOps,
        SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
        SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
        /* 0xF0 - 0xF7 */
@@ -337,20 +337,20 @@ static u32 group_table[] = {
        [Group1A*8] =
        DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
        [Group3_Byte*8] =
-       ByteOp | SrcImm | DstMem | ModRM, 0,
+       ByteOp | SrcImm | DstMem | ModRM, ByteOp | SrcImm | DstMem | ModRM,
        ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
        0, 0, 0, 0,
        [Group3*8] =
-       DstMem | SrcImm | ModRM, 0,
+       DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
        DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
        0, 0, 0, 0,
        [Group4*8] =
-       ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
+       ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock,
        0, 0, 0, 0, 0, 0,
        [Group5*8] =
-       DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
+       DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock,
        SrcMem | ModRM | Stack, 0,
-       SrcMem | ModRM | Stack, SrcMem | ModRM | Src2Mem16 | ImplicitOps,
+       SrcMem | ModRM | Stack, SrcMemFAddr | ModRM | ImplicitOps,
        SrcMem | ModRM | Stack, 0,
        [Group7*8] =
        0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
@@ -576,6 +576,13 @@ static u32 group2_table[] = {
        (_type)_x;                                                      \
 })
 
+#define insn_fetch_arr(_arr, _size, _eip)                                \
+({     rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size));           \
+       if (rc != X86EMUL_CONTINUE)                                     \
+               goto done;                                              \
+       (_eip) += (_size);                                              \
+})
+
 static inline unsigned long ad_mask(struct decode_cache *c)
 {
        return (1UL << (c->ad_bytes << 3)) - 1;
@@ -617,31 +624,66 @@ static void set_seg_override(struct decode_cache *c, int seg)
        c->seg_override = seg;
 }
 
-static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
+static unsigned long seg_base(struct x86_emulate_ctxt *ctxt,
+                             struct x86_emulate_ops *ops, int seg)
 {
        if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
                return 0;
 
-       return kvm_x86_ops->get_segment_base(ctxt->vcpu, seg);
+       return ops->get_cached_segment_base(seg, ctxt->vcpu);
 }
 
 static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt,
+                                      struct x86_emulate_ops *ops,
                                       struct decode_cache *c)
 {
        if (!c->has_seg_override)
                return 0;
 
-       return seg_base(ctxt, c->seg_override);
+       return seg_base(ctxt, ops, c->seg_override);
+}
+
+static unsigned long es_base(struct x86_emulate_ctxt *ctxt,
+                            struct x86_emulate_ops *ops)
+{
+       return seg_base(ctxt, ops, VCPU_SREG_ES);
+}
+
+static unsigned long ss_base(struct x86_emulate_ctxt *ctxt,
+                            struct x86_emulate_ops *ops)
+{
+       return seg_base(ctxt, ops, VCPU_SREG_SS);
+}
+
+static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
+                                     u32 error, bool valid)
+{
+       ctxt->exception = vec;
+       ctxt->error_code = error;
+       ctxt->error_code_valid = valid;
+       ctxt->restart = false;
+}
+
+static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
+{
+       emulate_exception(ctxt, GP_VECTOR, err, true);
 }
 
-static unsigned long es_base(struct x86_emulate_ctxt *ctxt)
+static void emulate_pf(struct x86_emulate_ctxt *ctxt, unsigned long addr,
+                      int err)
 {
-       return seg_base(ctxt, VCPU_SREG_ES);
+       ctxt->cr2 = addr;
+       emulate_exception(ctxt, PF_VECTOR, err, true);
 }
 
-static unsigned long ss_base(struct x86_emulate_ctxt *ctxt)
+static void emulate_ud(struct x86_emulate_ctxt *ctxt)
 {
-       return seg_base(ctxt, VCPU_SREG_SS);
+       emulate_exception(ctxt, UD_VECTOR, 0, false);
+}
+
+static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
+{
+       emulate_exception(ctxt, TS_VECTOR, err, true);
 }
 
 static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
@@ -932,12 +974,9 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
        /* we cannot decode insn before we complete previous rep insn */
        WARN_ON(ctxt->restart);
 
-       /* Shadow copy of register state. Committed on successful emulation. */
-       memset(c, 0, sizeof(struct decode_cache));
        c->eip = ctxt->eip;
        c->fetch.start = c->fetch.end = c->eip;
-       ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
-       memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
+       ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
 
        switch (mode) {
        case X86EMUL_MODE_REAL:
@@ -1060,7 +1099,7 @@ done_prefixes:
                set_seg_override(c, VCPU_SREG_DS);
 
        if (!(!c->twobyte && c->b == 0x8d))
-               c->modrm_ea += seg_override_base(ctxt, c);
+               c->modrm_ea += seg_override_base(ctxt, ops, c);
 
        if (c->ad_bytes != 8)
                c->modrm_ea = (u32)c->modrm_ea;
@@ -1148,6 +1187,25 @@ done_prefixes:
                else
                        c->src.val = insn_fetch(u8, 1, c->eip);
                break;
+       case SrcAcc:
+               c->src.type = OP_REG;
+               c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+               c->src.ptr = &c->regs[VCPU_REGS_RAX];
+               switch (c->src.bytes) {
+                       case 1:
+                               c->src.val = *(u8 *)c->src.ptr;
+                               break;
+                       case 2:
+                               c->src.val = *(u16 *)c->src.ptr;
+                               break;
+                       case 4:
+                               c->src.val = *(u32 *)c->src.ptr;
+                               break;
+                       case 8:
+                               c->src.val = *(u64 *)c->src.ptr;
+                               break;
+               }
+               break;
        case SrcOne:
                c->src.bytes = 1;
                c->src.val = 1;
@@ -1156,10 +1214,21 @@ done_prefixes:
                c->src.type = OP_MEM;
                c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
                c->src.ptr = (unsigned long *)
-                       register_address(c,  seg_override_base(ctxt, c),
+                       register_address(c,  seg_override_base(ctxt, ops, c),
                                         c->regs[VCPU_REGS_RSI]);
                c->src.val = 0;
                break;
+       case SrcImmFAddr:
+               c->src.type = OP_IMM;
+               c->src.ptr = (unsigned long *)c->eip;
+               c->src.bytes = c->op_bytes + 2;
+               insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
+               break;
+       case SrcMemFAddr:
+               c->src.type = OP_MEM;
+               c->src.ptr = (unsigned long *)c->modrm_ea;
+               c->src.bytes = c->op_bytes + 2;
+               break;
        }
 
        /*
@@ -1179,22 +1248,10 @@ done_prefixes:
                c->src2.bytes = 1;
                c->src2.val = insn_fetch(u8, 1, c->eip);
                break;
-       case Src2Imm16:
-               c->src2.type = OP_IMM;
-               c->src2.ptr = (unsigned long *)c->eip;
-               c->src2.bytes = 2;
-               c->src2.val = insn_fetch(u16, 2, c->eip);
-               break;
        case Src2One:
                c->src2.bytes = 1;
                c->src2.val = 1;
                break;
-       case Src2Mem16:
-               c->src2.type = OP_MEM;
-               c->src2.bytes = 2;
-               c->src2.ptr = (unsigned long *)(c->modrm_ea + c->src.bytes);
-               c->src2.val = 0;
-               break;
        }
 
        /* Decode and fetch the destination operand: register or memory. */
@@ -1253,7 +1310,7 @@ done_prefixes:
                c->dst.type = OP_MEM;
                c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
                c->dst.ptr = (unsigned long *)
-                       register_address(c, es_base(ctxt),
+                       register_address(c, es_base(ctxt, ops),
                                         c->regs[VCPU_REGS_RDI]);
                c->dst.val = 0;
                break;
@@ -1263,6 +1320,37 @@ done:
        return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
 }
 
+static int read_emulated(struct x86_emulate_ctxt *ctxt,
+                        struct x86_emulate_ops *ops,
+                        unsigned long addr, void *dest, unsigned size)
+{
+       int rc;
+       struct read_cache *mc = &ctxt->decode.mem_read;
+       u32 err;
+
+       while (size) {
+               int n = min(size, 8u);
+               size -= n;
+               if (mc->pos < mc->end)
+                       goto read_cached;
+
+               rc = ops->read_emulated(addr, mc->data + mc->end, n, &err,
+                                       ctxt->vcpu);
+               if (rc == X86EMUL_PROPAGATE_FAULT)
+                       emulate_pf(ctxt, addr, err);
+               if (rc != X86EMUL_CONTINUE)
+                       return rc;
+               mc->end += n;
+
+       read_cached:
+               memcpy(dest, mc->data + mc->pos, n);
+               mc->pos += n;
+               dest += n;
+               addr += n;
+       }
+       return X86EMUL_CONTINUE;
+}
+
 static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
                           struct x86_emulate_ops *ops,
                           unsigned int size, unsigned short port,
@@ -1330,13 +1418,13 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
        get_descriptor_table_ptr(ctxt, ops, selector, &dt);
 
        if (dt.size < index * 8 + 7) {
-               kvm_inject_gp(ctxt->vcpu, selector & 0xfffc);
+               emulate_gp(ctxt, selector & 0xfffc);
                return X86EMUL_PROPAGATE_FAULT;
        }
        addr = dt.address + index * 8;
        ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu,  &err);
        if (ret == X86EMUL_PROPAGATE_FAULT)
-               kvm_inject_page_fault(ctxt->vcpu, addr, err);
+               emulate_pf(ctxt, addr, err);
 
        return ret;
 }
@@ -1355,14 +1443,14 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
        get_descriptor_table_ptr(ctxt, ops, selector, &dt);
 
        if (dt.size < index * 8 + 7) {
-               kvm_inject_gp(ctxt->vcpu, selector & 0xfffc);
+               emulate_gp(ctxt, selector & 0xfffc);
                return X86EMUL_PROPAGATE_FAULT;
        }
 
        addr = dt.address + index * 8;
        ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
        if (ret == X86EMUL_PROPAGATE_FAULT)
-               kvm_inject_page_fault(ctxt->vcpu, addr, err);
+               emulate_pf(ctxt, addr, err);
 
        return ret;
 }
@@ -1481,11 +1569,70 @@ load:
        ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu);
        return X86EMUL_CONTINUE;
 exception:
-       kvm_queue_exception_e(ctxt->vcpu, err_vec, err_code);
+       emulate_exception(ctxt, err_vec, err_code, true);
        return X86EMUL_PROPAGATE_FAULT;
 }
 
-static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
+static inline int writeback(struct x86_emulate_ctxt *ctxt,
+                           struct x86_emulate_ops *ops)
+{
+       int rc;
+       struct decode_cache *c = &ctxt->decode;
+       u32 err;
+
+       switch (c->dst.type) {
+       case OP_REG:
+               /* The 4-byte case *is* correct:
+                * in 64-bit mode we zero-extend.
+                */
+               switch (c->dst.bytes) {
+               case 1:
+                       *(u8 *)c->dst.ptr = (u8)c->dst.val;
+                       break;
+               case 2:
+                       *(u16 *)c->dst.ptr = (u16)c->dst.val;
+                       break;
+               case 4:
+                       *c->dst.ptr = (u32)c->dst.val;
+                       break;  /* 64b: zero-ext */
+               case 8:
+                       *c->dst.ptr = c->dst.val;
+                       break;
+               }
+               break;
+       case OP_MEM:
+               if (c->lock_prefix)
+                       rc = ops->cmpxchg_emulated(
+                                       (unsigned long)c->dst.ptr,
+                                       &c->dst.orig_val,
+                                       &c->dst.val,
+                                       c->dst.bytes,
+                                       &err,
+                                       ctxt->vcpu);
+               else
+                       rc = ops->write_emulated(
+                                       (unsigned long)c->dst.ptr,
+                                       &c->dst.val,
+                                       c->dst.bytes,
+                                       &err,
+                                       ctxt->vcpu);
+               if (rc == X86EMUL_PROPAGATE_FAULT)
+                       emulate_pf(ctxt,
+                                             (unsigned long)c->dst.ptr, err);
+               if (rc != X86EMUL_CONTINUE)
+                       return rc;
+               break;
+       case OP_NONE:
+               /* no writeback */
+               break;
+       default:
+               break;
+       }
+       return X86EMUL_CONTINUE;
+}
+
+static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
+                               struct x86_emulate_ops *ops)
 {
        struct decode_cache *c = &ctxt->decode;
 
@@ -1493,7 +1640,7 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
        c->dst.bytes = c->op_bytes;
        c->dst.val = c->src.val;
        register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
-       c->dst.ptr = (void *) register_address(c, ss_base(ctxt),
+       c->dst.ptr = (void *) register_address(c, ss_base(ctxt, ops),
                                               c->regs[VCPU_REGS_RSP]);
 }
 
@@ -1504,9 +1651,9 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
        struct decode_cache *c = &ctxt->decode;
        int rc;
 
-       rc = ops->read_emulated(register_address(c, ss_base(ctxt),
-                                                c->regs[VCPU_REGS_RSP]),
-                               dest, len, ctxt->vcpu);
+       rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops),
+                                                      c->regs[VCPU_REGS_RSP]),
+                          dest, len);
        if (rc != X86EMUL_CONTINUE)
                return rc;
 
@@ -1541,7 +1688,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
                break;
        case X86EMUL_MODE_VM86:
                if (iopl < 3) {
-                       kvm_inject_gp(ctxt->vcpu, 0);
+                       emulate_gp(ctxt, 0);
                        return X86EMUL_PROPAGATE_FAULT;
                }
                change_mask |= EFLG_IF;
@@ -1557,15 +1704,14 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
        return rc;
 }
 
-static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg)
+static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt,
+                             struct x86_emulate_ops *ops, int seg)
 {
        struct decode_cache *c = &ctxt->decode;
-       struct kvm_segment segment;
 
-       kvm_x86_ops->get_segment(ctxt->vcpu, &segment, seg);
+       c->src.val = ops->get_segment_selector(seg, ctxt->vcpu);
 
-       c->src.val = segment.selector;
-       emulate_push(ctxt);
+       emulate_push(ctxt, ops);
 }
 
 static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
@@ -1583,19 +1729,31 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
        return rc;
 }
 
-static void emulate_pusha(struct x86_emulate_ctxt *ctxt)
+static int emulate_pusha(struct x86_emulate_ctxt *ctxt,
+                         struct x86_emulate_ops *ops)
 {
        struct decode_cache *c = &ctxt->decode;
        unsigned long old_esp = c->regs[VCPU_REGS_RSP];
+       int rc = X86EMUL_CONTINUE;
        int reg = VCPU_REGS_RAX;
 
        while (reg <= VCPU_REGS_RDI) {
                (reg == VCPU_REGS_RSP) ?
                (c->src.val = old_esp) : (c->src.val = c->regs[reg]);
 
-               emulate_push(ctxt);
+               emulate_push(ctxt, ops);
+
+               rc = writeback(ctxt, ops);
+               if (rc != X86EMUL_CONTINUE)
+                       return rc;
+
                ++reg;
        }
+
+       /* Disable writeback. */
+       c->dst.type = OP_NONE;
+
+       return rc;
 }
 
 static int emulate_popa(struct x86_emulate_ctxt *ctxt,
@@ -1695,14 +1853,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
                old_eip = c->eip;
                c->eip = c->src.val;
                c->src.val = old_eip;
-               emulate_push(ctxt);
+               emulate_push(ctxt, ops);
                break;
        }
        case 4: /* jmp abs */
                c->eip = c->src.val;
                break;
        case 6: /* push */
-               emulate_push(ctxt);
+               emulate_push(ctxt, ops);
                break;
        }
        return X86EMUL_CONTINUE;
@@ -1748,145 +1906,82 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
        return rc;
 }
 
-static inline int writeback(struct x86_emulate_ctxt *ctxt,
-                           struct x86_emulate_ops *ops)
-{
-       int rc;
-       struct decode_cache *c = &ctxt->decode;
-
-       switch (c->dst.type) {
-       case OP_REG:
-               /* The 4-byte case *is* correct:
-                * in 64-bit mode we zero-extend.
-                */
-               switch (c->dst.bytes) {
-               case 1:
-                       *(u8 *)c->dst.ptr = (u8)c->dst.val;
-                       break;
-               case 2:
-                       *(u16 *)c->dst.ptr = (u16)c->dst.val;
-                       break;
-               case 4:
-                       *c->dst.ptr = (u32)c->dst.val;
-                       break;  /* 64b: zero-ext */
-               case 8:
-                       *c->dst.ptr = c->dst.val;
-                       break;
-               }
-               break;
-       case OP_MEM:
-               if (c->lock_prefix)
-                       rc = ops->cmpxchg_emulated(
-                                       (unsigned long)c->dst.ptr,
-                                       &c->dst.orig_val,
-                                       &c->dst.val,
-                                       c->dst.bytes,
-                                       ctxt->vcpu);
-               else
-                       rc = ops->write_emulated(
-                                       (unsigned long)c->dst.ptr,
-                                       &c->dst.val,
-                                       c->dst.bytes,
-                                       ctxt->vcpu);
-               if (rc != X86EMUL_CONTINUE)
-                       return rc;
-               break;
-       case OP_NONE:
-               /* no writeback */
-               break;
-       default:
-               break;
-       }
-       return X86EMUL_CONTINUE;
-}
-
-static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
-{
-       u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask);
-       /*
-        * an sti; sti; sequence only disable interrupts for the first
-        * instruction. So, if the last instruction, be it emulated or
-        * not, left the system with the INT_STI flag enabled, it
-        * means that the last instruction is an sti. We should not
-        * leave the flag on in this case. The same goes for mov ss
-        */
-       if (!(int_shadow & mask))
-               ctxt->interruptibility = mask;
-}
-
 static inline void
 setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
-       struct kvm_segment *cs, struct kvm_segment *ss)
+                       struct x86_emulate_ops *ops, struct desc_struct *cs,
+                       struct desc_struct *ss)
 {
-       memset(cs, 0, sizeof(struct kvm_segment));
-       kvm_x86_ops->get_segment(ctxt->vcpu, cs, VCPU_SREG_CS);
-       memset(ss, 0, sizeof(struct kvm_segment));
+       memset(cs, 0, sizeof(struct desc_struct));
+       ops->get_cached_descriptor(cs, VCPU_SREG_CS, ctxt->vcpu);
+       memset(ss, 0, sizeof(struct desc_struct));
 
        cs->l = 0;              /* will be adjusted later */
-       cs->base = 0;           /* flat segment */
+       set_desc_base(cs, 0);   /* flat segment */
        cs->g = 1;              /* 4kb granularity */
-       cs->limit = 0xffffffff; /* 4GB limit */
+       set_desc_limit(cs, 0xfffff);    /* 4GB limit */
        cs->type = 0x0b;        /* Read, Execute, Accessed */
        cs->s = 1;
        cs->dpl = 0;            /* will be adjusted later */
-       cs->present = 1;
-       cs->db = 1;
+       cs->p = 1;
+       cs->d = 1;
 
-       ss->unusable = 0;
-       ss->base = 0;           /* flat segment */
-       ss->limit = 0xffffffff; /* 4GB limit */
+       set_desc_base(ss, 0);   /* flat segment */
+       set_desc_limit(ss, 0xfffff);    /* 4GB limit */
        ss->g = 1;              /* 4kb granularity */
        ss->s = 1;
        ss->type = 0x03;        /* Read/Write, Accessed */
-       ss->db = 1;             /* 32bit stack segment */
+       ss->d = 1;              /* 32bit stack segment */
        ss->dpl = 0;
-       ss->present = 1;
+       ss->p = 1;
 }
 
 static int
-emulate_syscall(struct x86_emulate_ctxt *ctxt)
+emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
        struct decode_cache *c = &ctxt->decode;
-       struct kvm_segment cs, ss;
+       struct desc_struct cs, ss;
        u64 msr_data;
+       u16 cs_sel, ss_sel;
 
        /* syscall is not available in real mode */
        if (ctxt->mode == X86EMUL_MODE_REAL ||
            ctxt->mode == X86EMUL_MODE_VM86) {
-               kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+               emulate_ud(ctxt);
                return X86EMUL_PROPAGATE_FAULT;
        }
 
-       setup_syscalls_segments(ctxt, &cs, &ss);
+       setup_syscalls_segments(ctxt, ops, &cs, &ss);
 
-       kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
+       ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
        msr_data >>= 32;
-       cs.selector = (u16)(msr_data & 0xfffc);
-       ss.selector = (u16)(msr_data + 8);
+       cs_sel = (u16)(msr_data & 0xfffc);
+       ss_sel = (u16)(msr_data + 8);
 
        if (is_long_mode(ctxt->vcpu)) {
-               cs.db = 0;
+               cs.d = 0;
                cs.l = 1;
        }
-       kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
-       kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+       ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
+       ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
+       ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
+       ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
 
        c->regs[VCPU_REGS_RCX] = c->eip;
        if (is_long_mode(ctxt->vcpu)) {
 #ifdef CONFIG_X86_64
                c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
 
-               kvm_x86_ops->get_msr(ctxt->vcpu,
-                       ctxt->mode == X86EMUL_MODE_PROT64 ?
-                       MSR_LSTAR : MSR_CSTAR, &msr_data);
+               ops->get_msr(ctxt->vcpu,
+                            ctxt->mode == X86EMUL_MODE_PROT64 ?
+                            MSR_LSTAR : MSR_CSTAR, &msr_data);
                c->eip = msr_data;
 
-               kvm_x86_ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data);
+               ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data);
                ctxt->eflags &= ~(msr_data | EFLG_RF);
 #endif
        } else {
                /* legacy mode */
-               kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
+               ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
                c->eip = (u32)msr_data;
 
                ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
@@ -1896,15 +1991,16 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
 }
 
 static int
-emulate_sysenter(struct x86_emulate_ctxt *ctxt)
+emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
        struct decode_cache *c = &ctxt->decode;
-       struct kvm_segment cs, ss;
+       struct desc_struct cs, ss;
        u64 msr_data;
+       u16 cs_sel, ss_sel;
 
        /* inject #GP if in real mode */
        if (ctxt->mode == X86EMUL_MODE_REAL) {
-               kvm_inject_gp(ctxt->vcpu, 0);
+               emulate_gp(ctxt, 0);
                return X86EMUL_PROPAGATE_FAULT;
        }
 
@@ -1912,67 +2008,70 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
        * Therefore, we inject an #UD.
        */
        if (ctxt->mode == X86EMUL_MODE_PROT64) {
-               kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+               emulate_ud(ctxt);
                return X86EMUL_PROPAGATE_FAULT;
        }
 
-       setup_syscalls_segments(ctxt, &cs, &ss);
+       setup_syscalls_segments(ctxt, ops, &cs, &ss);
 
-       kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
+       ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
        switch (ctxt->mode) {
        case X86EMUL_MODE_PROT32:
                if ((msr_data & 0xfffc) == 0x0) {
-                       kvm_inject_gp(ctxt->vcpu, 0);
+                       emulate_gp(ctxt, 0);
                        return X86EMUL_PROPAGATE_FAULT;
                }
                break;
        case X86EMUL_MODE_PROT64:
                if (msr_data == 0x0) {
-                       kvm_inject_gp(ctxt->vcpu, 0);
+                       emulate_gp(ctxt, 0);
                        return X86EMUL_PROPAGATE_FAULT;
                }
                break;
        }
 
        ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
-       cs.selector = (u16)msr_data;
-       cs.selector &= ~SELECTOR_RPL_MASK;
-       ss.selector = cs.selector + 8;
-       ss.selector &= ~SELECTOR_RPL_MASK;
+       cs_sel = (u16)msr_data;
+       cs_sel &= ~SELECTOR_RPL_MASK;
+       ss_sel = cs_sel + 8;
+       ss_sel &= ~SELECTOR_RPL_MASK;
        if (ctxt->mode == X86EMUL_MODE_PROT64
                || is_long_mode(ctxt->vcpu)) {
-               cs.db = 0;
+               cs.d = 0;
                cs.l = 1;
        }
 
-       kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
-       kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+       ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
+       ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
+       ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
+       ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
 
-       kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
+       ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
        c->eip = msr_data;
 
-       kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
+       ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
        c->regs[VCPU_REGS_RSP] = msr_data;
 
        return X86EMUL_CONTINUE;
 }
 
 static int
-emulate_sysexit(struct x86_emulate_ctxt *ctxt)
+emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
        struct decode_cache *c = &ctxt->decode;
-       struct kvm_segment cs, ss;
+       struct desc_struct cs, ss;
        u64 msr_data;
        int usermode;
+       u16 cs_sel, ss_sel;
 
        /* inject #GP if in real mode or Virtual 8086 mode */
        if (ctxt->mode == X86EMUL_MODE_REAL ||
            ctxt->mode == X86EMUL_MODE_VM86) {
-               kvm_inject_gp(ctxt->vcpu, 0);
+               emulate_gp(ctxt, 0);
                return X86EMUL_PROPAGATE_FAULT;
        }
 
-       setup_syscalls_segments(ctxt, &cs, &ss);
+       setup_syscalls_segments(ctxt, ops, &cs, &ss);
 
        if ((c->rex_prefix & 0x8) != 0x0)
                usermode = X86EMUL_MODE_PROT64;
@@ -1981,35 +2080,37 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
 
        cs.dpl = 3;
        ss.dpl = 3;
-       kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
+       ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
        switch (usermode) {
        case X86EMUL_MODE_PROT32:
-               cs.selector = (u16)(msr_data + 16);
+               cs_sel = (u16)(msr_data + 16);
                if ((msr_data & 0xfffc) == 0x0) {
-                       kvm_inject_gp(ctxt->vcpu, 0);
+                       emulate_gp(ctxt, 0);
                        return X86EMUL_PROPAGATE_FAULT;
                }
-               ss.selector = (u16)(msr_data + 24);
+               ss_sel = (u16)(msr_data + 24);
                break;
        case X86EMUL_MODE_PROT64:
-               cs.selector = (u16)(msr_data + 32);
+               cs_sel = (u16)(msr_data + 32);
                if (msr_data == 0x0) {
-                       kvm_inject_gp(ctxt->vcpu, 0);
+                       emulate_gp(ctxt, 0);
                        return X86EMUL_PROPAGATE_FAULT;
                }
-               ss.selector = cs.selector + 8;
-               cs.db = 0;
+               ss_sel = cs_sel + 8;
+               cs.d = 0;
                cs.l = 1;
                break;
        }
-       cs.selector |= SELECTOR_RPL_MASK;
-       ss.selector |= SELECTOR_RPL_MASK;
+       cs_sel |= SELECTOR_RPL_MASK;
+       ss_sel |= SELECTOR_RPL_MASK;
 
-       kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
-       kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+       ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
+       ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
+       ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
+       ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
 
-       c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX];
-       c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX];
+       c->eip = c->regs[VCPU_REGS_RDX];
+       c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX];
 
        return X86EMUL_CONTINUE;
 }
@@ -2030,25 +2131,25 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
                                            struct x86_emulate_ops *ops,
                                            u16 port, u16 len)
 {
-       struct kvm_segment tr_seg;
+       struct desc_struct tr_seg;
        int r;
        u16 io_bitmap_ptr;
        u8 perm, bit_idx = port & 0x7;
        unsigned mask = (1 << len) - 1;
 
-       kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR);
-       if (tr_seg.unusable)
+       ops->get_cached_descriptor(&tr_seg, VCPU_SREG_TR, ctxt->vcpu);
+       if (!tr_seg.p)
                return false;
-       if (tr_seg.limit < 103)
+       if (desc_limit_scaled(&tr_seg) < 103)
                return false;
-       r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu,
-                         NULL);
+       r = ops->read_std(get_desc_base(&tr_seg) + 102, &io_bitmap_ptr, 2,
+                         ctxt->vcpu, NULL);
        if (r != X86EMUL_CONTINUE)
                return false;
-       if (io_bitmap_ptr + port/8 > tr_seg.limit)
+       if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg))
                return false;
-       r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1,
-                         ctxt->vcpu, NULL);
+       r = ops->read_std(get_desc_base(&tr_seg) + io_bitmap_ptr + port/8,
+                         &perm, 1, ctxt->vcpu, NULL);
        if (r != X86EMUL_CONTINUE)
                return false;
        if ((perm >> bit_idx) & mask)
@@ -2066,17 +2167,6 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
        return true;
 }
 
-static u32 get_cached_descriptor_base(struct x86_emulate_ctxt *ctxt,
-                                     struct x86_emulate_ops *ops,
-                                     int seg)
-{
-       struct desc_struct desc;
-       if (ops->get_cached_descriptor(&desc, seg, ctxt->vcpu))
-               return get_desc_base(&desc);
-       else
-               return ~0;
-}
-
 static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
                                struct x86_emulate_ops *ops,
                                struct tss_segment_16 *tss)
@@ -2165,7 +2255,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
                            &err);
        if (ret == X86EMUL_PROPAGATE_FAULT) {
                /* FIXME: need to provide precise fault address */
-               kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+               emulate_pf(ctxt, old_tss_base, err);
                return ret;
        }
 
@@ -2175,7 +2265,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
                             &err);
        if (ret == X86EMUL_PROPAGATE_FAULT) {
                /* FIXME: need to provide precise fault address */
-               kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+               emulate_pf(ctxt, old_tss_base, err);
                return ret;
        }
 
@@ -2183,7 +2273,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
                            &err);
        if (ret == X86EMUL_PROPAGATE_FAULT) {
                /* FIXME: need to provide precise fault address */
-               kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+               emulate_pf(ctxt, new_tss_base, err);
                return ret;
        }
 
@@ -2196,7 +2286,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
                                     ctxt->vcpu, &err);
                if (ret == X86EMUL_PROPAGATE_FAULT) {
                        /* FIXME: need to provide precise fault address */
-                       kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+                       emulate_pf(ctxt, new_tss_base, err);
                        return ret;
                }
        }
@@ -2238,7 +2328,10 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
        struct decode_cache *c = &ctxt->decode;
        int ret;
 
-       ops->set_cr(3, tss->cr3, ctxt->vcpu);
+       if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) {
+               emulate_gp(ctxt, 0);
+               return X86EMUL_PROPAGATE_FAULT;
+       }
        c->eip = tss->eip;
        ctxt->eflags = tss->eflags | 2;
        c->regs[VCPU_REGS_RAX] = tss->eax;
@@ -2304,7 +2397,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
                            &err);
        if (ret == X86EMUL_PROPAGATE_FAULT) {
                /* FIXME: need to provide precise fault address */
-               kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+               emulate_pf(ctxt, old_tss_base, err);
                return ret;
        }
 
@@ -2314,7 +2407,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
                             &err);
        if (ret == X86EMUL_PROPAGATE_FAULT) {
                /* FIXME: need to provide precise fault address */
-               kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+               emulate_pf(ctxt, old_tss_base, err);
                return ret;
        }
 
@@ -2322,7 +2415,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
                            &err);
        if (ret == X86EMUL_PROPAGATE_FAULT) {
                /* FIXME: need to provide precise fault address */
-               kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+               emulate_pf(ctxt, new_tss_base, err);
                return ret;
        }
 
@@ -2335,7 +2428,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
                                     ctxt->vcpu, &err);
                if (ret == X86EMUL_PROPAGATE_FAULT) {
                        /* FIXME: need to provide precise fault address */
-                       kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+                       emulate_pf(ctxt, new_tss_base, err);
                        return ret;
                }
        }
@@ -2352,7 +2445,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
        int ret;
        u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu);
        ulong old_tss_base =
-               get_cached_descriptor_base(ctxt, ops, VCPU_SREG_TR);
+               ops->get_cached_segment_base(VCPU_SREG_TR, ctxt->vcpu);
        u32 desc_limit;
 
        /* FIXME: old_tss_base == ~0 ? */
@@ -2369,7 +2462,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
        if (reason != TASK_SWITCH_IRET) {
                if ((tss_selector & 3) > next_tss_desc.dpl ||
                    ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) {
-                       kvm_inject_gp(ctxt->vcpu, 0);
+                       emulate_gp(ctxt, 0);
                        return X86EMUL_PROPAGATE_FAULT;
                }
        }
@@ -2378,8 +2471,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
        if (!next_tss_desc.p ||
            ((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
             desc_limit < 0x2b)) {
-               kvm_queue_exception_e(ctxt->vcpu, TS_VECTOR,
-                                     tss_selector & 0xfffc);
+               emulate_ts(ctxt, tss_selector & 0xfffc);
                return X86EMUL_PROPAGATE_FAULT;
        }
 
@@ -2425,7 +2517,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
                c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
                c->lock_prefix = 0;
                c->src.val = (unsigned long) error_code;
-               emulate_push(ctxt);
+               emulate_push(ctxt, ops);
        }
 
        return ret;
@@ -2439,18 +2531,16 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
        struct decode_cache *c = &ctxt->decode;
        int rc;
 
-       memset(c, 0, sizeof(struct decode_cache));
        c->eip = ctxt->eip;
-       memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
        c->dst.type = OP_NONE;
 
        rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason,
                                     has_error_code, error_code);
 
        if (rc == X86EMUL_CONTINUE) {
-               memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
-               kvm_rip_write(ctxt->vcpu, c->eip);
                rc = writeback(ctxt, ops);
+               if (rc == X86EMUL_CONTINUE)
+                       ctxt->eip = c->eip;
        }
 
        return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
@@ -2474,29 +2564,22 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
        int rc = X86EMUL_CONTINUE;
        int saved_dst_type = c->dst.type;
 
-       ctxt->interruptibility = 0;
-
-       /* Shadow copy of register state. Committed on successful emulation.
-        * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
-        * modify them.
-        */
-
-       memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
+       ctxt->decode.mem_read.pos = 0;
 
        if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
-               kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+               emulate_ud(ctxt);
                goto done;
        }
 
        /* LOCK prefix is allowed only with some instructions */
        if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
-               kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+               emulate_ud(ctxt);
                goto done;
        }
 
        /* Privileged instruction can be executed only in CPL=0 */
        if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
-               kvm_inject_gp(ctxt->vcpu, 0);
+               emulate_gp(ctxt, 0);
                goto done;
        }
 
@@ -2506,7 +2589,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
                if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
                string_done:
                        ctxt->restart = false;
-                       kvm_rip_write(ctxt->vcpu, c->eip);
+                       ctxt->eip = c->eip;
                        goto done;
                }
                /* The second termination condition only applies for REPE
@@ -2529,20 +2612,16 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
        }
 
        if (c->src.type == OP_MEM) {
-               rc = ops->read_emulated((unsigned long)c->src.ptr,
-                                       &c->src.val,
-                                       c->src.bytes,
-                                       ctxt->vcpu);
+               rc = read_emulated(ctxt, ops, (unsigned long)c->src.ptr,
+                                       c->src.valptr, c->src.bytes);
                if (rc != X86EMUL_CONTINUE)
                        goto done;
                c->src.orig_val = c->src.val;
        }
 
        if (c->src2.type == OP_MEM) {
-               rc = ops->read_emulated((unsigned long)c->src2.ptr,
-                                       &c->src2.val,
-                                       c->src2.bytes,
-                                       ctxt->vcpu);
+               rc = read_emulated(ctxt, ops, (unsigned long)c->src2.ptr,
+                                       &c->src2.val, c->src2.bytes);
                if (rc != X86EMUL_CONTINUE)
                        goto done;
        }
@@ -2553,8 +2632,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 
        if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
                /* optimisation - avoid slow emulated read if Mov */
-               rc = ops->read_emulated((unsigned long)c->dst.ptr, &c->dst.val,
-                                       c->dst.bytes, ctxt->vcpu);
+               rc = read_emulated(ctxt, ops, (unsigned long)c->dst.ptr,
+                                  &c->dst.val, c->dst.bytes);
                if (rc != X86EMUL_CONTINUE)
                        goto done;
        }
@@ -2571,7 +2650,7 @@ special_insn:
                emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
                break;
        case 0x06:              /* push es */
-               emulate_push_sreg(ctxt, VCPU_SREG_ES);
+               emulate_push_sreg(ctxt, ops, VCPU_SREG_ES);
                break;
        case 0x07:              /* pop es */
                rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
@@ -2583,14 +2662,14 @@ special_insn:
                emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
                break;
        case 0x0e:              /* push cs */
-               emulate_push_sreg(ctxt, VCPU_SREG_CS);
+               emulate_push_sreg(ctxt, ops, VCPU_SREG_CS);
                break;
        case 0x10 ... 0x15:
              adc:              /* adc */
                emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
                break;
        case 0x16:              /* push ss */
-               emulate_push_sreg(ctxt, VCPU_SREG_SS);
+               emulate_push_sreg(ctxt, ops, VCPU_SREG_SS);
                break;
        case 0x17:              /* pop ss */
                rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
@@ -2602,7 +2681,7 @@ special_insn:
                emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
                break;
        case 0x1e:              /* push ds */
-               emulate_push_sreg(ctxt, VCPU_SREG_DS);
+               emulate_push_sreg(ctxt, ops, VCPU_SREG_DS);
                break;
        case 0x1f:              /* pop ds */
                rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
@@ -2632,7 +2711,7 @@ special_insn:
                emulate_1op("dec", c->dst, ctxt->eflags);
                break;
        case 0x50 ... 0x57:  /* push reg */
-               emulate_push(ctxt);
+               emulate_push(ctxt, ops);
                break;
        case 0x58 ... 0x5f: /* pop reg */
        pop_instruction:
@@ -2641,7 +2720,9 @@ special_insn:
                        goto done;
                break;
        case 0x60:      /* pusha */
-               emulate_pusha(ctxt);
+               rc = emulate_pusha(ctxt, ops);
+               if (rc != X86EMUL_CONTINUE)
+                       goto done;
                break;
        case 0x61:      /* popa */
                rc = emulate_popa(ctxt, ops);
@@ -2655,14 +2736,14 @@ special_insn:
                break;
        case 0x68: /* push imm */
        case 0x6a: /* push imm8 */
-               emulate_push(ctxt);
+               emulate_push(ctxt, ops);
                break;
        case 0x6c:              /* insb */
        case 0x6d:              /* insw/insd */
                c->dst.bytes = min(c->dst.bytes, 4u);
                if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
                                          c->dst.bytes)) {
-                       kvm_inject_gp(ctxt->vcpu, 0);
+                       emulate_gp(ctxt, 0);
                        goto done;
                }
                if (!pio_in_emulated(ctxt, ops, c->dst.bytes,
@@ -2674,7 +2755,7 @@ special_insn:
                c->src.bytes = min(c->src.bytes, 4u);
                if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
                                          c->src.bytes)) {
-                       kvm_inject_gp(ctxt->vcpu, 0);
+                       emulate_gp(ctxt, 0);
                        goto done;
                }
                ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX],
@@ -2707,6 +2788,7 @@ special_insn:
                }
                break;
        case 0x84 ... 0x85:
+       test:
                emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
                break;
        case 0x86 ... 0x87:     /* xchg */
@@ -2735,18 +2817,13 @@ special_insn:
                break;
        case 0x88 ... 0x8b:     /* mov */
                goto mov;
-       case 0x8c: { /* mov r/m, sreg */
-               struct kvm_segment segreg;
-
-               if (c->modrm_reg <= VCPU_SREG_GS)
-                       kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg);
-               else {
-                       kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+       case 0x8c:  /* mov r/m, sreg */
+               if (c->modrm_reg > VCPU_SREG_GS) {
+                       emulate_ud(ctxt);
                        goto done;
                }
-               c->dst.val = segreg.selector;
+               c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);
                break;
-       }
        case 0x8d: /* lea r16/r32, m */
                c->dst.val = c->modrm_ea;
                break;
@@ -2757,12 +2834,12 @@ special_insn:
 
                if (c->modrm_reg == VCPU_SREG_CS ||
                    c->modrm_reg > VCPU_SREG_GS) {
-                       kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+                       emulate_ud(ctxt);
                        goto done;
                }
 
                if (c->modrm_reg == VCPU_SREG_SS)
-                       toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_MOV_SS);
+                       ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
 
                rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg);
 
@@ -2775,19 +2852,19 @@ special_insn:
                        goto done;
                break;
        case 0x90: /* nop / xchg r8,rax */
-               if (!(c->rex_prefix & 1)) { /* nop */
-                       c->dst.type = OP_NONE;
+               if (c->dst.ptr == (unsigned long *)&c->regs[VCPU_REGS_RAX]) {
+                       c->dst.type = OP_NONE;  /* nop */
                        break;
                }
        case 0x91 ... 0x97: /* xchg reg,rax */
-               c->src.type = c->dst.type = OP_REG;
-               c->src.bytes = c->dst.bytes = c->op_bytes;
+               c->src.type = OP_REG;
+               c->src.bytes = c->op_bytes;
                c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX];
                c->src.val = *(c->src.ptr);
                goto xchg;
        case 0x9c: /* pushf */
                c->src.val =  (unsigned long) ctxt->eflags;
-               emulate_push(ctxt);
+               emulate_push(ctxt, ops);
                break;
        case 0x9d: /* popf */
                c->dst.type = OP_REG;
@@ -2797,19 +2874,15 @@ special_insn:
                if (rc != X86EMUL_CONTINUE)
                        goto done;
                break;
-       case 0xa0 ... 0xa1:     /* mov */
-               c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
-               c->dst.val = c->src.val;
-               break;
-       case 0xa2 ... 0xa3:     /* mov */
-               c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
-               break;
+       case 0xa0 ... 0xa3:     /* mov */
        case 0xa4 ... 0xa5:     /* movs */
                goto mov;
        case 0xa6 ... 0xa7:     /* cmps */
                c->dst.type = OP_NONE; /* Disable writeback. */
                DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
                goto cmp;
+       case 0xa8 ... 0xa9:     /* test ax, imm */
+               goto test;
        case 0xaa ... 0xab:     /* stos */
                c->dst.val = c->regs[VCPU_REGS_RAX];
                break;
@@ -2855,19 +2928,23 @@ special_insn:
                long int rel = c->src.val;
                c->src.val = (unsigned long) c->eip;
                jmp_rel(c, rel);
-               emulate_push(ctxt);
+               emulate_push(ctxt, ops);
                break;
        }
        case 0xe9: /* jmp rel */
                goto jmp;
-       case 0xea: /* jmp far */
+       case 0xea: { /* jmp far */
+               unsigned short sel;
        jump_far:
-               if (load_segment_descriptor(ctxt, ops, c->src2.val,
-                                           VCPU_SREG_CS))
+               memcpy(&sel, c->src.valptr + c->op_bytes, 2);
+
+               if (load_segment_descriptor(ctxt, ops, sel, VCPU_SREG_CS))
                        goto done;
 
-               c->eip = c->src.val;
+               c->eip = 0;
+               memcpy(&c->eip, c->src.valptr, c->op_bytes);
                break;
+       }
        case 0xeb:
              jmp:              /* jmp rel short */
                jmp_rel(c, c->src.val);
@@ -2879,20 +2956,20 @@ special_insn:
        do_io_in:
                c->dst.bytes = min(c->dst.bytes, 4u);
                if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
-                       kvm_inject_gp(ctxt->vcpu, 0);
+                       emulate_gp(ctxt, 0);
                        goto done;
                }
                if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
                                     &c->dst.val))
                        goto done; /* IO is needed */
                break;
-       case 0xee: /* out al,dx */
-       case 0xef: /* out (e/r)ax,dx */
+       case 0xee: /* out dx,al */
+       case 0xef: /* out dx,(e/r)ax */
                c->src.val = c->regs[VCPU_REGS_RDX];
        do_io_out:
                c->dst.bytes = min(c->dst.bytes, 4u);
                if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
-                       kvm_inject_gp(ctxt->vcpu, 0);
+                       emulate_gp(ctxt, 0);
                        goto done;
                }
                ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1,
@@ -2916,18 +2993,20 @@ special_insn:
                c->dst.type = OP_NONE;  /* Disable writeback. */
                break;
        case 0xfa: /* cli */
-               if (emulator_bad_iopl(ctxt, ops))
-                       kvm_inject_gp(ctxt->vcpu, 0);
-               else {
+               if (emulator_bad_iopl(ctxt, ops)) {
+                       emulate_gp(ctxt, 0);
+                       goto done;
+               } else {
                        ctxt->eflags &= ~X86_EFLAGS_IF;
                        c->dst.type = OP_NONE;  /* Disable writeback. */
                }
                break;
        case 0xfb: /* sti */
-               if (emulator_bad_iopl(ctxt, ops))
-                       kvm_inject_gp(ctxt->vcpu, 0);
-               else {
-                       toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_STI);
+               if (emulator_bad_iopl(ctxt, ops)) {
+                       emulate_gp(ctxt, 0);
+                       goto done;
+               } else {
+                       ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
                        ctxt->eflags |= X86_EFLAGS_IF;
                        c->dst.type = OP_NONE;  /* Disable writeback. */
                }
@@ -2964,11 +3043,12 @@ writeback:
        c->dst.type = saved_dst_type;
 
        if ((c->d & SrcMask) == SrcSI)
-               string_addr_inc(ctxt, seg_override_base(ctxt, c), VCPU_REGS_RSI,
-                               &c->src);
+               string_addr_inc(ctxt, seg_override_base(ctxt, ops, c),
+                               VCPU_REGS_RSI, &c->src);
 
        if ((c->d & DstMask) == DstDI)
-               string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI, &c->dst);
+               string_addr_inc(ctxt, es_base(ctxt, ops), VCPU_REGS_RDI,
+                               &c->dst);
 
        if (c->rep_prefix && (c->d & String)) {
                struct read_cache *rc = &ctxt->decode.io_read;
@@ -2981,11 +3061,12 @@ writeback:
                    (rc->end != 0 && rc->end == rc->pos))
                        ctxt->restart = false;
        }
-
-       /* Commit shadow register state. */
-       memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
-       kvm_rip_write(ctxt->vcpu, c->eip);
-       ops->set_rflags(ctxt->vcpu, ctxt->eflags);
+       /*
+        * reset read cache here in case string instruction is restared
+        * without decoding
+        */
+       ctxt->decode.mem_read.end = 0;
+       ctxt->eip = c->eip;
 
 done:
        return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
@@ -3051,7 +3132,7 @@ twobyte_insn:
                        c->dst.type = OP_NONE;
                        break;
                case 5: /* not defined */
-                       kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+                       emulate_ud(ctxt);
                        goto done;
                case 7: /* invlpg*/
                        emulate_invlpg(ctxt->vcpu, c->modrm_ea);
@@ -3063,7 +3144,7 @@ twobyte_insn:
                }
                break;
        case 0x05:              /* syscall */
-               rc = emulate_syscall(ctxt);
+               rc = emulate_syscall(ctxt, ops);
                if (rc != X86EMUL_CONTINUE)
                        goto done;
                else
@@ -3073,8 +3154,11 @@ twobyte_insn:
                emulate_clts(ctxt->vcpu);
                c->dst.type = OP_NONE;
                break;
-       case 0x08:              /* invd */
        case 0x09:              /* wbinvd */
+               kvm_emulate_wbinvd(ctxt->vcpu);
+               c->dst.type = OP_NONE;
+               break;
+       case 0x08:              /* invd */
        case 0x0d:              /* GrpP (prefetch) */
        case 0x18:              /* Grp16 (prefetch/nop) */
                c->dst.type = OP_NONE;
@@ -3084,7 +3168,7 @@ twobyte_insn:
                case 1:
                case 5 ... 7:
                case 9 ... 15:
-                       kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+                       emulate_ud(ctxt);
                        goto done;
                }
                c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu);
@@ -3093,31 +3177,42 @@ twobyte_insn:
        case 0x21: /* mov from dr to reg */
                if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
                    (c->modrm_reg == 4 || c->modrm_reg == 5)) {
-                       kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+                       emulate_ud(ctxt);
                        goto done;
                }
-               emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
+               ops->get_dr(c->modrm_reg, &c->regs[c->modrm_rm], ctxt->vcpu);
                c->dst.type = OP_NONE;  /* no writeback */
                break;
        case 0x22: /* mov reg, cr */
-               ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu);
+               if (ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu)) {
+                       emulate_gp(ctxt, 0);
+                       goto done;
+               }
                c->dst.type = OP_NONE;
                break;
        case 0x23: /* mov from reg to dr */
                if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
                    (c->modrm_reg == 4 || c->modrm_reg == 5)) {
-                       kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+                       emulate_ud(ctxt);
+                       goto done;
+               }
+
+               if (ops->set_dr(c->modrm_reg, c->regs[c->modrm_rm] &
+                               ((ctxt->mode == X86EMUL_MODE_PROT64) ?
+                                ~0ULL : ~0U), ctxt->vcpu) < 0) {
+                       /* #UD condition is already handled by the code above */
+                       emulate_gp(ctxt, 0);
                        goto done;
                }
-               emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm]);
+
                c->dst.type = OP_NONE;  /* no writeback */
                break;
        case 0x30:
                /* wrmsr */
                msr_data = (u32)c->regs[VCPU_REGS_RAX]
                        | ((u64)c->regs[VCPU_REGS_RDX] << 32);
-               if (kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
-                       kvm_inject_gp(ctxt->vcpu, 0);
+               if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
+                       emulate_gp(ctxt, 0);
                        goto done;
                }
                rc = X86EMUL_CONTINUE;
@@ -3125,8 +3220,8 @@ twobyte_insn:
                break;
        case 0x32:
                /* rdmsr */
-               if (kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
-                       kvm_inject_gp(ctxt->vcpu, 0);
+               if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
+                       emulate_gp(ctxt, 0);
                        goto done;
                } else {
                        c->regs[VCPU_REGS_RAX] = (u32)msr_data;
@@ -3136,14 +3231,14 @@ twobyte_insn:
                c->dst.type = OP_NONE;
                break;
        case 0x34:              /* sysenter */
-               rc = emulate_sysenter(ctxt);
+               rc = emulate_sysenter(ctxt, ops);
                if (rc != X86EMUL_CONTINUE)
                        goto done;
                else
                        goto writeback;
                break;
        case 0x35:              /* sysexit */
-               rc = emulate_sysexit(ctxt);
+               rc = emulate_sysexit(ctxt, ops);
                if (rc != X86EMUL_CONTINUE)
                        goto done;
                else
@@ -3160,7 +3255,7 @@ twobyte_insn:
                c->dst.type = OP_NONE;
                break;
        case 0xa0:        /* push fs */
-               emulate_push_sreg(ctxt, VCPU_SREG_FS);
+               emulate_push_sreg(ctxt, ops, VCPU_SREG_FS);
                break;
        case 0xa1:       /* pop fs */
                rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
@@ -3179,7 +3274,7 @@ twobyte_insn:
                emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
                break;
        case 0xa8:      /* push gs */
-               emulate_push_sreg(ctxt, VCPU_SREG_GS);
+               emulate_push_sreg(ctxt, ops, VCPU_SREG_GS);
                break;
        case 0xa9:      /* pop gs */
                rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
index 0150affad25d082c4dd8eea9546b30ee12491731..0fd6378981f4a6c7ba7cadbb139bc811760c0a43 100644 (file)
@@ -5,6 +5,7 @@
  * Copyright (c) 2006 Intel Corporation
  * Copyright (c) 2007 Keir Fraser, XenSource Inc
  * Copyright (c) 2008 Intel Corporation
+ * Copyright 2009 Red Hat, Inc. and/or its affilates.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -33,6 +34,7 @@
 
 #include <linux/kvm_host.h>
 #include <linux/slab.h>
+#include <linux/workqueue.h>
 
 #include "irq.h"
 #include "i8254.h"
@@ -243,11 +245,22 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
 {
        struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
                                                 irq_ack_notifier);
-       raw_spin_lock(&ps->inject_lock);
-       if (atomic_dec_return(&ps->pit_timer.pending) < 0)
+       int value;
+
+       spin_lock(&ps->inject_lock);
+       value = atomic_dec_return(&ps->pit_timer.pending);
+       if (value < 0)
+               /* spurious acks can be generated if, for example, the
+                * PIC is being reset.  Handle it gracefully here
+                */
                atomic_inc(&ps->pit_timer.pending);
+       else if (value > 0)
+               /* in this case, we had multiple outstanding pit interrupts
+                * that we needed to inject.  Reinject
+                */
+               queue_work(ps->pit->wq, &ps->pit->expired);
        ps->irq_ack = 1;
-       raw_spin_unlock(&ps->inject_lock);
+       spin_unlock(&ps->inject_lock);
 }
 
 void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
@@ -263,10 +276,10 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
                hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
 }
 
-static void destroy_pit_timer(struct kvm_timer *pt)
+static void destroy_pit_timer(struct kvm_pit *pit)
 {
-       pr_debug("execute del timer!\n");
-       hrtimer_cancel(&pt->timer);
+       hrtimer_cancel(&pit->pit_state.pit_timer.timer);
+       cancel_work_sync(&pit->expired);
 }
 
 static bool kpit_is_periodic(struct kvm_timer *ktimer)
@@ -280,6 +293,60 @@ static struct kvm_timer_ops kpit_ops = {
        .is_periodic = kpit_is_periodic,
 };
 
+static void pit_do_work(struct work_struct *work)
+{
+       struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
+       struct kvm *kvm = pit->kvm;
+       struct kvm_vcpu *vcpu;
+       int i;
+       struct kvm_kpit_state *ps = &pit->pit_state;
+       int inject = 0;
+
+       /* Try to inject pending interrupts when
+        * last one has been acked.
+        */
+       spin_lock(&ps->inject_lock);
+       if (ps->irq_ack) {
+               ps->irq_ack = 0;
+               inject = 1;
+       }
+       spin_unlock(&ps->inject_lock);
+       if (inject) {
+               kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
+               kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
+
+               /*
+                * Provides NMI watchdog support via Virtual Wire mode.
+                * The route is: PIT -> PIC -> LVT0 in NMI mode.
+                *
+                * Note: Our Virtual Wire implementation is simplified, only
+                * propagating PIT interrupts to all VCPUs when they have set
+                * LVT0 to NMI delivery. Other PIC interrupts are just sent to
+                * VCPU0, and only if its LVT0 is in EXTINT mode.
+                */
+               if (kvm->arch.vapics_in_nmi_mode > 0)
+                       kvm_for_each_vcpu(i, vcpu, kvm)
+                               kvm_apic_nmi_wd_deliver(vcpu);
+       }
+}
+
+static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
+{
+       struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
+       struct kvm_pit *pt = ktimer->kvm->arch.vpit;
+
+       if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
+               atomic_inc(&ktimer->pending);
+               queue_work(pt->wq, &pt->expired);
+       }
+
+       if (ktimer->t_ops->is_periodic(ktimer)) {
+               hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
+               return HRTIMER_RESTART;
+       } else
+               return HRTIMER_NORESTART;
+}
+
 static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
 {
        struct kvm_timer *pt = &ps->pit_timer;
@@ -291,13 +358,13 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
 
        /* TODO The new value only affected after the retriggered */
        hrtimer_cancel(&pt->timer);
+       cancel_work_sync(&ps->pit->expired);
        pt->period = interval;
        ps->is_periodic = is_period;
 
-       pt->timer.function = kvm_timer_fn;
+       pt->timer.function = pit_timer_fn;
        pt->t_ops = &kpit_ops;
        pt->kvm = ps->pit->kvm;
-       pt->vcpu = pt->kvm->bsp_vcpu;
 
        atomic_set(&pt->pending, 0);
        ps->irq_ack = 1;
@@ -346,7 +413,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
                }
                break;
        default:
-               destroy_pit_timer(&ps->pit_timer);
+               destroy_pit_timer(kvm->arch.vpit);
        }
 }
 
@@ -625,7 +692,15 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 
        mutex_init(&pit->pit_state.lock);
        mutex_lock(&pit->pit_state.lock);
-       raw_spin_lock_init(&pit->pit_state.inject_lock);
+       spin_lock_init(&pit->pit_state.inject_lock);
+
+       pit->wq = create_singlethread_workqueue("kvm-pit-wq");
+       if (!pit->wq) {
+               mutex_unlock(&pit->pit_state.lock);
+               kfree(pit);
+               return NULL;
+       }
+       INIT_WORK(&pit->expired, pit_do_work);
 
        kvm->arch.vpit = pit;
        pit->kvm = kvm;
@@ -677,6 +752,9 @@ void kvm_free_pit(struct kvm *kvm)
        struct hrtimer *timer;
 
        if (kvm->arch.vpit) {
+               kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &kvm->arch.vpit->dev);
+               kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
+                                             &kvm->arch.vpit->speaker_dev);
                kvm_unregister_irq_mask_notifier(kvm, 0,
                                               &kvm->arch.vpit->mask_notifier);
                kvm_unregister_irq_ack_notifier(kvm,
@@ -684,54 +762,10 @@ void kvm_free_pit(struct kvm *kvm)
                mutex_lock(&kvm->arch.vpit->pit_state.lock);
                timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
                hrtimer_cancel(timer);
+               cancel_work_sync(&kvm->arch.vpit->expired);
                kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id);
                mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+               destroy_workqueue(kvm->arch.vpit->wq);
                kfree(kvm->arch.vpit);
        }
 }
-
-static void __inject_pit_timer_intr(struct kvm *kvm)
-{
-       struct kvm_vcpu *vcpu;
-       int i;
-
-       kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
-       kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
-
-       /*
-        * Provides NMI watchdog support via Virtual Wire mode.
-        * The route is: PIT -> PIC -> LVT0 in NMI mode.
-        *
-        * Note: Our Virtual Wire implementation is simplified, only
-        * propagating PIT interrupts to all VCPUs when they have set
-        * LVT0 to NMI delivery. Other PIC interrupts are just sent to
-        * VCPU0, and only if its LVT0 is in EXTINT mode.
-        */
-       if (kvm->arch.vapics_in_nmi_mode > 0)
-               kvm_for_each_vcpu(i, vcpu, kvm)
-                       kvm_apic_nmi_wd_deliver(vcpu);
-}
-
-void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
-{
-       struct kvm_pit *pit = vcpu->kvm->arch.vpit;
-       struct kvm *kvm = vcpu->kvm;
-       struct kvm_kpit_state *ps;
-
-       if (pit) {
-               int inject = 0;
-               ps = &pit->pit_state;
-
-               /* Try to inject pending interrupts when
-                * last one has been acked.
-                */
-               raw_spin_lock(&ps->inject_lock);
-               if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
-                       ps->irq_ack = 0;
-                       inject = 1;
-               }
-               raw_spin_unlock(&ps->inject_lock);
-               if (inject)
-                       __inject_pit_timer_intr(kvm);
-       }
-}
index 900d6b0ba7c2347ed4fb487a92dc3a2d30c83a9f..46d08ca0b48f2a2c3bd04896c1feeb87c3cdef57 100644 (file)
@@ -27,7 +27,7 @@ struct kvm_kpit_state {
        u32    speaker_data_on;
        struct mutex lock;
        struct kvm_pit *pit;
-       raw_spinlock_t inject_lock;
+       spinlock_t inject_lock;
        unsigned long irq_ack;
        struct kvm_irq_ack_notifier irq_ack_notifier;
 };
@@ -40,6 +40,8 @@ struct kvm_pit {
        struct kvm_kpit_state pit_state;
        int irq_source_id;
        struct kvm_irq_mask_notifier mask_notifier;
+       struct workqueue_struct *wq;
+       struct work_struct expired;
 };
 
 #define KVM_PIT_BASE_ADDRESS       0x40
index 93825ff3338fae9940b612c0814f9a3864cbd213..8d10c063d7f207451b087a11a8d7bf0c888f3695 100644 (file)
@@ -3,6 +3,7 @@
  *
  * Copyright (c) 2003-2004 Fabrice Bellard
  * Copyright (c) 2007 Intel Corporation
+ * Copyright 2009 Red Hat, Inc. and/or its affilates.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -33,6 +34,8 @@
 #include <linux/kvm_host.h>
 #include "trace.h"
 
+static void pic_irq_request(struct kvm *kvm, int level);
+
 static void pic_lock(struct kvm_pic *s)
        __acquires(&s->lock)
 {
@@ -43,16 +46,25 @@ static void pic_unlock(struct kvm_pic *s)
        __releases(&s->lock)
 {
        bool wakeup = s->wakeup_needed;
-       struct kvm_vcpu *vcpu;
+       struct kvm_vcpu *vcpu, *found = NULL;
+       int i;
 
        s->wakeup_needed = false;
 
        raw_spin_unlock(&s->lock);
 
        if (wakeup) {
-               vcpu = s->kvm->bsp_vcpu;
-               if (vcpu)
-                       kvm_vcpu_kick(vcpu);
+               kvm_for_each_vcpu(i, vcpu, s->kvm) {
+                       if (kvm_apic_accept_pic_intr(vcpu)) {
+                               found = vcpu;
+                               break;
+                       }
+               }
+
+               if (!found)
+                       found = s->kvm->bsp_vcpu;
+
+               kvm_vcpu_kick(found);
        }
 }
 
@@ -173,10 +185,7 @@ static void pic_update_irq(struct kvm_pic *s)
                pic_set_irq1(&s->pics[0], 2, 0);
        }
        irq = pic_get_irq(&s->pics[0]);
-       if (irq >= 0)
-               s->irq_request(s->irq_request_opaque, 1);
-       else
-               s->irq_request(s->irq_request_opaque, 0);
+       pic_irq_request(s->kvm, irq >= 0);
 }
 
 void kvm_pic_update_irq(struct kvm_pic *s)
@@ -261,8 +270,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
 void kvm_pic_reset(struct kvm_kpic_state *s)
 {
        int irq;
-       struct kvm *kvm = s->pics_state->irq_request_opaque;
-       struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu;
+       struct kvm_vcpu *vcpu0 = s->pics_state->kvm->bsp_vcpu;
        u8 irr = s->irr, isr = s->imr;
 
        s->last_irr = 0;
@@ -301,8 +309,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
                        /*
                         * deassert a pending interrupt
                         */
-                       s->pics_state->irq_request(s->pics_state->
-                                                  irq_request_opaque, 0);
+                       pic_irq_request(s->pics_state->kvm, 0);
                        s->init_state = 1;
                        s->init4 = val & 1;
                        if (val & 0x02)
@@ -356,10 +363,20 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
                }
        } else
                switch (s->init_state) {
-               case 0:         /* normal mode */
+               case 0: { /* normal mode */
+                       u8 imr_diff = s->imr ^ val,
+                               off = (s == &s->pics_state->pics[0]) ? 0 : 8;
                        s->imr = val;
+                       for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
+                               if (imr_diff & (1 << irq))
+                                       kvm_fire_mask_notifiers(
+                                               s->pics_state->kvm,
+                                               SELECT_PIC(irq + off),
+                                               irq + off,
+                                               !!(s->imr & (1 << irq)));
                        pic_update_irq(s->pics_state);
                        break;
+               }
                case 1:
                        s->irq_base = val & 0xf8;
                        s->init_state = 2;
@@ -518,9 +535,8 @@ static int picdev_read(struct kvm_io_device *this,
 /*
  * callback when PIC0 irq status changed
  */
-static void pic_irq_request(void *opaque, int level)
+static void pic_irq_request(struct kvm *kvm, int level)
 {
-       struct kvm *kvm = opaque;
        struct kvm_vcpu *vcpu = kvm->bsp_vcpu;
        struct kvm_pic *s = pic_irqchip(kvm);
        int irq = pic_get_irq(&s->pics[0]);
@@ -549,8 +565,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
        s->kvm = kvm;
        s->pics[0].elcr_mask = 0xf8;
        s->pics[1].elcr_mask = 0xde;
-       s->irq_request = pic_irq_request;
-       s->irq_request_opaque = kvm;
        s->pics[0].pics_state = s;
        s->pics[1].pics_state = s;
 
index 96dfbb6ad2a9d2b1db365ef0c007f9d7056d9dda..2095a049835e4916b4bd504688e82a2a6cc3796a 100644 (file)
@@ -1,6 +1,7 @@
 /*
  * irq.c: API for in kernel interrupt controller
  * Copyright (c) 2007, Intel Corporation.
+ * Copyright 2009 Red Hat, Inc. and/or its affilates.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -89,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
 {
        kvm_inject_apic_timer_irqs(vcpu);
-       kvm_inject_pit_timer_irqs(vcpu);
        /* TODO: PIT, RTC etc. */
 }
 EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
index cd1f362f413d7bcee70920360388d944867a2ed9..ffed06871c5cf389594244086ecb3685496b4258 100644 (file)
@@ -38,8 +38,6 @@
 struct kvm;
 struct kvm_vcpu;
 
-typedef void irq_request_func(void *opaque, int level);
-
 struct kvm_kpic_state {
        u8 last_irr;    /* edge detection */
        u8 irr;         /* interrupt request register */
@@ -67,8 +65,6 @@ struct kvm_pic {
        unsigned pending_acks;
        struct kvm *kvm;
        struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
-       irq_request_func *irq_request;
-       void *irq_request_opaque;
        int output;             /* intr from master PIC */
        struct kvm_io_device dev;
        void (*ack_notifier)(void *opaque, int irq);
index cff851cf5322f59db69fc2b69d3ea306480e96d6..6491ac8e755b82f486cabb4cd00247a84ad5f503 100644 (file)
@@ -36,6 +36,8 @@ static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
 
 static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
 {
+       might_sleep();  /* on svm */
+
        if (!test_bit(VCPU_EXREG_PDPTR,
                      (unsigned long *)&vcpu->arch.regs_avail))
                kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR);
@@ -69,4 +71,10 @@ static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
        return kvm_read_cr4_bits(vcpu, ~0UL);
 }
 
+static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
+{
+       return (kvm_register_read(vcpu, VCPU_REGS_RAX) & -1u)
+               | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
+}
+
 #endif
index 1eb7a4ae0c9c0382ac64b44040f02e3accc4d005..77d8c0f4817d5f10f88e725ad49b22642c1172c5 100644 (file)
@@ -5,6 +5,7 @@
  * Copyright (C) 2006 Qumranet, Inc.
  * Copyright (C) 2007 Novell
  * Copyright (C) 2007 Intel
+ * Copyright 2009 Red Hat, Inc. and/or its affilates.
  *
  * Authors:
  *   Dor Laor <dor.laor@qumranet.com>
@@ -328,7 +329,7 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
                   "dest_mode 0x%x, short_hand 0x%x\n",
                   target, source, dest, dest_mode, short_hand);
 
-       ASSERT(!target);
+       ASSERT(target);
        switch (short_hand) {
        case APIC_DEST_NOSHORT:
                if (dest_mode == 0)
@@ -533,7 +534,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write)
        struct kvm_vcpu *vcpu = apic->vcpu;
        struct kvm_run *run = vcpu->run;
 
-       set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests);
+       kvm_make_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu);
        run->tpr_access.rip = kvm_rip_read(vcpu);
        run->tpr_access.is_write = write;
 }
@@ -1106,13 +1107,11 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
        u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
        int r = 0;
 
-       if (kvm_vcpu_is_bsp(vcpu)) {
-               if (!apic_hw_enabled(vcpu->arch.apic))
-                       r = 1;
-               if ((lvt0 & APIC_LVT_MASKED) == 0 &&
-                   GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
-                       r = 1;
-       }
+       if (!apic_hw_enabled(vcpu->arch.apic))
+               r = 1;
+       if ((lvt0 & APIC_LVT_MASKED) == 0 &&
+           GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
+               r = 1;
        return r;
 }
 
index b1ed0a1a591338c801d49268c2f784477fd4a0a1..0dcc95e09876fc7094bc9b05bf3acf34ede51439 100644 (file)
@@ -7,6 +7,7 @@
  * MMU support
  *
  * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
  *
  * Authors:
  *   Yaniv Kamay  <yaniv@qumranet.com>
@@ -32,6 +33,7 @@
 #include <linux/compiler.h>
 #include <linux/srcu.h>
 #include <linux/slab.h>
+#include <linux/uaccess.h>
 
 #include <asm/page.h>
 #include <asm/cmpxchg.h>
@@ -90,8 +92,6 @@ module_param(oos_shadow, bool, 0644);
 #define PT_FIRST_AVAIL_BITS_SHIFT 9
 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
 
-#define VALID_PAGE(x) ((x) != INVALID_PAGE)
-
 #define PT64_LEVEL_BITS 9
 
 #define PT64_LEVEL_SHIFT(level) \
@@ -173,7 +173,7 @@ struct kvm_shadow_walk_iterator {
             shadow_walk_okay(&(_walker));                      \
             shadow_walk_next(&(_walker)))
 
-typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp);
+typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
 
 static struct kmem_cache *pte_chain_cache;
 static struct kmem_cache *rmap_desc_cache;
@@ -288,6 +288,35 @@ static void __set_spte(u64 *sptep, u64 spte)
 #endif
 }
 
+static u64 __xchg_spte(u64 *sptep, u64 new_spte)
+{
+#ifdef CONFIG_X86_64
+       return xchg(sptep, new_spte);
+#else
+       u64 old_spte;
+
+       do {
+               old_spte = *sptep;
+       } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);
+
+       return old_spte;
+#endif
+}
+
+static void update_spte(u64 *sptep, u64 new_spte)
+{
+       u64 old_spte;
+
+       if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) ||
+             !is_rmap_spte(*sptep))
+               __set_spte(sptep, new_spte);
+       else {
+               old_spte = __xchg_spte(sptep, new_spte);
+               if (old_spte & shadow_accessed_mask)
+                       mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
+       }
+}
+
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
                                  struct kmem_cache *base_cache, int min)
 {
@@ -304,10 +333,11 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
        return 0;
 }
 
-static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
+static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
+                                 struct kmem_cache *cache)
 {
        while (mc->nobjs)
-               kfree(mc->objects[--mc->nobjs]);
+               kmem_cache_free(cache, mc->objects[--mc->nobjs]);
 }
 
 static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
@@ -355,10 +385,11 @@ out:
 
 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
 {
-       mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache);
-       mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache);
+       mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache);
+       mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache);
        mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
-       mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
+       mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
+                               mmu_page_header_cache);
 }
 
 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
@@ -379,7 +410,7 @@ static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
 
 static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
 {
-       kfree(pc);
+       kmem_cache_free(pte_chain_cache, pc);
 }
 
 static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
@@ -390,7 +421,23 @@ static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
 
 static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
 {
-       kfree(rd);
+       kmem_cache_free(rmap_desc_cache, rd);
+}
+
+static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
+{
+       if (!sp->role.direct)
+               return sp->gfns[index];
+
+       return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
+}
+
+static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
+{
+       if (sp->role.direct)
+               BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
+       else
+               sp->gfns[index] = gfn;
 }
 
 /*
@@ -403,8 +450,8 @@ static int *slot_largepage_idx(gfn_t gfn,
 {
        unsigned long idx;
 
-       idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
-             (slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
+       idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
+             (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
        return &slot->lpage_info[level - 2][idx].write_count;
 }
 
@@ -414,9 +461,7 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)
        int *write_count;
        int i;
 
-       gfn = unalias_gfn(kvm, gfn);
-
-       slot = gfn_to_memslot_unaliased(kvm, gfn);
+       slot = gfn_to_memslot(kvm, gfn);
        for (i = PT_DIRECTORY_LEVEL;
             i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
                write_count   = slot_largepage_idx(gfn, slot, i);
@@ -430,8 +475,7 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
        int *write_count;
        int i;
 
-       gfn = unalias_gfn(kvm, gfn);
-       slot = gfn_to_memslot_unaliased(kvm, gfn);
+       slot = gfn_to_memslot(kvm, gfn);
        for (i = PT_DIRECTORY_LEVEL;
             i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
                write_count   = slot_largepage_idx(gfn, slot, i);
@@ -447,8 +491,7 @@ static int has_wrprotected_page(struct kvm *kvm,
        struct kvm_memory_slot *slot;
        int *largepage_idx;
 
-       gfn = unalias_gfn(kvm, gfn);
-       slot = gfn_to_memslot_unaliased(kvm, gfn);
+       slot = gfn_to_memslot(kvm, gfn);
        if (slot) {
                largepage_idx = slot_largepage_idx(gfn, slot, level);
                return *largepage_idx;
@@ -501,7 +544,6 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
 
 /*
  * Take gfn and return the reverse mapping to it.
- * Note: gfn must be unaliased before this function get called
  */
 
 static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
@@ -513,8 +555,8 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
        if (likely(level == PT_PAGE_TABLE_LEVEL))
                return &slot->rmap[gfn - slot->base_gfn];
 
-       idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
-               (slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
+       idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
+               (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
 
        return &slot->lpage_info[level - 2][idx].rmap_pde;
 }
@@ -541,9 +583,8 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 
        if (!is_rmap_spte(*spte))
                return count;
-       gfn = unalias_gfn(vcpu->kvm, gfn);
        sp = page_header(__pa(spte));
-       sp->gfns[spte - sp->spt] = gfn;
+       kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
        rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
        if (!*rmapp) {
                rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
@@ -600,19 +641,13 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
        struct kvm_rmap_desc *desc;
        struct kvm_rmap_desc *prev_desc;
        struct kvm_mmu_page *sp;
-       pfn_t pfn;
+       gfn_t gfn;
        unsigned long *rmapp;
        int i;
 
-       if (!is_rmap_spte(*spte))
-               return;
        sp = page_header(__pa(spte));
-       pfn = spte_to_pfn(*spte);
-       if (*spte & shadow_accessed_mask)
-               kvm_set_pfn_accessed(pfn);
-       if (is_writable_pte(*spte))
-               kvm_set_pfn_dirty(pfn);
-       rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level);
+       gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
+       rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
        if (!*rmapp) {
                printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
                BUG();
@@ -644,6 +679,32 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
        }
 }
 
+static void set_spte_track_bits(u64 *sptep, u64 new_spte)
+{
+       pfn_t pfn;
+       u64 old_spte = *sptep;
+
+       if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) ||
+             old_spte & shadow_accessed_mask) {
+               __set_spte(sptep, new_spte);
+       } else
+               old_spte = __xchg_spte(sptep, new_spte);
+
+       if (!is_rmap_spte(old_spte))
+               return;
+       pfn = spte_to_pfn(old_spte);
+       if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
+               kvm_set_pfn_accessed(pfn);
+       if (is_writable_pte(old_spte))
+               kvm_set_pfn_dirty(pfn);
+}
+
+static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
+{
+       set_spte_track_bits(sptep, new_spte);
+       rmap_remove(kvm, sptep);
+}
+
 static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
 {
        struct kvm_rmap_desc *desc;
@@ -676,7 +737,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
        u64 *spte;
        int i, write_protected = 0;
 
-       gfn = unalias_gfn(kvm, gfn);
        rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
 
        spte = rmap_next(kvm, rmapp, NULL);
@@ -685,7 +745,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
                BUG_ON(!(*spte & PT_PRESENT_MASK));
                rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
                if (is_writable_pte(*spte)) {
-                       __set_spte(spte, *spte & ~PT_WRITABLE_MASK);
+                       update_spte(spte, *spte & ~PT_WRITABLE_MASK);
                        write_protected = 1;
                }
                spte = rmap_next(kvm, rmapp, spte);
@@ -709,9 +769,9 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
                        BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
                        pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
                        if (is_writable_pte(*spte)) {
-                               rmap_remove(kvm, spte);
+                               drop_spte(kvm, spte,
+                                         shadow_trap_nonpresent_pte);
                                --kvm->stat.lpages;
-                               __set_spte(spte, shadow_trap_nonpresent_pte);
                                spte = NULL;
                                write_protected = 1;
                        }
@@ -731,8 +791,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
        while ((spte = rmap_next(kvm, rmapp, NULL))) {
                BUG_ON(!(*spte & PT_PRESENT_MASK));
                rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
-               rmap_remove(kvm, spte);
-               __set_spte(spte, shadow_trap_nonpresent_pte);
+               drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
                need_tlb_flush = 1;
        }
        return need_tlb_flush;
@@ -754,8 +813,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
                rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
                need_flush = 1;
                if (pte_write(*ptep)) {
-                       rmap_remove(kvm, spte);
-                       __set_spte(spte, shadow_trap_nonpresent_pte);
+                       drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
                        spte = rmap_next(kvm, rmapp, NULL);
                } else {
                        new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
@@ -763,9 +821,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 
                        new_spte &= ~PT_WRITABLE_MASK;
                        new_spte &= ~SPTE_HOST_WRITEABLE;
-                       if (is_writable_pte(*spte))
-                               kvm_set_pfn_dirty(spte_to_pfn(*spte));
-                       __set_spte(spte, new_spte);
+                       new_spte &= ~shadow_accessed_mask;
+                       set_spte_track_bits(spte, new_spte);
                        spte = rmap_next(kvm, rmapp, spte);
                }
        }
@@ -799,8 +856,12 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
                        ret = handler(kvm, &memslot->rmap[gfn_offset], data);
 
                        for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
-                               int idx = gfn_offset;
-                               idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j);
+                               unsigned long idx;
+                               int sh;
+
+                               sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j);
+                               idx = ((memslot->base_gfn+gfn_offset) >> sh) -
+                                       (memslot->base_gfn >> sh);
                                ret |= handler(kvm,
                                        &memslot->lpage_info[j][idx].rmap_pde,
                                        data);
@@ -863,7 +924,6 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 
        sp = page_header(__pa(spte));
 
-       gfn = unalias_gfn(vcpu->kvm, gfn);
        rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
 
        kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
@@ -894,10 +954,12 @@ static int is_empty_shadow_page(u64 *spt)
 static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
        ASSERT(is_empty_shadow_page(sp->spt));
+       hlist_del(&sp->hash_link);
        list_del(&sp->link);
        __free_page(virt_to_page(sp->spt));
-       __free_page(virt_to_page(sp->gfns));
-       kfree(sp);
+       if (!sp->role.direct)
+               __free_page(virt_to_page(sp->gfns));
+       kmem_cache_free(mmu_page_header_cache, sp);
        ++kvm->arch.n_free_mmu_pages;
 }
 
@@ -907,13 +969,15 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn)
 }
 
 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
-                                              u64 *parent_pte)
+                                              u64 *parent_pte, int direct)
 {
        struct kvm_mmu_page *sp;
 
        sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
        sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
-       sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
+       if (!direct)
+               sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
+                                                 PAGE_SIZE);
        set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
        list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
        bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
@@ -998,7 +1062,6 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
        BUG();
 }
 
-
 static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
 {
        struct kvm_pte_chain *pte_chain;
@@ -1008,63 +1071,37 @@ static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
 
        if (!sp->multimapped && sp->parent_pte) {
                parent_sp = page_header(__pa(sp->parent_pte));
-               fn(parent_sp);
-               mmu_parent_walk(parent_sp, fn);
+               fn(parent_sp, sp->parent_pte);
                return;
        }
+
        hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
                for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
-                       if (!pte_chain->parent_ptes[i])
+                       u64 *spte = pte_chain->parent_ptes[i];
+
+                       if (!spte)
                                break;
-                       parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
-                       fn(parent_sp);
-                       mmu_parent_walk(parent_sp, fn);
+                       parent_sp = page_header(__pa(spte));
+                       fn(parent_sp, spte);
                }
 }
 
-static void kvm_mmu_update_unsync_bitmap(u64 *spte)
+static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte);
+static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
 {
-       unsigned int index;
-       struct kvm_mmu_page *sp = page_header(__pa(spte));
-
-       index = spte - sp->spt;
-       if (!__test_and_set_bit(index, sp->unsync_child_bitmap))
-               sp->unsync_children++;
-       WARN_ON(!sp->unsync_children);
+       mmu_parent_walk(sp, mark_unsync);
 }
 
-static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
+static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
 {
-       struct kvm_pte_chain *pte_chain;
-       struct hlist_node *node;
-       int i;
+       unsigned int index;
 
-       if (!sp->parent_pte)
+       index = spte - sp->spt;
+       if (__test_and_set_bit(index, sp->unsync_child_bitmap))
                return;
-
-       if (!sp->multimapped) {
-               kvm_mmu_update_unsync_bitmap(sp->parent_pte);
+       if (sp->unsync_children++)
                return;
-       }
-
-       hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
-               for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
-                       if (!pte_chain->parent_ptes[i])
-                               break;
-                       kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]);
-               }
-}
-
-static int unsync_walk_fn(struct kvm_mmu_page *sp)
-{
-       kvm_mmu_update_parents_unsync(sp);
-       return 1;
-}
-
-static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
-{
-       mmu_parent_walk(sp, unsync_walk_fn);
-       kvm_mmu_update_parents_unsync(sp);
+       kvm_mmu_mark_parents_unsync(sp);
 }
 
 static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
@@ -1077,7 +1114,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
 }
 
 static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
-                              struct kvm_mmu_page *sp)
+                              struct kvm_mmu_page *sp, bool clear_unsync)
 {
        return 1;
 }
@@ -1123,35 +1160,40 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
        int i, ret, nr_unsync_leaf = 0;
 
        for_each_unsync_children(sp->unsync_child_bitmap, i) {
+               struct kvm_mmu_page *child;
                u64 ent = sp->spt[i];
 
-               if (is_shadow_present_pte(ent) && !is_large_pte(ent)) {
-                       struct kvm_mmu_page *child;
-                       child = page_header(ent & PT64_BASE_ADDR_MASK);
-
-                       if (child->unsync_children) {
-                               if (mmu_pages_add(pvec, child, i))
-                                       return -ENOSPC;
-
-                               ret = __mmu_unsync_walk(child, pvec);
-                               if (!ret)
-                                       __clear_bit(i, sp->unsync_child_bitmap);
-                               else if (ret > 0)
-                                       nr_unsync_leaf += ret;
-                               else
-                                       return ret;
-                       }
+               if (!is_shadow_present_pte(ent) || is_large_pte(ent))
+                       goto clear_child_bitmap;
+
+               child = page_header(ent & PT64_BASE_ADDR_MASK);
+
+               if (child->unsync_children) {
+                       if (mmu_pages_add(pvec, child, i))
+                               return -ENOSPC;
+
+                       ret = __mmu_unsync_walk(child, pvec);
+                       if (!ret)
+                               goto clear_child_bitmap;
+                       else if (ret > 0)
+                               nr_unsync_leaf += ret;
+                       else
+                               return ret;
+               } else if (child->unsync) {
+                       nr_unsync_leaf++;
+                       if (mmu_pages_add(pvec, child, i))
+                               return -ENOSPC;
+               } else
+                        goto clear_child_bitmap;
 
-                       if (child->unsync) {
-                               nr_unsync_leaf++;
-                               if (mmu_pages_add(pvec, child, i))
-                                       return -ENOSPC;
-                       }
-               }
+               continue;
+
+clear_child_bitmap:
+               __clear_bit(i, sp->unsync_child_bitmap);
+               sp->unsync_children--;
+               WARN_ON((int)sp->unsync_children < 0);
        }
 
-       if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
-               sp->unsync_children = 0;
 
        return nr_unsync_leaf;
 }
@@ -1166,26 +1208,6 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
        return __mmu_unsync_walk(sp, pvec);
 }
 
-static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
-{
-       unsigned index;
-       struct hlist_head *bucket;
-       struct kvm_mmu_page *sp;
-       struct hlist_node *node;
-
-       pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
-       index = kvm_page_table_hashfn(gfn);
-       bucket = &kvm->arch.mmu_page_hash[index];
-       hlist_for_each_entry(sp, node, bucket, hash_link)
-               if (sp->gfn == gfn && !sp->role.direct
-                   && !sp->role.invalid) {
-                       pgprintk("%s: found role %x\n",
-                                __func__, sp->role.word);
-                       return sp;
-               }
-       return NULL;
-}
-
 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
        WARN_ON(!sp->unsync);
@@ -1194,20 +1216,36 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
        --kvm->stat.mmu_unsync;
 }
 
-static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
+static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+                                   struct list_head *invalid_list);
+static void kvm_mmu_commit_zap_page(struct kvm *kvm,
+                                   struct list_head *invalid_list);
+
+#define for_each_gfn_sp(kvm, sp, gfn, pos)                             \
+  hlist_for_each_entry(sp, pos,                                                \
+   &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)  \
+       if ((sp)->gfn != (gfn)) {} else
+
+#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos)              \
+  hlist_for_each_entry(sp, pos,                                                \
+   &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)  \
+               if ((sp)->gfn != (gfn) || (sp)->role.direct ||          \
+                       (sp)->role.invalid) {} else
 
-static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+/* @sp->gfn should be write-protected at the call site */
+static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+                          struct list_head *invalid_list, bool clear_unsync)
 {
        if (sp->role.cr4_pae != !!is_pae(vcpu)) {
-               kvm_mmu_zap_page(vcpu->kvm, sp);
+               kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
                return 1;
        }
 
-       if (rmap_write_protect(vcpu->kvm, sp->gfn))
-               kvm_flush_remote_tlbs(vcpu->kvm);
-       kvm_unlink_unsync_page(vcpu->kvm, sp);
-       if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
-               kvm_mmu_zap_page(vcpu->kvm, sp);
+       if (clear_unsync)
+               kvm_unlink_unsync_page(vcpu->kvm, sp);
+
+       if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) {
+               kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
                return 1;
        }
 
@@ -1215,6 +1253,52 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
        return 0;
 }
 
+static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
+                                  struct kvm_mmu_page *sp)
+{
+       LIST_HEAD(invalid_list);
+       int ret;
+
+       ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
+       if (ret)
+               kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+
+       return ret;
+}
+
+static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+                        struct list_head *invalid_list)
+{
+       return __kvm_sync_page(vcpu, sp, invalid_list, true);
+}
+
+/* @gfn should be write-protected at the call site */
+static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
+{
+       struct kvm_mmu_page *s;
+       struct hlist_node *node;
+       LIST_HEAD(invalid_list);
+       bool flush = false;
+
+       for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
+               if (!s->unsync)
+                       continue;
+
+               WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
+               if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
+                       (vcpu->arch.mmu.sync_page(vcpu, s, true))) {
+                       kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
+                       continue;
+               }
+               kvm_unlink_unsync_page(vcpu->kvm, s);
+               flush = true;
+       }
+
+       kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+       if (flush)
+               kvm_mmu_flush_tlb(vcpu);
+}
+
 struct mmu_page_path {
        struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
        unsigned int idx[PT64_ROOT_LEVEL-1];
@@ -1281,6 +1365,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
        struct kvm_mmu_page *sp;
        struct mmu_page_path parents;
        struct kvm_mmu_pages pages;
+       LIST_HEAD(invalid_list);
 
        kvm_mmu_pages_init(parent, &parents, &pages);
        while (mmu_unsync_walk(parent, &pages)) {
@@ -1293,9 +1378,10 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
                        kvm_flush_remote_tlbs(vcpu->kvm);
 
                for_each_sp(pages, sp, parents, i) {
-                       kvm_sync_page(vcpu, sp);
+                       kvm_sync_page(vcpu, sp, &invalid_list);
                        mmu_pages_clear_parents(&parents);
                }
+               kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
                cond_resched_lock(&vcpu->kvm->mmu_lock);
                kvm_mmu_pages_init(parent, &parents, &pages);
        }
@@ -1310,11 +1396,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
                                             u64 *parent_pte)
 {
        union kvm_mmu_page_role role;
-       unsigned index;
        unsigned quadrant;
-       struct hlist_head *bucket;
        struct kvm_mmu_page *sp;
-       struct hlist_node *node, *tmp;
+       struct hlist_node *node;
+       bool need_sync = false;
 
        role = vcpu->arch.mmu.base_role;
        role.level = level;
@@ -1322,40 +1407,45 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
        if (role.direct)
                role.cr4_pae = 0;
        role.access = access;
-       if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
+       if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
                quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
                quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
                role.quadrant = quadrant;
        }
-       index = kvm_page_table_hashfn(gfn);
-       bucket = &vcpu->kvm->arch.mmu_page_hash[index];
-       hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
-               if (sp->gfn == gfn) {
-                       if (sp->unsync)
-                               if (kvm_sync_page(vcpu, sp))
-                                       continue;
+       for_each_gfn_sp(vcpu->kvm, sp, gfn, node) {
+               if (!need_sync && sp->unsync)
+                       need_sync = true;
 
-                       if (sp->role.word != role.word)
-                               continue;
+               if (sp->role.word != role.word)
+                       continue;
 
-                       mmu_page_add_parent_pte(vcpu, sp, parent_pte);
-                       if (sp->unsync_children) {
-                               set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
-                               kvm_mmu_mark_parents_unsync(sp);
-                       }
-                       trace_kvm_mmu_get_page(sp, false);
-                       return sp;
-               }
+               if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
+                       break;
+
+               mmu_page_add_parent_pte(vcpu, sp, parent_pte);
+               if (sp->unsync_children) {
+                       kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+                       kvm_mmu_mark_parents_unsync(sp);
+               } else if (sp->unsync)
+                       kvm_mmu_mark_parents_unsync(sp);
+
+               trace_kvm_mmu_get_page(sp, false);
+               return sp;
+       }
        ++vcpu->kvm->stat.mmu_cache_miss;
-       sp = kvm_mmu_alloc_page(vcpu, parent_pte);
+       sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct);
        if (!sp)
                return sp;
        sp->gfn = gfn;
        sp->role = role;
-       hlist_add_head(&sp->hash_link, bucket);
+       hlist_add_head(&sp->hash_link,
+               &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
        if (!direct) {
                if (rmap_write_protect(vcpu->kvm, gfn))
                        kvm_flush_remote_tlbs(vcpu->kvm);
+               if (level > PT_PAGE_TABLE_LEVEL && need_sync)
+                       kvm_sync_pages(vcpu, gfn);
+
                account_shadowed(vcpu->kvm, gfn);
        }
        if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
@@ -1402,6 +1492,47 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
        --iterator->level;
 }
 
+static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
+{
+       u64 spte;
+
+       spte = __pa(sp->spt)
+               | PT_PRESENT_MASK | PT_ACCESSED_MASK
+               | PT_WRITABLE_MASK | PT_USER_MASK;
+       __set_spte(sptep, spte);
+}
+
+static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
+{
+       if (is_large_pte(*sptep)) {
+               drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+               kvm_flush_remote_tlbs(vcpu->kvm);
+       }
+}
+
+static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
+                                  unsigned direct_access)
+{
+       if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
+               struct kvm_mmu_page *child;
+
+               /*
+                * For the direct sp, if the guest pte's dirty bit
+                * changed form clean to dirty, it will corrupt the
+                * sp's access: allow writable in the read-only sp,
+                * so we should update the spte at this point to get
+                * a new sp with the correct access.
+                */
+               child = page_header(*sptep & PT64_BASE_ADDR_MASK);
+               if (child->role.access == direct_access)
+                       return;
+
+               mmu_page_remove_parent_pte(child, sptep);
+               __set_spte(sptep, shadow_trap_nonpresent_pte);
+               kvm_flush_remote_tlbs(vcpu->kvm);
+       }
+}
+
 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
                                         struct kvm_mmu_page *sp)
 {
@@ -1422,7 +1553,8 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
                        } else {
                                if (is_large_pte(ent))
                                        --kvm->stat.lpages;
-                               rmap_remove(kvm, &pt[i]);
+                               drop_spte(kvm, &pt[i],
+                                         shadow_trap_nonpresent_pte);
                        }
                }
                pt[i] = shadow_trap_nonpresent_pte;
@@ -1464,7 +1596,8 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
 }
 
 static int mmu_zap_unsync_children(struct kvm *kvm,
-                                  struct kvm_mmu_page *parent)
+                                  struct kvm_mmu_page *parent,
+                                  struct list_head *invalid_list)
 {
        int i, zapped = 0;
        struct mmu_page_path parents;
@@ -1478,7 +1611,7 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
                struct kvm_mmu_page *sp;
 
                for_each_sp(pages, sp, parents, i) {
-                       kvm_mmu_zap_page(kvm, sp);
+                       kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
                        mmu_pages_clear_parents(&parents);
                        zapped++;
                }
@@ -1488,32 +1621,52 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
        return zapped;
 }
 
-static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+                                   struct list_head *invalid_list)
 {
        int ret;
 
-       trace_kvm_mmu_zap_page(sp);
+       trace_kvm_mmu_prepare_zap_page(sp);
        ++kvm->stat.mmu_shadow_zapped;
-       ret = mmu_zap_unsync_children(kvm, sp);
+       ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
        kvm_mmu_page_unlink_children(kvm, sp);
        kvm_mmu_unlink_parents(kvm, sp);
-       kvm_flush_remote_tlbs(kvm);
        if (!sp->role.invalid && !sp->role.direct)
                unaccount_shadowed(kvm, sp->gfn);
        if (sp->unsync)
                kvm_unlink_unsync_page(kvm, sp);
        if (!sp->root_count) {
-               hlist_del(&sp->hash_link);
-               kvm_mmu_free_page(kvm, sp);
+               /* Count self */
+               ret++;
+               list_move(&sp->link, invalid_list);
        } else {
-               sp->role.invalid = 1;
                list_move(&sp->link, &kvm->arch.active_mmu_pages);
                kvm_reload_remote_mmus(kvm);
        }
+
+       sp->role.invalid = 1;
        kvm_mmu_reset_last_pte_updated(kvm);
        return ret;
 }
 
+static void kvm_mmu_commit_zap_page(struct kvm *kvm,
+                                   struct list_head *invalid_list)
+{
+       struct kvm_mmu_page *sp;
+
+       if (list_empty(invalid_list))
+               return;
+
+       kvm_flush_remote_tlbs(kvm);
+
+       do {
+               sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
+               WARN_ON(!sp->role.invalid || sp->root_count);
+               kvm_mmu_free_page(kvm, sp);
+       } while (!list_empty(invalid_list));
+
+}
+
 /*
  * Changing the number of mmu pages allocated to the vm
  * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
@@ -1521,6 +1674,7 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
 {
        int used_pages;
+       LIST_HEAD(invalid_list);
 
        used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
        used_pages = max(0, used_pages);
@@ -1538,9 +1692,10 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
 
                        page = container_of(kvm->arch.active_mmu_pages.prev,
                                            struct kvm_mmu_page, link);
-                       used_pages -= kvm_mmu_zap_page(kvm, page);
-                       used_pages--;
+                       used_pages -= kvm_mmu_prepare_zap_page(kvm, page,
+                                                              &invalid_list);
                }
+               kvm_mmu_commit_zap_page(kvm, &invalid_list);
                kvm_nr_mmu_pages = used_pages;
                kvm->arch.n_free_mmu_pages = 0;
        }
@@ -1553,47 +1708,36 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
 
 static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 {
-       unsigned index;
-       struct hlist_head *bucket;
        struct kvm_mmu_page *sp;
-       struct hlist_node *node, *n;
+       struct hlist_node *node;
+       LIST_HEAD(invalid_list);
        int r;
 
        pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
        r = 0;
-       index = kvm_page_table_hashfn(gfn);
-       bucket = &kvm->arch.mmu_page_hash[index];
-restart:
-       hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
-               if (sp->gfn == gfn && !sp->role.direct) {
-                       pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
-                                sp->role.word);
-                       r = 1;
-                       if (kvm_mmu_zap_page(kvm, sp))
-                               goto restart;
-               }
+
+       for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
+               pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
+                        sp->role.word);
+               r = 1;
+               kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+       }
+       kvm_mmu_commit_zap_page(kvm, &invalid_list);
        return r;
 }
 
 static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
 {
-       unsigned index;
-       struct hlist_head *bucket;
        struct kvm_mmu_page *sp;
-       struct hlist_node *node, *nn;
+       struct hlist_node *node;
+       LIST_HEAD(invalid_list);
 
-       index = kvm_page_table_hashfn(gfn);
-       bucket = &kvm->arch.mmu_page_hash[index];
-restart:
-       hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) {
-               if (sp->gfn == gfn && !sp->role.direct
-                   && !sp->role.invalid) {
-                       pgprintk("%s: zap %lx %x\n",
-                                __func__, gfn, sp->role.word);
-                       if (kvm_mmu_zap_page(kvm, sp))
-                               goto restart;
-               }
+       for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
+               pgprintk("%s: zap %lx %x\n",
+                        __func__, gfn, sp->role.word);
+               kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
        }
+       kvm_mmu_commit_zap_page(kvm, &invalid_list);
 }
 
 static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
@@ -1723,47 +1867,51 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
 
-static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
-       unsigned index;
-       struct hlist_head *bucket;
-       struct kvm_mmu_page *s;
-       struct hlist_node *node, *n;
-
-       index = kvm_page_table_hashfn(sp->gfn);
-       bucket = &vcpu->kvm->arch.mmu_page_hash[index];
-       /* don't unsync if pagetable is shadowed with multiple roles */
-       hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
-               if (s->gfn != sp->gfn || s->role.direct)
-                       continue;
-               if (s->role.word != sp->role.word)
-                       return 1;
-       }
        trace_kvm_mmu_unsync_page(sp);
        ++vcpu->kvm->stat.mmu_unsync;
        sp->unsync = 1;
 
        kvm_mmu_mark_parents_unsync(sp);
-
        mmu_convert_notrap(sp);
-       return 0;
+}
+
+static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
+{
+       struct kvm_mmu_page *s;
+       struct hlist_node *node;
+
+       for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
+               if (s->unsync)
+                       continue;
+               WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
+               __kvm_unsync_page(vcpu, s);
+       }
 }
 
 static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
                                  bool can_unsync)
 {
-       struct kvm_mmu_page *shadow;
+       struct kvm_mmu_page *s;
+       struct hlist_node *node;
+       bool need_unsync = false;
 
-       shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
-       if (shadow) {
-               if (shadow->role.level != PT_PAGE_TABLE_LEVEL)
+       for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
+               if (!can_unsync)
                        return 1;
-               if (shadow->unsync)
-                       return 0;
-               if (can_unsync && oos_shadow)
-                       return kvm_unsync_page(vcpu, shadow);
-               return 1;
+
+               if (s->role.level != PT_PAGE_TABLE_LEVEL)
+                       return 1;
+
+               if (!need_unsync && !s->unsync) {
+                       if (!oos_shadow)
+                               return 1;
+                       need_unsync = true;
+               }
        }
+       if (need_unsync)
+               kvm_unsync_pages(vcpu, gfn);
        return 0;
 }
 
@@ -1804,13 +1952,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
        spte |= (u64)pfn << PAGE_SHIFT;
 
        if ((pte_access & ACC_WRITE_MASK)
-           || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
+           || (!tdp_enabled && write_fault && !is_write_protection(vcpu)
+               && !user_fault)) {
 
                if (level > PT_PAGE_TABLE_LEVEL &&
                    has_wrprotected_page(vcpu->kvm, gfn, level)) {
                        ret = 1;
-                       spte = shadow_trap_nonpresent_pte;
-                       goto set_pte;
+                       drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+                       goto done;
                }
 
                spte |= PT_WRITABLE_MASK;
@@ -1841,7 +1990,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                mark_page_dirty(vcpu->kvm, gfn);
 
 set_pte:
-       __set_spte(sptep, spte);
+       if (is_writable_pte(*sptep) && !is_writable_pte(spte))
+               kvm_set_pfn_dirty(pfn);
+       update_spte(sptep, spte);
+done:
        return ret;
 }
 
@@ -1853,7 +2005,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                         bool reset_host_protection)
 {
        int was_rmapped = 0;
-       int was_writable = is_writable_pte(*sptep);
        int rmap_count;
 
        pgprintk("%s: spte %llx access %x write_fault %d"
@@ -1878,8 +2029,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                } else if (pfn != spte_to_pfn(*sptep)) {
                        pgprintk("hfn old %lx new %lx\n",
                                 spte_to_pfn(*sptep), pfn);
-                       rmap_remove(vcpu->kvm, sptep);
-                       __set_spte(sptep, shadow_trap_nonpresent_pte);
+                       drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
                        kvm_flush_remote_tlbs(vcpu->kvm);
                } else
                        was_rmapped = 1;
@@ -1890,7 +2040,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                      reset_host_protection)) {
                if (write_fault)
                        *ptwrite = 1;
-               kvm_x86_ops->tlb_flush(vcpu);
+               kvm_mmu_flush_tlb(vcpu);
        }
 
        pgprintk("%s: setting spte %llx\n", __func__, *sptep);
@@ -1904,15 +2054,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
        page_header_update_slot(vcpu->kvm, sptep, gfn);
        if (!was_rmapped) {
                rmap_count = rmap_add(vcpu, sptep, gfn);
-               kvm_release_pfn_clean(pfn);
                if (rmap_count > RMAP_RECYCLE_THRESHOLD)
                        rmap_recycle(vcpu, sptep, gfn);
-       } else {
-               if (was_writable)
-                       kvm_release_pfn_dirty(pfn);
-               else
-                       kvm_release_pfn_clean(pfn);
        }
+       kvm_release_pfn_clean(pfn);
        if (speculative) {
                vcpu->arch.last_pte_updated = sptep;
                vcpu->arch.last_pte_gfn = gfn;
@@ -1941,7 +2086,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
                }
 
                if (*iterator.sptep == shadow_trap_nonpresent_pte) {
-                       pseudo_gfn = (iterator.addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
+                       u64 base_addr = iterator.addr;
+
+                       base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
+                       pseudo_gfn = base_addr >> PAGE_SHIFT;
                        sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
                                              iterator.level - 1,
                                              1, ACC_ALL, iterator.sptep);
@@ -1960,6 +2108,29 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
        return pt_write;
 }
 
+static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn)
+{
+       char buf[1];
+       void __user *hva;
+       int r;
+
+       /* Touch the page, so send SIGBUS */
+       hva = (void __user *)gfn_to_hva(kvm, gfn);
+       r = copy_from_user(buf, hva, 1);
+}
+
+static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
+{
+       kvm_release_pfn_clean(pfn);
+       if (is_hwpoison_pfn(pfn)) {
+               kvm_send_hwpoison_signal(kvm, gfn);
+               return 0;
+       } else if (is_fault_pfn(pfn))
+               return -EFAULT;
+
+       return 1;
+}
+
 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 {
        int r;
@@ -1983,10 +2154,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
        pfn = gfn_to_pfn(vcpu->kvm, gfn);
 
        /* mmio */
-       if (is_error_pfn(pfn)) {
-               kvm_release_pfn_clean(pfn);
-               return 1;
-       }
+       if (is_error_pfn(pfn))
+               return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
 
        spin_lock(&vcpu->kvm->mmu_lock);
        if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -2009,6 +2178,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
 {
        int i;
        struct kvm_mmu_page *sp;
+       LIST_HEAD(invalid_list);
 
        if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
                return;
@@ -2018,8 +2188,10 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
 
                sp = page_header(root);
                --sp->root_count;
-               if (!sp->root_count && sp->role.invalid)
-                       kvm_mmu_zap_page(vcpu->kvm, sp);
+               if (!sp->root_count && sp->role.invalid) {
+                       kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
+                       kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+               }
                vcpu->arch.mmu.root_hpa = INVALID_PAGE;
                spin_unlock(&vcpu->kvm->mmu_lock);
                return;
@@ -2032,10 +2204,12 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
                        sp = page_header(root);
                        --sp->root_count;
                        if (!sp->root_count && sp->role.invalid)
-                               kvm_mmu_zap_page(vcpu->kvm, sp);
+                               kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
+                                                        &invalid_list);
                }
                vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
        }
+       kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
        spin_unlock(&vcpu->kvm->mmu_lock);
        vcpu->arch.mmu.root_hpa = INVALID_PAGE;
 }
@@ -2045,7 +2219,7 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
        int ret = 0;
 
        if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
-               set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+               kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
                ret = 1;
        }
 
@@ -2073,6 +2247,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
                        root_gfn = 0;
                }
                spin_lock(&vcpu->kvm->mmu_lock);
+               kvm_mmu_free_some_pages(vcpu);
                sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
                                      PT64_ROOT_LEVEL, direct,
                                      ACC_ALL, NULL);
@@ -2103,6 +2278,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
                        root_gfn = i << 30;
                }
                spin_lock(&vcpu->kvm->mmu_lock);
+               kvm_mmu_free_some_pages(vcpu);
                sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
                                      PT32_ROOT_LEVEL, direct,
                                      ACC_ALL, NULL);
@@ -2198,10 +2374,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
        mmu_seq = vcpu->kvm->mmu_notifier_seq;
        smp_rmb();
        pfn = gfn_to_pfn(vcpu->kvm, gfn);
-       if (is_error_pfn(pfn)) {
-               kvm_release_pfn_clean(pfn);
-               return 1;
-       }
+       if (is_error_pfn(pfn))
+               return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
        spin_lock(&vcpu->kvm->mmu_lock);
        if (mmu_notifier_retry(vcpu, mmu_seq))
                goto out_unlock;
@@ -2243,7 +2417,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
 {
        ++vcpu->stat.tlb_flush;
-       kvm_x86_ops->tlb_flush(vcpu);
+       kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 }
 
 static void paging_new_cr3(struct kvm_vcpu *vcpu)
@@ -2457,10 +2631,9 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu)
 static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
 {
        ASSERT(vcpu);
-       if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
+       if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
+               /* mmu.free() should set root_hpa = INVALID_PAGE */
                vcpu->arch.mmu.free(vcpu);
-               vcpu->arch.mmu.root_hpa = INVALID_PAGE;
-       }
 }
 
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
@@ -2477,9 +2650,6 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
        r = mmu_topup_memory_caches(vcpu);
        if (r)
                goto out;
-       spin_lock(&vcpu->kvm->mmu_lock);
-       kvm_mmu_free_some_pages(vcpu);
-       spin_unlock(&vcpu->kvm->mmu_lock);
        r = mmu_alloc_roots(vcpu);
        spin_lock(&vcpu->kvm->mmu_lock);
        mmu_sync_roots(vcpu);
@@ -2508,7 +2678,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
        pte = *spte;
        if (is_shadow_present_pte(pte)) {
                if (is_last_spte(pte, sp->role.level))
-                       rmap_remove(vcpu->kvm, spte);
+                       drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
                else {
                        child = page_header(pte & PT64_BASE_ADDR_MASK);
                        mmu_page_remove_parent_pte(child, spte);
@@ -2529,6 +2699,9 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
                return;
         }
 
+       if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
+               return;
+
        ++vcpu->kvm->stat.mmu_pte_updated;
        if (!sp->role.cr4_pae)
                paging32_update_pte(vcpu, sp, spte, new);
@@ -2549,11 +2722,15 @@ static bool need_remote_flush(u64 old, u64 new)
        return (old & ~new & PT64_PERM_MASK) != 0;
 }
 
-static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
+static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
+                                   bool remote_flush, bool local_flush)
 {
-       if (need_remote_flush(old, new))
+       if (zap_page)
+               return;
+
+       if (remote_flush)
                kvm_flush_remote_tlbs(vcpu->kvm);
-       else
+       else if (local_flush)
                kvm_mmu_flush_tlb(vcpu);
 }
 
@@ -2603,10 +2780,10 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
                       bool guest_initiated)
 {
        gfn_t gfn = gpa >> PAGE_SHIFT;
+       union kvm_mmu_page_role mask = { .word = 0 };
        struct kvm_mmu_page *sp;
-       struct hlist_node *node, *n;
-       struct hlist_head *bucket;
-       unsigned index;
+       struct hlist_node *node;
+       LIST_HEAD(invalid_list);
        u64 entry, gentry;
        u64 *spte;
        unsigned offset = offset_in_page(gpa);
@@ -2619,6 +2796,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
        int npte;
        int r;
        int invlpg_counter;
+       bool remote_flush, local_flush, zap_page;
+
+       zap_page = remote_flush = local_flush = false;
 
        pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
 
@@ -2674,13 +2854,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
                        vcpu->arch.last_pte_updated = NULL;
                }
        }
-       index = kvm_page_table_hashfn(gfn);
-       bucket = &vcpu->kvm->arch.mmu_page_hash[index];
 
-restart:
-       hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
-               if (sp->gfn != gfn || sp->role.direct || sp->role.invalid)
-                       continue;
+       mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
+       for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
                pte_size = sp->role.cr4_pae ? 8 : 4;
                misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
                misaligned |= bytes < 4;
@@ -2697,8 +2873,8 @@ restart:
                         */
                        pgprintk("misaligned: gpa %llx bytes %d role %x\n",
                                 gpa, bytes, sp->role.word);
-                       if (kvm_mmu_zap_page(vcpu->kvm, sp))
-                               goto restart;
+                       zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
+                                                    &invalid_list);
                        ++vcpu->kvm->stat.mmu_flooded;
                        continue;
                }
@@ -2722,16 +2898,22 @@ restart:
                        if (quadrant != sp->role.quadrant)
                                continue;
                }
+               local_flush = true;
                spte = &sp->spt[page_offset / sizeof(*spte)];
                while (npte--) {
                        entry = *spte;
                        mmu_pte_write_zap_pte(vcpu, sp, spte);
-                       if (gentry)
+                       if (gentry &&
+                             !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
+                             & mask.word))
                                mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
-                       mmu_pte_write_flush_tlb(vcpu, entry, *spte);
+                       if (!remote_flush && need_remote_flush(entry, *spte))
+                               remote_flush = true;
                        ++spte;
                }
        }
+       mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
+       kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
        kvm_mmu_audit(vcpu, "post pte write");
        spin_unlock(&vcpu->kvm->mmu_lock);
        if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
@@ -2759,15 +2941,21 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
 
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 {
-       while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES &&
+       int free_pages;
+       LIST_HEAD(invalid_list);
+
+       free_pages = vcpu->kvm->arch.n_free_mmu_pages;
+       while (free_pages < KVM_REFILL_PAGES &&
               !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
                struct kvm_mmu_page *sp;
 
                sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
                                  struct kvm_mmu_page, link);
-               kvm_mmu_zap_page(vcpu->kvm, sp);
+               free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
+                                                      &invalid_list);
                ++vcpu->kvm->stat.mmu_recycled;
        }
+       kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 }
 
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
@@ -2795,11 +2983,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
                return 1;
        case EMULATE_DO_MMIO:
                ++vcpu->stat.mmio_exits;
-               return 0;
+               /* fall through */
        case EMULATE_FAIL:
-               vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-               vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
-               vcpu->run->internal.ndata = 0;
                return 0;
        default:
                BUG();
@@ -2896,7 +3081,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
                pt = sp->spt;
                for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
                        /* avoid RMW */
-                       if (pt[i] & PT_WRITABLE_MASK)
+                       if (is_writable_pte(pt[i]))
                                pt[i] &= ~PT_WRITABLE_MASK;
        }
        kvm_flush_remote_tlbs(kvm);
@@ -2905,25 +3090,26 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 void kvm_mmu_zap_all(struct kvm *kvm)
 {
        struct kvm_mmu_page *sp, *node;
+       LIST_HEAD(invalid_list);
 
        spin_lock(&kvm->mmu_lock);
 restart:
        list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
-               if (kvm_mmu_zap_page(kvm, sp))
+               if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
                        goto restart;
 
+       kvm_mmu_commit_zap_page(kvm, &invalid_list);
        spin_unlock(&kvm->mmu_lock);
-
-       kvm_flush_remote_tlbs(kvm);
 }
 
-static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm)
+static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
+                                              struct list_head *invalid_list)
 {
        struct kvm_mmu_page *page;
 
        page = container_of(kvm->arch.active_mmu_pages.prev,
                            struct kvm_mmu_page, link);
-       return kvm_mmu_zap_page(kvm, page) + 1;
+       return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
 }
 
 static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
@@ -2936,6 +3122,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 
        list_for_each_entry(kvm, &vm_list, vm_list) {
                int npages, idx, freed_pages;
+               LIST_HEAD(invalid_list);
 
                idx = srcu_read_lock(&kvm->srcu);
                spin_lock(&kvm->mmu_lock);
@@ -2943,12 +3130,14 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
                         kvm->arch.n_free_mmu_pages;
                cache_count += npages;
                if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
-                       freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm);
+                       freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
+                                                         &invalid_list);
                        cache_count -= freed_pages;
                        kvm_freed = kvm;
                }
                nr_to_scan--;
 
+               kvm_mmu_commit_zap_page(kvm, &invalid_list);
                spin_unlock(&kvm->mmu_lock);
                srcu_read_unlock(&kvm->srcu, idx);
        }
@@ -3074,7 +3263,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
 
 static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
 {
-       kvm_set_cr3(vcpu, vcpu->arch.cr3);
+       (void)kvm_set_cr3(vcpu, vcpu->arch.cr3);
        return 1;
 }
 
@@ -3331,9 +3520,9 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
        struct kvm_mmu_page *rev_sp;
        gfn_t gfn;
 
-       if (*sptep & PT_WRITABLE_MASK) {
+       if (is_writable_pte(*sptep)) {
                rev_sp = page_header(__pa(sptep));
-               gfn = rev_sp->gfns[sptep - rev_sp->spt];
+               gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
 
                if (!gfn_to_memslot(kvm, gfn)) {
                        if (!printk_ratelimit())
@@ -3347,8 +3536,7 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
                        return;
                }
 
-               rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt],
-                                   rev_sp->role.level);
+               rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
                if (!*rmapp) {
                        if (!printk_ratelimit())
                                return;
@@