KVM: Portability: Split mmu-related static inline functions to mmu.h
[sfrench/cifs-2.6.git] / drivers / kvm / x86.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * derived from drivers/kvm/kvm_main.c
5  *
6  * Copyright (C) 2006 Qumranet, Inc.
7  *
8  * Authors:
9  *   Avi Kivity   <avi@qumranet.com>
10  *   Yaniv Kamay  <yaniv@qumranet.com>
11  *
12  * This work is licensed under the terms of the GNU GPL, version 2.  See
13  * the COPYING file in the top-level directory.
14  *
15  */
16
17 #include "kvm.h"
18 #include "x86.h"
19 #include "x86_emulate.h"
20 #include "segment_descriptor.h"
21 #include "irq.h"
22 #include "mmu.h"
23
24 #include <linux/kvm.h>
25 #include <linux/fs.h>
26 #include <linux/vmalloc.h>
27 #include <linux/module.h>
28 #include <linux/mman.h>
29 #include <linux/highmem.h>
30
31 #include <asm/uaccess.h>
32 #include <asm/msr.h>
33
34 #define MAX_IO_MSRS 256
35 #define CR0_RESERVED_BITS                                               \
36         (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
37                           | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
38                           | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
39 #define CR4_RESERVED_BITS                                               \
40         (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
41                           | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
42                           | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR  \
43                           | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
44
45 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
46 #define EFER_RESERVED_BITS 0xfffffffffffff2fe
47
48 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
49 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
50
51 struct kvm_x86_ops *kvm_x86_ops;
52
53 struct kvm_stats_debugfs_item debugfs_entries[] = {
54         { "pf_fixed", VCPU_STAT(pf_fixed) },
55         { "pf_guest", VCPU_STAT(pf_guest) },
56         { "tlb_flush", VCPU_STAT(tlb_flush) },
57         { "invlpg", VCPU_STAT(invlpg) },
58         { "exits", VCPU_STAT(exits) },
59         { "io_exits", VCPU_STAT(io_exits) },
60         { "mmio_exits", VCPU_STAT(mmio_exits) },
61         { "signal_exits", VCPU_STAT(signal_exits) },
62         { "irq_window", VCPU_STAT(irq_window_exits) },
63         { "halt_exits", VCPU_STAT(halt_exits) },
64         { "halt_wakeup", VCPU_STAT(halt_wakeup) },
65         { "request_irq", VCPU_STAT(request_irq_exits) },
66         { "irq_exits", VCPU_STAT(irq_exits) },
67         { "host_state_reload", VCPU_STAT(host_state_reload) },
68         { "efer_reload", VCPU_STAT(efer_reload) },
69         { "fpu_reload", VCPU_STAT(fpu_reload) },
70         { "insn_emulation", VCPU_STAT(insn_emulation) },
71         { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
72         { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
73         { "mmu_pte_write", VM_STAT(mmu_pte_write) },
74         { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
75         { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
76         { "mmu_flooded", VM_STAT(mmu_flooded) },
77         { "mmu_recycled", VM_STAT(mmu_recycled) },
78         { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
79         { NULL }
80 };
81
82
83 unsigned long segment_base(u16 selector)
84 {
85         struct descriptor_table gdt;
86         struct segment_descriptor *d;
87         unsigned long table_base;
88         unsigned long v;
89
90         if (selector == 0)
91                 return 0;
92
93         asm("sgdt %0" : "=m"(gdt));
94         table_base = gdt.base;
95
96         if (selector & 4) {           /* from ldt */
97                 u16 ldt_selector;
98
99                 asm("sldt %0" : "=g"(ldt_selector));
100                 table_base = segment_base(ldt_selector);
101         }
102         d = (struct segment_descriptor *)(table_base + (selector & ~7));
103         v = d->base_low | ((unsigned long)d->base_mid << 16) |
104                 ((unsigned long)d->base_high << 24);
105 #ifdef CONFIG_X86_64
106         if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
107                 v |= ((unsigned long) \
108                       ((struct segment_descriptor_64 *)d)->base_higher) << 32;
109 #endif
110         return v;
111 }
112 EXPORT_SYMBOL_GPL(segment_base);
113
114 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
115 {
116         if (irqchip_in_kernel(vcpu->kvm))
117                 return vcpu->arch.apic_base;
118         else
119                 return vcpu->arch.apic_base;
120 }
121 EXPORT_SYMBOL_GPL(kvm_get_apic_base);
122
123 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
124 {
125         /* TODO: reserve bits check */
126         if (irqchip_in_kernel(vcpu->kvm))
127                 kvm_lapic_set_base(vcpu, data);
128         else
129                 vcpu->arch.apic_base = data;
130 }
131 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
132
133 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
134 {
135         WARN_ON(vcpu->arch.exception.pending);
136         vcpu->arch.exception.pending = true;
137         vcpu->arch.exception.has_error_code = false;
138         vcpu->arch.exception.nr = nr;
139 }
140 EXPORT_SYMBOL_GPL(kvm_queue_exception);
141
142 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
143                            u32 error_code)
144 {
145         ++vcpu->stat.pf_guest;
146         if (vcpu->arch.exception.pending && vcpu->arch.exception.nr == PF_VECTOR) {
147                 printk(KERN_DEBUG "kvm: inject_page_fault:"
148                        " double fault 0x%lx\n", addr);
149                 vcpu->arch.exception.nr = DF_VECTOR;
150                 vcpu->arch.exception.error_code = 0;
151                 return;
152         }
153         vcpu->arch.cr2 = addr;
154         kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
155 }
156
157 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
158 {
159         WARN_ON(vcpu->arch.exception.pending);
160         vcpu->arch.exception.pending = true;
161         vcpu->arch.exception.has_error_code = true;
162         vcpu->arch.exception.nr = nr;
163         vcpu->arch.exception.error_code = error_code;
164 }
165 EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
166
167 static void __queue_exception(struct kvm_vcpu *vcpu)
168 {
169         kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
170                                      vcpu->arch.exception.has_error_code,
171                                      vcpu->arch.exception.error_code);
172 }
173
174 /*
175  * Load the pae pdptrs.  Return true is they are all valid.
176  */
177 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
178 {
179         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
180         unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
181         int i;
182         int ret;
183         u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
184
185         mutex_lock(&vcpu->kvm->lock);
186         ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
187                                   offset * sizeof(u64), sizeof(pdpte));
188         if (ret < 0) {
189                 ret = 0;
190                 goto out;
191         }
192         for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
193                 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
194                         ret = 0;
195                         goto out;
196                 }
197         }
198         ret = 1;
199
200         memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
201 out:
202         mutex_unlock(&vcpu->kvm->lock);
203
204         return ret;
205 }
206
207 static bool pdptrs_changed(struct kvm_vcpu *vcpu)
208 {
209         u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
210         bool changed = true;
211         int r;
212
213         if (is_long_mode(vcpu) || !is_pae(vcpu))
214                 return false;
215
216         mutex_lock(&vcpu->kvm->lock);
217         r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
218         if (r < 0)
219                 goto out;
220         changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
221 out:
222         mutex_unlock(&vcpu->kvm->lock);
223
224         return changed;
225 }
226
227 void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
228 {
229         if (cr0 & CR0_RESERVED_BITS) {
230                 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
231                        cr0, vcpu->arch.cr0);
232                 kvm_inject_gp(vcpu, 0);
233                 return;
234         }
235
236         if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
237                 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
238                 kvm_inject_gp(vcpu, 0);
239                 return;
240         }
241
242         if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
243                 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
244                        "and a clear PE flag\n");
245                 kvm_inject_gp(vcpu, 0);
246                 return;
247         }
248
249         if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
250 #ifdef CONFIG_X86_64
251                 if ((vcpu->arch.shadow_efer & EFER_LME)) {
252                         int cs_db, cs_l;
253
254                         if (!is_pae(vcpu)) {
255                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
256                                        "in long mode while PAE is disabled\n");
257                                 kvm_inject_gp(vcpu, 0);
258                                 return;
259                         }
260                         kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
261                         if (cs_l) {
262                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
263                                        "in long mode while CS.L == 1\n");
264                                 kvm_inject_gp(vcpu, 0);
265                                 return;
266
267                         }
268                 } else
269 #endif
270                 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
271                         printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
272                                "reserved bits\n");
273                         kvm_inject_gp(vcpu, 0);
274                         return;
275                 }
276
277         }
278
279         kvm_x86_ops->set_cr0(vcpu, cr0);
280         vcpu->arch.cr0 = cr0;
281
282         mutex_lock(&vcpu->kvm->lock);
283         kvm_mmu_reset_context(vcpu);
284         mutex_unlock(&vcpu->kvm->lock);
285         return;
286 }
287 EXPORT_SYMBOL_GPL(set_cr0);
288
289 void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
290 {
291         set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
292 }
293 EXPORT_SYMBOL_GPL(lmsw);
294
295 void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
296 {
297         if (cr4 & CR4_RESERVED_BITS) {
298                 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
299                 kvm_inject_gp(vcpu, 0);
300                 return;
301         }
302
303         if (is_long_mode(vcpu)) {
304                 if (!(cr4 & X86_CR4_PAE)) {
305                         printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
306                                "in long mode\n");
307                         kvm_inject_gp(vcpu, 0);
308                         return;
309                 }
310         } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
311                    && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
312                 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
313                 kvm_inject_gp(vcpu, 0);
314                 return;
315         }
316
317         if (cr4 & X86_CR4_VMXE) {
318                 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
319                 kvm_inject_gp(vcpu, 0);
320                 return;
321         }
322         kvm_x86_ops->set_cr4(vcpu, cr4);
323         vcpu->arch.cr4 = cr4;
324         mutex_lock(&vcpu->kvm->lock);
325         kvm_mmu_reset_context(vcpu);
326         mutex_unlock(&vcpu->kvm->lock);
327 }
328 EXPORT_SYMBOL_GPL(set_cr4);
329
330 void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
331 {
332         if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
333                 kvm_mmu_flush_tlb(vcpu);
334                 return;
335         }
336
337         if (is_long_mode(vcpu)) {
338                 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
339                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
340                         kvm_inject_gp(vcpu, 0);
341                         return;
342                 }
343         } else {
344                 if (is_pae(vcpu)) {
345                         if (cr3 & CR3_PAE_RESERVED_BITS) {
346                                 printk(KERN_DEBUG
347                                        "set_cr3: #GP, reserved bits\n");
348                                 kvm_inject_gp(vcpu, 0);
349                                 return;
350                         }
351                         if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
352                                 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
353                                        "reserved bits\n");
354                                 kvm_inject_gp(vcpu, 0);
355                                 return;
356                         }
357                 }
358                 /*
359                  * We don't check reserved bits in nonpae mode, because
360                  * this isn't enforced, and VMware depends on this.
361                  */
362         }
363
364         mutex_lock(&vcpu->kvm->lock);
365         /*
366          * Does the new cr3 value map to physical memory? (Note, we
367          * catch an invalid cr3 even in real-mode, because it would
368          * cause trouble later on when we turn on paging anyway.)
369          *
370          * A real CPU would silently accept an invalid cr3 and would
371          * attempt to use it - with largely undefined (and often hard
372          * to debug) behavior on the guest side.
373          */
374         if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
375                 kvm_inject_gp(vcpu, 0);
376         else {
377                 vcpu->arch.cr3 = cr3;
378                 vcpu->arch.mmu.new_cr3(vcpu);
379         }
380         mutex_unlock(&vcpu->kvm->lock);
381 }
382 EXPORT_SYMBOL_GPL(set_cr3);
383
384 void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
385 {
386         if (cr8 & CR8_RESERVED_BITS) {
387                 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
388                 kvm_inject_gp(vcpu, 0);
389                 return;
390         }
391         if (irqchip_in_kernel(vcpu->kvm))
392                 kvm_lapic_set_tpr(vcpu, cr8);
393         else
394                 vcpu->arch.cr8 = cr8;
395 }
396 EXPORT_SYMBOL_GPL(set_cr8);
397
398 unsigned long get_cr8(struct kvm_vcpu *vcpu)
399 {
400         if (irqchip_in_kernel(vcpu->kvm))
401                 return kvm_lapic_get_cr8(vcpu);
402         else
403                 return vcpu->arch.cr8;
404 }
405 EXPORT_SYMBOL_GPL(get_cr8);
406
407 /*
408  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
409  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
410  *
411  * This list is modified at module load time to reflect the
412  * capabilities of the host cpu.
413  */
414 static u32 msrs_to_save[] = {
415         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
416         MSR_K6_STAR,
417 #ifdef CONFIG_X86_64
418         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
419 #endif
420         MSR_IA32_TIME_STAMP_COUNTER,
421 };
422
423 static unsigned num_msrs_to_save;
424
425 static u32 emulated_msrs[] = {
426         MSR_IA32_MISC_ENABLE,
427 };
428
429 #ifdef CONFIG_X86_64
430
431 static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
432 {
433         if (efer & EFER_RESERVED_BITS) {
434                 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
435                        efer);
436                 kvm_inject_gp(vcpu, 0);
437                 return;
438         }
439
440         if (is_paging(vcpu)
441             && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
442                 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
443                 kvm_inject_gp(vcpu, 0);
444                 return;
445         }
446
447         kvm_x86_ops->set_efer(vcpu, efer);
448
449         efer &= ~EFER_LMA;
450         efer |= vcpu->arch.shadow_efer & EFER_LMA;
451
452         vcpu->arch.shadow_efer = efer;
453 }
454
455 #endif
456
457 /*
458  * Writes msr value into into the appropriate "register".
459  * Returns 0 on success, non-0 otherwise.
460  * Assumes vcpu_load() was already called.
461  */
462 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
463 {
464         return kvm_x86_ops->set_msr(vcpu, msr_index, data);
465 }
466
467 /*
468  * Adapt set_msr() to msr_io()'s calling convention
469  */
470 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
471 {
472         return kvm_set_msr(vcpu, index, *data);
473 }
474
475
476 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
477 {
478         switch (msr) {
479 #ifdef CONFIG_X86_64
480         case MSR_EFER:
481                 set_efer(vcpu, data);
482                 break;
483 #endif
484         case MSR_IA32_MC0_STATUS:
485                 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
486                        __FUNCTION__, data);
487                 break;
488         case MSR_IA32_MCG_STATUS:
489                 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
490                         __FUNCTION__, data);
491                 break;
492         case MSR_IA32_UCODE_REV:
493         case MSR_IA32_UCODE_WRITE:
494         case 0x200 ... 0x2ff: /* MTRRs */
495                 break;
496         case MSR_IA32_APICBASE:
497                 kvm_set_apic_base(vcpu, data);
498                 break;
499         case MSR_IA32_MISC_ENABLE:
500                 vcpu->arch.ia32_misc_enable_msr = data;
501                 break;
502         default:
503                 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr);
504                 return 1;
505         }
506         return 0;
507 }
508 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
509
510
511 /*
512  * Reads an msr value (of 'msr_index') into 'pdata'.
513  * Returns 0 on success, non-0 otherwise.
514  * Assumes vcpu_load() was already called.
515  */
516 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
517 {
518         return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
519 }
520
521 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
522 {
523         u64 data;
524
525         switch (msr) {
526         case 0xc0010010: /* SYSCFG */
527         case 0xc0010015: /* HWCR */
528         case MSR_IA32_PLATFORM_ID:
529         case MSR_IA32_P5_MC_ADDR:
530         case MSR_IA32_P5_MC_TYPE:
531         case MSR_IA32_MC0_CTL:
532         case MSR_IA32_MCG_STATUS:
533         case MSR_IA32_MCG_CAP:
534         case MSR_IA32_MC0_MISC:
535         case MSR_IA32_MC0_MISC+4:
536         case MSR_IA32_MC0_MISC+8:
537         case MSR_IA32_MC0_MISC+12:
538         case MSR_IA32_MC0_MISC+16:
539         case MSR_IA32_UCODE_REV:
540         case MSR_IA32_PERF_STATUS:
541         case MSR_IA32_EBL_CR_POWERON:
542                 /* MTRR registers */
543         case 0xfe:
544         case 0x200 ... 0x2ff:
545                 data = 0;
546                 break;
547         case 0xcd: /* fsb frequency */
548                 data = 3;
549                 break;
550         case MSR_IA32_APICBASE:
551                 data = kvm_get_apic_base(vcpu);
552                 break;
553         case MSR_IA32_MISC_ENABLE:
554                 data = vcpu->arch.ia32_misc_enable_msr;
555                 break;
556 #ifdef CONFIG_X86_64
557         case MSR_EFER:
558                 data = vcpu->arch.shadow_efer;
559                 break;
560 #endif
561         default:
562                 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
563                 return 1;
564         }
565         *pdata = data;
566         return 0;
567 }
568 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
569
570 /*
571  * Read or write a bunch of msrs. All parameters are kernel addresses.
572  *
573  * @return number of msrs set successfully.
574  */
575 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
576                     struct kvm_msr_entry *entries,
577                     int (*do_msr)(struct kvm_vcpu *vcpu,
578                                   unsigned index, u64 *data))
579 {
580         int i;
581
582         vcpu_load(vcpu);
583
584         for (i = 0; i < msrs->nmsrs; ++i)
585                 if (do_msr(vcpu, entries[i].index, &entries[i].data))
586                         break;
587
588         vcpu_put(vcpu);
589
590         return i;
591 }
592
593 /*
594  * Read or write a bunch of msrs. Parameters are user addresses.
595  *
596  * @return number of msrs set successfully.
597  */
598 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
599                   int (*do_msr)(struct kvm_vcpu *vcpu,
600                                 unsigned index, u64 *data),
601                   int writeback)
602 {
603         struct kvm_msrs msrs;
604         struct kvm_msr_entry *entries;
605         int r, n;
606         unsigned size;
607
608         r = -EFAULT;
609         if (copy_from_user(&msrs, user_msrs, sizeof msrs))
610                 goto out;
611
612         r = -E2BIG;
613         if (msrs.nmsrs >= MAX_IO_MSRS)
614                 goto out;
615
616         r = -ENOMEM;
617         size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
618         entries = vmalloc(size);
619         if (!entries)
620                 goto out;
621
622         r = -EFAULT;
623         if (copy_from_user(entries, user_msrs->entries, size))
624                 goto out_free;
625
626         r = n = __msr_io(vcpu, &msrs, entries, do_msr);
627         if (r < 0)
628                 goto out_free;
629
630         r = -EFAULT;
631         if (writeback && copy_to_user(user_msrs->entries, entries, size))
632                 goto out_free;
633
634         r = n;
635
636 out_free:
637         vfree(entries);
638 out:
639         return r;
640 }
641
642 /*
643  * Make sure that a cpu that is being hot-unplugged does not have any vcpus
644  * cached on it.
645  */
646 void decache_vcpus_on_cpu(int cpu)
647 {
648         struct kvm *vm;
649         struct kvm_vcpu *vcpu;
650         int i;
651
652         spin_lock(&kvm_lock);
653         list_for_each_entry(vm, &vm_list, vm_list)
654                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
655                         vcpu = vm->vcpus[i];
656                         if (!vcpu)
657                                 continue;
658                         /*
659                          * If the vcpu is locked, then it is running on some
660                          * other cpu and therefore it is not cached on the
661                          * cpu in question.
662                          *
663                          * If it's not locked, check the last cpu it executed
664                          * on.
665                          */
666                         if (mutex_trylock(&vcpu->mutex)) {
667                                 if (vcpu->cpu == cpu) {
668                                         kvm_x86_ops->vcpu_decache(vcpu);
669                                         vcpu->cpu = -1;
670                                 }
671                                 mutex_unlock(&vcpu->mutex);
672                         }
673                 }
674         spin_unlock(&kvm_lock);
675 }
676
677 int kvm_dev_ioctl_check_extension(long ext)
678 {
679         int r;
680
681         switch (ext) {
682         case KVM_CAP_IRQCHIP:
683         case KVM_CAP_HLT:
684         case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
685         case KVM_CAP_USER_MEMORY:
686         case KVM_CAP_SET_TSS_ADDR:
687         case KVM_CAP_EXT_CPUID:
688                 r = 1;
689                 break;
690         default:
691                 r = 0;
692                 break;
693         }
694         return r;
695
696 }
697
698 long kvm_arch_dev_ioctl(struct file *filp,
699                         unsigned int ioctl, unsigned long arg)
700 {
701         void __user *argp = (void __user *)arg;
702         long r;
703
704         switch (ioctl) {
705         case KVM_GET_MSR_INDEX_LIST: {
706                 struct kvm_msr_list __user *user_msr_list = argp;
707                 struct kvm_msr_list msr_list;
708                 unsigned n;
709
710                 r = -EFAULT;
711                 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
712                         goto out;
713                 n = msr_list.nmsrs;
714                 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
715                 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
716                         goto out;
717                 r = -E2BIG;
718                 if (n < num_msrs_to_save)
719                         goto out;
720                 r = -EFAULT;
721                 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
722                                  num_msrs_to_save * sizeof(u32)))
723                         goto out;
724                 if (copy_to_user(user_msr_list->indices
725                                  + num_msrs_to_save * sizeof(u32),
726                                  &emulated_msrs,
727                                  ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
728                         goto out;
729                 r = 0;
730                 break;
731         }
732         default:
733                 r = -EINVAL;
734         }
735 out:
736         return r;
737 }
738
739 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
740 {
741         kvm_x86_ops->vcpu_load(vcpu, cpu);
742 }
743
744 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
745 {
746         kvm_x86_ops->vcpu_put(vcpu);
747         kvm_put_guest_fpu(vcpu);
748 }
749
750 static int is_efer_nx(void)
751 {
752         u64 efer;
753
754         rdmsrl(MSR_EFER, efer);
755         return efer & EFER_NX;
756 }
757
758 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
759 {
760         int i;
761         struct kvm_cpuid_entry2 *e, *entry;
762
763         entry = NULL;
764         for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
765                 e = &vcpu->arch.cpuid_entries[i];
766                 if (e->function == 0x80000001) {
767                         entry = e;
768                         break;
769                 }
770         }
771         if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
772                 entry->edx &= ~(1 << 20);
773                 printk(KERN_INFO "kvm: guest NX capability removed\n");
774         }
775 }
776
777 /* when an old userspace process fills a new kernel module */
778 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
779                                     struct kvm_cpuid *cpuid,
780                                     struct kvm_cpuid_entry __user *entries)
781 {
782         int r, i;
783         struct kvm_cpuid_entry *cpuid_entries;
784
785         r = -E2BIG;
786         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
787                 goto out;
788         r = -ENOMEM;
789         cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
790         if (!cpuid_entries)
791                 goto out;
792         r = -EFAULT;
793         if (copy_from_user(cpuid_entries, entries,
794                            cpuid->nent * sizeof(struct kvm_cpuid_entry)))
795                 goto out_free;
796         for (i = 0; i < cpuid->nent; i++) {
797                 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
798                 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
799                 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
800                 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
801                 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
802                 vcpu->arch.cpuid_entries[i].index = 0;
803                 vcpu->arch.cpuid_entries[i].flags = 0;
804                 vcpu->arch.cpuid_entries[i].padding[0] = 0;
805                 vcpu->arch.cpuid_entries[i].padding[1] = 0;
806                 vcpu->arch.cpuid_entries[i].padding[2] = 0;
807         }
808         vcpu->arch.cpuid_nent = cpuid->nent;
809         cpuid_fix_nx_cap(vcpu);
810         r = 0;
811
812 out_free:
813         vfree(cpuid_entries);
814 out:
815         return r;
816 }
817
818 static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
819                                     struct kvm_cpuid2 *cpuid,
820                                     struct kvm_cpuid_entry2 __user *entries)
821 {
822         int r;
823
824         r = -E2BIG;
825         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
826                 goto out;
827         r = -EFAULT;
828         if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
829                            cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
830                 goto out;
831         vcpu->arch.cpuid_nent = cpuid->nent;
832         return 0;
833
834 out:
835         return r;
836 }
837
838 static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
839                                     struct kvm_cpuid2 *cpuid,
840                                     struct kvm_cpuid_entry2 __user *entries)
841 {
842         int r;
843
844         r = -E2BIG;
845         if (cpuid->nent < vcpu->arch.cpuid_nent)
846                 goto out;
847         r = -EFAULT;
848         if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
849                            vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
850                 goto out;
851         return 0;
852
853 out:
854         cpuid->nent = vcpu->arch.cpuid_nent;
855         return r;
856 }
857
858 static inline u32 bit(int bitno)
859 {
860         return 1 << (bitno & 31);
861 }
862
863 static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
864                           u32 index)
865 {
866         entry->function = function;
867         entry->index = index;
868         cpuid_count(entry->function, entry->index,
869                 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
870         entry->flags = 0;
871 }
872
873 static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
874                          u32 index, int *nent, int maxnent)
875 {
876         const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) |
877                 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
878                 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
879                 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
880                 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
881                 bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |
882                 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
883                 bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
884                 bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
885                 bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
886         const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
887                 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
888                 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
889                 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
890                 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
891                 bit(X86_FEATURE_PGE) |
892                 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
893                 bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
894                 bit(X86_FEATURE_SYSCALL) |
895                 (bit(X86_FEATURE_NX) && is_efer_nx()) |
896 #ifdef CONFIG_X86_64
897                 bit(X86_FEATURE_LM) |
898 #endif
899                 bit(X86_FEATURE_MMXEXT) |
900                 bit(X86_FEATURE_3DNOWEXT) |
901                 bit(X86_FEATURE_3DNOW);
902         const u32 kvm_supported_word3_x86_features =
903                 bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
904         const u32 kvm_supported_word6_x86_features =
905                 bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY);
906
907         /* all func 2 cpuid_count() should be called on the same cpu */
908         get_cpu();
909         do_cpuid_1_ent(entry, function, index);
910         ++*nent;
911
912         switch (function) {
913         case 0:
914                 entry->eax = min(entry->eax, (u32)0xb);
915                 break;
916         case 1:
917                 entry->edx &= kvm_supported_word0_x86_features;
918                 entry->ecx &= kvm_supported_word3_x86_features;
919                 break;
920         /* function 2 entries are STATEFUL. That is, repeated cpuid commands
921          * may return different values. This forces us to get_cpu() before
922          * issuing the first command, and also to emulate this annoying behavior
923          * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
924         case 2: {
925                 int t, times = entry->eax & 0xff;
926
927                 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
928                 for (t = 1; t < times && *nent < maxnent; ++t) {
929                         do_cpuid_1_ent(&entry[t], function, 0);
930                         entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
931                         ++*nent;
932                 }
933                 break;
934         }
935         /* function 4 and 0xb have additional index. */
936         case 4: {
937                 int index, cache_type;
938
939                 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
940                 /* read more entries until cache_type is zero */
941                 for (index = 1; *nent < maxnent; ++index) {
942                         cache_type = entry[index - 1].eax & 0x1f;
943                         if (!cache_type)
944                                 break;
945                         do_cpuid_1_ent(&entry[index], function, index);
946                         entry[index].flags |=
947                                KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
948                         ++*nent;
949                 }
950                 break;
951         }
952         case 0xb: {
953                 int index, level_type;
954
955                 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
956                 /* read more entries until level_type is zero */
957                 for (index = 1; *nent < maxnent; ++index) {
958                         level_type = entry[index - 1].ecx & 0xff;
959                         if (!level_type)
960                                 break;
961                         do_cpuid_1_ent(&entry[index], function, index);
962                         entry[index].flags |=
963                                KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
964                         ++*nent;
965                 }
966                 break;
967         }
968         case 0x80000000:
969                 entry->eax = min(entry->eax, 0x8000001a);
970                 break;
971         case 0x80000001:
972                 entry->edx &= kvm_supported_word1_x86_features;
973                 entry->ecx &= kvm_supported_word6_x86_features;
974                 break;
975         }
976         put_cpu();
977 }
978
979 static int kvm_vm_ioctl_get_supported_cpuid(struct kvm *kvm,
980                                     struct kvm_cpuid2 *cpuid,
981                                     struct kvm_cpuid_entry2 __user *entries)
982 {
983         struct kvm_cpuid_entry2 *cpuid_entries;
984         int limit, nent = 0, r = -E2BIG;
985         u32 func;
986
987         if (cpuid->nent < 1)
988                 goto out;
989         r = -ENOMEM;
990         cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
991         if (!cpuid_entries)
992                 goto out;
993
994         do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
995         limit = cpuid_entries[0].eax;
996         for (func = 1; func <= limit && nent < cpuid->nent; ++func)
997                 do_cpuid_ent(&cpuid_entries[nent], func, 0,
998                                 &nent, cpuid->nent);
999         r = -E2BIG;
1000         if (nent >= cpuid->nent)
1001                 goto out_free;
1002
1003         do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
1004         limit = cpuid_entries[nent - 1].eax;
1005         for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
1006                 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1007                                &nent, cpuid->nent);
1008         r = -EFAULT;
1009         if (copy_to_user(entries, cpuid_entries,
1010                         nent * sizeof(struct kvm_cpuid_entry2)))
1011                 goto out_free;
1012         cpuid->nent = nent;
1013         r = 0;
1014
1015 out_free:
1016         vfree(cpuid_entries);
1017 out:
1018         return r;
1019 }
1020
1021 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
1022                                     struct kvm_lapic_state *s)
1023 {
1024         vcpu_load(vcpu);
1025         memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
1026         vcpu_put(vcpu);
1027
1028         return 0;
1029 }
1030
1031 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
1032                                     struct kvm_lapic_state *s)
1033 {
1034         vcpu_load(vcpu);
1035         memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
1036         kvm_apic_post_state_restore(vcpu);
1037         vcpu_put(vcpu);
1038
1039         return 0;
1040 }
1041
1042 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1043                                     struct kvm_interrupt *irq)
1044 {
1045         if (irq->irq < 0 || irq->irq >= 256)
1046                 return -EINVAL;
1047         if (irqchip_in_kernel(vcpu->kvm))
1048                 return -ENXIO;
1049         vcpu_load(vcpu);
1050
1051         set_bit(irq->irq, vcpu->arch.irq_pending);
1052         set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
1053
1054         vcpu_put(vcpu);
1055
1056         return 0;
1057 }
1058
1059 long kvm_arch_vcpu_ioctl(struct file *filp,
1060                          unsigned int ioctl, unsigned long arg)
1061 {
1062         struct kvm_vcpu *vcpu = filp->private_data;
1063         void __user *argp = (void __user *)arg;
1064         int r;
1065
1066         switch (ioctl) {
1067         case KVM_GET_LAPIC: {
1068                 struct kvm_lapic_state lapic;
1069
1070                 memset(&lapic, 0, sizeof lapic);
1071                 r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
1072                 if (r)
1073                         goto out;
1074                 r = -EFAULT;
1075                 if (copy_to_user(argp, &lapic, sizeof lapic))
1076                         goto out;
1077                 r = 0;
1078                 break;
1079         }
1080         case KVM_SET_LAPIC: {
1081                 struct kvm_lapic_state lapic;
1082
1083                 r = -EFAULT;
1084                 if (copy_from_user(&lapic, argp, sizeof lapic))
1085                         goto out;
1086                 r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
1087                 if (r)
1088                         goto out;
1089                 r = 0;
1090                 break;
1091         }
1092         case KVM_INTERRUPT: {
1093                 struct kvm_interrupt irq;
1094
1095                 r = -EFAULT;
1096                 if (copy_from_user(&irq, argp, sizeof irq))
1097                         goto out;
1098                 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
1099                 if (r)
1100                         goto out;
1101                 r = 0;
1102                 break;
1103         }
1104         case KVM_SET_CPUID: {
1105                 struct kvm_cpuid __user *cpuid_arg = argp;
1106                 struct kvm_cpuid cpuid;
1107
1108                 r = -EFAULT;
1109                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1110                         goto out;
1111                 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
1112                 if (r)
1113                         goto out;
1114                 break;
1115         }
1116         case KVM_SET_CPUID2: {
1117                 struct kvm_cpuid2 __user *cpuid_arg = argp;
1118                 struct kvm_cpuid2 cpuid;
1119
1120                 r = -EFAULT;
1121                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1122                         goto out;
1123                 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
1124                                 cpuid_arg->entries);
1125                 if (r)
1126                         goto out;
1127                 break;
1128         }
1129         case KVM_GET_CPUID2: {
1130                 struct kvm_cpuid2 __user *cpuid_arg = argp;
1131                 struct kvm_cpuid2 cpuid;
1132
1133                 r = -EFAULT;
1134                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1135                         goto out;
1136                 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
1137                                 cpuid_arg->entries);
1138                 if (r)
1139                         goto out;
1140                 r = -EFAULT;
1141                 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1142                         goto out;
1143                 r = 0;
1144                 break;
1145         }
1146         case KVM_GET_MSRS:
1147                 r = msr_io(vcpu, argp, kvm_get_msr, 1);
1148                 break;
1149         case KVM_SET_MSRS:
1150                 r = msr_io(vcpu, argp, do_set_msr, 0);
1151                 break;
1152         default:
1153                 r = -EINVAL;
1154         }
1155 out:
1156         return r;
1157 }
1158
1159 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
1160 {
1161         int ret;
1162
1163         if (addr > (unsigned int)(-3 * PAGE_SIZE))
1164                 return -1;
1165         ret = kvm_x86_ops->set_tss_addr(kvm, addr);
1166         return ret;
1167 }
1168
1169 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
1170                                           u32 kvm_nr_mmu_pages)
1171 {
1172         if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
1173                 return -EINVAL;
1174
1175         mutex_lock(&kvm->lock);
1176
1177         kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
1178         kvm->n_requested_mmu_pages = kvm_nr_mmu_pages;
1179
1180         mutex_unlock(&kvm->lock);
1181         return 0;
1182 }
1183
1184 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
1185 {
1186         return kvm->n_alloc_mmu_pages;
1187 }
1188
1189 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
1190 {
1191         int i;
1192         struct kvm_mem_alias *alias;
1193
1194         for (i = 0; i < kvm->naliases; ++i) {
1195                 alias = &kvm->aliases[i];
1196                 if (gfn >= alias->base_gfn
1197                     && gfn < alias->base_gfn + alias->npages)
1198                         return alias->target_gfn + gfn - alias->base_gfn;
1199         }
1200         return gfn;
1201 }
1202
1203 /*
1204  * Set a new alias region.  Aliases map a portion of physical memory into
1205  * another portion.  This is useful for memory windows, for example the PC
1206  * VGA region.
1207  */
1208 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
1209                                          struct kvm_memory_alias *alias)
1210 {
1211         int r, n;
1212         struct kvm_mem_alias *p;
1213
1214         r = -EINVAL;
1215         /* General sanity checks */
1216         if (alias->memory_size & (PAGE_SIZE - 1))
1217                 goto out;
1218         if (alias->guest_phys_addr & (PAGE_SIZE - 1))
1219                 goto out;
1220         if (alias->slot >= KVM_ALIAS_SLOTS)
1221                 goto out;
1222         if (alias->guest_phys_addr + alias->memory_size
1223             < alias->guest_phys_addr)
1224                 goto out;
1225         if (alias->target_phys_addr + alias->memory_size
1226             < alias->target_phys_addr)
1227                 goto out;
1228
1229         mutex_lock(&kvm->lock);
1230
1231         p = &kvm->aliases[alias->slot];
1232         p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
1233         p->npages = alias->memory_size >> PAGE_SHIFT;
1234         p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
1235
1236         for (n = KVM_ALIAS_SLOTS; n > 0; --n)
1237                 if (kvm->aliases[n - 1].npages)
1238                         break;
1239         kvm->naliases = n;
1240
1241         kvm_mmu_zap_all(kvm);
1242
1243         mutex_unlock(&kvm->lock);
1244
1245         return 0;
1246
1247 out:
1248         return r;
1249 }
1250
1251 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
1252 {
1253         int r;
1254
1255         r = 0;
1256         switch (chip->chip_id) {
1257         case KVM_IRQCHIP_PIC_MASTER:
1258                 memcpy(&chip->chip.pic,
1259                         &pic_irqchip(kvm)->pics[0],
1260                         sizeof(struct kvm_pic_state));
1261                 break;
1262         case KVM_IRQCHIP_PIC_SLAVE:
1263                 memcpy(&chip->chip.pic,
1264                         &pic_irqchip(kvm)->pics[1],
1265                         sizeof(struct kvm_pic_state));
1266                 break;
1267         case KVM_IRQCHIP_IOAPIC:
1268                 memcpy(&chip->chip.ioapic,
1269                         ioapic_irqchip(kvm),
1270                         sizeof(struct kvm_ioapic_state));
1271                 break;
1272         default:
1273                 r = -EINVAL;
1274                 break;
1275         }
1276         return r;
1277 }
1278
1279 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
1280 {
1281         int r;
1282
1283         r = 0;
1284         switch (chip->chip_id) {
1285         case KVM_IRQCHIP_PIC_MASTER:
1286                 memcpy(&pic_irqchip(kvm)->pics[0],
1287                         &chip->chip.pic,
1288                         sizeof(struct kvm_pic_state));
1289                 break;
1290         case KVM_IRQCHIP_PIC_SLAVE:
1291                 memcpy(&pic_irqchip(kvm)->pics[1],
1292                         &chip->chip.pic,
1293                         sizeof(struct kvm_pic_state));
1294                 break;
1295         case KVM_IRQCHIP_IOAPIC:
1296                 memcpy(ioapic_irqchip(kvm),
1297                         &chip->chip.ioapic,
1298                         sizeof(struct kvm_ioapic_state));
1299                 break;
1300         default:
1301                 r = -EINVAL;
1302                 break;
1303         }
1304         kvm_pic_update_irq(pic_irqchip(kvm));
1305         return r;
1306 }
1307
1308 /*
1309  * Get (and clear) the dirty memory log for a memory slot.
1310  */
1311 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1312                                       struct kvm_dirty_log *log)
1313 {
1314         int r;
1315         int n;
1316         struct kvm_memory_slot *memslot;
1317         int is_dirty = 0;
1318
1319         mutex_lock(&kvm->lock);
1320
1321         r = kvm_get_dirty_log(kvm, log, &is_dirty);
1322         if (r)
1323                 goto out;
1324
1325         /* If nothing is dirty, don't bother messing with page tables. */
1326         if (is_dirty) {
1327                 kvm_mmu_slot_remove_write_access(kvm, log->slot);
1328                 kvm_flush_remote_tlbs(kvm);
1329                 memslot = &kvm->memslots[log->slot];
1330                 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
1331                 memset(memslot->dirty_bitmap, 0, n);
1332         }
1333         r = 0;
1334 out:
1335         mutex_unlock(&kvm->lock);
1336         return r;
1337 }
1338
1339 long kvm_arch_vm_ioctl(struct file *filp,
1340                        unsigned int ioctl, unsigned long arg)
1341 {
1342         struct kvm *kvm = filp->private_data;
1343         void __user *argp = (void __user *)arg;
1344         int r = -EINVAL;
1345
1346         switch (ioctl) {
1347         case KVM_SET_TSS_ADDR:
1348                 r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
1349                 if (r < 0)
1350                         goto out;
1351                 break;
1352         case KVM_SET_MEMORY_REGION: {
1353                 struct kvm_memory_region kvm_mem;
1354                 struct kvm_userspace_memory_region kvm_userspace_mem;
1355
1356                 r = -EFAULT;
1357                 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
1358                         goto out;
1359                 kvm_userspace_mem.slot = kvm_mem.slot;
1360                 kvm_userspace_mem.flags = kvm_mem.flags;
1361                 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
1362                 kvm_userspace_mem.memory_size = kvm_mem.memory_size;
1363                 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
1364                 if (r)
1365                         goto out;
1366                 break;
1367         }
1368         case KVM_SET_NR_MMU_PAGES:
1369                 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
1370                 if (r)
1371                         goto out;
1372                 break;
1373         case KVM_GET_NR_MMU_PAGES:
1374                 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
1375                 break;
1376         case KVM_SET_MEMORY_ALIAS: {
1377                 struct kvm_memory_alias alias;
1378
1379                 r = -EFAULT;
1380                 if (copy_from_user(&alias, argp, sizeof alias))
1381                         goto out;
1382                 r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
1383                 if (r)
1384                         goto out;
1385                 break;
1386         }
1387         case KVM_CREATE_IRQCHIP:
1388                 r = -ENOMEM;
1389                 kvm->vpic = kvm_create_pic(kvm);
1390                 if (kvm->vpic) {
1391                         r = kvm_ioapic_init(kvm);
1392                         if (r) {
1393                                 kfree(kvm->vpic);
1394                                 kvm->vpic = NULL;
1395                                 goto out;
1396                         }
1397                 } else
1398                         goto out;
1399                 break;
1400         case KVM_IRQ_LINE: {
1401                 struct kvm_irq_level irq_event;
1402
1403                 r = -EFAULT;
1404                 if (copy_from_user(&irq_event, argp, sizeof irq_event))
1405                         goto out;
1406                 if (irqchip_in_kernel(kvm)) {
1407                         mutex_lock(&kvm->lock);
1408                         if (irq_event.irq < 16)
1409                                 kvm_pic_set_irq(pic_irqchip(kvm),
1410                                         irq_event.irq,
1411                                         irq_event.level);
1412                         kvm_ioapic_set_irq(kvm->vioapic,
1413                                         irq_event.irq,
1414                                         irq_event.level);
1415                         mutex_unlock(&kvm->lock);
1416                         r = 0;
1417                 }
1418                 break;
1419         }
1420         case KVM_GET_IRQCHIP: {
1421                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1422                 struct kvm_irqchip chip;
1423
1424                 r = -EFAULT;
1425                 if (copy_from_user(&chip, argp, sizeof chip))
1426                         goto out;
1427                 r = -ENXIO;
1428                 if (!irqchip_in_kernel(kvm))
1429                         goto out;
1430                 r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
1431                 if (r)
1432                         goto out;
1433                 r = -EFAULT;
1434                 if (copy_to_user(argp, &chip, sizeof chip))
1435                         goto out;
1436                 r = 0;
1437                 break;
1438         }
1439         case KVM_SET_IRQCHIP: {
1440                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1441                 struct kvm_irqchip chip;
1442
1443                 r = -EFAULT;
1444                 if (copy_from_user(&chip, argp, sizeof chip))
1445                         goto out;
1446                 r = -ENXIO;
1447                 if (!irqchip_in_kernel(kvm))
1448                         goto out;
1449                 r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
1450                 if (r)
1451                         goto out;
1452                 r = 0;
1453                 break;
1454         }
1455         case KVM_GET_SUPPORTED_CPUID: {
1456                 struct kvm_cpuid2 __user *cpuid_arg = argp;
1457                 struct kvm_cpuid2 cpuid;
1458
1459                 r = -EFAULT;
1460                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1461                         goto out;
1462                 r = kvm_vm_ioctl_get_supported_cpuid(kvm, &cpuid,
1463                         cpuid_arg->entries);
1464                 if (r)
1465                         goto out;
1466
1467                 r = -EFAULT;
1468                 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1469                         goto out;
1470                 r = 0;
1471                 break;
1472         }
1473         default:
1474                 ;
1475         }
1476 out:
1477         return r;
1478 }
1479
1480 static void kvm_init_msr_list(void)
1481 {
1482         u32 dummy[2];
1483         unsigned i, j;
1484
1485         for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
1486                 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
1487                         continue;
1488                 if (j < i)
1489                         msrs_to_save[j] = msrs_to_save[i];
1490                 j++;
1491         }
1492         num_msrs_to_save = j;
1493 }
1494
1495 /*
1496  * Only apic need an MMIO device hook, so shortcut now..
1497  */
1498 static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
1499                                                 gpa_t addr)
1500 {
1501         struct kvm_io_device *dev;
1502
1503         if (vcpu->arch.apic) {
1504                 dev = &vcpu->arch.apic->dev;
1505                 if (dev->in_range(dev, addr))
1506                         return dev;
1507         }
1508         return NULL;
1509 }
1510
1511
1512 static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1513                                                 gpa_t addr)
1514 {
1515         struct kvm_io_device *dev;
1516
1517         dev = vcpu_find_pervcpu_dev(vcpu, addr);
1518         if (dev == NULL)
1519                 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
1520         return dev;
1521 }
1522
1523 int emulator_read_std(unsigned long addr,
1524                              void *val,
1525                              unsigned int bytes,
1526                              struct kvm_vcpu *vcpu)
1527 {
1528         void *data = val;
1529
1530         while (bytes) {
1531                 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1532                 unsigned offset = addr & (PAGE_SIZE-1);
1533                 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
1534                 int ret;
1535
1536                 if (gpa == UNMAPPED_GVA)
1537                         return X86EMUL_PROPAGATE_FAULT;
1538                 ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy);
1539                 if (ret < 0)
1540                         return X86EMUL_UNHANDLEABLE;
1541
1542                 bytes -= tocopy;
1543                 data += tocopy;
1544                 addr += tocopy;
1545         }
1546
1547         return X86EMUL_CONTINUE;
1548 }
1549 EXPORT_SYMBOL_GPL(emulator_read_std);
1550
1551 static int emulator_read_emulated(unsigned long addr,
1552                                   void *val,
1553                                   unsigned int bytes,
1554                                   struct kvm_vcpu *vcpu)
1555 {
1556         struct kvm_io_device *mmio_dev;
1557         gpa_t                 gpa;
1558
1559         if (vcpu->mmio_read_completed) {
1560                 memcpy(val, vcpu->mmio_data, bytes);
1561                 vcpu->mmio_read_completed = 0;
1562                 return X86EMUL_CONTINUE;
1563         }
1564
1565         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1566
1567         /* For APIC access vmexit */
1568         if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1569                 goto mmio;
1570
1571         if (emulator_read_std(addr, val, bytes, vcpu)
1572                         == X86EMUL_CONTINUE)
1573                 return X86EMUL_CONTINUE;
1574         if (gpa == UNMAPPED_GVA)
1575                 return X86EMUL_PROPAGATE_FAULT;
1576
1577 mmio:
1578         /*
1579          * Is this MMIO handled locally?
1580          */
1581         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1582         if (mmio_dev) {
1583                 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
1584                 return X86EMUL_CONTINUE;
1585         }
1586
1587         vcpu->mmio_needed = 1;
1588         vcpu->mmio_phys_addr = gpa;
1589         vcpu->mmio_size = bytes;
1590         vcpu->mmio_is_write = 0;
1591
1592         return X86EMUL_UNHANDLEABLE;
1593 }
1594
1595 static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
1596                                const void *val, int bytes)
1597 {
1598         int ret;
1599
1600         ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
1601         if (ret < 0)
1602                 return 0;
1603         kvm_mmu_pte_write(vcpu, gpa, val, bytes);
1604         return 1;
1605 }
1606
1607 static int emulator_write_emulated_onepage(unsigned long addr,
1608                                            const void *val,
1609                                            unsigned int bytes,
1610                                            struct kvm_vcpu *vcpu)
1611 {
1612         struct kvm_io_device *mmio_dev;
1613         gpa_t                 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1614
1615         if (gpa == UNMAPPED_GVA) {
1616                 kvm_inject_page_fault(vcpu, addr, 2);
1617                 return X86EMUL_PROPAGATE_FAULT;
1618         }
1619
1620         /* For APIC access vmexit */
1621         if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1622                 goto mmio;
1623
1624         if (emulator_write_phys(vcpu, gpa, val, bytes))
1625                 return X86EMUL_CONTINUE;
1626
1627 mmio:
1628         /*
1629          * Is this MMIO handled locally?
1630          */
1631         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1632         if (mmio_dev) {
1633                 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
1634                 return X86EMUL_CONTINUE;
1635         }
1636
1637         vcpu->mmio_needed = 1;
1638         vcpu->mmio_phys_addr = gpa;
1639         vcpu->mmio_size = bytes;
1640         vcpu->mmio_is_write = 1;
1641         memcpy(vcpu->mmio_data, val, bytes);
1642
1643         return X86EMUL_CONTINUE;
1644 }
1645
1646 int emulator_write_emulated(unsigned long addr,
1647                                    const void *val,
1648                                    unsigned int bytes,
1649                                    struct kvm_vcpu *vcpu)
1650 {
1651         /* Crossing a page boundary? */
1652         if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
1653                 int rc, now;
1654
1655                 now = -addr & ~PAGE_MASK;
1656                 rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
1657                 if (rc != X86EMUL_CONTINUE)
1658                         return rc;
1659                 addr += now;
1660                 val += now;
1661                 bytes -= now;
1662         }
1663         return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
1664 }
1665 EXPORT_SYMBOL_GPL(emulator_write_emulated);
1666
1667 static int emulator_cmpxchg_emulated(unsigned long addr,
1668                                      const void *old,
1669                                      const void *new,
1670                                      unsigned int bytes,
1671                                      struct kvm_vcpu *vcpu)
1672 {
1673         static int reported;
1674
1675         if (!reported) {
1676                 reported = 1;
1677                 printk(KERN_WARNING "kvm: emulating exchange as write\n");
1678         }
1679 #ifndef CONFIG_X86_64
1680         /* guests cmpxchg8b have to be emulated atomically */
1681         if (bytes == 8) {
1682                 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1683                 struct page *page;
1684                 char *addr;
1685                 u64 val;
1686
1687                 if (gpa == UNMAPPED_GVA ||
1688                    (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1689                         goto emul_write;
1690
1691                 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
1692                         goto emul_write;
1693
1694                 val = *(u64 *)new;
1695                 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1696                 addr = kmap_atomic(page, KM_USER0);
1697                 set_64bit((u64 *)(addr + offset_in_page(gpa)), val);
1698                 kunmap_atomic(addr, KM_USER0);
1699                 kvm_release_page_dirty(page);
1700         }
1701 emul_write:
1702 #endif
1703
1704         return emulator_write_emulated(addr, new, bytes, vcpu);
1705 }
1706
1707 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
1708 {
1709         return kvm_x86_ops->get_segment_base(vcpu, seg);
1710 }
1711
1712 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
1713 {
1714         return X86EMUL_CONTINUE;
1715 }
1716
1717 int emulate_clts(struct kvm_vcpu *vcpu)
1718 {
1719         kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
1720         return X86EMUL_CONTINUE;
1721 }
1722
1723 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
1724 {
1725         struct kvm_vcpu *vcpu = ctxt->vcpu;
1726
1727         switch (dr) {
1728         case 0 ... 3:
1729                 *dest = kvm_x86_ops->get_dr(vcpu, dr);
1730                 return X86EMUL_CONTINUE;
1731         default:
1732                 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr);
1733                 return X86EMUL_UNHANDLEABLE;
1734         }
1735 }
1736
1737 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
1738 {
1739         unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
1740         int exception;
1741
1742         kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
1743         if (exception) {
1744                 /* FIXME: better handling */
1745                 return X86EMUL_UNHANDLEABLE;
1746         }
1747         return X86EMUL_CONTINUE;
1748 }
1749
1750 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
1751 {
1752         static int reported;
1753         u8 opcodes[4];
1754         unsigned long rip = vcpu->arch.rip;
1755         unsigned long rip_linear;
1756
1757         rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
1758
1759         if (reported)
1760                 return;
1761
1762         emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
1763
1764         printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
1765                context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
1766         reported = 1;
1767 }
1768 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
1769
1770 struct x86_emulate_ops emulate_ops = {
1771         .read_std            = emulator_read_std,
1772         .read_emulated       = emulator_read_emulated,
1773         .write_emulated      = emulator_write_emulated,
1774         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
1775 };
1776
1777 int emulate_instruction(struct kvm_vcpu *vcpu,
1778                         struct kvm_run *run,
1779                         unsigned long cr2,
1780                         u16 error_code,
1781                         int no_decode)
1782 {
1783         int r;
1784
1785         vcpu->arch.mmio_fault_cr2 = cr2;
1786         kvm_x86_ops->cache_regs(vcpu);
1787
1788         vcpu->mmio_is_write = 0;
1789         vcpu->arch.pio.string = 0;
1790
1791         if (!no_decode) {
1792                 int cs_db, cs_l;
1793                 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
1794
1795                 vcpu->arch.emulate_ctxt.vcpu = vcpu;
1796                 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
1797                 vcpu->arch.emulate_ctxt.mode =
1798                         (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
1799                         ? X86EMUL_MODE_REAL : cs_l
1800                         ? X86EMUL_MODE_PROT64 : cs_db
1801                         ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
1802
1803                 if (vcpu->arch.emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
1804                         vcpu->arch.emulate_ctxt.cs_base = 0;
1805                         vcpu->arch.emulate_ctxt.ds_base = 0;
1806                         vcpu->arch.emulate_ctxt.es_base = 0;
1807                         vcpu->arch.emulate_ctxt.ss_base = 0;
1808                 } else {
1809                         vcpu->arch.emulate_ctxt.cs_base =
1810                                         get_segment_base(vcpu, VCPU_SREG_CS);
1811                         vcpu->arch.emulate_ctxt.ds_base =
1812                                         get_segment_base(vcpu, VCPU_SREG_DS);
1813                         vcpu->arch.emulate_ctxt.es_base =
1814                                         get_segment_base(vcpu, VCPU_SREG_ES);
1815                         vcpu->arch.emulate_ctxt.ss_base =
1816                                         get_segment_base(vcpu, VCPU_SREG_SS);
1817                 }
1818
1819                 vcpu->arch.emulate_ctxt.gs_base =
1820                                         get_segment_base(vcpu, VCPU_SREG_GS);
1821                 vcpu->arch.emulate_ctxt.fs_base =
1822                                         get_segment_base(vcpu, VCPU_SREG_FS);
1823
1824                 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
1825                 ++vcpu->stat.insn_emulation;
1826                 if (r)  {
1827                         ++vcpu->stat.insn_emulation_fail;
1828                         if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1829                                 return EMULATE_DONE;
1830                         return EMULATE_FAIL;
1831                 }
1832         }
1833
1834         r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
1835
1836         if (vcpu->arch.pio.string)
1837                 return EMULATE_DO_MMIO;
1838
1839         if ((r || vcpu->mmio_is_write) && run) {
1840                 run->exit_reason = KVM_EXIT_MMIO;
1841                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
1842                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
1843                 run->mmio.len = vcpu->mmio_size;
1844                 run->mmio.is_write = vcpu->mmio_is_write;
1845         }
1846
1847         if (r) {
1848                 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1849                         return EMULATE_DONE;
1850                 if (!vcpu->mmio_needed) {
1851                         kvm_report_emulation_failure(vcpu, "mmio");
1852                         return EMULATE_FAIL;
1853                 }
1854                 return EMULATE_DO_MMIO;
1855         }
1856
1857         kvm_x86_ops->decache_regs(vcpu);
1858         kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
1859
1860         if (vcpu->mmio_is_write) {
1861                 vcpu->mmio_needed = 0;
1862                 return EMULATE_DO_MMIO;
1863         }
1864
1865         return EMULATE_DONE;
1866 }
1867 EXPORT_SYMBOL_GPL(emulate_instruction);
1868
1869 static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
1870 {
1871         int i;
1872
1873         for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i)
1874                 if (vcpu->arch.pio.guest_pages[i]) {
1875                         kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]);
1876                         vcpu->arch.pio.guest_pages[i] = NULL;
1877                 }
1878 }
1879
1880 static int pio_copy_data(struct kvm_vcpu *vcpu)
1881 {
1882         void *p = vcpu->arch.pio_data;
1883         void *q;
1884         unsigned bytes;
1885         int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1;
1886
1887         q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
1888                  PAGE_KERNEL);
1889         if (!q) {
1890                 free_pio_guest_pages(vcpu);
1891                 return -ENOMEM;
1892         }
1893         q += vcpu->arch.pio.guest_page_offset;
1894         bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
1895         if (vcpu->arch.pio.in)
1896                 memcpy(q, p, bytes);
1897         else
1898                 memcpy(p, q, bytes);
1899         q -= vcpu->arch.pio.guest_page_offset;
1900         vunmap(q);
1901         free_pio_guest_pages(vcpu);
1902         return 0;
1903 }
1904
1905 int complete_pio(struct kvm_vcpu *vcpu)
1906 {
1907         struct kvm_pio_request *io = &vcpu->arch.pio;
1908         long delta;
1909         int r;
1910
1911         kvm_x86_ops->cache_regs(vcpu);
1912
1913         if (!io->string) {
1914                 if (io->in)
1915                         memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data,
1916                                io->size);
1917         } else {
1918                 if (io->in) {
1919                         r = pio_copy_data(vcpu);
1920                         if (r) {
1921                                 kvm_x86_ops->cache_regs(vcpu);
1922                                 return r;
1923                         }
1924                 }
1925
1926                 delta = 1;
1927                 if (io->rep) {
1928                         delta *= io->cur_count;
1929                         /*
1930                          * The size of the register should really depend on
1931                          * current address size.
1932                          */
1933                         vcpu->arch.regs[VCPU_REGS_RCX] -= delta;
1934                 }
1935                 if (io->down)
1936                         delta = -delta;
1937                 delta *= io->size;
1938                 if (io->in)
1939                         vcpu->arch.regs[VCPU_REGS_RDI] += delta;
1940                 else
1941                         vcpu->arch.regs[VCPU_REGS_RSI] += delta;
1942         }
1943
1944         kvm_x86_ops->decache_regs(vcpu);
1945
1946         io->count -= io->cur_count;
1947         io->cur_count = 0;
1948
1949         return 0;
1950 }
1951
1952 static void kernel_pio(struct kvm_io_device *pio_dev,
1953                        struct kvm_vcpu *vcpu,
1954                        void *pd)
1955 {
1956         /* TODO: String I/O for in kernel device */
1957
1958         mutex_lock(&vcpu->kvm->lock);
1959         if (vcpu->arch.pio.in)
1960                 kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,
1961                                   vcpu->arch.pio.size,
1962                                   pd);
1963         else
1964                 kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,
1965                                    vcpu->arch.pio.size,
1966                                    pd);
1967         mutex_unlock(&vcpu->kvm->lock);
1968 }
1969
1970 static void pio_string_write(struct kvm_io_device *pio_dev,
1971                              struct kvm_vcpu *vcpu)
1972 {
1973         struct kvm_pio_request *io = &vcpu->arch.pio;
1974         void *pd = vcpu->arch.pio_data;
1975         int i;
1976
1977         mutex_lock(&vcpu->kvm->lock);
1978         for (i = 0; i < io->cur_count; i++) {
1979                 kvm_iodevice_write(pio_dev, io->port,
1980                                    io->size,
1981                                    pd);
1982                 pd += io->size;
1983         }
1984         mutex_unlock(&vcpu->kvm->lock);
1985 }
1986
1987 static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
1988                                                gpa_t addr)
1989 {
1990         return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
1991 }
1992
1993 int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1994                   int size, unsigned port)
1995 {
1996         struct kvm_io_device *pio_dev;
1997
1998         vcpu->run->exit_reason = KVM_EXIT_IO;
1999         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2000         vcpu->run->io.size = vcpu->arch.pio.size = size;
2001         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
2002         vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
2003         vcpu->run->io.port = vcpu->arch.pio.port = port;
2004         vcpu->arch.pio.in = in;
2005         vcpu->arch.pio.string = 0;
2006         vcpu->arch.pio.down = 0;
2007         vcpu->arch.pio.guest_page_offset = 0;
2008         vcpu->arch.pio.rep = 0;
2009
2010         kvm_x86_ops->cache_regs(vcpu);
2011         memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4);
2012         kvm_x86_ops->decache_regs(vcpu);
2013
2014         kvm_x86_ops->skip_emulated_instruction(vcpu);
2015
2016         pio_dev = vcpu_find_pio_dev(vcpu, port);
2017         if (pio_dev) {
2018                 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
2019                 complete_pio(vcpu);
2020                 return 1;
2021         }
2022         return 0;
2023 }
2024 EXPORT_SYMBOL_GPL(kvm_emulate_pio);
2025
2026 int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2027                   int size, unsigned long count, int down,
2028                   gva_t address, int rep, unsigned port)
2029 {
2030         unsigned now, in_page;
2031         int i, ret = 0;
2032         int nr_pages = 1;
2033         struct page *page;
2034         struct kvm_io_device *pio_dev;
2035
2036         vcpu->run->exit_reason = KVM_EXIT_IO;
2037         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2038         vcpu->run->io.size = vcpu->arch.pio.size = size;
2039         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
2040         vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
2041         vcpu->run->io.port = vcpu->arch.pio.port = port;
2042         vcpu->arch.pio.in = in;
2043         vcpu->arch.pio.string = 1;
2044         vcpu->arch.pio.down = down;
2045         vcpu->arch.pio.guest_page_offset = offset_in_page(address);
2046         vcpu->arch.pio.rep = rep;
2047
2048         if (!count) {
2049                 kvm_x86_ops->skip_emulated_instruction(vcpu);
2050                 return 1;
2051         }
2052
2053         if (!down)
2054                 in_page = PAGE_SIZE - offset_in_page(address);
2055         else
2056                 in_page = offset_in_page(address) + size;
2057         now = min(count, (unsigned long)in_page / size);
2058         if (!now) {
2059                 /*
2060                  * String I/O straddles page boundary.  Pin two guest pages
2061                  * so that we satisfy atomicity constraints.  Do just one
2062                  * transaction to avoid complexity.
2063                  */
2064                 nr_pages = 2;
2065                 now = 1;
2066         }
2067         if (down) {
2068                 /*
2069                  * String I/O in reverse.  Yuck.  Kill the guest, fix later.
2070                  */
2071                 pr_unimpl(vcpu, "guest string pio down\n");
2072                 kvm_inject_gp(vcpu, 0);
2073                 return 1;
2074         }
2075         vcpu->run->io.count = now;
2076         vcpu->arch.pio.cur_count = now;
2077
2078         if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
2079                 kvm_x86_ops->skip_emulated_instruction(vcpu);
2080
2081         for (i = 0; i < nr_pages; ++i) {
2082                 mutex_lock(&vcpu->kvm->lock);
2083                 page = gva_to_page(vcpu, address + i * PAGE_SIZE);
2084                 vcpu->arch.pio.guest_pages[i] = page;
2085                 mutex_unlock(&vcpu->kvm->lock);
2086                 if (!page) {
2087                         kvm_inject_gp(vcpu, 0);
2088                         free_pio_guest_pages(vcpu);
2089                         return 1;
2090                 }
2091         }
2092
2093         pio_dev = vcpu_find_pio_dev(vcpu, port);
2094         if (!vcpu->arch.pio.in) {
2095                 /* string PIO write */
2096                 ret = pio_copy_data(vcpu);
2097                 if (ret >= 0 && pio_dev) {
2098                         pio_string_write(pio_dev, vcpu);
2099                         complete_pio(vcpu);
2100                         if (vcpu->arch.pio.count == 0)
2101                                 ret = 1;
2102                 }
2103         } else if (pio_dev)
2104                 pr_unimpl(vcpu, "no string pio read support yet, "
2105                        "port %x size %d count %ld\n",
2106                         port, size, count);
2107
2108         return ret;
2109 }
2110 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
2111
2112 int kvm_arch_init(void *opaque)
2113 {
2114         int r;
2115         struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
2116
2117         r = kvm_mmu_module_init();
2118         if (r)
2119                 goto out_fail;
2120
2121         kvm_init_msr_list();
2122
2123         if (kvm_x86_ops) {
2124                 printk(KERN_ERR "kvm: already loaded the other module\n");
2125                 r = -EEXIST;
2126                 goto out;
2127         }
2128
2129         if (!ops->cpu_has_kvm_support()) {
2130                 printk(KERN_ERR "kvm: no hardware support\n");
2131                 r = -EOPNOTSUPP;
2132                 goto out;
2133         }
2134         if (ops->disabled_by_bios()) {
2135                 printk(KERN_ERR "kvm: disabled by bios\n");
2136                 r = -EOPNOTSUPP;
2137                 goto out;
2138         }
2139
2140         kvm_x86_ops = ops;
2141         kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
2142         return 0;
2143
2144 out:
2145         kvm_mmu_module_exit();
2146 out_fail:
2147         return r;
2148 }
2149
2150 void kvm_arch_exit(void)
2151 {
2152         kvm_x86_ops = NULL;
2153         kvm_mmu_module_exit();
2154 }
2155
2156 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
2157 {
2158         ++vcpu->stat.halt_exits;
2159         if (irqchip_in_kernel(vcpu->kvm)) {
2160                 vcpu->arch.mp_state = VCPU_MP_STATE_HALTED;
2161                 kvm_vcpu_block(vcpu);
2162                 if (vcpu->arch.mp_state != VCPU_MP_STATE_RUNNABLE)
2163                         return -EINTR;
2164                 return 1;
2165         } else {
2166                 vcpu->run->exit_reason = KVM_EXIT_HLT;
2167                 return 0;
2168         }
2169 }
2170 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
2171
2172 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2173 {
2174         unsigned long nr, a0, a1, a2, a3, ret;
2175
2176         kvm_x86_ops->cache_regs(vcpu);
2177
2178         nr = vcpu->arch.regs[VCPU_REGS_RAX];
2179         a0 = vcpu->arch.regs[VCPU_REGS_RBX];
2180         a1 = vcpu->arch.regs[VCPU_REGS_RCX];
2181         a2 = vcpu->arch.regs[VCPU_REGS_RDX];
2182         a3 = vcpu->arch.regs[VCPU_REGS_RSI];
2183
2184         if (!is_long_mode(vcpu)) {
2185                 nr &= 0xFFFFFFFF;
2186                 a0 &= 0xFFFFFFFF;
2187                 a1 &= 0xFFFFFFFF;
2188                 a2 &= 0xFFFFFFFF;
2189                 a3 &= 0xFFFFFFFF;
2190         }
2191
2192         switch (nr) {
2193         default:
2194                 ret = -KVM_ENOSYS;
2195                 break;
2196         }
2197         vcpu->arch.regs[VCPU_REGS_RAX] = ret;
2198         kvm_x86_ops->decache_regs(vcpu);
2199         return 0;
2200 }
2201 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
2202
2203 int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
2204 {
2205         char instruction[3];
2206         int ret = 0;
2207
2208         mutex_lock(&vcpu->kvm->lock);
2209
2210         /*
2211          * Blow out the MMU to ensure that no other VCPU has an active mapping
2212          * to ensure that the updated hypercall appears atomically across all
2213          * VCPUs.
2214          */
2215         kvm_mmu_zap_all(vcpu->kvm);
2216
2217         kvm_x86_ops->cache_regs(vcpu);
2218         kvm_x86_ops->patch_hypercall(vcpu, instruction);
2219         if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu)
2220             != X86EMUL_CONTINUE)
2221                 ret = -EFAULT;
2222
2223         mutex_unlock(&vcpu->kvm->lock);
2224
2225         return ret;
2226 }
2227
2228 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
2229 {
2230         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
2231 }
2232
2233 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2234 {
2235         struct descriptor_table dt = { limit, base };
2236
2237         kvm_x86_ops->set_gdt(vcpu, &dt);
2238 }
2239
2240 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2241 {
2242         struct descriptor_table dt = { limit, base };
2243
2244         kvm_x86_ops->set_idt(vcpu, &dt);
2245 }
2246
2247 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
2248                    unsigned long *rflags)
2249 {
2250         lmsw(vcpu, msw);
2251         *rflags = kvm_x86_ops->get_rflags(vcpu);
2252 }
2253
2254 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
2255 {
2256         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2257         switch (cr) {
2258         case 0:
2259                 return vcpu->arch.cr0;
2260         case 2:
2261                 return vcpu->arch.cr2;
2262         case 3:
2263                 return vcpu->arch.cr3;
2264         case 4:
2265                 return vcpu->arch.cr4;
2266         case 8:
2267                 return get_cr8(vcpu);
2268         default:
2269                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2270                 return 0;
2271         }
2272 }
2273
2274 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
2275                      unsigned long *rflags)
2276 {
2277         switch (cr) {
2278         case 0:
2279                 set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
2280                 *rflags = kvm_x86_ops->get_rflags(vcpu);
2281                 break;
2282         case 2:
2283                 vcpu->arch.cr2 = val;
2284                 break;
2285         case 3:
2286                 set_cr3(vcpu, val);
2287                 break;
2288         case 4:
2289                 set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
2290                 break;
2291         case 8:
2292                 set_cr8(vcpu, val & 0xfUL);
2293                 break;
2294         default:
2295                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2296         }
2297 }
2298
2299 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
2300 {
2301         struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
2302         int j, nent = vcpu->arch.cpuid_nent;
2303
2304         e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
2305         /* when no next entry is found, the current entry[i] is reselected */
2306         for (j = i + 1; j == i; j = (j + 1) % nent) {
2307                 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
2308                 if (ej->function == e->function) {
2309                         ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
2310                         return j;
2311                 }
2312         }
2313         return 0; /* silence gcc, even though control never reaches here */
2314 }
2315
2316 /* find an entry with matching function, matching index (if needed), and that
2317  * should be read next (if it's stateful) */
2318 static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
2319         u32 function, u32 index)
2320 {
2321         if (e->function != function)
2322                 return 0;
2323         if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
2324                 return 0;
2325         if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
2326                 !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
2327                 return 0;
2328         return 1;
2329 }
2330
2331 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
2332 {
2333         int i;
2334         u32 function, index;
2335         struct kvm_cpuid_entry2 *e, *best;
2336
2337         kvm_x86_ops->cache_regs(vcpu);
2338         function = vcpu->arch.regs[VCPU_REGS_RAX];
2339         index = vcpu->arch.regs[VCPU_REGS_RCX];
2340         vcpu->arch.regs[VCPU_REGS_RAX] = 0;
2341         vcpu->arch.regs[VCPU_REGS_RBX] = 0;
2342         vcpu->arch.regs[VCPU_REGS_RCX] = 0;
2343         vcpu->arch.regs[VCPU_REGS_RDX] = 0;
2344         best = NULL;
2345         for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
2346                 e = &vcpu->arch.cpuid_entries[i];
2347                 if (is_matching_cpuid_entry(e, function, index)) {
2348                         if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
2349                                 move_to_next_stateful_cpuid_entry(vcpu, i);
2350                         best = e;
2351                         break;
2352                 }
2353                 /*
2354                  * Both basic or both extended?
2355                  */
2356                 if (((e->function ^ function) & 0x80000000) == 0)
2357                         if (!best || e->function > best->function)
2358                                 best = e;
2359         }
2360         if (best) {
2361                 vcpu->arch.regs[VCPU_REGS_RAX] = best->eax;
2362                 vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx;
2363                 vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx;
2364                 vcpu->arch.regs[VCPU_REGS_RDX] = best->edx;
2365         }
2366         kvm_x86_ops->decache_regs(vcpu);
2367         kvm_x86_ops->skip_emulated_instruction(vcpu);
2368 }
2369 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
2370
2371 /*
2372  * Check if userspace requested an interrupt window, and that the
2373  * interrupt window is open.
2374  *
2375  * No need to exit to userspace if we already have an interrupt queued.
2376  */
2377 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
2378                                           struct kvm_run *kvm_run)
2379 {
2380         return (!vcpu->arch.irq_summary &&
2381                 kvm_run->request_interrupt_window &&
2382                 vcpu->arch.interrupt_window_open &&
2383                 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
2384 }
2385
2386 static void post_kvm_run_save(struct kvm_vcpu *vcpu,
2387                               struct kvm_run *kvm_run)
2388 {
2389         kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
2390         kvm_run->cr8 = get_cr8(vcpu);
2391         kvm_run->apic_base = kvm_get_apic_base(vcpu);
2392         if (irqchip_in_kernel(vcpu->kvm))
2393                 kvm_run->ready_for_interrupt_injection = 1;
2394         else
2395                 kvm_run->ready_for_interrupt_injection =
2396                                         (vcpu->arch.interrupt_window_open &&
2397                                          vcpu->arch.irq_summary == 0);
2398 }
2399
2400 static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2401 {
2402         int r;
2403
2404         if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) {
2405                 pr_debug("vcpu %d received sipi with vector # %x\n",
2406                        vcpu->vcpu_id, vcpu->arch.sipi_vector);
2407                 kvm_lapic_reset(vcpu);
2408                 r = kvm_x86_ops->vcpu_reset(vcpu);
2409                 if (r)
2410                         return r;
2411                 vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
2412         }
2413
2414 preempted:
2415         if (vcpu->guest_debug.enabled)
2416                 kvm_x86_ops->guest_debug_pre(vcpu);
2417
2418 again:
2419         r = kvm_mmu_reload(vcpu);
2420         if (unlikely(r))
2421                 goto out;
2422
2423         kvm_inject_pending_timer_irqs(vcpu);
2424
2425         preempt_disable();
2426
2427         kvm_x86_ops->prepare_guest_switch(vcpu);
2428         kvm_load_guest_fpu(vcpu);
2429
2430         local_irq_disable();
2431
2432         if (signal_pending(current)) {
2433                 local_irq_enable();
2434                 preempt_enable();
2435                 r = -EINTR;
2436                 kvm_run->exit_reason = KVM_EXIT_INTR;
2437                 ++vcpu->stat.signal_exits;
2438                 goto out;
2439         }
2440
2441         if (vcpu->arch.exception.pending)
2442                 __queue_exception(vcpu);
2443         else if (irqchip_in_kernel(vcpu->kvm))
2444                 kvm_x86_ops->inject_pending_irq(vcpu);
2445         else
2446                 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
2447
2448         vcpu->guest_mode = 1;
2449         kvm_guest_enter();
2450
2451         if (vcpu->requests)
2452                 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
2453                         kvm_x86_ops->tlb_flush(vcpu);
2454
2455         kvm_x86_ops->run(vcpu, kvm_run);
2456
2457         vcpu->guest_mode = 0;
2458         local_irq_enable();
2459
2460         ++vcpu->stat.exits;
2461
2462         /*
2463          * We must have an instruction between local_irq_enable() and
2464          * kvm_guest_exit(), so the timer interrupt isn't delayed by
2465          * the interrupt shadow.  The stat.exits increment will do nicely.
2466          * But we need to prevent reordering, hence this barrier():
2467          */
2468         barrier();
2469
2470         kvm_guest_exit();
2471
2472         preempt_enable();
2473
2474         /*
2475          * Profile KVM exit RIPs:
2476          */
2477         if (unlikely(prof_on == KVM_PROFILING)) {
2478                 kvm_x86_ops->cache_regs(vcpu);
2479                 profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip);
2480         }
2481
2482         if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
2483                 vcpu->arch.exception.pending = false;
2484
2485         r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
2486
2487         if (r > 0) {
2488                 if (dm_request_for_irq_injection(vcpu, kvm_run)) {
2489                         r = -EINTR;
2490                         kvm_run->exit_reason = KVM_EXIT_INTR;
2491                         ++vcpu->stat.request_irq_exits;
2492                         goto out;
2493                 }
2494                 if (!need_resched())
2495                         goto again;
2496         }
2497
2498 out:
2499         if (r > 0) {
2500                 kvm_resched(vcpu);
2501                 goto preempted;
2502         }
2503
2504         post_kvm_run_save(vcpu, kvm_run);
2505
2506         return r;
2507 }
2508
2509 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2510 {
2511         int r;
2512         sigset_t sigsaved;
2513
2514         vcpu_load(vcpu);
2515
2516         if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_UNINITIALIZED)) {
2517                 kvm_vcpu_block(vcpu);
2518                 vcpu_put(vcpu);
2519                 return -EAGAIN;
2520         }
2521
2522         if (vcpu->sigset_active)
2523                 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
2524
2525         /* re-sync apic's tpr */
2526         if (!irqchip_in_kernel(vcpu->kvm))
2527                 set_cr8(vcpu, kvm_run->cr8);
2528
2529         if (vcpu->arch.pio.cur_count) {
2530                 r = complete_pio(vcpu);
2531                 if (r)
2532                         goto out;
2533         }
2534 #if CONFIG_HAS_IOMEM
2535         if (vcpu->mmio_needed) {
2536                 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
2537                 vcpu->mmio_read_completed = 1;
2538                 vcpu->mmio_needed = 0;
2539                 r = emulate_instruction(vcpu, kvm_run,
2540                                         vcpu->arch.mmio_fault_cr2, 0, 1);
2541                 if (r == EMULATE_DO_MMIO) {
2542                         /*
2543                          * Read-modify-write.  Back to userspace.
2544                          */
2545                         r = 0;
2546                         goto out;
2547                 }
2548         }
2549 #endif
2550         if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
2551                 kvm_x86_ops->cache_regs(vcpu);
2552                 vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
2553                 kvm_x86_ops->decache_regs(vcpu);
2554         }
2555
2556         r = __vcpu_run(vcpu, kvm_run);
2557
2558 out:
2559         if (vcpu->sigset_active)
2560                 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
2561
2562         vcpu_put(vcpu);
2563         return r;
2564 }
2565
2566 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
2567 {
2568         vcpu_load(vcpu);
2569
2570         kvm_x86_ops->cache_regs(vcpu);
2571
2572         regs->rax = vcpu->arch.regs[VCPU_REGS_RAX];
2573         regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX];
2574         regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX];
2575         regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX];
2576         regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI];
2577         regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI];
2578         regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP];
2579         regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP];
2580 #ifdef CONFIG_X86_64
2581         regs->r8 = vcpu->arch.regs[VCPU_REGS_R8];
2582         regs->r9 = vcpu->arch.regs[VCPU_REGS_R9];
2583         regs->r10 = vcpu->arch.regs[VCPU_REGS_R10];
2584         regs->r11 = vcpu->arch.regs[VCPU_REGS_R11];
2585         regs->r12 = vcpu->arch.regs[VCPU_REGS_R12];
2586         regs->r13 = vcpu->arch.regs[VCPU_REGS_R13];
2587         regs->r14 = vcpu->arch.regs[VCPU_REGS_R14];
2588         regs->r15 = vcpu->arch.regs[VCPU_REGS_R15];
2589 #endif
2590
2591         regs->rip = vcpu->arch.rip;
2592         regs->rflags = kvm_x86_ops->get_rflags(vcpu);
2593
2594         /*
2595          * Don't leak debug flags in case they were set for guest debugging
2596          */
2597         if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
2598                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
2599
2600         vcpu_put(vcpu);
2601
2602         return 0;
2603 }
2604
2605 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
2606 {
2607         vcpu_load(vcpu);
2608
2609         vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax;
2610         vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx;
2611         vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx;
2612         vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx;
2613         vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi;
2614         vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi;
2615         vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp;
2616         vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp;
2617 #ifdef CONFIG_X86_64
2618         vcpu->arch.regs[VCPU_REGS_R8] = regs->r8;
2619         vcpu->arch.regs[VCPU_REGS_R9] = regs->r9;
2620         vcpu->arch.regs[VCPU_REGS_R10] = regs->r10;
2621         vcpu->arch.regs[VCPU_REGS_R11] = regs->r11;
2622         vcpu->arch.regs[VCPU_REGS_R12] = regs->r12;
2623         vcpu->arch.regs[VCPU_REGS_R13] = regs->r13;
2624         vcpu->arch.regs[VCPU_REGS_R14] = regs->r14;
2625         vcpu->arch.regs[VCPU_REGS_R15] = regs->r15;
2626 #endif
2627
2628         vcpu->arch.rip = regs->rip;
2629         kvm_x86_ops->set_rflags(vcpu, regs->rflags);
2630
2631         kvm_x86_ops->decache_regs(vcpu);
2632
2633         vcpu_put(vcpu);
2634
2635         return 0;
2636 }
2637
2638 static void get_segment(struct kvm_vcpu *vcpu,
2639                         struct kvm_segment *var, int seg)
2640 {
2641         return kvm_x86_ops->get_segment(vcpu, var, seg);
2642 }
2643
2644 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
2645 {
2646         struct kvm_segment cs;
2647
2648         get_segment(vcpu, &cs, VCPU_SREG_CS);
2649         *db = cs.db;
2650         *l = cs.l;
2651 }
2652 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
2653
2654 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
2655                                   struct kvm_sregs *sregs)
2656 {
2657         struct descriptor_table dt;
2658         int pending_vec;
2659
2660         vcpu_load(vcpu);
2661
2662         get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2663         get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2664         get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2665         get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2666         get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2667         get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2668
2669         get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2670         get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2671
2672         kvm_x86_ops->get_idt(vcpu, &dt);
2673         sregs->idt.limit = dt.limit;
2674         sregs->idt.base = dt.base;
2675         kvm_x86_ops->get_gdt(vcpu, &dt);
2676         sregs->gdt.limit = dt.limit;
2677         sregs->gdt.base = dt.base;
2678
2679         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2680         sregs->cr0 = vcpu->arch.cr0;
2681         sregs->cr2 = vcpu->arch.cr2;
2682         sregs->cr3 = vcpu->arch.cr3;
2683         sregs->cr4 = vcpu->arch.cr4;
2684         sregs->cr8 = get_cr8(vcpu);
2685         sregs->efer = vcpu->arch.shadow_efer;
2686         sregs->apic_base = kvm_get_apic_base(vcpu);
2687
2688         if (irqchip_in_kernel(vcpu->kvm)) {
2689                 memset(sregs->interrupt_bitmap, 0,
2690                        sizeof sregs->interrupt_bitmap);
2691                 pending_vec = kvm_x86_ops->get_irq(vcpu);
2692                 if (pending_vec >= 0)
2693                         set_bit(pending_vec,
2694                                 (unsigned long *)sregs->interrupt_bitmap);
2695         } else
2696                 memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,
2697                        sizeof sregs->interrupt_bitmap);
2698
2699         vcpu_put(vcpu);
2700
2701         return 0;
2702 }
2703
2704 static void set_segment(struct kvm_vcpu *vcpu,
2705                         struct kvm_segment *var, int seg)
2706 {
2707         return kvm_x86_ops->set_segment(vcpu, var, seg);
2708 }
2709
2710 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2711                                   struct kvm_sregs *sregs)
2712 {
2713         int mmu_reset_needed = 0;
2714         int i, pending_vec, max_bits;
2715         struct descriptor_table dt;
2716
2717         vcpu_load(vcpu);
2718
2719         dt.limit = sregs->idt.limit;
2720         dt.base = sregs->idt.base;
2721         kvm_x86_ops->set_idt(vcpu, &dt);
2722         dt.limit = sregs->gdt.limit;
2723         dt.base = sregs->gdt.base;
2724         kvm_x86_ops->set_gdt(vcpu, &dt);
2725
2726         vcpu->arch.cr2 = sregs->cr2;
2727         mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
2728         vcpu->arch.cr3 = sregs->cr3;
2729
2730         set_cr8(vcpu, sregs->cr8);
2731
2732         mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
2733 #ifdef CONFIG_X86_64
2734         kvm_x86_ops->set_efer(vcpu, sregs->efer);
2735 #endif
2736         kvm_set_apic_base(vcpu, sregs->apic_base);
2737
2738         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2739
2740         mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
2741         vcpu->arch.cr0 = sregs->cr0;
2742         kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
2743
2744         mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
2745         kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
2746         if (!is_long_mode(vcpu) && is_pae(vcpu))
2747                 load_pdptrs(vcpu, vcpu->arch.cr3);
2748
2749         if (mmu_reset_needed)
2750                 kvm_mmu_reset_context(vcpu);
2751
2752         if (!irqchip_in_kernel(vcpu->kvm)) {
2753                 memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap,
2754                        sizeof vcpu->arch.irq_pending);
2755                 vcpu->arch.irq_summary = 0;
2756                 for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i)
2757                         if (vcpu->arch.irq_pending[i])
2758                                 __set_bit(i, &vcpu->arch.irq_summary);
2759         } else {
2760                 max_bits = (sizeof sregs->interrupt_bitmap) << 3;
2761                 pending_vec = find_first_bit(
2762                         (const unsigned long *)sregs->interrupt_bitmap,
2763                         max_bits);
2764                 /* Only pending external irq is handled here */
2765                 if (pending_vec < max_bits) {
2766                         kvm_x86_ops->set_irq(vcpu, pending_vec);
2767                         pr_debug("Set back pending irq %d\n",
2768                                  pending_vec);
2769                 }
2770         }
2771
2772         set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2773         set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2774         set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2775         set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2776         set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2777         set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2778
2779         set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2780         set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2781
2782         vcpu_put(vcpu);
2783
2784         return 0;
2785 }
2786
2787 int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
2788                                     struct kvm_debug_guest *dbg)
2789 {
2790         int r;
2791
2792         vcpu_load(vcpu);
2793
2794         r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
2795
2796         vcpu_put(vcpu);
2797
2798         return r;
2799 }
2800
2801 /*
2802  * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
2803  * we have asm/x86/processor.h
2804  */
2805 struct fxsave {
2806         u16     cwd;
2807         u16     swd;
2808         u16     twd;
2809         u16     fop;
2810         u64     rip;
2811         u64     rdp;
2812         u32     mxcsr;
2813         u32     mxcsr_mask;
2814         u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
2815 #ifdef CONFIG_X86_64
2816         u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
2817 #else
2818         u32     xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
2819 #endif
2820 };
2821
2822 /*
2823  * Translate a guest virtual address to a guest physical address.
2824  */
2825 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
2826                                     struct kvm_translation *tr)
2827 {
2828         unsigned long vaddr = tr->linear_address;
2829         gpa_t gpa;
2830
2831         vcpu_load(vcpu);
2832         mutex_lock(&vcpu->kvm->lock);
2833         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
2834         tr->physical_address = gpa;
2835         tr->valid = gpa != UNMAPPED_GVA;
2836         tr->writeable = 1;
2837         tr->usermode = 0;
2838         mutex_unlock(&vcpu->kvm->lock);
2839         vcpu_put(vcpu);
2840
2841         return 0;
2842 }
2843
2844 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2845 {
2846         struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
2847
2848         vcpu_load(vcpu);
2849
2850         memcpy(fpu->fpr, fxsave->st_space, 128);
2851         fpu->fcw = fxsave->cwd;
2852         fpu->fsw = fxsave->swd;
2853         fpu->ftwx = fxsave->twd;
2854         fpu->last_opcode = fxsave->fop;
2855         fpu->last_ip = fxsave->rip;
2856         fpu->last_dp = fxsave->rdp;
2857         memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
2858
2859         vcpu_put(vcpu);
2860
2861         return 0;
2862 }
2863
2864 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2865 {
2866         struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
2867
2868         vcpu_load(vcpu);
2869
2870         memcpy(fxsave->st_space, fpu->fpr, 128);
2871         fxsave->cwd = fpu->fcw;
2872         fxsave->swd = fpu->fsw;
2873         fxsave->twd = fpu->ftwx;
2874         fxsave->fop = fpu->last_opcode;
2875         fxsave->rip = fpu->last_ip;
2876         fxsave->rdp = fpu->last_dp;
2877         memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
2878
2879         vcpu_put(vcpu);
2880
2881         return 0;
2882 }
2883
2884 void fx_init(struct kvm_vcpu *vcpu)
2885 {
2886         unsigned after_mxcsr_mask;
2887
2888         /* Initialize guest FPU by resetting ours and saving into guest's */
2889         preempt_disable();
2890         fx_save(&vcpu->arch.host_fx_image);
2891         fpu_init();
2892         fx_save(&vcpu->arch.guest_fx_image);
2893         fx_restore(&vcpu->arch.host_fx_image);
2894         preempt_enable();
2895
2896         vcpu->arch.cr0 |= X86_CR0_ET;
2897         after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
2898         vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
2899         memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
2900                0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
2901 }
2902 EXPORT_SYMBOL_GPL(fx_init);
2903
2904 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
2905 {
2906         if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
2907                 return;
2908
2909         vcpu->guest_fpu_loaded = 1;
2910         fx_save(&vcpu->arch.host_fx_image);
2911         fx_restore(&vcpu->arch.guest_fx_image);
2912 }
2913 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
2914
2915 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
2916 {
2917         if (!vcpu->guest_fpu_loaded)
2918                 return;
2919
2920         vcpu->guest_fpu_loaded = 0;
2921         fx_save(&vcpu->arch.guest_fx_image);
2922         fx_restore(&vcpu->arch.host_fx_image);
2923         ++vcpu->stat.fpu_reload;
2924 }
2925 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
2926
2927 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
2928 {
2929         kvm_x86_ops->vcpu_free(vcpu);
2930 }
2931
2932 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
2933                                                 unsigned int id)
2934 {
2935         return kvm_x86_ops->vcpu_create(kvm, id);
2936 }
2937
2938 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
2939 {
2940         int r;
2941
2942         /* We do fxsave: this must be aligned. */
2943         BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
2944
2945         vcpu_load(vcpu);
2946         r = kvm_arch_vcpu_reset(vcpu);
2947         if (r == 0)
2948                 r = kvm_mmu_setup(vcpu);
2949         vcpu_put(vcpu);
2950         if (r < 0)
2951                 goto free_vcpu;
2952
2953         return 0;
2954 free_vcpu:
2955         kvm_x86_ops->vcpu_free(vcpu);
2956         return r;
2957 }
2958
2959 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
2960 {
2961         vcpu_load(vcpu);
2962         kvm_mmu_unload(vcpu);
2963         vcpu_put(vcpu);
2964
2965         kvm_x86_ops->vcpu_free(vcpu);
2966 }
2967
2968 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
2969 {
2970         return kvm_x86_ops->vcpu_reset(vcpu);
2971 }
2972
2973 void kvm_arch_hardware_enable(void *garbage)
2974 {
2975         kvm_x86_ops->hardware_enable(garbage);
2976 }
2977
2978 void kvm_arch_hardware_disable(void *garbage)
2979 {
2980         kvm_x86_ops->hardware_disable(garbage);
2981 }
2982
2983 int kvm_arch_hardware_setup(void)
2984 {
2985         return kvm_x86_ops->hardware_setup();
2986 }
2987
2988 void kvm_arch_hardware_unsetup(void)
2989 {
2990         kvm_x86_ops->hardware_unsetup();
2991 }
2992
2993 void kvm_arch_check_processor_compat(void *rtn)
2994 {
2995         kvm_x86_ops->check_processor_compatibility(rtn);
2996 }
2997
2998 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
2999 {
3000         struct page *page;
3001         struct kvm *kvm;
3002         int r;
3003
3004         BUG_ON(vcpu->kvm == NULL);
3005         kvm = vcpu->kvm;
3006
3007         vcpu->arch.mmu.root_hpa = INVALID_PAGE;
3008         if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
3009                 vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
3010         else
3011                 vcpu->arch.mp_state = VCPU_MP_STATE_UNINITIALIZED;
3012
3013         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
3014         if (!page) {
3015                 r = -ENOMEM;
3016                 goto fail;
3017         }
3018         vcpu->arch.pio_data = page_address(page);
3019
3020         r = kvm_mmu_create(vcpu);
3021         if (r < 0)
3022                 goto fail_free_pio_data;
3023
3024         if (irqchip_in_kernel(kvm)) {
3025                 r = kvm_create_lapic(vcpu);
3026                 if (r < 0)
3027                         goto fail_mmu_destroy;
3028         }
3029
3030         return 0;
3031
3032 fail_mmu_destroy:
3033         kvm_mmu_destroy(vcpu);
3034 fail_free_pio_data:
3035         free_page((unsigned long)vcpu->arch.pio_data);
3036 fail:
3037         return r;
3038 }
3039
3040 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
3041 {
3042         kvm_free_lapic(vcpu);
3043         kvm_mmu_destroy(vcpu);
3044         free_page((unsigned long)vcpu->arch.pio_data);
3045 }
3046
3047 struct  kvm *kvm_arch_create_vm(void)
3048 {
3049         struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
3050
3051         if (!kvm)
3052                 return ERR_PTR(-ENOMEM);
3053
3054         INIT_LIST_HEAD(&kvm->active_mmu_pages);
3055
3056         return kvm;
3057 }
3058
3059 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
3060 {
3061         vcpu_load(vcpu);
3062         kvm_mmu_unload(vcpu);
3063         vcpu_put(vcpu);
3064 }
3065
3066 static void kvm_free_vcpus(struct kvm *kvm)
3067 {
3068         unsigned int i;
3069
3070         /*
3071          * Unpin any mmu pages first.
3072          */
3073         for (i = 0; i < KVM_MAX_VCPUS; ++i)
3074                 if (kvm->vcpus[i])
3075                         kvm_unload_vcpu_mmu(kvm->vcpus[i]);
3076         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3077                 if (kvm->vcpus[i]) {
3078                         kvm_arch_vcpu_free(kvm->vcpus[i]);
3079                         kvm->vcpus[i] = NULL;
3080                 }
3081         }
3082
3083 }
3084
3085 void kvm_arch_destroy_vm(struct kvm *kvm)
3086 {
3087         kfree(kvm->vpic);
3088         kfree(kvm->vioapic);
3089         kvm_free_vcpus(kvm);
3090         kvm_free_physmem(kvm);
3091         kfree(kvm);
3092 }
3093
3094 int kvm_arch_set_memory_region(struct kvm *kvm,
3095                                 struct kvm_userspace_memory_region *mem,
3096                                 struct kvm_memory_slot old,
3097                                 int user_alloc)
3098 {
3099         int npages = mem->memory_size >> PAGE_SHIFT;
3100         struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
3101
3102         /*To keep backward compatibility with older userspace,
3103          *x86 needs to hanlde !user_alloc case.
3104          */
3105         if (!user_alloc) {
3106                 if (npages && !old.rmap) {
3107                         down_write(&current->mm->mmap_sem);
3108                         memslot->userspace_addr = do_mmap(NULL, 0,
3109                                                      npages * PAGE_SIZE,
3110                                                      PROT_READ | PROT_WRITE,
3111                                                      MAP_SHARED | MAP_ANONYMOUS,
3112                                                      0);
3113                         up_write(&current->mm->mmap_sem);
3114
3115                         if (IS_ERR((void *)memslot->userspace_addr))
3116                                 return PTR_ERR((void *)memslot->userspace_addr);
3117                 } else {
3118                         if (!old.user_alloc && old.rmap) {
3119                                 int ret;
3120
3121                                 down_write(&current->mm->mmap_sem);
3122                                 ret = do_munmap(current->mm, old.userspace_addr,
3123                                                 old.npages * PAGE_SIZE);
3124                                 up_write(&current->mm->mmap_sem);
3125                                 if (ret < 0)
3126                                         printk(KERN_WARNING
3127                                        "kvm_vm_ioctl_set_memory_region: "
3128                                        "failed to munmap memory\n");
3129                         }
3130                 }
3131         }
3132
3133         if (!kvm->n_requested_mmu_pages) {
3134                 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
3135                 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
3136         }
3137
3138         kvm_mmu_slot_remove_write_access(kvm, mem->slot);
3139         kvm_flush_remote_tlbs(kvm);
3140
3141         return 0;
3142 }
3143
3144 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
3145 {
3146         return vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE
3147                || vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED;
3148 }