KVM: Remove memory alias support
[sfrench/cifs-2.6.git] / arch / x86 / kvm / mmu.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * MMU support
8  *
9  * Copyright (C) 2006 Qumranet, Inc.
10  * Copyright 2010 Red Hat, Inc. and/or its affilates.
11  *
12  * Authors:
13  *   Yaniv Kamay  <yaniv@qumranet.com>
14  *   Avi Kivity   <avi@qumranet.com>
15  *
16  * This work is licensed under the terms of the GNU GPL, version 2.  See
17  * the COPYING file in the top-level directory.
18  *
19  */
20
21 #include "mmu.h"
22 #include "x86.h"
23 #include "kvm_cache_regs.h"
24
25 #include <linux/kvm_host.h>
26 #include <linux/types.h>
27 #include <linux/string.h>
28 #include <linux/mm.h>
29 #include <linux/highmem.h>
30 #include <linux/module.h>
31 #include <linux/swap.h>
32 #include <linux/hugetlb.h>
33 #include <linux/compiler.h>
34 #include <linux/srcu.h>
35 #include <linux/slab.h>
36 #include <linux/uaccess.h>
37
38 #include <asm/page.h>
39 #include <asm/cmpxchg.h>
40 #include <asm/io.h>
41 #include <asm/vmx.h>
42
43 /*
44  * When setting this variable to true it enables Two-Dimensional-Paging
45  * where the hardware walks 2 page tables:
46  * 1. the guest-virtual to guest-physical
47  * 2. while doing 1. it walks guest-physical to host-physical
48  * If the hardware supports that we don't need to do shadow paging.
49  */
50 bool tdp_enabled = false;
51
52 #undef MMU_DEBUG
53
54 #undef AUDIT
55
56 #ifdef AUDIT
57 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
58 #else
59 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
60 #endif
61
62 #ifdef MMU_DEBUG
63
64 #define pgprintk(x...) do { if (dbg) printk(x); } while (0)
65 #define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
66
67 #else
68
69 #define pgprintk(x...) do { } while (0)
70 #define rmap_printk(x...) do { } while (0)
71
72 #endif
73
74 #if defined(MMU_DEBUG) || defined(AUDIT)
75 static int dbg = 0;
76 module_param(dbg, bool, 0644);
77 #endif
78
79 static int oos_shadow = 1;
80 module_param(oos_shadow, bool, 0644);
81
82 #ifndef MMU_DEBUG
83 #define ASSERT(x) do { } while (0)
84 #else
85 #define ASSERT(x)                                                       \
86         if (!(x)) {                                                     \
87                 printk(KERN_WARNING "assertion failed %s:%d: %s\n",     \
88                        __FILE__, __LINE__, #x);                         \
89         }
90 #endif
91
92 #define PT_FIRST_AVAIL_BITS_SHIFT 9
93 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
94
95 #define VALID_PAGE(x) ((x) != INVALID_PAGE)
96
97 #define PT64_LEVEL_BITS 9
98
99 #define PT64_LEVEL_SHIFT(level) \
100                 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
101
102 #define PT64_LEVEL_MASK(level) \
103                 (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
104
105 #define PT64_INDEX(address, level)\
106         (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
107
108
109 #define PT32_LEVEL_BITS 10
110
111 #define PT32_LEVEL_SHIFT(level) \
112                 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
113
114 #define PT32_LEVEL_MASK(level) \
115                 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
116 #define PT32_LVL_OFFSET_MASK(level) \
117         (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
118                                                 * PT32_LEVEL_BITS))) - 1))
119
120 #define PT32_INDEX(address, level)\
121         (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
122
123
124 #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
125 #define PT64_DIR_BASE_ADDR_MASK \
126         (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
127 #define PT64_LVL_ADDR_MASK(level) \
128         (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
129                                                 * PT64_LEVEL_BITS))) - 1))
130 #define PT64_LVL_OFFSET_MASK(level) \
131         (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
132                                                 * PT64_LEVEL_BITS))) - 1))
133
134 #define PT32_BASE_ADDR_MASK PAGE_MASK
135 #define PT32_DIR_BASE_ADDR_MASK \
136         (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
137 #define PT32_LVL_ADDR_MASK(level) \
138         (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
139                                             * PT32_LEVEL_BITS))) - 1))
140
141 #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
142                         | PT64_NX_MASK)
143
144 #define RMAP_EXT 4
145
146 #define ACC_EXEC_MASK    1
147 #define ACC_WRITE_MASK   PT_WRITABLE_MASK
148 #define ACC_USER_MASK    PT_USER_MASK
149 #define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
150
151 #include <trace/events/kvm.h>
152
153 #define CREATE_TRACE_POINTS
154 #include "mmutrace.h"
155
156 #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
157
158 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
159
160 struct kvm_rmap_desc {
161         u64 *sptes[RMAP_EXT];
162         struct kvm_rmap_desc *more;
163 };
164
165 struct kvm_shadow_walk_iterator {
166         u64 addr;
167         hpa_t shadow_addr;
168         int level;
169         u64 *sptep;
170         unsigned index;
171 };
172
173 #define for_each_shadow_entry(_vcpu, _addr, _walker)    \
174         for (shadow_walk_init(&(_walker), _vcpu, _addr);        \
175              shadow_walk_okay(&(_walker));                      \
176              shadow_walk_next(&(_walker)))
177
178 typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
179
180 static struct kmem_cache *pte_chain_cache;
181 static struct kmem_cache *rmap_desc_cache;
182 static struct kmem_cache *mmu_page_header_cache;
183
184 static u64 __read_mostly shadow_trap_nonpresent_pte;
185 static u64 __read_mostly shadow_notrap_nonpresent_pte;
186 static u64 __read_mostly shadow_base_present_pte;
187 static u64 __read_mostly shadow_nx_mask;
188 static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
189 static u64 __read_mostly shadow_user_mask;
190 static u64 __read_mostly shadow_accessed_mask;
191 static u64 __read_mostly shadow_dirty_mask;
192
193 static inline u64 rsvd_bits(int s, int e)
194 {
195         return ((1ULL << (e - s + 1)) - 1) << s;
196 }
197
198 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
199 {
200         shadow_trap_nonpresent_pte = trap_pte;
201         shadow_notrap_nonpresent_pte = notrap_pte;
202 }
203 EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
204
205 void kvm_mmu_set_base_ptes(u64 base_pte)
206 {
207         shadow_base_present_pte = base_pte;
208 }
209 EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
210
211 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
212                 u64 dirty_mask, u64 nx_mask, u64 x_mask)
213 {
214         shadow_user_mask = user_mask;
215         shadow_accessed_mask = accessed_mask;
216         shadow_dirty_mask = dirty_mask;
217         shadow_nx_mask = nx_mask;
218         shadow_x_mask = x_mask;
219 }
220 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
221
222 static bool is_write_protection(struct kvm_vcpu *vcpu)
223 {
224         return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
225 }
226
227 static int is_cpuid_PSE36(void)
228 {
229         return 1;
230 }
231
232 static int is_nx(struct kvm_vcpu *vcpu)
233 {
234         return vcpu->arch.efer & EFER_NX;
235 }
236
237 static int is_shadow_present_pte(u64 pte)
238 {
239         return pte != shadow_trap_nonpresent_pte
240                 && pte != shadow_notrap_nonpresent_pte;
241 }
242
243 static int is_large_pte(u64 pte)
244 {
245         return pte & PT_PAGE_SIZE_MASK;
246 }
247
248 static int is_writable_pte(unsigned long pte)
249 {
250         return pte & PT_WRITABLE_MASK;
251 }
252
253 static int is_dirty_gpte(unsigned long pte)
254 {
255         return pte & PT_DIRTY_MASK;
256 }
257
258 static int is_rmap_spte(u64 pte)
259 {
260         return is_shadow_present_pte(pte);
261 }
262
263 static int is_last_spte(u64 pte, int level)
264 {
265         if (level == PT_PAGE_TABLE_LEVEL)
266                 return 1;
267         if (is_large_pte(pte))
268                 return 1;
269         return 0;
270 }
271
272 static pfn_t spte_to_pfn(u64 pte)
273 {
274         return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
275 }
276
277 static gfn_t pse36_gfn_delta(u32 gpte)
278 {
279         int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
280
281         return (gpte & PT32_DIR_PSE36_MASK) << shift;
282 }
283
284 static void __set_spte(u64 *sptep, u64 spte)
285 {
286 #ifdef CONFIG_X86_64
287         set_64bit((unsigned long *)sptep, spte);
288 #else
289         set_64bit((unsigned long long *)sptep, spte);
290 #endif
291 }
292
293 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
294                                   struct kmem_cache *base_cache, int min)
295 {
296         void *obj;
297
298         if (cache->nobjs >= min)
299                 return 0;
300         while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
301                 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
302                 if (!obj)
303                         return -ENOMEM;
304                 cache->objects[cache->nobjs++] = obj;
305         }
306         return 0;
307 }
308
309 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
310                                   struct kmem_cache *cache)
311 {
312         while (mc->nobjs)
313                 kmem_cache_free(cache, mc->objects[--mc->nobjs]);
314 }
315
316 static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
317                                        int min)
318 {
319         struct page *page;
320
321         if (cache->nobjs >= min)
322                 return 0;
323         while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
324                 page = alloc_page(GFP_KERNEL);
325                 if (!page)
326                         return -ENOMEM;
327                 cache->objects[cache->nobjs++] = page_address(page);
328         }
329         return 0;
330 }
331
332 static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
333 {
334         while (mc->nobjs)
335                 free_page((unsigned long)mc->objects[--mc->nobjs]);
336 }
337
338 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
339 {
340         int r;
341
342         r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
343                                    pte_chain_cache, 4);
344         if (r)
345                 goto out;
346         r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
347                                    rmap_desc_cache, 4);
348         if (r)
349                 goto out;
350         r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
351         if (r)
352                 goto out;
353         r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
354                                    mmu_page_header_cache, 4);
355 out:
356         return r;
357 }
358
359 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
360 {
361         mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache);
362         mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache);
363         mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
364         mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
365                                 mmu_page_header_cache);
366 }
367
368 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
369                                     size_t size)
370 {
371         void *p;
372
373         BUG_ON(!mc->nobjs);
374         p = mc->objects[--mc->nobjs];
375         return p;
376 }
377
378 static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
379 {
380         return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
381                                       sizeof(struct kvm_pte_chain));
382 }
383
384 static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
385 {
386         kmem_cache_free(pte_chain_cache, pc);
387 }
388
389 static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
390 {
391         return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
392                                       sizeof(struct kvm_rmap_desc));
393 }
394
395 static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
396 {
397         kmem_cache_free(rmap_desc_cache, rd);
398 }
399
400 static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
401 {
402         if (!sp->role.direct)
403                 return sp->gfns[index];
404
405         return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
406 }
407
408 static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
409 {
410         if (sp->role.direct)
411                 BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
412         else
413                 sp->gfns[index] = gfn;
414 }
415
416 /*
417  * Return the pointer to the largepage write count for a given
418  * gfn, handling slots that are not large page aligned.
419  */
420 static int *slot_largepage_idx(gfn_t gfn,
421                                struct kvm_memory_slot *slot,
422                                int level)
423 {
424         unsigned long idx;
425
426         idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
427               (slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
428         return &slot->lpage_info[level - 2][idx].write_count;
429 }
430
431 static void account_shadowed(struct kvm *kvm, gfn_t gfn)
432 {
433         struct kvm_memory_slot *slot;
434         int *write_count;
435         int i;
436
437         slot = gfn_to_memslot(kvm, gfn);
438         for (i = PT_DIRECTORY_LEVEL;
439              i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
440                 write_count   = slot_largepage_idx(gfn, slot, i);
441                 *write_count += 1;
442         }
443 }
444
445 static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
446 {
447         struct kvm_memory_slot *slot;
448         int *write_count;
449         int i;
450
451         slot = gfn_to_memslot(kvm, gfn);
452         for (i = PT_DIRECTORY_LEVEL;
453              i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
454                 write_count   = slot_largepage_idx(gfn, slot, i);
455                 *write_count -= 1;
456                 WARN_ON(*write_count < 0);
457         }
458 }
459
460 static int has_wrprotected_page(struct kvm *kvm,
461                                 gfn_t gfn,
462                                 int level)
463 {
464         struct kvm_memory_slot *slot;
465         int *largepage_idx;
466
467         slot = gfn_to_memslot(kvm, gfn);
468         if (slot) {
469                 largepage_idx = slot_largepage_idx(gfn, slot, level);
470                 return *largepage_idx;
471         }
472
473         return 1;
474 }
475
476 static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
477 {
478         unsigned long page_size;
479         int i, ret = 0;
480
481         page_size = kvm_host_page_size(kvm, gfn);
482
483         for (i = PT_PAGE_TABLE_LEVEL;
484              i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
485                 if (page_size >= KVM_HPAGE_SIZE(i))
486                         ret = i;
487                 else
488                         break;
489         }
490
491         return ret;
492 }
493
494 static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
495 {
496         struct kvm_memory_slot *slot;
497         int host_level, level, max_level;
498
499         slot = gfn_to_memslot(vcpu->kvm, large_gfn);
500         if (slot && slot->dirty_bitmap)
501                 return PT_PAGE_TABLE_LEVEL;
502
503         host_level = host_mapping_level(vcpu->kvm, large_gfn);
504
505         if (host_level == PT_PAGE_TABLE_LEVEL)
506                 return host_level;
507
508         max_level = kvm_x86_ops->get_lpage_level() < host_level ?
509                 kvm_x86_ops->get_lpage_level() : host_level;
510
511         for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
512                 if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
513                         break;
514
515         return level - 1;
516 }
517
518 /*
519  * Take gfn and return the reverse mapping to it.
520  */
521
522 static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
523 {
524         struct kvm_memory_slot *slot;
525         unsigned long idx;
526
527         slot = gfn_to_memslot(kvm, gfn);
528         if (likely(level == PT_PAGE_TABLE_LEVEL))
529                 return &slot->rmap[gfn - slot->base_gfn];
530
531         idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
532                 (slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
533
534         return &slot->lpage_info[level - 2][idx].rmap_pde;
535 }
536
537 /*
538  * Reverse mapping data structures:
539  *
540  * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
541  * that points to page_address(page).
542  *
543  * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
544  * containing more mappings.
545  *
546  * Returns the number of rmap entries before the spte was added or zero if
547  * the spte was not added.
548  *
549  */
550 static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
551 {
552         struct kvm_mmu_page *sp;
553         struct kvm_rmap_desc *desc;
554         unsigned long *rmapp;
555         int i, count = 0;
556
557         if (!is_rmap_spte(*spte))
558                 return count;
559         sp = page_header(__pa(spte));
560         kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
561         rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
562         if (!*rmapp) {
563                 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
564                 *rmapp = (unsigned long)spte;
565         } else if (!(*rmapp & 1)) {
566                 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
567                 desc = mmu_alloc_rmap_desc(vcpu);
568                 desc->sptes[0] = (u64 *)*rmapp;
569                 desc->sptes[1] = spte;
570                 *rmapp = (unsigned long)desc | 1;
571         } else {
572                 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
573                 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
574                 while (desc->sptes[RMAP_EXT-1] && desc->more) {
575                         desc = desc->more;
576                         count += RMAP_EXT;
577                 }
578                 if (desc->sptes[RMAP_EXT-1]) {
579                         desc->more = mmu_alloc_rmap_desc(vcpu);
580                         desc = desc->more;
581                 }
582                 for (i = 0; desc->sptes[i]; ++i)
583                         ;
584                 desc->sptes[i] = spte;
585         }
586         return count;
587 }
588
589 static void rmap_desc_remove_entry(unsigned long *rmapp,
590                                    struct kvm_rmap_desc *desc,
591                                    int i,
592                                    struct kvm_rmap_desc *prev_desc)
593 {
594         int j;
595
596         for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j)
597                 ;
598         desc->sptes[i] = desc->sptes[j];
599         desc->sptes[j] = NULL;
600         if (j != 0)
601                 return;
602         if (!prev_desc && !desc->more)
603                 *rmapp = (unsigned long)desc->sptes[0];
604         else
605                 if (prev_desc)
606                         prev_desc->more = desc->more;
607                 else
608                         *rmapp = (unsigned long)desc->more | 1;
609         mmu_free_rmap_desc(desc);
610 }
611
612 static void rmap_remove(struct kvm *kvm, u64 *spte)
613 {
614         struct kvm_rmap_desc *desc;
615         struct kvm_rmap_desc *prev_desc;
616         struct kvm_mmu_page *sp;
617         pfn_t pfn;
618         gfn_t gfn;
619         unsigned long *rmapp;
620         int i;
621
622         if (!is_rmap_spte(*spte))
623                 return;
624         sp = page_header(__pa(spte));
625         pfn = spte_to_pfn(*spte);
626         if (*spte & shadow_accessed_mask)
627                 kvm_set_pfn_accessed(pfn);
628         if (is_writable_pte(*spte))
629                 kvm_set_pfn_dirty(pfn);
630         gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
631         rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
632         if (!*rmapp) {
633                 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
634                 BUG();
635         } else if (!(*rmapp & 1)) {
636                 rmap_printk("rmap_remove:  %p %llx 1->0\n", spte, *spte);
637                 if ((u64 *)*rmapp != spte) {
638                         printk(KERN_ERR "rmap_remove:  %p %llx 1->BUG\n",
639                                spte, *spte);
640                         BUG();
641                 }
642                 *rmapp = 0;
643         } else {
644                 rmap_printk("rmap_remove:  %p %llx many->many\n", spte, *spte);
645                 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
646                 prev_desc = NULL;
647                 while (desc) {
648                         for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i)
649                                 if (desc->sptes[i] == spte) {
650                                         rmap_desc_remove_entry(rmapp,
651                                                                desc, i,
652                                                                prev_desc);
653                                         return;
654                                 }
655                         prev_desc = desc;
656                         desc = desc->more;
657                 }
658                 pr_err("rmap_remove: %p %llx many->many\n", spte, *spte);
659                 BUG();
660         }
661 }
662
663 static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
664 {
665         struct kvm_rmap_desc *desc;
666         u64 *prev_spte;
667         int i;
668
669         if (!*rmapp)
670                 return NULL;
671         else if (!(*rmapp & 1)) {
672                 if (!spte)
673                         return (u64 *)*rmapp;
674                 return NULL;
675         }
676         desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
677         prev_spte = NULL;
678         while (desc) {
679                 for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
680                         if (prev_spte == spte)
681                                 return desc->sptes[i];
682                         prev_spte = desc->sptes[i];
683                 }
684                 desc = desc->more;
685         }
686         return NULL;
687 }
688
689 static int rmap_write_protect(struct kvm *kvm, u64 gfn)
690 {
691         unsigned long *rmapp;
692         u64 *spte;
693         int i, write_protected = 0;
694
695         rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
696
697         spte = rmap_next(kvm, rmapp, NULL);
698         while (spte) {
699                 BUG_ON(!spte);
700                 BUG_ON(!(*spte & PT_PRESENT_MASK));
701                 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
702                 if (is_writable_pte(*spte)) {
703                         __set_spte(spte, *spte & ~PT_WRITABLE_MASK);
704                         write_protected = 1;
705                 }
706                 spte = rmap_next(kvm, rmapp, spte);
707         }
708         if (write_protected) {
709                 pfn_t pfn;
710
711                 spte = rmap_next(kvm, rmapp, NULL);
712                 pfn = spte_to_pfn(*spte);
713                 kvm_set_pfn_dirty(pfn);
714         }
715
716         /* check for huge page mappings */
717         for (i = PT_DIRECTORY_LEVEL;
718              i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
719                 rmapp = gfn_to_rmap(kvm, gfn, i);
720                 spte = rmap_next(kvm, rmapp, NULL);
721                 while (spte) {
722                         BUG_ON(!spte);
723                         BUG_ON(!(*spte & PT_PRESENT_MASK));
724                         BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
725                         pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
726                         if (is_writable_pte(*spte)) {
727                                 rmap_remove(kvm, spte);
728                                 --kvm->stat.lpages;
729                                 __set_spte(spte, shadow_trap_nonpresent_pte);
730                                 spte = NULL;
731                                 write_protected = 1;
732                         }
733                         spte = rmap_next(kvm, rmapp, spte);
734                 }
735         }
736
737         return write_protected;
738 }
739
740 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
741                            unsigned long data)
742 {
743         u64 *spte;
744         int need_tlb_flush = 0;
745
746         while ((spte = rmap_next(kvm, rmapp, NULL))) {
747                 BUG_ON(!(*spte & PT_PRESENT_MASK));
748                 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
749                 rmap_remove(kvm, spte);
750                 __set_spte(spte, shadow_trap_nonpresent_pte);
751                 need_tlb_flush = 1;
752         }
753         return need_tlb_flush;
754 }
755
756 static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
757                              unsigned long data)
758 {
759         int need_flush = 0;
760         u64 *spte, new_spte;
761         pte_t *ptep = (pte_t *)data;
762         pfn_t new_pfn;
763
764         WARN_ON(pte_huge(*ptep));
765         new_pfn = pte_pfn(*ptep);
766         spte = rmap_next(kvm, rmapp, NULL);
767         while (spte) {
768                 BUG_ON(!is_shadow_present_pte(*spte));
769                 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
770                 need_flush = 1;
771                 if (pte_write(*ptep)) {
772                         rmap_remove(kvm, spte);
773                         __set_spte(spte, shadow_trap_nonpresent_pte);
774                         spte = rmap_next(kvm, rmapp, NULL);
775                 } else {
776                         new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
777                         new_spte |= (u64)new_pfn << PAGE_SHIFT;
778
779                         new_spte &= ~PT_WRITABLE_MASK;
780                         new_spte &= ~SPTE_HOST_WRITEABLE;
781                         if (is_writable_pte(*spte))
782                                 kvm_set_pfn_dirty(spte_to_pfn(*spte));
783                         __set_spte(spte, new_spte);
784                         spte = rmap_next(kvm, rmapp, spte);
785                 }
786         }
787         if (need_flush)
788                 kvm_flush_remote_tlbs(kvm);
789
790         return 0;
791 }
792
793 static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
794                           unsigned long data,
795                           int (*handler)(struct kvm *kvm, unsigned long *rmapp,
796                                          unsigned long data))
797 {
798         int i, j;
799         int ret;
800         int retval = 0;
801         struct kvm_memslots *slots;
802
803         slots = kvm_memslots(kvm);
804
805         for (i = 0; i < slots->nmemslots; i++) {
806                 struct kvm_memory_slot *memslot = &slots->memslots[i];
807                 unsigned long start = memslot->userspace_addr;
808                 unsigned long end;
809
810                 end = start + (memslot->npages << PAGE_SHIFT);
811                 if (hva >= start && hva < end) {
812                         gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
813
814                         ret = handler(kvm, &memslot->rmap[gfn_offset], data);
815
816                         for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
817                                 int idx = gfn_offset;
818                                 idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j);
819                                 ret |= handler(kvm,
820                                         &memslot->lpage_info[j][idx].rmap_pde,
821                                         data);
822                         }
823                         trace_kvm_age_page(hva, memslot, ret);
824                         retval |= ret;
825                 }
826         }
827
828         return retval;
829 }
830
831 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
832 {
833         return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
834 }
835
836 void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
837 {
838         kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
839 }
840
841 static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
842                          unsigned long data)
843 {
844         u64 *spte;
845         int young = 0;
846
847         /*
848          * Emulate the accessed bit for EPT, by checking if this page has
849          * an EPT mapping, and clearing it if it does. On the next access,
850          * a new EPT mapping will be established.
851          * This has some overhead, but not as much as the cost of swapping
852          * out actively used pages or breaking up actively used hugepages.
853          */
854         if (!shadow_accessed_mask)
855                 return kvm_unmap_rmapp(kvm, rmapp, data);
856
857         spte = rmap_next(kvm, rmapp, NULL);
858         while (spte) {
859                 int _young;
860                 u64 _spte = *spte;
861                 BUG_ON(!(_spte & PT_PRESENT_MASK));
862                 _young = _spte & PT_ACCESSED_MASK;
863                 if (_young) {
864                         young = 1;
865                         clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
866                 }
867                 spte = rmap_next(kvm, rmapp, spte);
868         }
869         return young;
870 }
871
872 #define RMAP_RECYCLE_THRESHOLD 1000
873
874 static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
875 {
876         unsigned long *rmapp;
877         struct kvm_mmu_page *sp;
878
879         sp = page_header(__pa(spte));
880
881         rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
882
883         kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
884         kvm_flush_remote_tlbs(vcpu->kvm);
885 }
886
887 int kvm_age_hva(struct kvm *kvm, unsigned long hva)
888 {
889         return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
890 }
891
892 #ifdef MMU_DEBUG
893 static int is_empty_shadow_page(u64 *spt)
894 {
895         u64 *pos;
896         u64 *end;
897
898         for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
899                 if (is_shadow_present_pte(*pos)) {
900                         printk(KERN_ERR "%s: %p %llx\n", __func__,
901                                pos, *pos);
902                         return 0;
903                 }
904         return 1;
905 }
906 #endif
907
908 static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
909 {
910         ASSERT(is_empty_shadow_page(sp->spt));
911         hlist_del(&sp->hash_link);
912         list_del(&sp->link);
913         __free_page(virt_to_page(sp->spt));
914         if (!sp->role.direct)
915                 __free_page(virt_to_page(sp->gfns));
916         kmem_cache_free(mmu_page_header_cache, sp);
917         ++kvm->arch.n_free_mmu_pages;
918 }
919
920 static unsigned kvm_page_table_hashfn(gfn_t gfn)
921 {
922         return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
923 }
924
925 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
926                                                u64 *parent_pte, int direct)
927 {
928         struct kvm_mmu_page *sp;
929
930         sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
931         sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
932         if (!direct)
933                 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
934                                                   PAGE_SIZE);
935         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
936         list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
937         bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
938         sp->multimapped = 0;
939         sp->parent_pte = parent_pte;
940         --vcpu->kvm->arch.n_free_mmu_pages;
941         return sp;
942 }
943
944 static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
945                                     struct kvm_mmu_page *sp, u64 *parent_pte)
946 {
947         struct kvm_pte_chain *pte_chain;
948         struct hlist_node *node;
949         int i;
950
951         if (!parent_pte)
952                 return;
953         if (!sp->multimapped) {
954                 u64 *old = sp->parent_pte;
955
956                 if (!old) {
957                         sp->parent_pte = parent_pte;
958                         return;
959                 }
960                 sp->multimapped = 1;
961                 pte_chain = mmu_alloc_pte_chain(vcpu);
962                 INIT_HLIST_HEAD(&sp->parent_ptes);
963                 hlist_add_head(&pte_chain->link, &sp->parent_ptes);
964                 pte_chain->parent_ptes[0] = old;
965         }
966         hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
967                 if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
968                         continue;
969                 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
970                         if (!pte_chain->parent_ptes[i]) {
971                                 pte_chain->parent_ptes[i] = parent_pte;
972                                 return;
973                         }
974         }
975         pte_chain = mmu_alloc_pte_chain(vcpu);
976         BUG_ON(!pte_chain);
977         hlist_add_head(&pte_chain->link, &sp->parent_ptes);
978         pte_chain->parent_ptes[0] = parent_pte;
979 }
980
981 static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
982                                        u64 *parent_pte)
983 {
984         struct kvm_pte_chain *pte_chain;
985         struct hlist_node *node;
986         int i;
987
988         if (!sp->multimapped) {
989                 BUG_ON(sp->parent_pte != parent_pte);
990                 sp->parent_pte = NULL;
991                 return;
992         }
993         hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
994                 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
995                         if (!pte_chain->parent_ptes[i])
996                                 break;
997                         if (pte_chain->parent_ptes[i] != parent_pte)
998                                 continue;
999                         while (i + 1 < NR_PTE_CHAIN_ENTRIES
1000                                 && pte_chain->parent_ptes[i + 1]) {
1001                                 pte_chain->parent_ptes[i]
1002                                         = pte_chain->parent_ptes[i + 1];
1003                                 ++i;
1004                         }
1005                         pte_chain->parent_ptes[i] = NULL;
1006                         if (i == 0) {
1007                                 hlist_del(&pte_chain->link);
1008                                 mmu_free_pte_chain(pte_chain);
1009                                 if (hlist_empty(&sp->parent_ptes)) {
1010                                         sp->multimapped = 0;
1011                                         sp->parent_pte = NULL;
1012                                 }
1013                         }
1014                         return;
1015                 }
1016         BUG();
1017 }
1018
1019 static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
1020 {
1021         struct kvm_pte_chain *pte_chain;
1022         struct hlist_node *node;
1023         struct kvm_mmu_page *parent_sp;
1024         int i;
1025
1026         if (!sp->multimapped && sp->parent_pte) {
1027                 parent_sp = page_header(__pa(sp->parent_pte));
1028                 fn(parent_sp, sp->parent_pte);
1029                 return;
1030         }
1031
1032         hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
1033                 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
1034                         u64 *spte = pte_chain->parent_ptes[i];
1035
1036                         if (!spte)
1037                                 break;
1038                         parent_sp = page_header(__pa(spte));
1039                         fn(parent_sp, spte);
1040                 }
1041 }
1042
1043 static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte);
1044 static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1045 {
1046         mmu_parent_walk(sp, mark_unsync);
1047 }
1048
1049 static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
1050 {
1051         unsigned int index;
1052
1053         index = spte - sp->spt;
1054         if (__test_and_set_bit(index, sp->unsync_child_bitmap))
1055                 return;
1056         if (sp->unsync_children++)
1057                 return;
1058         kvm_mmu_mark_parents_unsync(sp);
1059 }
1060
1061 static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1062                                     struct kvm_mmu_page *sp)
1063 {
1064         int i;
1065
1066         for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1067                 sp->spt[i] = shadow_trap_nonpresent_pte;
1068 }
1069
1070 static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
1071                                struct kvm_mmu_page *sp, bool clear_unsync)
1072 {
1073         return 1;
1074 }
1075
1076 static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
1077 {
1078 }
1079
1080 #define KVM_PAGE_ARRAY_NR 16
1081
1082 struct kvm_mmu_pages {
1083         struct mmu_page_and_offset {
1084                 struct kvm_mmu_page *sp;
1085                 unsigned int idx;
1086         } page[KVM_PAGE_ARRAY_NR];
1087         unsigned int nr;
1088 };
1089
1090 #define for_each_unsync_children(bitmap, idx)           \
1091         for (idx = find_first_bit(bitmap, 512);         \
1092              idx < 512;                                 \
1093              idx = find_next_bit(bitmap, 512, idx+1))
1094
1095 static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
1096                          int idx)
1097 {
1098         int i;
1099
1100         if (sp->unsync)
1101                 for (i=0; i < pvec->nr; i++)
1102                         if (pvec->page[i].sp == sp)
1103                                 return 0;
1104
1105         pvec->page[pvec->nr].sp = sp;
1106         pvec->page[pvec->nr].idx = idx;
1107         pvec->nr++;
1108         return (pvec->nr == KVM_PAGE_ARRAY_NR);
1109 }
1110
1111 static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
1112                            struct kvm_mmu_pages *pvec)
1113 {
1114         int i, ret, nr_unsync_leaf = 0;
1115
1116         for_each_unsync_children(sp->unsync_child_bitmap, i) {
1117                 struct kvm_mmu_page *child;
1118                 u64 ent = sp->spt[i];
1119
1120                 if (!is_shadow_present_pte(ent) || is_large_pte(ent))
1121                         goto clear_child_bitmap;
1122
1123                 child = page_header(ent & PT64_BASE_ADDR_MASK);
1124
1125                 if (child->unsync_children) {
1126                         if (mmu_pages_add(pvec, child, i))
1127                                 return -ENOSPC;
1128
1129                         ret = __mmu_unsync_walk(child, pvec);
1130                         if (!ret)
1131                                 goto clear_child_bitmap;
1132                         else if (ret > 0)
1133                                 nr_unsync_leaf += ret;
1134                         else
1135                                 return ret;
1136                 } else if (child->unsync) {
1137                         nr_unsync_leaf++;
1138                         if (mmu_pages_add(pvec, child, i))
1139                                 return -ENOSPC;
1140                 } else
1141                          goto clear_child_bitmap;
1142
1143                 continue;
1144
1145 clear_child_bitmap:
1146                 __clear_bit(i, sp->unsync_child_bitmap);
1147                 sp->unsync_children--;
1148                 WARN_ON((int)sp->unsync_children < 0);
1149         }
1150
1151
1152         return nr_unsync_leaf;
1153 }
1154
1155 static int mmu_unsync_walk(struct kvm_mmu_page *sp,
1156                            struct kvm_mmu_pages *pvec)
1157 {
1158         if (!sp->unsync_children)
1159                 return 0;
1160
1161         mmu_pages_add(pvec, sp, 0);
1162         return __mmu_unsync_walk(sp, pvec);
1163 }
1164
1165 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1166 {
1167         WARN_ON(!sp->unsync);
1168         trace_kvm_mmu_sync_page(sp);
1169         sp->unsync = 0;
1170         --kvm->stat.mmu_unsync;
1171 }
1172
1173 static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1174                                     struct list_head *invalid_list);
1175 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1176                                     struct list_head *invalid_list);
1177
1178 #define for_each_gfn_sp(kvm, sp, gfn, pos)                              \
1179   hlist_for_each_entry(sp, pos,                                         \
1180    &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)   \
1181         if ((sp)->gfn != (gfn)) {} else
1182
1183 #define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos)               \
1184   hlist_for_each_entry(sp, pos,                                         \
1185    &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)   \
1186                 if ((sp)->gfn != (gfn) || (sp)->role.direct ||          \
1187                         (sp)->role.invalid) {} else
1188
1189 /* @sp->gfn should be write-protected at the call site */
1190 static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1191                            struct list_head *invalid_list, bool clear_unsync)
1192 {
1193         if (sp->role.cr4_pae != !!is_pae(vcpu)) {
1194                 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1195                 return 1;
1196         }
1197
1198         if (clear_unsync)
1199                 kvm_unlink_unsync_page(vcpu->kvm, sp);
1200
1201         if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) {
1202                 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1203                 return 1;
1204         }
1205
1206         kvm_mmu_flush_tlb(vcpu);
1207         return 0;
1208 }
1209
1210 static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
1211                                    struct kvm_mmu_page *sp)
1212 {
1213         LIST_HEAD(invalid_list);
1214         int ret;
1215
1216         ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
1217         if (ret)
1218                 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1219
1220         return ret;
1221 }
1222
1223 static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1224                          struct list_head *invalid_list)
1225 {
1226         return __kvm_sync_page(vcpu, sp, invalid_list, true);
1227 }
1228
1229 /* @gfn should be write-protected at the call site */
1230 static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
1231 {
1232         struct kvm_mmu_page *s;
1233         struct hlist_node *node;
1234         LIST_HEAD(invalid_list);
1235         bool flush = false;
1236
1237         for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1238                 if (!s->unsync)
1239                         continue;
1240
1241                 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
1242                 if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
1243                         (vcpu->arch.mmu.sync_page(vcpu, s, true))) {
1244                         kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
1245                         continue;
1246                 }
1247                 kvm_unlink_unsync_page(vcpu->kvm, s);
1248                 flush = true;
1249         }
1250
1251         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1252         if (flush)
1253                 kvm_mmu_flush_tlb(vcpu);
1254 }
1255
1256 struct mmu_page_path {
1257         struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
1258         unsigned int idx[PT64_ROOT_LEVEL-1];
1259 };
1260
1261 #define for_each_sp(pvec, sp, parents, i)                       \
1262                 for (i = mmu_pages_next(&pvec, &parents, -1),   \
1263                         sp = pvec.page[i].sp;                   \
1264                         i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});   \
1265                         i = mmu_pages_next(&pvec, &parents, i))
1266
1267 static int mmu_pages_next(struct kvm_mmu_pages *pvec,
1268                           struct mmu_page_path *parents,
1269                           int i)
1270 {
1271         int n;
1272
1273         for (n = i+1; n < pvec->nr; n++) {
1274                 struct kvm_mmu_page *sp = pvec->page[n].sp;
1275
1276                 if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
1277                         parents->idx[0] = pvec->page[n].idx;
1278                         return n;
1279                 }
1280
1281                 parents->parent[sp->role.level-2] = sp;
1282                 parents->idx[sp->role.level-1] = pvec->page[n].idx;
1283         }
1284
1285         return n;
1286 }
1287
1288 static void mmu_pages_clear_parents(struct mmu_page_path *parents)
1289 {
1290         struct kvm_mmu_page *sp;
1291         unsigned int level = 0;
1292
1293         do {
1294                 unsigned int idx = parents->idx[level];
1295
1296                 sp = parents->parent[level];
1297                 if (!sp)
1298                         return;
1299
1300                 --sp->unsync_children;
1301                 WARN_ON((int)sp->unsync_children < 0);
1302                 __clear_bit(idx, sp->unsync_child_bitmap);
1303                 level++;
1304         } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
1305 }
1306
1307 static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
1308                                struct mmu_page_path *parents,
1309                                struct kvm_mmu_pages *pvec)
1310 {
1311         parents->parent[parent->role.level-1] = NULL;
1312         pvec->nr = 0;
1313 }
1314
1315 static void mmu_sync_children(struct kvm_vcpu *vcpu,
1316                               struct kvm_mmu_page *parent)
1317 {
1318         int i;
1319         struct kvm_mmu_page *sp;
1320         struct mmu_page_path parents;
1321         struct kvm_mmu_pages pages;
1322         LIST_HEAD(invalid_list);
1323
1324         kvm_mmu_pages_init(parent, &parents, &pages);
1325         while (mmu_unsync_walk(parent, &pages)) {
1326                 int protected = 0;
1327
1328                 for_each_sp(pages, sp, parents, i)
1329                         protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
1330
1331                 if (protected)
1332                         kvm_flush_remote_tlbs(vcpu->kvm);
1333
1334                 for_each_sp(pages, sp, parents, i) {
1335                         kvm_sync_page(vcpu, sp, &invalid_list);
1336                         mmu_pages_clear_parents(&parents);
1337                 }
1338                 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1339                 cond_resched_lock(&vcpu->kvm->mmu_lock);
1340                 kvm_mmu_pages_init(parent, &parents, &pages);
1341         }
1342 }
1343
1344 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1345                                              gfn_t gfn,
1346                                              gva_t gaddr,
1347                                              unsigned level,
1348                                              int direct,
1349                                              unsigned access,
1350                                              u64 *parent_pte)
1351 {
1352         union kvm_mmu_page_role role;
1353         unsigned quadrant;
1354         struct kvm_mmu_page *sp;
1355         struct hlist_node *node;
1356         bool need_sync = false;
1357
1358         role = vcpu->arch.mmu.base_role;
1359         role.level = level;
1360         role.direct = direct;
1361         if (role.direct)
1362                 role.cr4_pae = 0;
1363         role.access = access;
1364         if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
1365                 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
1366                 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
1367                 role.quadrant = quadrant;
1368         }
1369         for_each_gfn_sp(vcpu->kvm, sp, gfn, node) {
1370                 if (!need_sync && sp->unsync)
1371                         need_sync = true;
1372
1373                 if (sp->role.word != role.word)
1374                         continue;
1375
1376                 if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
1377                         break;
1378
1379                 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1380                 if (sp->unsync_children) {
1381                         set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
1382                         kvm_mmu_mark_parents_unsync(sp);
1383                 } else if (sp->unsync)
1384                         kvm_mmu_mark_parents_unsync(sp);
1385
1386                 trace_kvm_mmu_get_page(sp, false);
1387                 return sp;
1388         }
1389         ++vcpu->kvm->stat.mmu_cache_miss;
1390         sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct);
1391         if (!sp)
1392                 return sp;
1393         sp->gfn = gfn;
1394         sp->role = role;
1395         hlist_add_head(&sp->hash_link,
1396                 &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
1397         if (!direct) {
1398                 if (rmap_write_protect(vcpu->kvm, gfn))
1399                         kvm_flush_remote_tlbs(vcpu->kvm);
1400                 if (level > PT_PAGE_TABLE_LEVEL && need_sync)
1401                         kvm_sync_pages(vcpu, gfn);
1402
1403                 account_shadowed(vcpu->kvm, gfn);
1404         }
1405         if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
1406                 vcpu->arch.mmu.prefetch_page(vcpu, sp);
1407         else
1408                 nonpaging_prefetch_page(vcpu, sp);
1409         trace_kvm_mmu_get_page(sp, true);
1410         return sp;
1411 }
1412
1413 static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
1414                              struct kvm_vcpu *vcpu, u64 addr)
1415 {
1416         iterator->addr = addr;
1417         iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
1418         iterator->level = vcpu->arch.mmu.shadow_root_level;
1419         if (iterator->level == PT32E_ROOT_LEVEL) {
1420                 iterator->shadow_addr
1421                         = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
1422                 iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
1423                 --iterator->level;
1424                 if (!iterator->shadow_addr)
1425                         iterator->level = 0;
1426         }
1427 }
1428
1429 static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
1430 {
1431         if (iterator->level < PT_PAGE_TABLE_LEVEL)
1432                 return false;
1433
1434         if (iterator->level == PT_PAGE_TABLE_LEVEL)
1435                 if (is_large_pte(*iterator->sptep))
1436                         return false;
1437
1438         iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
1439         iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
1440         return true;
1441 }
1442
1443 static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
1444 {
1445         iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK;
1446         --iterator->level;
1447 }
1448
1449 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
1450                                          struct kvm_mmu_page *sp)
1451 {
1452         unsigned i;
1453         u64 *pt;
1454         u64 ent;
1455
1456         pt = sp->spt;
1457
1458         for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1459                 ent = pt[i];
1460
1461                 if (is_shadow_present_pte(ent)) {
1462                         if (!is_last_spte(ent, sp->role.level)) {
1463                                 ent &= PT64_BASE_ADDR_MASK;
1464                                 mmu_page_remove_parent_pte(page_header(ent),
1465                                                            &pt[i]);
1466                         } else {
1467                                 if (is_large_pte(ent))
1468                                         --kvm->stat.lpages;
1469                                 rmap_remove(kvm, &pt[i]);
1470                         }
1471                 }
1472                 pt[i] = shadow_trap_nonpresent_pte;
1473         }
1474 }
1475
1476 static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
1477 {
1478         mmu_page_remove_parent_pte(sp, parent_pte);
1479 }
1480
1481 static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
1482 {
1483         int i;
1484         struct kvm_vcpu *vcpu;
1485
1486         kvm_for_each_vcpu(i, vcpu, kvm)
1487                 vcpu->arch.last_pte_updated = NULL;
1488 }
1489
1490 static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
1491 {
1492         u64 *parent_pte;
1493
1494         while (sp->multimapped || sp->parent_pte) {
1495                 if (!sp->multimapped)
1496                         parent_pte = sp->parent_pte;
1497                 else {
1498                         struct kvm_pte_chain *chain;
1499
1500                         chain = container_of(sp->parent_ptes.first,
1501                                              struct kvm_pte_chain, link);
1502                         parent_pte = chain->parent_ptes[0];
1503                 }
1504                 BUG_ON(!parent_pte);
1505                 kvm_mmu_put_page(sp, parent_pte);
1506                 __set_spte(parent_pte, shadow_trap_nonpresent_pte);
1507         }
1508 }
1509
1510 static int mmu_zap_unsync_children(struct kvm *kvm,
1511                                    struct kvm_mmu_page *parent,
1512                                    struct list_head *invalid_list)
1513 {
1514         int i, zapped = 0;
1515         struct mmu_page_path parents;
1516         struct kvm_mmu_pages pages;
1517
1518         if (parent->role.level == PT_PAGE_TABLE_LEVEL)
1519                 return 0;
1520
1521         kvm_mmu_pages_init(parent, &parents, &pages);
1522         while (mmu_unsync_walk(parent, &pages)) {
1523                 struct kvm_mmu_page *sp;
1524
1525                 for_each_sp(pages, sp, parents, i) {
1526                         kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
1527                         mmu_pages_clear_parents(&parents);
1528                         zapped++;
1529                 }
1530                 kvm_mmu_pages_init(parent, &parents, &pages);
1531         }
1532
1533         return zapped;
1534 }
1535
1536 static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1537                                     struct list_head *invalid_list)
1538 {
1539         int ret;
1540
1541         trace_kvm_mmu_prepare_zap_page(sp);
1542         ++kvm->stat.mmu_shadow_zapped;
1543         ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
1544         kvm_mmu_page_unlink_children(kvm, sp);
1545         kvm_mmu_unlink_parents(kvm, sp);
1546         if (!sp->role.invalid && !sp->role.direct)
1547                 unaccount_shadowed(kvm, sp->gfn);
1548         if (sp->unsync)
1549                 kvm_unlink_unsync_page(kvm, sp);
1550         if (!sp->root_count) {
1551                 /* Count self */
1552                 ret++;
1553                 list_move(&sp->link, invalid_list);
1554         } else {
1555                 list_move(&sp->link, &kvm->arch.active_mmu_pages);
1556                 kvm_reload_remote_mmus(kvm);
1557         }
1558
1559         sp->role.invalid = 1;
1560         kvm_mmu_reset_last_pte_updated(kvm);
1561         return ret;
1562 }
1563
1564 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1565                                     struct list_head *invalid_list)
1566 {
1567         struct kvm_mmu_page *sp;
1568
1569         if (list_empty(invalid_list))
1570                 return;
1571
1572         kvm_flush_remote_tlbs(kvm);
1573
1574         do {
1575                 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
1576                 WARN_ON(!sp->role.invalid || sp->root_count);
1577                 kvm_mmu_free_page(kvm, sp);
1578         } while (!list_empty(invalid_list));
1579
1580 }
1581
1582 /*
1583  * Changing the number of mmu pages allocated to the vm
1584  * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
1585  */
1586 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
1587 {
1588         int used_pages;
1589         LIST_HEAD(invalid_list);
1590
1591         used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
1592         used_pages = max(0, used_pages);
1593
1594         /*
1595          * If we set the number of mmu pages to be smaller be than the
1596          * number of actived pages , we must to free some mmu pages before we
1597          * change the value
1598          */
1599
1600         if (used_pages > kvm_nr_mmu_pages) {
1601                 while (used_pages > kvm_nr_mmu_pages &&
1602                         !list_empty(&kvm->arch.active_mmu_pages)) {
1603                         struct kvm_mmu_page *page;
1604
1605                         page = container_of(kvm->arch.active_mmu_pages.prev,
1606                                             struct kvm_mmu_page, link);
1607                         used_pages -= kvm_mmu_prepare_zap_page(kvm, page,
1608                                                                &invalid_list);
1609                 }
1610                 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1611                 kvm_nr_mmu_pages = used_pages;
1612                 kvm->arch.n_free_mmu_pages = 0;
1613         }
1614         else
1615                 kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
1616                                          - kvm->arch.n_alloc_mmu_pages;
1617
1618         kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
1619 }
1620
1621 static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1622 {
1623         struct kvm_mmu_page *sp;
1624         struct hlist_node *node;
1625         LIST_HEAD(invalid_list);
1626         int r;
1627
1628         pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
1629         r = 0;
1630
1631         for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1632                 pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
1633                          sp->role.word);
1634                 r = 1;
1635                 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1636         }
1637         kvm_mmu_commit_zap_page(kvm, &invalid_list);
1638         return r;
1639 }
1640
1641 static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1642 {
1643         struct kvm_mmu_page *sp;
1644         struct hlist_node *node;
1645         LIST_HEAD(invalid_list);
1646
1647         for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1648                 pgprintk("%s: zap %lx %x\n",
1649                          __func__, gfn, sp->role.word);
1650                 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1651         }
1652         kvm_mmu_commit_zap_page(kvm, &invalid_list);
1653 }
1654
1655 static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
1656 {
1657         int slot = memslot_id(kvm, gfn);
1658         struct kvm_mmu_page *sp = page_header(__pa(pte));
1659
1660         __set_bit(slot, sp->slot_bitmap);
1661 }
1662
1663 static void mmu_convert_notrap(struct kvm_mmu_page *sp)
1664 {
1665         int i;
1666         u64 *pt = sp->spt;
1667
1668         if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
1669                 return;
1670
1671         for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1672                 if (pt[i] == shadow_notrap_nonpresent_pte)
1673                         __set_spte(&pt[i], shadow_trap_nonpresent_pte);
1674         }
1675 }
1676
1677 /*
1678  * The function is based on mtrr_type_lookup() in
1679  * arch/x86/kernel/cpu/mtrr/generic.c
1680  */
1681 static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
1682                          u64 start, u64 end)
1683 {
1684         int i;
1685         u64 base, mask;
1686         u8 prev_match, curr_match;
1687         int num_var_ranges = KVM_NR_VAR_MTRR;
1688
1689         if (!mtrr_state->enabled)
1690                 return 0xFF;
1691
1692         /* Make end inclusive end, instead of exclusive */
1693         end--;
1694
1695         /* Look in fixed ranges. Just return the type as per start */
1696         if (mtrr_state->have_fixed && (start < 0x100000)) {
1697                 int idx;
1698
1699                 if (start < 0x80000) {
1700                         idx = 0;
1701                         idx += (start >> 16);
1702                         return mtrr_state->fixed_ranges[idx];
1703                 } else if (start < 0xC0000) {
1704                         idx = 1 * 8;
1705                         idx += ((start - 0x80000) >> 14);
1706                         return mtrr_state->fixed_ranges[idx];
1707                 } else if (start < 0x1000000) {
1708                         idx = 3 * 8;
1709                         idx += ((start - 0xC0000) >> 12);
1710                         return mtrr_state->fixed_ranges[idx];
1711                 }
1712         }
1713
1714         /*
1715          * Look in variable ranges
1716          * Look of multiple ranges matching this address and pick type
1717          * as per MTRR precedence
1718          */
1719         if (!(mtrr_state->enabled & 2))
1720                 return mtrr_state->def_type;
1721
1722         prev_match = 0xFF;
1723         for (i = 0; i < num_var_ranges; ++i) {
1724                 unsigned short start_state, end_state;
1725
1726                 if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11)))
1727                         continue;
1728
1729                 base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) +
1730                        (mtrr_state->var_ranges[i].base_lo & PAGE_MASK);
1731                 mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) +
1732                        (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK);
1733
1734                 start_state = ((start & mask) == (base & mask));
1735                 end_state = ((end & mask) == (base & mask));
1736                 if (start_state != end_state)
1737                         return 0xFE;
1738
1739                 if ((start & mask) != (base & mask))
1740                         continue;
1741
1742                 curr_match = mtrr_state->var_ranges[i].base_lo & 0xff;
1743                 if (prev_match == 0xFF) {
1744                         prev_match = curr_match;
1745                         continue;
1746                 }
1747
1748                 if (prev_match == MTRR_TYPE_UNCACHABLE ||
1749                     curr_match == MTRR_TYPE_UNCACHABLE)
1750                         return MTRR_TYPE_UNCACHABLE;
1751
1752                 if ((prev_match == MTRR_TYPE_WRBACK &&
1753                      curr_match == MTRR_TYPE_WRTHROUGH) ||
1754                     (prev_match == MTRR_TYPE_WRTHROUGH &&
1755                      curr_match == MTRR_TYPE_WRBACK)) {
1756                         prev_match = MTRR_TYPE_WRTHROUGH;
1757                         curr_match = MTRR_TYPE_WRTHROUGH;
1758                 }
1759
1760                 if (prev_match != curr_match)
1761                         return MTRR_TYPE_UNCACHABLE;
1762         }
1763
1764         if (prev_match != 0xFF)
1765                 return prev_match;
1766
1767         return mtrr_state->def_type;
1768 }
1769
1770 u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
1771 {
1772         u8 mtrr;
1773
1774         mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT,
1775                              (gfn << PAGE_SHIFT) + PAGE_SIZE);
1776         if (mtrr == 0xfe || mtrr == 0xff)
1777                 mtrr = MTRR_TYPE_WRBACK;
1778         return mtrr;
1779 }
1780 EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
1781
1782 static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1783 {
1784         trace_kvm_mmu_unsync_page(sp);
1785         ++vcpu->kvm->stat.mmu_unsync;
1786         sp->unsync = 1;
1787
1788         kvm_mmu_mark_parents_unsync(sp);
1789         mmu_convert_notrap(sp);
1790 }
1791
1792 static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
1793 {
1794         struct kvm_mmu_page *s;
1795         struct hlist_node *node;
1796
1797         for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1798                 if (s->unsync)
1799                         continue;
1800                 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
1801                 __kvm_unsync_page(vcpu, s);
1802         }
1803 }
1804
1805 static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
1806                                   bool can_unsync)
1807 {
1808         struct kvm_mmu_page *s;
1809         struct hlist_node *node;
1810         bool need_unsync = false;
1811
1812         for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1813                 if (s->role.level != PT_PAGE_TABLE_LEVEL)
1814                         return 1;
1815
1816                 if (!need_unsync && !s->unsync) {
1817                         if (!can_unsync || !oos_shadow)
1818                                 return 1;
1819                         need_unsync = true;
1820                 }
1821         }
1822         if (need_unsync)
1823                 kvm_unsync_pages(vcpu, gfn);
1824         return 0;
1825 }
1826
1827 static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1828                     unsigned pte_access, int user_fault,
1829                     int write_fault, int dirty, int level,
1830                     gfn_t gfn, pfn_t pfn, bool speculative,
1831                     bool can_unsync, bool reset_host_protection)
1832 {
1833         u64 spte;
1834         int ret = 0;
1835
1836         /*
1837          * We don't set the accessed bit, since we sometimes want to see
1838          * whether the guest actually used the pte (in order to detect
1839          * demand paging).
1840          */
1841         spte = shadow_base_present_pte | shadow_dirty_mask;
1842         if (!speculative)
1843                 spte |= shadow_accessed_mask;
1844         if (!dirty)
1845                 pte_access &= ~ACC_WRITE_MASK;
1846         if (pte_access & ACC_EXEC_MASK)
1847                 spte |= shadow_x_mask;
1848         else
1849                 spte |= shadow_nx_mask;
1850         if (pte_access & ACC_USER_MASK)
1851                 spte |= shadow_user_mask;
1852         if (level > PT_PAGE_TABLE_LEVEL)
1853                 spte |= PT_PAGE_SIZE_MASK;
1854         if (tdp_enabled)
1855                 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
1856                         kvm_is_mmio_pfn(pfn));
1857
1858         if (reset_host_protection)
1859                 spte |= SPTE_HOST_WRITEABLE;
1860
1861         spte |= (u64)pfn << PAGE_SHIFT;
1862
1863         if ((pte_access & ACC_WRITE_MASK)
1864             || (!tdp_enabled && write_fault && !is_write_protection(vcpu)
1865                 && !user_fault)) {
1866
1867                 if (level > PT_PAGE_TABLE_LEVEL &&
1868                     has_wrprotected_page(vcpu->kvm, gfn, level)) {
1869                         ret = 1;
1870                         rmap_remove(vcpu->kvm, sptep);
1871                         spte = shadow_trap_nonpresent_pte;
1872                         goto set_pte;
1873                 }
1874
1875                 spte |= PT_WRITABLE_MASK;
1876
1877                 if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK))
1878                         spte &= ~PT_USER_MASK;
1879
1880                 /*
1881                  * Optimization: for pte sync, if spte was writable the hash
1882                  * lookup is unnecessary (and expensive). Write protection
1883                  * is responsibility of mmu_get_page / kvm_sync_page.
1884                  * Same reasoning can be applied to dirty page accounting.
1885                  */
1886                 if (!can_unsync && is_writable_pte(*sptep))
1887                         goto set_pte;
1888
1889                 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
1890                         pgprintk("%s: found shadow page for %lx, marking ro\n",
1891                                  __func__, gfn);
1892                         ret = 1;
1893                         pte_access &= ~ACC_WRITE_MASK;
1894                         if (is_writable_pte(spte))
1895                                 spte &= ~PT_WRITABLE_MASK;
1896                 }
1897         }
1898
1899         if (pte_access & ACC_WRITE_MASK)
1900                 mark_page_dirty(vcpu->kvm, gfn);
1901
1902 set_pte:
1903         __set_spte(sptep, spte);
1904         return ret;
1905 }
1906
1907 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1908                          unsigned pt_access, unsigned pte_access,
1909                          int user_fault, int write_fault, int dirty,
1910                          int *ptwrite, int level, gfn_t gfn,
1911                          pfn_t pfn, bool speculative,
1912                          bool reset_host_protection)
1913 {
1914         int was_rmapped = 0;
1915         int was_writable = is_writable_pte(*sptep);
1916         int rmap_count;
1917
1918         pgprintk("%s: spte %llx access %x write_fault %d"
1919                  " user_fault %d gfn %lx\n",
1920                  __func__, *sptep, pt_access,
1921                  write_fault, user_fault, gfn);
1922
1923         if (is_rmap_spte(*sptep)) {
1924                 /*
1925                  * If we overwrite a PTE page pointer with a 2MB PMD, unlink
1926                  * the parent of the now unreachable PTE.
1927                  */
1928                 if (level > PT_PAGE_TABLE_LEVEL &&
1929                     !is_large_pte(*sptep)) {
1930                         struct kvm_mmu_page *child;
1931                         u64 pte = *sptep;
1932
1933                         child = page_header(pte & PT64_BASE_ADDR_MASK);
1934                         mmu_page_remove_parent_pte(child, sptep);
1935                         __set_spte(sptep, shadow_trap_nonpresent_pte);
1936                         kvm_flush_remote_tlbs(vcpu->kvm);
1937                 } else if (pfn != spte_to_pfn(*sptep)) {
1938                         pgprintk("hfn old %lx new %lx\n",
1939                                  spte_to_pfn(*sptep), pfn);
1940                         rmap_remove(vcpu->kvm, sptep);
1941                         __set_spte(sptep, shadow_trap_nonpresent_pte);
1942                         kvm_flush_remote_tlbs(vcpu->kvm);
1943                 } else
1944                         was_rmapped = 1;
1945         }
1946
1947         if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
1948                       dirty, level, gfn, pfn, speculative, true,
1949                       reset_host_protection)) {
1950                 if (write_fault)
1951                         *ptwrite = 1;
1952                 kvm_mmu_flush_tlb(vcpu);
1953         }
1954
1955         pgprintk("%s: setting spte %llx\n", __func__, *sptep);
1956         pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
1957                  is_large_pte(*sptep)? "2MB" : "4kB",
1958                  *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
1959                  *sptep, sptep);
1960         if (!was_rmapped && is_large_pte(*sptep))
1961                 ++vcpu->kvm->stat.lpages;
1962
1963         page_header_update_slot(vcpu->kvm, sptep, gfn);
1964         if (!was_rmapped) {
1965                 rmap_count = rmap_add(vcpu, sptep, gfn);
1966                 kvm_release_pfn_clean(pfn);
1967                 if (rmap_count > RMAP_RECYCLE_THRESHOLD)
1968                         rmap_recycle(vcpu, sptep, gfn);
1969         } else {
1970                 if (was_writable)
1971                         kvm_release_pfn_dirty(pfn);
1972                 else
1973                         kvm_release_pfn_clean(pfn);
1974         }
1975         if (speculative) {
1976                 vcpu->arch.last_pte_updated = sptep;
1977                 vcpu->arch.last_pte_gfn = gfn;
1978         }
1979 }
1980
1981 static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
1982 {
1983 }
1984
1985 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1986                         int level, gfn_t gfn, pfn_t pfn)
1987 {
1988         struct kvm_shadow_walk_iterator iterator;
1989         struct kvm_mmu_page *sp;
1990         int pt_write = 0;
1991         gfn_t pseudo_gfn;
1992
1993         for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
1994                 if (iterator.level == level) {
1995                         mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
1996                                      0, write, 1, &pt_write,
1997                                      level, gfn, pfn, false, true);
1998                         ++vcpu->stat.pf_fixed;
1999                         break;
2000                 }
2001
2002                 if (*iterator.sptep == shadow_trap_nonpresent_pte) {
2003                         u64 base_addr = iterator.addr;
2004
2005                         base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
2006                         pseudo_gfn = base_addr >> PAGE_SHIFT;
2007                         sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
2008                                               iterator.level - 1,
2009                                               1, ACC_ALL, iterator.sptep);
2010                         if (!sp) {
2011                                 pgprintk("nonpaging_map: ENOMEM\n");
2012                                 kvm_release_pfn_clean(pfn);
2013                                 return -ENOMEM;
2014                         }
2015
2016                         __set_spte(iterator.sptep,
2017                                    __pa(sp->spt)
2018                                    | PT_PRESENT_MASK | PT_WRITABLE_MASK
2019                                    | shadow_user_mask | shadow_x_mask);
2020                 }
2021         }
2022         return pt_write;
2023 }
2024
2025 static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn)
2026 {
2027         char buf[1];
2028         void __user *hva;
2029         int r;
2030
2031         /* Touch the page, so send SIGBUS */
2032         hva = (void __user *)gfn_to_hva(kvm, gfn);
2033         r = copy_from_user(buf, hva, 1);
2034 }
2035
2036 static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
2037 {
2038         kvm_release_pfn_clean(pfn);
2039         if (is_hwpoison_pfn(pfn)) {
2040                 kvm_send_hwpoison_signal(kvm, gfn);
2041                 return 0;
2042         }
2043         return 1;
2044 }
2045
2046 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
2047 {
2048         int r;
2049         int level;
2050         pfn_t pfn;
2051         unsigned long mmu_seq;
2052
2053         level = mapping_level(vcpu, gfn);
2054
2055         /*
2056          * This path builds a PAE pagetable - so we can map 2mb pages at
2057          * maximum. Therefore check if the level is larger than that.
2058          */
2059         if (level > PT_DIRECTORY_LEVEL)
2060                 level = PT_DIRECTORY_LEVEL;
2061
2062         gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2063
2064         mmu_seq = vcpu->kvm->mmu_notifier_seq;
2065         smp_rmb();
2066         pfn = gfn_to_pfn(vcpu->kvm, gfn);
2067
2068         /* mmio */
2069         if (is_error_pfn(pfn))
2070                 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2071
2072         spin_lock(&vcpu->kvm->mmu_lock);
2073         if (mmu_notifier_retry(vcpu, mmu_seq))
2074                 goto out_unlock;
2075         kvm_mmu_free_some_pages(vcpu);
2076         r = __direct_map(vcpu, v, write, level, gfn, pfn);
2077         spin_unlock(&vcpu->kvm->mmu_lock);
2078
2079
2080         return r;
2081
2082 out_unlock:
2083         spin_unlock(&vcpu->kvm->mmu_lock);
2084         kvm_release_pfn_clean(pfn);
2085         return 0;
2086 }
2087
2088
2089 static void mmu_free_roots(struct kvm_vcpu *vcpu)
2090 {
2091         int i;
2092         struct kvm_mmu_page *sp;
2093         LIST_HEAD(invalid_list);
2094
2095         if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2096                 return;
2097         spin_lock(&vcpu->kvm->mmu_lock);
2098         if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2099                 hpa_t root = vcpu->arch.mmu.root_hpa;
2100
2101                 sp = page_header(root);
2102                 --sp->root_count;
2103                 if (!sp->root_count && sp->role.invalid) {
2104                         kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
2105                         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2106                 }
2107                 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2108                 spin_unlock(&vcpu->kvm->mmu_lock);
2109                 return;
2110         }
2111         for (i = 0; i < 4; ++i) {
2112                 hpa_t root = vcpu->arch.mmu.pae_root[i];
2113
2114                 if (root) {
2115                         root &= PT64_BASE_ADDR_MASK;
2116                         sp = page_header(root);
2117                         --sp->root_count;
2118                         if (!sp->root_count && sp->role.invalid)
2119                                 kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2120                                                          &invalid_list);
2121                 }
2122                 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
2123         }
2124         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2125         spin_unlock(&vcpu->kvm->mmu_lock);
2126         vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2127 }
2128
2129 static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
2130 {
2131         int ret = 0;
2132
2133         if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
2134                 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
2135                 ret = 1;
2136         }
2137
2138         return ret;
2139 }
2140
2141 static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
2142 {
2143         int i;
2144         gfn_t root_gfn;
2145         struct kvm_mmu_page *sp;
2146         int direct = 0;
2147         u64 pdptr;
2148
2149         root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
2150
2151         if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2152                 hpa_t root = vcpu->arch.mmu.root_hpa;
2153
2154                 ASSERT(!VALID_PAGE(root));
2155                 if (mmu_check_root(vcpu, root_gfn))
2156                         return 1;
2157                 if (tdp_enabled) {
2158                         direct = 1;
2159                         root_gfn = 0;
2160                 }
2161                 spin_lock(&vcpu->kvm->mmu_lock);
2162                 kvm_mmu_free_some_pages(vcpu);
2163                 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
2164                                       PT64_ROOT_LEVEL, direct,
2165                                       ACC_ALL, NULL);
2166                 root = __pa(sp->spt);
2167                 ++sp->root_count;
2168                 spin_unlock(&vcpu->kvm->mmu_lock);
2169                 vcpu->arch.mmu.root_hpa = root;
2170                 return 0;
2171         }
2172         direct = !is_paging(vcpu);
2173         for (i = 0; i < 4; ++i) {
2174                 hpa_t root = vcpu->arch.mmu.pae_root[i];
2175
2176                 ASSERT(!VALID_PAGE(root));
2177                 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
2178                         pdptr = kvm_pdptr_read(vcpu, i);
2179                         if (!is_present_gpte(pdptr)) {
2180                                 vcpu->arch.mmu.pae_root[i] = 0;
2181                                 continue;
2182                         }
2183                         root_gfn = pdptr >> PAGE_SHIFT;
2184                 } else if (vcpu->arch.mmu.root_level == 0)
2185                         root_gfn = 0;
2186                 if (mmu_check_root(vcpu, root_gfn))
2187                         return 1;
2188                 if (tdp_enabled) {
2189                         direct = 1;
2190                         root_gfn = i << 30;
2191                 }
2192                 spin_lock(&vcpu->kvm->mmu_lock);
2193                 kvm_mmu_free_some_pages(vcpu);
2194                 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
2195                                       PT32_ROOT_LEVEL, direct,
2196                                       ACC_ALL, NULL);
2197                 root = __pa(sp->spt);
2198                 ++sp->root_count;
2199                 spin_unlock(&vcpu->kvm->mmu_lock);
2200
2201                 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
2202         }
2203         vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
2204         return 0;
2205 }
2206
2207 static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2208 {
2209         int i;
2210         struct kvm_mmu_page *sp;
2211
2212         if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2213                 return;
2214         if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2215                 hpa_t root = vcpu->arch.mmu.root_hpa;
2216                 sp = page_header(root);
2217                 mmu_sync_children(vcpu, sp);
2218                 return;
2219         }
2220         for (i = 0; i < 4; ++i) {
2221                 hpa_t root = vcpu->arch.mmu.pae_root[i];
2222
2223                 if (root && VALID_PAGE(root)) {
2224                         root &= PT64_BASE_ADDR_MASK;
2225                         sp = page_header(root);
2226                         mmu_sync_children(vcpu, sp);
2227                 }
2228         }
2229 }
2230
2231 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
2232 {
2233         spin_lock(&vcpu->kvm->mmu_lock);
2234         mmu_sync_roots(vcpu);
2235         spin_unlock(&vcpu->kvm->mmu_lock);
2236 }
2237
2238 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
2239                                   u32 access, u32 *error)
2240 {
2241         if (error)
2242                 *error = 0;
2243         return vaddr;
2244 }
2245
2246 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2247                                 u32 error_code)
2248 {
2249         gfn_t gfn;
2250         int r;
2251
2252         pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
2253         r = mmu_topup_memory_caches(vcpu);
2254         if (r)
2255                 return r;
2256
2257         ASSERT(vcpu);
2258         ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
2259
2260         gfn = gva >> PAGE_SHIFT;
2261
2262         return nonpaging_map(vcpu, gva & PAGE_MASK,
2263                              error_code & PFERR_WRITE_MASK, gfn);
2264 }
2265
2266 static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
2267                                 u32 error_code)
2268 {
2269         pfn_t pfn;
2270         int r;
2271         int level;
2272         gfn_t gfn = gpa >> PAGE_SHIFT;
2273         unsigned long mmu_seq;
2274
2275         ASSERT(vcpu);
2276         ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
2277
2278         r = mmu_topup_memory_caches(vcpu);
2279         if (r)
2280                 return r;
2281
2282         level = mapping_level(vcpu, gfn);
2283
2284         gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2285
2286         mmu_seq = vcpu->kvm->mmu_notifier_seq;
2287         smp_rmb();
2288         pfn = gfn_to_pfn(vcpu->kvm, gfn);
2289         if (is_error_pfn(pfn))
2290                 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2291         spin_lock(&vcpu->kvm->mmu_lock);
2292         if (mmu_notifier_retry(vcpu, mmu_seq))
2293                 goto out_unlock;
2294         kvm_mmu_free_some_pages(vcpu);
2295         r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
2296                          level, gfn, pfn);
2297         spin_unlock(&vcpu->kvm->mmu_lock);
2298
2299         return r;
2300
2301 out_unlock:
2302         spin_unlock(&vcpu->kvm->mmu_lock);
2303         kvm_release_pfn_clean(pfn);
2304         return 0;
2305 }
2306
2307 static void nonpaging_free(struct kvm_vcpu *vcpu)
2308 {
2309         mmu_free_roots(vcpu);
2310 }
2311
2312 static int nonpaging_init_context(struct kvm_vcpu *vcpu)
2313 {
2314         struct kvm_mmu *context = &vcpu->arch.mmu;
2315
2316         context->new_cr3 = nonpaging_new_cr3;
2317         context->page_fault = nonpaging_page_fault;
2318         context->gva_to_gpa = nonpaging_gva_to_gpa;
2319         context->free = nonpaging_free;
2320         context->prefetch_page = nonpaging_prefetch_page;
2321         context->sync_page = nonpaging_sync_page;
2322         context->invlpg = nonpaging_invlpg;
2323         context->root_level = 0;
2324         context->shadow_root_level = PT32E_ROOT_LEVEL;
2325         context->root_hpa = INVALID_PAGE;
2326         return 0;
2327 }
2328
2329 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
2330 {
2331         ++vcpu->stat.tlb_flush;
2332         set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests);
2333 }
2334
2335 static void paging_new_cr3(struct kvm_vcpu *vcpu)
2336 {
2337         pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3);
2338         mmu_free_roots(vcpu);
2339 }
2340
2341 static void inject_page_fault(struct kvm_vcpu *vcpu,
2342                               u64 addr,
2343                               u32 err_code)
2344 {
2345         kvm_inject_page_fault(vcpu, addr, err_code);
2346 }
2347
2348 static void paging_free(struct kvm_vcpu *vcpu)
2349 {
2350         nonpaging_free(vcpu);
2351 }
2352
2353 static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level)
2354 {
2355         int bit7;
2356
2357         bit7 = (gpte >> 7) & 1;
2358         return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0;
2359 }
2360
2361 #define PTTYPE 64
2362 #include "paging_tmpl.h"
2363 #undef PTTYPE
2364
2365 #define PTTYPE 32
2366 #include "paging_tmpl.h"
2367 #undef PTTYPE
2368
2369 static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2370 {
2371         struct kvm_mmu *context = &vcpu->arch.mmu;
2372         int maxphyaddr = cpuid_maxphyaddr(vcpu);
2373         u64 exb_bit_rsvd = 0;
2374
2375         if (!is_nx(vcpu))
2376                 exb_bit_rsvd = rsvd_bits(63, 63);
2377         switch (level) {
2378         case PT32_ROOT_LEVEL:
2379                 /* no rsvd bits for 2 level 4K page table entries */
2380                 context->rsvd_bits_mask[0][1] = 0;
2381                 context->rsvd_bits_mask[0][0] = 0;
2382                 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2383
2384                 if (!is_pse(vcpu)) {
2385                         context->rsvd_bits_mask[1][1] = 0;
2386                         break;
2387                 }
2388
2389                 if (is_cpuid_PSE36())
2390                         /* 36bits PSE 4MB page */
2391                         context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
2392                 else
2393                         /* 32 bits PSE 4MB page */
2394                         context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
2395                 break;
2396         case PT32E_ROOT_LEVEL:
2397                 context->rsvd_bits_mask[0][2] =
2398                         rsvd_bits(maxphyaddr, 63) |
2399                         rsvd_bits(7, 8) | rsvd_bits(1, 2);      /* PDPTE */
2400                 context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
2401                         rsvd_bits(maxphyaddr, 62);      /* PDE */
2402                 context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
2403                         rsvd_bits(maxphyaddr, 62);      /* PTE */
2404                 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2405                         rsvd_bits(maxphyaddr, 62) |
2406                         rsvd_bits(13, 20);              /* large page */
2407                 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2408                 break;
2409         case PT64_ROOT_LEVEL:
2410                 context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
2411                         rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
2412                 context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
2413                         rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
2414                 context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
2415                         rsvd_bits(maxphyaddr, 51);
2416                 context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
2417                         rsvd_bits(maxphyaddr, 51);
2418                 context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
2419                 context->rsvd_bits_mask[1][2] = exb_bit_rsvd |
2420                         rsvd_bits(maxphyaddr, 51) |
2421                         rsvd_bits(13, 29);
2422                 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2423                         rsvd_bits(maxphyaddr, 51) |
2424                         rsvd_bits(13, 20);              /* large page */
2425                 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2426                 break;
2427         }
2428 }
2429
2430 static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
2431 {
2432         struct kvm_mmu *context = &vcpu->arch.mmu;
2433
2434         ASSERT(is_pae(vcpu));
2435         context->new_cr3 = paging_new_cr3;
2436         context->page_fault = paging64_page_fault;
2437         context->gva_to_gpa = paging64_gva_to_gpa;
2438         context->prefetch_page = paging64_prefetch_page;
2439         context->sync_page = paging64_sync_page;
2440         context->invlpg = paging64_invlpg;
2441         context->free = paging_free;
2442         context->root_level = level;
2443         context->shadow_root_level = level;
2444         context->root_hpa = INVALID_PAGE;
2445         return 0;
2446 }
2447
2448 static int paging64_init_context(struct kvm_vcpu *vcpu)
2449 {
2450         reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
2451         return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
2452 }
2453
2454 static int paging32_init_context(struct kvm_vcpu *vcpu)
2455 {
2456         struct kvm_mmu *context = &vcpu->arch.mmu;
2457
2458         reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
2459         context->new_cr3 = paging_new_cr3;
2460         context->page_fault = paging32_page_fault;
2461         context->gva_to_gpa = paging32_gva_to_gpa;
2462         context->free = paging_free;
2463         context->prefetch_page = paging32_prefetch_page;
2464         context->sync_page = paging32_sync_page;
2465         context->invlpg = paging32_invlpg;
2466         context->root_level = PT32_ROOT_LEVEL;
2467         context->shadow_root_level = PT32E_ROOT_LEVEL;
2468         context->root_hpa = INVALID_PAGE;
2469         return 0;
2470 }
2471
2472 static int paging32E_init_context(struct kvm_vcpu *vcpu)
2473 {
2474         reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
2475         return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
2476 }
2477
2478 static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2479 {
2480         struct kvm_mmu *context = &vcpu->arch.mmu;
2481
2482         context->new_cr3 = nonpaging_new_cr3;
2483         context->page_fault = tdp_page_fault;
2484         context->free = nonpaging_free;
2485         context->prefetch_page = nonpaging_prefetch_page;
2486         context->sync_page = nonpaging_sync_page;
2487         context->invlpg = nonpaging_invlpg;
2488         context->shadow_root_level = kvm_x86_ops->get_tdp_level();
2489         context->root_hpa = INVALID_PAGE;
2490
2491         if (!is_paging(vcpu)) {
2492                 context->gva_to_gpa = nonpaging_gva_to_gpa;
2493                 context->root_level = 0;
2494         } else if (is_long_mode(vcpu)) {
2495                 reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
2496                 context->gva_to_gpa = paging64_gva_to_gpa;
2497                 context->root_level = PT64_ROOT_LEVEL;
2498         } else if (is_pae(vcpu)) {
2499                 reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
2500                 context->gva_to_gpa = paging64_gva_to_gpa;
2501                 context->root_level = PT32E_ROOT_LEVEL;
2502         } else {
2503                 reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
2504                 context->gva_to_gpa = paging32_gva_to_gpa;
2505                 context->root_level = PT32_ROOT_LEVEL;
2506         }
2507
2508         return 0;
2509 }
2510
2511 static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
2512 {
2513         int r;
2514
2515         ASSERT(vcpu);
2516         ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
2517
2518         if (!is_paging(vcpu))
2519                 r = nonpaging_init_context(vcpu);
2520         else if (is_long_mode(vcpu))
2521                 r = paging64_init_context(vcpu);
2522         else if (is_pae(vcpu))
2523                 r = paging32E_init_context(vcpu);
2524         else
2525                 r = paging32_init_context(vcpu);
2526
2527         vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
2528         vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
2529
2530         return r;
2531 }
2532
2533 static int init_kvm_mmu(struct kvm_vcpu *vcpu)
2534 {
2535         vcpu->arch.update_pte.pfn = bad_pfn;
2536
2537         if (tdp_enabled)
2538                 return init_kvm_tdp_mmu(vcpu);
2539         else
2540                 return init_kvm_softmmu(vcpu);
2541 }
2542
2543 static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
2544 {
2545         ASSERT(vcpu);
2546         if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
2547                 /* mmu.free() should set root_hpa = INVALID_PAGE */
2548                 vcpu->arch.mmu.free(vcpu);
2549 }
2550
2551 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
2552 {
2553         destroy_kvm_mmu(vcpu);
2554         return init_kvm_mmu(vcpu);
2555 }
2556 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
2557
2558 int kvm_mmu_load(struct kvm_vcpu *vcpu)
2559 {
2560         int r;
2561
2562         r = mmu_topup_memory_caches(vcpu);
2563         if (r)
2564                 goto out;
2565         r = mmu_alloc_roots(vcpu);
2566         spin_lock(&vcpu->kvm->mmu_lock);
2567         mmu_sync_roots(vcpu);
2568         spin_unlock(&vcpu->kvm->mmu_lock);
2569         if (r)
2570                 goto out;
2571         /* set_cr3() should ensure TLB has been flushed */
2572         kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
2573 out:
2574         return r;
2575 }
2576 EXPORT_SYMBOL_GPL(kvm_mmu_load);
2577
2578 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
2579 {
2580         mmu_free_roots(vcpu);
2581 }
2582
2583 static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
2584                                   struct kvm_mmu_page *sp,
2585                                   u64 *spte)
2586 {
2587         u64 pte;
2588         struct kvm_mmu_page *child;
2589
2590         pte = *spte;
2591         if (is_shadow_present_pte(pte)) {
2592                 if (is_last_spte(pte, sp->role.level))
2593                         rmap_remove(vcpu->kvm, spte);
2594                 else {
2595                         child = page_header(pte & PT64_BASE_ADDR_MASK);
2596                         mmu_page_remove_parent_pte(child, spte);
2597                 }
2598         }
2599         __set_spte(spte, shadow_trap_nonpresent_pte);
2600         if (is_large_pte(pte))
2601                 --vcpu->kvm->stat.lpages;
2602 }
2603
2604 static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
2605                                   struct kvm_mmu_page *sp,
2606                                   u64 *spte,
2607                                   const void *new)
2608 {
2609         if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
2610                 ++vcpu->kvm->stat.mmu_pde_zapped;
2611                 return;
2612         }
2613
2614         ++vcpu->kvm->stat.mmu_pte_updated;
2615         if (!sp->role.cr4_pae)
2616                 paging32_update_pte(vcpu, sp, spte, new);
2617         else
2618                 paging64_update_pte(vcpu, sp, spte, new);
2619 }
2620
2621 static bool need_remote_flush(u64 old, u64 new)
2622 {
2623         if (!is_shadow_present_pte(old))
2624                 return false;
2625         if (!is_shadow_present_pte(new))
2626                 return true;
2627         if ((old ^ new) & PT64_BASE_ADDR_MASK)
2628                 return true;
2629         old ^= PT64_NX_MASK;
2630         new ^= PT64_NX_MASK;
2631         return (old & ~new & PT64_PERM_MASK) != 0;
2632 }
2633
2634 static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
2635                                     bool remote_flush, bool local_flush)
2636 {
2637         if (zap_page)
2638                 return;
2639
2640         if (remote_flush)
2641                 kvm_flush_remote_tlbs(vcpu->kvm);
2642         else if (local_flush)
2643                 kvm_mmu_flush_tlb(vcpu);
2644 }
2645
2646 static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
2647 {
2648         u64 *spte = vcpu->arch.last_pte_updated;
2649
2650         return !!(spte && (*spte & shadow_accessed_mask));
2651 }
2652
2653 static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2654                                           u64 gpte)
2655 {
2656         gfn_t gfn;
2657         pfn_t pfn;
2658
2659         if (!is_present_gpte(gpte))
2660                 return;
2661         gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
2662
2663         vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
2664         smp_rmb();
2665         pfn = gfn_to_pfn(vcpu->kvm, gfn);
2666
2667         if (is_error_pfn(pfn)) {
2668                 kvm_release_pfn_clean(pfn);
2669                 return;
2670         }
2671         vcpu->arch.update_pte.gfn = gfn;
2672         vcpu->arch.update_pte.pfn = pfn;
2673 }
2674
2675 static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
2676 {
2677         u64 *spte = vcpu->arch.last_pte_updated;
2678
2679         if (spte
2680             && vcpu->arch.last_pte_gfn == gfn
2681             && shadow_accessed_mask
2682             && !(*spte & shadow_accessed_mask)
2683             && is_shadow_present_pte(*spte))
2684                 set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
2685 }
2686
2687 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2688                        const u8 *new, int bytes,
2689                        bool guest_initiated)
2690 {
2691         gfn_t gfn = gpa >> PAGE_SHIFT;
2692         struct kvm_mmu_page *sp;
2693         struct hlist_node *node;
2694         LIST_HEAD(invalid_list);
2695         u64 entry, gentry;
2696         u64 *spte;
2697         unsigned offset = offset_in_page(gpa);
2698         unsigned pte_size;
2699         unsigned page_offset;
2700         unsigned misaligned;
2701         unsigned quadrant;
2702         int level;
2703         int flooded = 0;
2704         int npte;
2705         int r;
2706         int invlpg_counter;
2707         bool remote_flush, local_flush, zap_page;
2708
2709         zap_page = remote_flush = local_flush = false;
2710
2711         pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
2712
2713         invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter);
2714
2715         /*
2716          * Assume that the pte write on a page table of the same type
2717          * as the current vcpu paging mode.  This is nearly always true
2718          * (might be false while changing modes).  Note it is verified later
2719          * by update_pte().
2720          */
2721         if ((is_pae(vcpu) && bytes == 4) || !new) {
2722                 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
2723                 if (is_pae(vcpu)) {
2724                         gpa &= ~(gpa_t)7;
2725                         bytes = 8;
2726                 }
2727                 r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));
2728                 if (r)
2729                         gentry = 0;
2730                 new = (const u8 *)&gentry;
2731         }
2732
2733         switch (bytes) {
2734         case 4:
2735                 gentry = *(const u32 *)new;
2736                 break;
2737         case 8:
2738                 gentry = *(const u64 *)new;
2739                 break;
2740         default:
2741                 gentry = 0;
2742                 break;
2743         }
2744
2745         mmu_guess_page_from_pte_write(vcpu, gpa, gentry);
2746         spin_lock(&vcpu->kvm->mmu_lock);
2747         if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
2748                 gentry = 0;
2749         kvm_mmu_access_page(vcpu, gfn);
2750         kvm_mmu_free_some_pages(vcpu);
2751         ++vcpu->kvm->stat.mmu_pte_write;
2752         kvm_mmu_audit(vcpu, "pre pte write");
2753         if (guest_initiated) {
2754                 if (gfn == vcpu->arch.last_pt_write_gfn
2755                     && !last_updated_pte_accessed(vcpu)) {
2756                         ++vcpu->arch.last_pt_write_count;
2757                         if (vcpu->arch.last_pt_write_count >= 3)
2758                                 flooded = 1;
2759                 } else {
2760                         vcpu->arch.last_pt_write_gfn = gfn;
2761                         vcpu->arch.last_pt_write_count = 1;
2762                         vcpu->arch.last_pte_updated = NULL;
2763                 }
2764         }
2765
2766         for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
2767                 pte_size = sp->role.cr4_pae ? 8 : 4;
2768                 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
2769                 misaligned |= bytes < 4;
2770                 if (misaligned || flooded) {
2771                         /*
2772                          * Misaligned accesses are too much trouble to fix
2773                          * up; also, they usually indicate a page is not used
2774                          * as a page table.
2775                          *
2776                          * If we're seeing too many writes to a page,
2777                          * it may no longer be a page table, or we may be
2778                          * forking, in which case it is better to unmap the
2779                          * page.
2780                          */
2781                         pgprintk("misaligned: gpa %llx bytes %d role %x\n",
2782                                  gpa, bytes, sp->role.word);
2783                         zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2784                                                      &invalid_list);
2785                         ++vcpu->kvm->stat.mmu_flooded;
2786                         continue;
2787                 }
2788                 page_offset = offset;
2789                 level = sp->role.level;
2790                 npte = 1;
2791                 if (!sp->role.cr4_pae) {
2792                         page_offset <<= 1;      /* 32->64 */
2793                         /*
2794                          * A 32-bit pde maps 4MB while the shadow pdes map
2795                          * only 2MB.  So we need to double the offset again
2796                          * and zap two pdes instead of one.
2797                          */
2798                         if (level == PT32_ROOT_LEVEL) {
2799                                 page_offset &= ~7; /* kill rounding error */
2800                                 page_offset <<= 1;
2801                                 npte = 2;
2802                         }
2803                         quadrant = page_offset >> PAGE_SHIFT;
2804                         page_offset &= ~PAGE_MASK;
2805                         if (quadrant != sp->role.quadrant)
2806                                 continue;
2807                 }
2808                 local_flush = true;
2809                 spte = &sp->spt[page_offset / sizeof(*spte)];
2810                 while (npte--) {
2811                         entry = *spte;
2812                         mmu_pte_write_zap_pte(vcpu, sp, spte);
2813                         if (gentry)
2814                                 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
2815                         if (!remote_flush && need_remote_flush(entry, *spte))
2816                                 remote_flush = true;
2817                         ++spte;
2818                 }
2819         }
2820         mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
2821         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2822         kvm_mmu_audit(vcpu, "post pte write");
2823         spin_unlock(&vcpu->kvm->mmu_lock);
2824         if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
2825                 kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
2826                 vcpu->arch.update_pte.pfn = bad_pfn;
2827         }
2828 }
2829
2830 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
2831 {
2832         gpa_t gpa;
2833         int r;
2834
2835         if (tdp_enabled)
2836                 return 0;
2837
2838         gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
2839
2840         spin_lock(&vcpu->kvm->mmu_lock);
2841         r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
2842         spin_unlock(&vcpu->kvm->mmu_lock);
2843         return r;
2844 }
2845 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
2846
2847 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
2848 {
2849         int free_pages;
2850         LIST_HEAD(invalid_list);
2851
2852         free_pages = vcpu->kvm->arch.n_free_mmu_pages;
2853         while (free_pages < KVM_REFILL_PAGES &&
2854                !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
2855                 struct kvm_mmu_page *sp;
2856
2857                 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
2858                                   struct kvm_mmu_page, link);
2859                 free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2860                                                        &invalid_list);
2861                 ++vcpu->kvm->stat.mmu_recycled;
2862         }
2863         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2864 }
2865
2866 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
2867 {
2868         int r;
2869         enum emulation_result er;
2870
2871         r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
2872         if (r < 0)
2873                 goto out;
2874
2875         if (!r) {
2876                 r = 1;
2877                 goto out;
2878         }
2879
2880         r = mmu_topup_memory_caches(vcpu);
2881         if (r)
2882                 goto out;
2883
2884         er = emulate_instruction(vcpu, cr2, error_code, 0);
2885
2886         switch (er) {
2887         case EMULATE_DONE:
2888                 return 1;
2889         case EMULATE_DO_MMIO:
2890                 ++vcpu->stat.mmio_exits;
2891                 /* fall through */
2892         case EMULATE_FAIL:
2893                 return 0;
2894         default:
2895                 BUG();
2896         }
2897 out:
2898         return r;
2899 }
2900 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
2901
2902 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
2903 {
2904         vcpu->arch.mmu.invlpg(vcpu, gva);
2905         kvm_mmu_flush_tlb(vcpu);
2906         ++vcpu->stat.invlpg;
2907 }
2908 EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
2909
2910 void kvm_enable_tdp(void)
2911 {
2912         tdp_enabled = true;
2913 }
2914 EXPORT_SYMBOL_GPL(kvm_enable_tdp);
2915
2916 void kvm_disable_tdp(void)
2917 {
2918         tdp_enabled = false;
2919 }
2920 EXPORT_SYMBOL_GPL(kvm_disable_tdp);
2921
2922 static void free_mmu_pages(struct kvm_vcpu *vcpu)
2923 {
2924         free_page((unsigned long)vcpu->arch.mmu.pae_root);
2925 }
2926
2927 static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
2928 {
2929         struct page *page;
2930         int i;
2931
2932         ASSERT(vcpu);
2933
2934         /*
2935          * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
2936          * Therefore we need to allocate shadow page tables in the first
2937          * 4GB of memory, which happens to fit the DMA32 zone.
2938          */
2939         page = alloc_page(GFP_KERNEL | __GFP_DMA32);
2940         if (!page)
2941                 return -ENOMEM;
2942
2943         vcpu->arch.mmu.pae_root = page_address(page);
2944         for (i = 0; i < 4; ++i)
2945                 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
2946
2947         return 0;
2948 }
2949
2950 int kvm_mmu_create(struct kvm_vcpu *vcpu)
2951 {
2952         ASSERT(vcpu);
2953         ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
2954
2955         return alloc_mmu_pages(vcpu);
2956 }
2957
2958 int kvm_mmu_setup(struct kvm_vcpu *vcpu)
2959 {
2960         ASSERT(vcpu);
2961         ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
2962
2963         return init_kvm_mmu(vcpu);
2964 }
2965
2966 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
2967 {
2968         ASSERT(vcpu);
2969
2970         destroy_kvm_mmu(vcpu);
2971         free_mmu_pages(vcpu);
2972         mmu_free_memory_caches(vcpu);
2973 }
2974
2975 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
2976 {
2977         struct kvm_mmu_page *sp;
2978
2979         list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
2980                 int i;
2981                 u64 *pt;
2982
2983                 if (!test_bit(slot, sp->slot_bitmap))
2984                         continue;
2985
2986                 pt = sp->spt;
2987                 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
2988                         /* avoid RMW */
2989                         if (is_writable_pte(pt[i]))
2990                                 pt[i] &= ~PT_WRITABLE_MASK;
2991         }
2992         kvm_flush_remote_tlbs(kvm);
2993 }
2994
2995 void kvm_mmu_zap_all(struct kvm *kvm)
2996 {
2997         struct kvm_mmu_page *sp, *node;
2998         LIST_HEAD(invalid_list);
2999
3000         spin_lock(&kvm->mmu_lock);
3001 restart:
3002         list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
3003                 if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
3004                         goto restart;
3005
3006         kvm_mmu_commit_zap_page(kvm, &invalid_list);
3007         spin_unlock(&kvm->mmu_lock);
3008 }
3009
3010 static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
3011                                                struct list_head *invalid_list)
3012 {
3013         struct kvm_mmu_page *page;
3014
3015         page = container_of(kvm->arch.active_mmu_pages.prev,
3016                             struct kvm_mmu_page, link);
3017         return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
3018 }
3019
3020 static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
3021 {
3022         struct kvm *kvm;
3023         struct kvm *kvm_freed = NULL;
3024         int cache_count = 0;
3025
3026         spin_lock(&kvm_lock);
3027
3028         list_for_each_entry(kvm, &vm_list, vm_list) {
3029                 int npages, idx, freed_pages;
3030                 LIST_HEAD(invalid_list);
3031
3032                 idx = srcu_read_lock(&kvm->srcu);
3033                 spin_lock(&kvm->mmu_lock);
3034                 npages = kvm->arch.n_alloc_mmu_pages -
3035                          kvm->arch.n_free_mmu_pages;
3036                 cache_count += npages;
3037                 if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
3038                         freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
3039                                                           &invalid_list);
3040                         cache_count -= freed_pages;
3041                         kvm_freed = kvm;
3042                 }
3043                 nr_to_scan--;
3044
3045                 kvm_mmu_commit_zap_page(kvm, &invalid_list);
3046                 spin_unlock(&kvm->mmu_lock);
3047                 srcu_read_unlock(&kvm->srcu, idx);
3048         }
3049         if (kvm_freed)
3050                 list_move_tail(&kvm_freed->vm_list, &vm_list);
3051
3052         spin_unlock(&kvm_lock);
3053
3054         return cache_count;
3055 }
3056
3057 static struct shrinker mmu_shrinker = {
3058         .shrink = mmu_shrink,
3059         .seeks = DEFAULT_SEEKS * 10,
3060 };
3061
3062 static void mmu_destroy_caches(void)
3063 {
3064         if (pte_chain_cache)
3065                 kmem_cache_destroy(pte_chain_cache);
3066         if (rmap_desc_cache)
3067                 kmem_cache_destroy(rmap_desc_cache);
3068         if (mmu_page_header_cache)
3069                 kmem_cache_destroy(mmu_page_header_cache);
3070 }
3071
3072 void kvm_mmu_module_exit(void)
3073 {
3074         mmu_destroy_caches();
3075         unregister_shrinker(&mmu_shrinker);
3076 }
3077
3078 int kvm_mmu_module_init(void)
3079 {
3080         pte_chain_cache = kmem_cache_create("kvm_pte_chain",
3081                                             sizeof(struct kvm_pte_chain),
3082                                             0, 0, NULL);
3083         if (!pte_chain_cache)
3084                 goto nomem;
3085         rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
3086                                             sizeof(struct kvm_rmap_desc),
3087                                             0, 0, NULL);
3088         if (!rmap_desc_cache)
3089                 goto nomem;
3090
3091         mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
3092                                                   sizeof(struct kvm_mmu_page),
3093                                                   0, 0, NULL);
3094         if (!mmu_page_header_cache)
3095                 goto nomem;
3096
3097         register_shrinker(&mmu_shrinker);
3098
3099         return 0;
3100
3101 nomem:
3102         mmu_destroy_caches();
3103         return -ENOMEM;
3104 }
3105
3106 /*
3107  * Caculate mmu pages needed for kvm.
3108  */
3109 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
3110 {
3111         int i;
3112         unsigned int nr_mmu_pages;
3113         unsigned int  nr_pages = 0;
3114         struct kvm_memslots *slots;
3115
3116         slots = kvm_memslots(kvm);
3117
3118         for (i = 0; i < slots->nmemslots; i++)
3119                 nr_pages += slots->memslots[i].npages;
3120
3121         nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
3122         nr_mmu_pages = max(nr_mmu_pages,
3123                         (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
3124
3125         return nr_mmu_pages;
3126 }
3127
3128 static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer,
3129                                 unsigned len)
3130 {
3131         if (len > buffer->len)
3132                 return NULL;
3133         return buffer->ptr;
3134 }
3135
3136 static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer,
3137                                 unsigned len)
3138 {
3139         void *ret;
3140
3141         ret = pv_mmu_peek_buffer(buffer, len);
3142         if (!ret)
3143                 return ret;
3144         buffer->ptr += len;
3145         buffer->len -= len;
3146         buffer->processed += len;
3147         return ret;
3148 }
3149
3150 static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
3151                              gpa_t addr, gpa_t value)
3152 {
3153         int bytes = 8;
3154         int r;
3155
3156         if (!is_long_mode(vcpu) && !is_pae(vcpu))
3157                 bytes = 4;
3158
3159         r = mmu_topup_memory_caches(vcpu);
3160         if (r)
3161                 return r;
3162
3163         if (!emulator_write_phys(vcpu, addr, &value, bytes))
3164                 return -EFAULT;
3165
3166         return 1;
3167 }
3168
3169 static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
3170 {
3171         (void)kvm_set_cr3(vcpu, vcpu->arch.cr3);
3172         return 1;
3173 }
3174
3175 static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr)
3176 {
3177         spin_lock(&vcpu->kvm->mmu_lock);
3178         mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT);
3179         spin_unlock(&vcpu->kvm->mmu_lock);
3180         return 1;
3181 }
3182
3183 static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu,
3184                              struct kvm_pv_mmu_op_buffer *buffer)
3185 {
3186         struct kvm_mmu_op_header *header;
3187
3188         header = pv_mmu_peek_buffer(buffer, sizeof *header);
3189         if (!header)
3190                 return 0;
3191         switch (header->op) {
3192         case KVM_MMU_OP_WRITE_PTE: {
3193                 struct kvm_mmu_op_write_pte *wpte;
3194
3195                 wpte = pv_mmu_read_buffer(buffer, sizeof *wpte);
3196                 if (!wpte)
3197                         return 0;
3198                 return kvm_pv_mmu_write(vcpu, wpte->pte_phys,
3199                                         wpte->pte_val);
3200         }
3201         case KVM_MMU_OP_FLUSH_TLB: {
3202                 struct kvm_mmu_op_flush_tlb *ftlb;
3203
3204                 ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb);
3205                 if (!ftlb)
3206                         return 0;
3207                 return kvm_pv_mmu_flush_tlb(vcpu);
3208         }
3209         case KVM_MMU_OP_RELEASE_PT: {
3210                 struct kvm_mmu_op_release_pt *rpt;
3211
3212                 rpt = pv_mmu_read_buffer(buffer, sizeof *rpt);
3213                 if (!rpt)
3214                         return 0;
3215                 return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys);
3216         }
3217         default: return 0;
3218         }
3219 }
3220
3221 int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
3222                   gpa_t addr, unsigned long *ret)
3223 {
3224         int r;
3225         struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer;
3226
3227         buffer->ptr = buffer->buf;
3228         buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf);
3229         buffer->processed = 0;
3230
3231         r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len);
3232         if (r)
3233                 goto out;
3234
3235         while (buffer->len) {
3236                 r = kvm_pv_mmu_op_one(vcpu, buffer);
3237                 if (r < 0)
3238                         goto out;
3239                 if (r == 0)
3240                         break;
3241         }
3242
3243         r = 1;
3244 out:
3245         *ret = buffer->processed;
3246         return r;
3247 }
3248
3249 int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
3250 {
3251         struct kvm_shadow_walk_iterator iterator;
3252         int nr_sptes = 0;
3253
3254         spin_lock(&vcpu->kvm->mmu_lock);
3255         for_each_shadow_entry(vcpu, addr, iterator) {
3256                 sptes[iterator.level-1] = *iterator.sptep;
3257                 nr_sptes++;
3258                 if (!is_shadow_present_pte(*iterator.sptep))
3259                         break;
3260         }
3261         spin_unlock(&vcpu->kvm->mmu_lock);
3262
3263         return nr_sptes;
3264 }
3265 EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
3266
3267 #ifdef AUDIT
3268
3269 static const char *audit_msg;
3270
3271 static gva_t canonicalize(gva_t gva)
3272 {
3273 #ifdef CONFIG_X86_64
3274         gva = (long long)(gva << 16) >> 16;
3275 #endif
3276         return gva;
3277 }
3278
3279
3280 typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
3281
3282 static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
3283                             inspect_spte_fn fn)
3284 {
3285         int i;
3286
3287         for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
3288                 u64 ent = sp->spt[i];
3289
3290                 if (is_shadow_present_pte(ent)) {
3291                         if (!is_last_spte(ent, sp->role.level)) {
3292                                 struct kvm_mmu_page *child;
3293                                 child = page_header(ent & PT64_BASE_ADDR_MASK);
3294                                 __mmu_spte_walk(kvm, child, fn);
3295                         } else
3296                                 fn(kvm, &sp->spt[i]);
3297                 }
3298         }
3299 }
3300
3301 static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
3302 {
3303         int i;
3304         struct kvm_mmu_page *sp;
3305
3306         if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
3307                 return;
3308         if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
3309                 hpa_t root = vcpu->arch.mmu.root_hpa;
3310                 sp = page_header(root);
3311                 __mmu_spte_walk(vcpu->kvm, sp, fn);
3312                 return;
3313         }
3314         for (i = 0; i < 4; ++i) {
3315                 hpa_t root = vcpu->arch.mmu.pae_root[i];
3316
3317                 if (root && VALID_PAGE(root)) {
3318                         root &= PT64_BASE_ADDR_MASK;
3319                         sp = page_header(root);
3320                         __mmu_spte_walk(vcpu->kvm, sp, fn);
3321                 }
3322         }
3323         return;
3324 }
3325
3326 static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
3327                                 gva_t va, int level)
3328 {
3329         u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
3330         int i;
3331         gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
3332
3333         for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
3334                 u64 ent = pt[i];
3335
3336                 if (ent == shadow_trap_nonpresent_pte)
3337                         continue;
3338
3339                 va = canonicalize(va);
3340                 if (is_shadow_present_pte(ent) && !is_last_spte(ent, level))
3341                         audit_mappings_page(vcpu, ent, va, level - 1);
3342                 else {
3343                         gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL);
3344                         gfn_t gfn = gpa >> PAGE_SHIFT;
3345                         pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
3346                         hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
3347
3348                         if (is_error_pfn(pfn)) {
3349                                 kvm_release_pfn_clean(pfn);
3350                                 continue;
3351                         }
3352
3353                         if (is_shadow_present_pte(ent)
3354                             && (ent & PT64_BASE_ADDR_MASK) != hpa)
3355                                 printk(KERN_ERR "xx audit error: (%s) levels %d"
3356                                        " gva %lx gpa %llx hpa %llx ent %llx %d\n",
3357                                        audit_msg, vcpu->arch.mmu.root_level,
3358                                        va, gpa, hpa, ent,
3359                                        is_shadow_present_pte(ent));
3360                         else if (ent == shadow_notrap_nonpresent_pte
3361                                  && !is_error_hpa(hpa))
3362                                 printk(KERN_ERR "audit: (%s) notrap shadow,"
3363                                        " valid guest gva %lx\n", audit_msg, va);
3364                         kvm_release_pfn_clean(pfn);
3365
3366                 }
3367         }
3368 }
3369
3370 static void audit_mappings(struct kvm_vcpu *vcpu)
3371 {
3372         unsigned i;
3373
3374         if (vcpu->arch.mmu.root_level == 4)
3375                 audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
3376         else
3377                 for (i = 0; i < 4; ++i)
3378                         if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
3379                                 audit_mappings_page(vcpu,
3380                                                     vcpu->arch.mmu.pae_root[i],
3381                                                     i << 30,
3382                                                     2);
3383 }
3384
3385 static int count_rmaps(struct kvm_vcpu *vcpu)
3386 {
3387         struct kvm *kvm = vcpu->kvm;
3388         struct kvm_memslots *slots;
3389         int nmaps = 0;
3390         int i, j, k, idx;
3391
3392         idx = srcu_read_lock(&kvm->srcu);
3393         slots = kvm_memslots(kvm);
3394         for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
3395                 struct kvm_memory_slot *m = &slots->memslots[i];
3396                 struct kvm_rmap_desc *d;
3397
3398                 for (j = 0; j < m->npages; ++j) {
3399                         unsigned long *rmapp = &m->rmap[j];
3400
3401                         if (!*rmapp)
3402                                 continue;
3403                         if (!(*rmapp & 1)) {
3404                                 ++nmaps;
3405                                 continue;
3406                         }
3407                         d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
3408                         while (d) {
3409                                 for (k = 0; k < RMAP_EXT; ++k)
3410                                         if (d->sptes[k])
3411                                                 ++nmaps;
3412                                         else
3413                                                 break;
3414                                 d = d->more;
3415                         }
3416                 }
3417         }
3418         srcu_read_unlock(&kvm->srcu, idx);
3419         return nmaps;
3420 }
3421
3422 void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
3423 {
3424         unsigned long *rmapp;
3425         struct kvm_mmu_page *rev_sp;
3426         gfn_t gfn;
3427
3428         if (is_writable_pte(*sptep)) {
3429                 rev_sp = page_header(__pa(sptep));
3430                 gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
3431
3432                 if (!gfn_to_memslot(kvm, gfn)) {
3433                         if (!printk_ratelimit())
3434                                 return;
3435                         printk(KERN_ERR "%s: no memslot for gfn %ld\n",
3436                                          audit_msg, gfn);
3437                         printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
3438                                audit_msg, (long int)(sptep - rev_sp->spt),
3439                                         rev_sp->gfn);
3440                         dump_stack();
3441                         return;
3442                 }
3443
3444                 rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
3445                 if (!*rmapp) {
3446                         if (!printk_ratelimit())
3447                                 return;
3448                         printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
3449                                          audit_msg, *sptep);
3450                         dump_stack();
3451                 }
3452         }
3453
3454 }
3455
3456 void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu)
3457 {
3458         mmu_spte_walk(vcpu, inspect_spte_has_rmap);
3459 }
3460
3461 static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
3462 {
3463         struct kvm_mmu_page *sp;
3464         int i;
3465
3466         list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
3467                 u64 *pt = sp->spt;
3468
3469                 if (sp->role.level != PT_PAGE_TABLE_LEVEL)
3470                         continue;
3471
3472                 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
3473                         u64 ent = pt[i];
3474
3475                         if (!(ent & PT_PRESENT_MASK))
3476                                 continue;
3477                         if (!is_writable_pte(ent))
3478                                 continue;
3479                         inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
3480                 }
3481         }
3482         return;
3483 }
3484
3485 static void audit_rmap(struct kvm_vcpu *vcpu)
3486 {
3487         check_writable_mappings_rmap(vcpu);
3488         count_rmaps(vcpu);
3489 }
3490
3491 static void audit_write_protection(struct kvm_vcpu *vcpu)
3492 {
3493         struct kvm_mmu_page *sp;
3494         struct kvm_memory_slot *slot;
3495         unsigned long *rmapp;
3496         u64 *spte;
3497         gfn_t gfn;
3498
3499         list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
3500                 if (sp->role.direct)
3501                         continue;
3502                 if (sp->unsync)
3503                         continue;
3504
3505                 slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
3506                 rmapp = &slot->rmap[gfn - slot->base_gfn];
3507
3508                 spte = rmap_next(vcpu->kvm, rmapp, NULL);
3509                 while (spte) {
3510                         if (is_writable_pte(*spte))
3511                                 printk(KERN_ERR "%s: (%s) shadow page has "
3512                                 "writable mappings: gfn %lx role %x\n",
3513                                __func__, audit_msg, sp->gfn,
3514                                sp->role.word);
3515                         spte = rmap_next(vcpu->kvm, rmapp, spte);
3516                 }
3517         }
3518 }
3519
3520 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
3521 {
3522         int olddbg = dbg;
3523
3524         dbg = 0;
3525         audit_msg = msg;
3526         audit_rmap(vcpu);
3527         audit_write_protection(vcpu);
3528         if (strcmp("pre pte write", audit_msg) != 0)
3529                 audit_mappings(vcpu);
3530         audit_writable_sptes_have_rmaps(vcpu);
3531         dbg = olddbg;
3532 }
3533
3534 #endif