iommu/vt-d: Use memunmap to free memremap
[sfrench/cifs-2.6.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright © 2006-2014 Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * Authors: David Woodhouse <dwmw2@infradead.org>,
14  *          Ashok Raj <ashok.raj@intel.com>,
15  *          Shaohua Li <shaohua.li@intel.com>,
16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17  *          Fenghua Yu <fenghua.yu@intel.com>
18  *          Joerg Roedel <jroedel@suse.de>
19  */
20
21 #define pr_fmt(fmt)     "DMAR: " fmt
22
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/export.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/memory.h>
36 #include <linux/cpu.h>
37 #include <linux/timer.h>
38 #include <linux/io.h>
39 #include <linux/iova.h>
40 #include <linux/iommu.h>
41 #include <linux/intel-iommu.h>
42 #include <linux/syscore_ops.h>
43 #include <linux/tboot.h>
44 #include <linux/dmi.h>
45 #include <linux/pci-ats.h>
46 #include <linux/memblock.h>
47 #include <linux/dma-contiguous.h>
48 #include <linux/dma-direct.h>
49 #include <linux/crash_dump.h>
50 #include <asm/irq_remapping.h>
51 #include <asm/cacheflush.h>
52 #include <asm/iommu.h>
53
54 #include "irq_remapping.h"
55 #include "intel-pasid.h"
56
57 #define ROOT_SIZE               VTD_PAGE_SIZE
58 #define CONTEXT_SIZE            VTD_PAGE_SIZE
59
60 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
61 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
62 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
63 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
64
65 #define IOAPIC_RANGE_START      (0xfee00000)
66 #define IOAPIC_RANGE_END        (0xfeefffff)
67 #define IOVA_START_ADDR         (0x1000)
68
69 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
70
71 #define MAX_AGAW_WIDTH 64
72 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
73
74 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
75 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
76
77 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
78    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
79 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
80                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
81 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
82
83 /* IO virtual address start page frame number */
84 #define IOVA_START_PFN          (1)
85
86 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
87
88 /* page table handling */
89 #define LEVEL_STRIDE            (9)
90 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
91
92 /*
93  * This bitmap is used to advertise the page sizes our hardware support
94  * to the IOMMU core, which will then use this information to split
95  * physically contiguous memory regions it is mapping into page sizes
96  * that we support.
97  *
98  * Traditionally the IOMMU core just handed us the mappings directly,
99  * after making sure the size is an order of a 4KiB page and that the
100  * mapping has natural alignment.
101  *
102  * To retain this behavior, we currently advertise that we support
103  * all page sizes that are an order of 4KiB.
104  *
105  * If at some point we'd like to utilize the IOMMU core's new behavior,
106  * we could change this to advertise the real page sizes we support.
107  */
108 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
109
110 static inline int agaw_to_level(int agaw)
111 {
112         return agaw + 2;
113 }
114
115 static inline int agaw_to_width(int agaw)
116 {
117         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
118 }
119
120 static inline int width_to_agaw(int width)
121 {
122         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
123 }
124
125 static inline unsigned int level_to_offset_bits(int level)
126 {
127         return (level - 1) * LEVEL_STRIDE;
128 }
129
130 static inline int pfn_level_offset(unsigned long pfn, int level)
131 {
132         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
133 }
134
135 static inline unsigned long level_mask(int level)
136 {
137         return -1UL << level_to_offset_bits(level);
138 }
139
140 static inline unsigned long level_size(int level)
141 {
142         return 1UL << level_to_offset_bits(level);
143 }
144
145 static inline unsigned long align_to_level(unsigned long pfn, int level)
146 {
147         return (pfn + level_size(level) - 1) & level_mask(level);
148 }
149
150 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
151 {
152         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
153 }
154
155 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
156    are never going to work. */
157 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
158 {
159         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
160 }
161
162 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
163 {
164         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
165 }
166 static inline unsigned long page_to_dma_pfn(struct page *pg)
167 {
168         return mm_to_dma_pfn(page_to_pfn(pg));
169 }
170 static inline unsigned long virt_to_dma_pfn(void *p)
171 {
172         return page_to_dma_pfn(virt_to_page(p));
173 }
174
175 /* global iommu list, set NULL for ignored DMAR units */
176 static struct intel_iommu **g_iommus;
177
178 static void __init check_tylersburg_isoch(void);
179 static int rwbf_quirk;
180
181 /*
182  * set to 1 to panic kernel if can't successfully enable VT-d
183  * (used when kernel is launched w/ TXT)
184  */
185 static int force_on = 0;
186 int intel_iommu_tboot_noforce;
187
188 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
189
190 /*
191  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
192  * if marked present.
193  */
194 static phys_addr_t root_entry_lctp(struct root_entry *re)
195 {
196         if (!(re->lo & 1))
197                 return 0;
198
199         return re->lo & VTD_PAGE_MASK;
200 }
201
202 /*
203  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
204  * if marked present.
205  */
206 static phys_addr_t root_entry_uctp(struct root_entry *re)
207 {
208         if (!(re->hi & 1))
209                 return 0;
210
211         return re->hi & VTD_PAGE_MASK;
212 }
213
214 static inline void context_clear_pasid_enable(struct context_entry *context)
215 {
216         context->lo &= ~(1ULL << 11);
217 }
218
219 static inline bool context_pasid_enabled(struct context_entry *context)
220 {
221         return !!(context->lo & (1ULL << 11));
222 }
223
224 static inline void context_set_copied(struct context_entry *context)
225 {
226         context->hi |= (1ull << 3);
227 }
228
229 static inline bool context_copied(struct context_entry *context)
230 {
231         return !!(context->hi & (1ULL << 3));
232 }
233
234 static inline bool __context_present(struct context_entry *context)
235 {
236         return (context->lo & 1);
237 }
238
239 bool context_present(struct context_entry *context)
240 {
241         return context_pasid_enabled(context) ?
242              __context_present(context) :
243              __context_present(context) && !context_copied(context);
244 }
245
246 static inline void context_set_present(struct context_entry *context)
247 {
248         context->lo |= 1;
249 }
250
251 static inline void context_set_fault_enable(struct context_entry *context)
252 {
253         context->lo &= (((u64)-1) << 2) | 1;
254 }
255
256 static inline void context_set_translation_type(struct context_entry *context,
257                                                 unsigned long value)
258 {
259         context->lo &= (((u64)-1) << 4) | 3;
260         context->lo |= (value & 3) << 2;
261 }
262
263 static inline void context_set_address_root(struct context_entry *context,
264                                             unsigned long value)
265 {
266         context->lo &= ~VTD_PAGE_MASK;
267         context->lo |= value & VTD_PAGE_MASK;
268 }
269
270 static inline void context_set_address_width(struct context_entry *context,
271                                              unsigned long value)
272 {
273         context->hi |= value & 7;
274 }
275
276 static inline void context_set_domain_id(struct context_entry *context,
277                                          unsigned long value)
278 {
279         context->hi |= (value & ((1 << 16) - 1)) << 8;
280 }
281
282 static inline int context_domain_id(struct context_entry *c)
283 {
284         return((c->hi >> 8) & 0xffff);
285 }
286
287 static inline void context_clear_entry(struct context_entry *context)
288 {
289         context->lo = 0;
290         context->hi = 0;
291 }
292
293 /*
294  * 0: readable
295  * 1: writable
296  * 2-6: reserved
297  * 7: super page
298  * 8-10: available
299  * 11: snoop behavior
300  * 12-63: Host physcial address
301  */
302 struct dma_pte {
303         u64 val;
304 };
305
306 static inline void dma_clear_pte(struct dma_pte *pte)
307 {
308         pte->val = 0;
309 }
310
311 static inline u64 dma_pte_addr(struct dma_pte *pte)
312 {
313 #ifdef CONFIG_64BIT
314         return pte->val & VTD_PAGE_MASK;
315 #else
316         /* Must have a full atomic 64-bit read */
317         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
318 #endif
319 }
320
321 static inline bool dma_pte_present(struct dma_pte *pte)
322 {
323         return (pte->val & 3) != 0;
324 }
325
326 static inline bool dma_pte_superpage(struct dma_pte *pte)
327 {
328         return (pte->val & DMA_PTE_LARGE_PAGE);
329 }
330
331 static inline int first_pte_in_page(struct dma_pte *pte)
332 {
333         return !((unsigned long)pte & ~VTD_PAGE_MASK);
334 }
335
336 /*
337  * This domain is a statically identity mapping domain.
338  *      1. This domain creats a static 1:1 mapping to all usable memory.
339  *      2. It maps to each iommu if successful.
340  *      3. Each iommu mapps to this domain if successful.
341  */
342 static struct dmar_domain *si_domain;
343 static int hw_pass_through = 1;
344
345 /*
346  * Domain represents a virtual machine, more than one devices
347  * across iommus may be owned in one domain, e.g. kvm guest.
348  */
349 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 0)
350
351 /* si_domain contains mulitple devices */
352 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 1)
353
354 #define for_each_domain_iommu(idx, domain)                      \
355         for (idx = 0; idx < g_num_of_iommus; idx++)             \
356                 if (domain->iommu_refcnt[idx])
357
358 struct dmar_rmrr_unit {
359         struct list_head list;          /* list of rmrr units   */
360         struct acpi_dmar_header *hdr;   /* ACPI header          */
361         u64     base_address;           /* reserved base address*/
362         u64     end_address;            /* reserved end address */
363         struct dmar_dev_scope *devices; /* target devices */
364         int     devices_cnt;            /* target device count */
365         struct iommu_resv_region *resv; /* reserved region handle */
366 };
367
368 struct dmar_atsr_unit {
369         struct list_head list;          /* list of ATSR units */
370         struct acpi_dmar_header *hdr;   /* ACPI header */
371         struct dmar_dev_scope *devices; /* target devices */
372         int devices_cnt;                /* target device count */
373         u8 include_all:1;               /* include all ports */
374 };
375
376 static LIST_HEAD(dmar_atsr_units);
377 static LIST_HEAD(dmar_rmrr_units);
378
379 #define for_each_rmrr_units(rmrr) \
380         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
381
382 /* bitmap for indexing intel_iommus */
383 static int g_num_of_iommus;
384
385 static void domain_exit(struct dmar_domain *domain);
386 static void domain_remove_dev_info(struct dmar_domain *domain);
387 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
388                                      struct device *dev);
389 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
390 static void domain_context_clear(struct intel_iommu *iommu,
391                                  struct device *dev);
392 static int domain_detach_iommu(struct dmar_domain *domain,
393                                struct intel_iommu *iommu);
394
395 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
396 int dmar_disabled = 0;
397 #else
398 int dmar_disabled = 1;
399 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
400
401 int intel_iommu_enabled = 0;
402 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
403
404 static int dmar_map_gfx = 1;
405 static int dmar_forcedac;
406 static int intel_iommu_strict;
407 static int intel_iommu_superpage = 1;
408 static int intel_iommu_ecs = 1;
409 static int intel_iommu_pasid28;
410 static int iommu_identity_mapping;
411
412 #define IDENTMAP_ALL            1
413 #define IDENTMAP_GFX            2
414 #define IDENTMAP_AZALIA         4
415
416 /* Broadwell and Skylake have broken ECS support — normal so-called "second
417  * level" translation of DMA requests-without-PASID doesn't actually happen
418  * unless you also set the NESTE bit in an extended context-entry. Which of
419  * course means that SVM doesn't work because it's trying to do nested
420  * translation of the physical addresses it finds in the process page tables,
421  * through the IOVA->phys mapping found in the "second level" page tables.
422  *
423  * The VT-d specification was retroactively changed to change the definition
424  * of the capability bits and pretend that Broadwell/Skylake never happened...
425  * but unfortunately the wrong bit was changed. It's ECS which is broken, but
426  * for some reason it was the PASID capability bit which was redefined (from
427  * bit 28 on BDW/SKL to bit 40 in future).
428  *
429  * So our test for ECS needs to eschew those implementations which set the old
430  * PASID capabiity bit 28, since those are the ones on which ECS is broken.
431  * Unless we are working around the 'pasid28' limitations, that is, by putting
432  * the device into passthrough mode for normal DMA and thus masking the bug.
433  */
434 #define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
435                             (intel_iommu_pasid28 || !ecap_broken_pasid(iommu->ecap)))
436 /* PASID support is thus enabled if ECS is enabled and *either* of the old
437  * or new capability bits are set. */
438 #define pasid_enabled(iommu) (ecs_enabled(iommu) &&                     \
439                               (ecap_pasid(iommu->ecap) || ecap_broken_pasid(iommu->ecap)))
440
441 int intel_iommu_gfx_mapped;
442 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
443
444 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
445 static DEFINE_SPINLOCK(device_domain_lock);
446 static LIST_HEAD(device_domain_list);
447
448 /*
449  * Iterate over elements in device_domain_list and call the specified
450  * callback @fn against each element. This helper should only be used
451  * in the context where the device_domain_lock has already been holden.
452  */
453 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
454                                      void *data), void *data)
455 {
456         int ret = 0;
457         struct device_domain_info *info;
458
459         assert_spin_locked(&device_domain_lock);
460         list_for_each_entry(info, &device_domain_list, global) {
461                 ret = fn(info, data);
462                 if (ret)
463                         return ret;
464         }
465
466         return 0;
467 }
468
469 const struct iommu_ops intel_iommu_ops;
470
471 static bool translation_pre_enabled(struct intel_iommu *iommu)
472 {
473         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
474 }
475
476 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
477 {
478         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
479 }
480
481 static void init_translation_status(struct intel_iommu *iommu)
482 {
483         u32 gsts;
484
485         gsts = readl(iommu->reg + DMAR_GSTS_REG);
486         if (gsts & DMA_GSTS_TES)
487                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
488 }
489
490 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
491 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
492 {
493         return container_of(dom, struct dmar_domain, domain);
494 }
495
496 static int __init intel_iommu_setup(char *str)
497 {
498         if (!str)
499                 return -EINVAL;
500         while (*str) {
501                 if (!strncmp(str, "on", 2)) {
502                         dmar_disabled = 0;
503                         pr_info("IOMMU enabled\n");
504                 } else if (!strncmp(str, "off", 3)) {
505                         dmar_disabled = 1;
506                         pr_info("IOMMU disabled\n");
507                 } else if (!strncmp(str, "igfx_off", 8)) {
508                         dmar_map_gfx = 0;
509                         pr_info("Disable GFX device mapping\n");
510                 } else if (!strncmp(str, "forcedac", 8)) {
511                         pr_info("Forcing DAC for PCI devices\n");
512                         dmar_forcedac = 1;
513                 } else if (!strncmp(str, "strict", 6)) {
514                         pr_info("Disable batched IOTLB flush\n");
515                         intel_iommu_strict = 1;
516                 } else if (!strncmp(str, "sp_off", 6)) {
517                         pr_info("Disable supported super page\n");
518                         intel_iommu_superpage = 0;
519                 } else if (!strncmp(str, "ecs_off", 7)) {
520                         printk(KERN_INFO
521                                 "Intel-IOMMU: disable extended context table support\n");
522                         intel_iommu_ecs = 0;
523                 } else if (!strncmp(str, "pasid28", 7)) {
524                         printk(KERN_INFO
525                                 "Intel-IOMMU: enable pre-production PASID support\n");
526                         intel_iommu_pasid28 = 1;
527                         iommu_identity_mapping |= IDENTMAP_GFX;
528                 } else if (!strncmp(str, "tboot_noforce", 13)) {
529                         printk(KERN_INFO
530                                 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
531                         intel_iommu_tboot_noforce = 1;
532                 }
533
534                 str += strcspn(str, ",");
535                 while (*str == ',')
536                         str++;
537         }
538         return 0;
539 }
540 __setup("intel_iommu=", intel_iommu_setup);
541
542 static struct kmem_cache *iommu_domain_cache;
543 static struct kmem_cache *iommu_devinfo_cache;
544
545 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
546 {
547         struct dmar_domain **domains;
548         int idx = did >> 8;
549
550         domains = iommu->domains[idx];
551         if (!domains)
552                 return NULL;
553
554         return domains[did & 0xff];
555 }
556
557 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
558                              struct dmar_domain *domain)
559 {
560         struct dmar_domain **domains;
561         int idx = did >> 8;
562
563         if (!iommu->domains[idx]) {
564                 size_t size = 256 * sizeof(struct dmar_domain *);
565                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
566         }
567
568         domains = iommu->domains[idx];
569         if (WARN_ON(!domains))
570                 return;
571         else
572                 domains[did & 0xff] = domain;
573 }
574
575 void *alloc_pgtable_page(int node)
576 {
577         struct page *page;
578         void *vaddr = NULL;
579
580         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
581         if (page)
582                 vaddr = page_address(page);
583         return vaddr;
584 }
585
586 void free_pgtable_page(void *vaddr)
587 {
588         free_page((unsigned long)vaddr);
589 }
590
591 static inline void *alloc_domain_mem(void)
592 {
593         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
594 }
595
596 static void free_domain_mem(void *vaddr)
597 {
598         kmem_cache_free(iommu_domain_cache, vaddr);
599 }
600
601 static inline void * alloc_devinfo_mem(void)
602 {
603         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
604 }
605
606 static inline void free_devinfo_mem(void *vaddr)
607 {
608         kmem_cache_free(iommu_devinfo_cache, vaddr);
609 }
610
611 static inline int domain_type_is_vm(struct dmar_domain *domain)
612 {
613         return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
614 }
615
616 static inline int domain_type_is_si(struct dmar_domain *domain)
617 {
618         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
619 }
620
621 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
622 {
623         return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
624                                 DOMAIN_FLAG_STATIC_IDENTITY);
625 }
626
627 static inline int domain_pfn_supported(struct dmar_domain *domain,
628                                        unsigned long pfn)
629 {
630         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
631
632         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
633 }
634
635 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
636 {
637         unsigned long sagaw;
638         int agaw = -1;
639
640         sagaw = cap_sagaw(iommu->cap);
641         for (agaw = width_to_agaw(max_gaw);
642              agaw >= 0; agaw--) {
643                 if (test_bit(agaw, &sagaw))
644                         break;
645         }
646
647         return agaw;
648 }
649
650 /*
651  * Calculate max SAGAW for each iommu.
652  */
653 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
654 {
655         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
656 }
657
658 /*
659  * calculate agaw for each iommu.
660  * "SAGAW" may be different across iommus, use a default agaw, and
661  * get a supported less agaw for iommus that don't support the default agaw.
662  */
663 int iommu_calculate_agaw(struct intel_iommu *iommu)
664 {
665         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
666 }
667
668 /* This functionin only returns single iommu in a domain */
669 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
670 {
671         int iommu_id;
672
673         /* si_domain and vm domain should not get here. */
674         BUG_ON(domain_type_is_vm_or_si(domain));
675         for_each_domain_iommu(iommu_id, domain)
676                 break;
677
678         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
679                 return NULL;
680
681         return g_iommus[iommu_id];
682 }
683
684 static void domain_update_iommu_coherency(struct dmar_domain *domain)
685 {
686         struct dmar_drhd_unit *drhd;
687         struct intel_iommu *iommu;
688         bool found = false;
689         int i;
690
691         domain->iommu_coherency = 1;
692
693         for_each_domain_iommu(i, domain) {
694                 found = true;
695                 if (!ecap_coherent(g_iommus[i]->ecap)) {
696                         domain->iommu_coherency = 0;
697                         break;
698                 }
699         }
700         if (found)
701                 return;
702
703         /* No hardware attached; use lowest common denominator */
704         rcu_read_lock();
705         for_each_active_iommu(iommu, drhd) {
706                 if (!ecap_coherent(iommu->ecap)) {
707                         domain->iommu_coherency = 0;
708                         break;
709                 }
710         }
711         rcu_read_unlock();
712 }
713
714 static int domain_update_iommu_snooping(struct intel_iommu *skip)
715 {
716         struct dmar_drhd_unit *drhd;
717         struct intel_iommu *iommu;
718         int ret = 1;
719
720         rcu_read_lock();
721         for_each_active_iommu(iommu, drhd) {
722                 if (iommu != skip) {
723                         if (!ecap_sc_support(iommu->ecap)) {
724                                 ret = 0;
725                                 break;
726                         }
727                 }
728         }
729         rcu_read_unlock();
730
731         return ret;
732 }
733
734 static int domain_update_iommu_superpage(struct intel_iommu *skip)
735 {
736         struct dmar_drhd_unit *drhd;
737         struct intel_iommu *iommu;
738         int mask = 0xf;
739
740         if (!intel_iommu_superpage) {
741                 return 0;
742         }
743
744         /* set iommu_superpage to the smallest common denominator */
745         rcu_read_lock();
746         for_each_active_iommu(iommu, drhd) {
747                 if (iommu != skip) {
748                         mask &= cap_super_page_val(iommu->cap);
749                         if (!mask)
750                                 break;
751                 }
752         }
753         rcu_read_unlock();
754
755         return fls(mask);
756 }
757
758 /* Some capabilities may be different across iommus */
759 static void domain_update_iommu_cap(struct dmar_domain *domain)
760 {
761         domain_update_iommu_coherency(domain);
762         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
763         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
764 }
765
766 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
767                                          u8 devfn, int alloc)
768 {
769         struct root_entry *root = &iommu->root_entry[bus];
770         struct context_entry *context;
771         u64 *entry;
772
773         entry = &root->lo;
774         if (ecs_enabled(iommu)) {
775                 if (devfn >= 0x80) {
776                         devfn -= 0x80;
777                         entry = &root->hi;
778                 }
779                 devfn *= 2;
780         }
781         if (*entry & 1)
782                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
783         else {
784                 unsigned long phy_addr;
785                 if (!alloc)
786                         return NULL;
787
788                 context = alloc_pgtable_page(iommu->node);
789                 if (!context)
790                         return NULL;
791
792                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
793                 phy_addr = virt_to_phys((void *)context);
794                 *entry = phy_addr | 1;
795                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
796         }
797         return &context[devfn];
798 }
799
800 static int iommu_dummy(struct device *dev)
801 {
802         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
803 }
804
805 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
806 {
807         struct dmar_drhd_unit *drhd = NULL;
808         struct intel_iommu *iommu;
809         struct device *tmp;
810         struct pci_dev *ptmp, *pdev = NULL;
811         u16 segment = 0;
812         int i;
813
814         if (iommu_dummy(dev))
815                 return NULL;
816
817         if (dev_is_pci(dev)) {
818                 struct pci_dev *pf_pdev;
819
820                 pdev = to_pci_dev(dev);
821
822 #ifdef CONFIG_X86
823                 /* VMD child devices currently cannot be handled individually */
824                 if (is_vmd(pdev->bus))
825                         return NULL;
826 #endif
827
828                 /* VFs aren't listed in scope tables; we need to look up
829                  * the PF instead to find the IOMMU. */
830                 pf_pdev = pci_physfn(pdev);
831                 dev = &pf_pdev->dev;
832                 segment = pci_domain_nr(pdev->bus);
833         } else if (has_acpi_companion(dev))
834                 dev = &ACPI_COMPANION(dev)->dev;
835
836         rcu_read_lock();
837         for_each_active_iommu(iommu, drhd) {
838                 if (pdev && segment != drhd->segment)
839                         continue;
840
841                 for_each_active_dev_scope(drhd->devices,
842                                           drhd->devices_cnt, i, tmp) {
843                         if (tmp == dev) {
844                                 /* For a VF use its original BDF# not that of the PF
845                                  * which we used for the IOMMU lookup. Strictly speaking
846                                  * we could do this for all PCI devices; we only need to
847                                  * get the BDF# from the scope table for ACPI matches. */
848                                 if (pdev && pdev->is_virtfn)
849                                         goto got_pdev;
850
851                                 *bus = drhd->devices[i].bus;
852                                 *devfn = drhd->devices[i].devfn;
853                                 goto out;
854                         }
855
856                         if (!pdev || !dev_is_pci(tmp))
857                                 continue;
858
859                         ptmp = to_pci_dev(tmp);
860                         if (ptmp->subordinate &&
861                             ptmp->subordinate->number <= pdev->bus->number &&
862                             ptmp->subordinate->busn_res.end >= pdev->bus->number)
863                                 goto got_pdev;
864                 }
865
866                 if (pdev && drhd->include_all) {
867                 got_pdev:
868                         *bus = pdev->bus->number;
869                         *devfn = pdev->devfn;
870                         goto out;
871                 }
872         }
873         iommu = NULL;
874  out:
875         rcu_read_unlock();
876
877         return iommu;
878 }
879
880 static void domain_flush_cache(struct dmar_domain *domain,
881                                void *addr, int size)
882 {
883         if (!domain->iommu_coherency)
884                 clflush_cache_range(addr, size);
885 }
886
887 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
888 {
889         struct context_entry *context;
890         int ret = 0;
891         unsigned long flags;
892
893         spin_lock_irqsave(&iommu->lock, flags);
894         context = iommu_context_addr(iommu, bus, devfn, 0);
895         if (context)
896                 ret = context_present(context);
897         spin_unlock_irqrestore(&iommu->lock, flags);
898         return ret;
899 }
900
901 static void free_context_table(struct intel_iommu *iommu)
902 {
903         int i;
904         unsigned long flags;
905         struct context_entry *context;
906
907         spin_lock_irqsave(&iommu->lock, flags);
908         if (!iommu->root_entry) {
909                 goto out;
910         }
911         for (i = 0; i < ROOT_ENTRY_NR; i++) {
912                 context = iommu_context_addr(iommu, i, 0, 0);
913                 if (context)
914                         free_pgtable_page(context);
915
916                 if (!ecs_enabled(iommu))
917                         continue;
918
919                 context = iommu_context_addr(iommu, i, 0x80, 0);
920                 if (context)
921                         free_pgtable_page(context);
922
923         }
924         free_pgtable_page(iommu->root_entry);
925         iommu->root_entry = NULL;
926 out:
927         spin_unlock_irqrestore(&iommu->lock, flags);
928 }
929
930 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
931                                       unsigned long pfn, int *target_level)
932 {
933         struct dma_pte *parent, *pte = NULL;
934         int level = agaw_to_level(domain->agaw);
935         int offset;
936
937         BUG_ON(!domain->pgd);
938
939         if (!domain_pfn_supported(domain, pfn))
940                 /* Address beyond IOMMU's addressing capabilities. */
941                 return NULL;
942
943         parent = domain->pgd;
944
945         while (1) {
946                 void *tmp_page;
947
948                 offset = pfn_level_offset(pfn, level);
949                 pte = &parent[offset];
950                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
951                         break;
952                 if (level == *target_level)
953                         break;
954
955                 if (!dma_pte_present(pte)) {
956                         uint64_t pteval;
957
958                         tmp_page = alloc_pgtable_page(domain->nid);
959
960                         if (!tmp_page)
961                                 return NULL;
962
963                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
964                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
965                         if (cmpxchg64(&pte->val, 0ULL, pteval))
966                                 /* Someone else set it while we were thinking; use theirs. */
967                                 free_pgtable_page(tmp_page);
968                         else
969                                 domain_flush_cache(domain, pte, sizeof(*pte));
970                 }
971                 if (level == 1)
972                         break;
973
974                 parent = phys_to_virt(dma_pte_addr(pte));
975                 level--;
976         }
977
978         if (!*target_level)
979                 *target_level = level;
980
981         return pte;
982 }
983
984
985 /* return address's pte at specific level */
986 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
987                                          unsigned long pfn,
988                                          int level, int *large_page)
989 {
990         struct dma_pte *parent, *pte = NULL;
991         int total = agaw_to_level(domain->agaw);
992         int offset;
993
994         parent = domain->pgd;
995         while (level <= total) {
996                 offset = pfn_level_offset(pfn, total);
997                 pte = &parent[offset];
998                 if (level == total)
999                         return pte;
1000
1001                 if (!dma_pte_present(pte)) {
1002                         *large_page = total;
1003                         break;
1004                 }
1005
1006                 if (dma_pte_superpage(pte)) {
1007                         *large_page = total;
1008                         return pte;
1009                 }
1010
1011                 parent = phys_to_virt(dma_pte_addr(pte));
1012                 total--;
1013         }
1014         return NULL;
1015 }
1016
1017 /* clear last level pte, a tlb flush should be followed */
1018 static void dma_pte_clear_range(struct dmar_domain *domain,
1019                                 unsigned long start_pfn,
1020                                 unsigned long last_pfn)
1021 {
1022         unsigned int large_page = 1;
1023         struct dma_pte *first_pte, *pte;
1024
1025         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1026         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1027         BUG_ON(start_pfn > last_pfn);
1028
1029         /* we don't need lock here; nobody else touches the iova range */
1030         do {
1031                 large_page = 1;
1032                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1033                 if (!pte) {
1034                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1035                         continue;
1036                 }
1037                 do {
1038                         dma_clear_pte(pte);
1039                         start_pfn += lvl_to_nr_pages(large_page);
1040                         pte++;
1041                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1042
1043                 domain_flush_cache(domain, first_pte,
1044                                    (void *)pte - (void *)first_pte);
1045
1046         } while (start_pfn && start_pfn <= last_pfn);
1047 }
1048
1049 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1050                                int retain_level, struct dma_pte *pte,
1051                                unsigned long pfn, unsigned long start_pfn,
1052                                unsigned long last_pfn)
1053 {
1054         pfn = max(start_pfn, pfn);
1055         pte = &pte[pfn_level_offset(pfn, level)];
1056
1057         do {
1058                 unsigned long level_pfn;
1059                 struct dma_pte *level_pte;
1060
1061                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1062                         goto next;
1063
1064                 level_pfn = pfn & level_mask(level);
1065                 level_pte = phys_to_virt(dma_pte_addr(pte));
1066
1067                 if (level > 2) {
1068                         dma_pte_free_level(domain, level - 1, retain_level,
1069                                            level_pte, level_pfn, start_pfn,
1070                                            last_pfn);
1071                 }
1072
1073                 /*
1074                  * Free the page table if we're below the level we want to
1075                  * retain and the range covers the entire table.
1076                  */
1077                 if (level < retain_level && !(start_pfn > level_pfn ||
1078                       last_pfn < level_pfn + level_size(level) - 1)) {
1079                         dma_clear_pte(pte);
1080                         domain_flush_cache(domain, pte, sizeof(*pte));
1081                         free_pgtable_page(level_pte);
1082                 }
1083 next:
1084                 pfn += level_size(level);
1085         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1086 }
1087
1088 /*
1089  * clear last level (leaf) ptes and free page table pages below the
1090  * level we wish to keep intact.
1091  */
1092 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1093                                    unsigned long start_pfn,
1094                                    unsigned long last_pfn,
1095                                    int retain_level)
1096 {
1097         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1098         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1099         BUG_ON(start_pfn > last_pfn);
1100
1101         dma_pte_clear_range(domain, start_pfn, last_pfn);
1102
1103         /* We don't need lock here; nobody else touches the iova range */
1104         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1105                            domain->pgd, 0, start_pfn, last_pfn);
1106
1107         /* free pgd */
1108         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1109                 free_pgtable_page(domain->pgd);
1110                 domain->pgd = NULL;
1111         }
1112 }
1113
1114 /* When a page at a given level is being unlinked from its parent, we don't
1115    need to *modify* it at all. All we need to do is make a list of all the
1116    pages which can be freed just as soon as we've flushed the IOTLB and we
1117    know the hardware page-walk will no longer touch them.
1118    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1119    be freed. */
1120 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1121                                             int level, struct dma_pte *pte,
1122                                             struct page *freelist)
1123 {
1124         struct page *pg;
1125
1126         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1127         pg->freelist = freelist;
1128         freelist = pg;
1129
1130         if (level == 1)
1131                 return freelist;
1132
1133         pte = page_address(pg);
1134         do {
1135                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1136                         freelist = dma_pte_list_pagetables(domain, level - 1,
1137                                                            pte, freelist);
1138                 pte++;
1139         } while (!first_pte_in_page(pte));
1140
1141         return freelist;
1142 }
1143
1144 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1145                                         struct dma_pte *pte, unsigned long pfn,
1146                                         unsigned long start_pfn,
1147                                         unsigned long last_pfn,
1148                                         struct page *freelist)
1149 {
1150         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1151
1152         pfn = max(start_pfn, pfn);
1153         pte = &pte[pfn_level_offset(pfn, level)];
1154
1155         do {
1156                 unsigned long level_pfn;
1157
1158                 if (!dma_pte_present(pte))
1159                         goto next;
1160
1161                 level_pfn = pfn & level_mask(level);
1162
1163                 /* If range covers entire pagetable, free it */
1164                 if (start_pfn <= level_pfn &&
1165                     last_pfn >= level_pfn + level_size(level) - 1) {
1166                         /* These suborbinate page tables are going away entirely. Don't
1167                            bother to clear them; we're just going to *free* them. */
1168                         if (level > 1 && !dma_pte_superpage(pte))
1169                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1170
1171                         dma_clear_pte(pte);
1172                         if (!first_pte)
1173                                 first_pte = pte;
1174                         last_pte = pte;
1175                 } else if (level > 1) {
1176                         /* Recurse down into a level that isn't *entirely* obsolete */
1177                         freelist = dma_pte_clear_level(domain, level - 1,
1178                                                        phys_to_virt(dma_pte_addr(pte)),
1179                                                        level_pfn, start_pfn, last_pfn,
1180                                                        freelist);
1181                 }
1182 next:
1183                 pfn += level_size(level);
1184         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1185
1186         if (first_pte)
1187                 domain_flush_cache(domain, first_pte,
1188                                    (void *)++last_pte - (void *)first_pte);
1189
1190         return freelist;
1191 }
1192
1193 /* We can't just free the pages because the IOMMU may still be walking
1194    the page tables, and may have cached the intermediate levels. The
1195    pages can only be freed after the IOTLB flush has been done. */
1196 static struct page *domain_unmap(struct dmar_domain *domain,
1197                                  unsigned long start_pfn,
1198                                  unsigned long last_pfn)
1199 {
1200         struct page *freelist = NULL;
1201
1202         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1203         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1204         BUG_ON(start_pfn > last_pfn);
1205
1206         /* we don't need lock here; nobody else touches the iova range */
1207         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1208                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1209
1210         /* free pgd */
1211         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1212                 struct page *pgd_page = virt_to_page(domain->pgd);
1213                 pgd_page->freelist = freelist;
1214                 freelist = pgd_page;
1215
1216                 domain->pgd = NULL;
1217         }
1218
1219         return freelist;
1220 }
1221
1222 static void dma_free_pagelist(struct page *freelist)
1223 {
1224         struct page *pg;
1225
1226         while ((pg = freelist)) {
1227                 freelist = pg->freelist;
1228                 free_pgtable_page(page_address(pg));
1229         }
1230 }
1231
1232 static void iova_entry_free(unsigned long data)
1233 {
1234         struct page *freelist = (struct page *)data;
1235
1236         dma_free_pagelist(freelist);
1237 }
1238
1239 /* iommu handling */
1240 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1241 {
1242         struct root_entry *root;
1243         unsigned long flags;
1244
1245         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1246         if (!root) {
1247                 pr_err("Allocating root entry for %s failed\n",
1248                         iommu->name);
1249                 return -ENOMEM;
1250         }
1251
1252         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1253
1254         spin_lock_irqsave(&iommu->lock, flags);
1255         iommu->root_entry = root;
1256         spin_unlock_irqrestore(&iommu->lock, flags);
1257
1258         return 0;
1259 }
1260
1261 static void iommu_set_root_entry(struct intel_iommu *iommu)
1262 {
1263         u64 addr;
1264         u32 sts;
1265         unsigned long flag;
1266
1267         addr = virt_to_phys(iommu->root_entry);
1268         if (ecs_enabled(iommu))
1269                 addr |= DMA_RTADDR_RTT;
1270
1271         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1272         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1273
1274         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1275
1276         /* Make sure hardware complete it */
1277         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1278                       readl, (sts & DMA_GSTS_RTPS), sts);
1279
1280         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1281 }
1282
1283 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1284 {
1285         u32 val;
1286         unsigned long flag;
1287
1288         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1289                 return;
1290
1291         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1292         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1293
1294         /* Make sure hardware complete it */
1295         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1296                       readl, (!(val & DMA_GSTS_WBFS)), val);
1297
1298         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1299 }
1300
1301 /* return value determine if we need a write buffer flush */
1302 static void __iommu_flush_context(struct intel_iommu *iommu,
1303                                   u16 did, u16 source_id, u8 function_mask,
1304                                   u64 type)
1305 {
1306         u64 val = 0;
1307         unsigned long flag;
1308
1309         switch (type) {
1310         case DMA_CCMD_GLOBAL_INVL:
1311                 val = DMA_CCMD_GLOBAL_INVL;
1312                 break;
1313         case DMA_CCMD_DOMAIN_INVL:
1314                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1315                 break;
1316         case DMA_CCMD_DEVICE_INVL:
1317                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1318                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1319                 break;
1320         default:
1321                 BUG();
1322         }
1323         val |= DMA_CCMD_ICC;
1324
1325         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1326         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1327
1328         /* Make sure hardware complete it */
1329         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1330                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1331
1332         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1333 }
1334
1335 /* return value determine if we need a write buffer flush */
1336 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1337                                 u64 addr, unsigned int size_order, u64 type)
1338 {
1339         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1340         u64 val = 0, val_iva = 0;
1341         unsigned long flag;
1342
1343         switch (type) {
1344         case DMA_TLB_GLOBAL_FLUSH:
1345                 /* global flush doesn't need set IVA_REG */
1346                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1347                 break;
1348         case DMA_TLB_DSI_FLUSH:
1349                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1350                 break;
1351         case DMA_TLB_PSI_FLUSH:
1352                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1353                 /* IH bit is passed in as part of address */
1354                 val_iva = size_order | addr;
1355                 break;
1356         default:
1357                 BUG();
1358         }
1359         /* Note: set drain read/write */
1360 #if 0
1361         /*
1362          * This is probably to be super secure.. Looks like we can
1363          * ignore it without any impact.
1364          */
1365         if (cap_read_drain(iommu->cap))
1366                 val |= DMA_TLB_READ_DRAIN;
1367 #endif
1368         if (cap_write_drain(iommu->cap))
1369                 val |= DMA_TLB_WRITE_DRAIN;
1370
1371         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1372         /* Note: Only uses first TLB reg currently */
1373         if (val_iva)
1374                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1375         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1376
1377         /* Make sure hardware complete it */
1378         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1379                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1380
1381         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1382
1383         /* check IOTLB invalidation granularity */
1384         if (DMA_TLB_IAIG(val) == 0)
1385                 pr_err("Flush IOTLB failed\n");
1386         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1387                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1388                         (unsigned long long)DMA_TLB_IIRG(type),
1389                         (unsigned long long)DMA_TLB_IAIG(val));
1390 }
1391
1392 static struct device_domain_info *
1393 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1394                          u8 bus, u8 devfn)
1395 {
1396         struct device_domain_info *info;
1397
1398         assert_spin_locked(&device_domain_lock);
1399
1400         if (!iommu->qi)
1401                 return NULL;
1402
1403         list_for_each_entry(info, &domain->devices, link)
1404                 if (info->iommu == iommu && info->bus == bus &&
1405                     info->devfn == devfn) {
1406                         if (info->ats_supported && info->dev)
1407                                 return info;
1408                         break;
1409                 }
1410
1411         return NULL;
1412 }
1413
1414 static void domain_update_iotlb(struct dmar_domain *domain)
1415 {
1416         struct device_domain_info *info;
1417         bool has_iotlb_device = false;
1418
1419         assert_spin_locked(&device_domain_lock);
1420
1421         list_for_each_entry(info, &domain->devices, link) {
1422                 struct pci_dev *pdev;
1423
1424                 if (!info->dev || !dev_is_pci(info->dev))
1425                         continue;
1426
1427                 pdev = to_pci_dev(info->dev);
1428                 if (pdev->ats_enabled) {
1429                         has_iotlb_device = true;
1430                         break;
1431                 }
1432         }
1433
1434         domain->has_iotlb_device = has_iotlb_device;
1435 }
1436
1437 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1438 {
1439         struct pci_dev *pdev;
1440
1441         assert_spin_locked(&device_domain_lock);
1442
1443         if (!info || !dev_is_pci(info->dev))
1444                 return;
1445
1446         pdev = to_pci_dev(info->dev);
1447         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1448          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1449          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1450          * reserved, which should be set to 0.
1451          */
1452         if (!ecap_dit(info->iommu->ecap))
1453                 info->pfsid = 0;
1454         else {
1455                 struct pci_dev *pf_pdev;
1456
1457                 /* pdev will be returned if device is not a vf */
1458                 pf_pdev = pci_physfn(pdev);
1459                 info->pfsid = PCI_DEVID(pf_pdev->bus->number, pf_pdev->devfn);
1460         }
1461
1462 #ifdef CONFIG_INTEL_IOMMU_SVM
1463         /* The PCIe spec, in its wisdom, declares that the behaviour of
1464            the device if you enable PASID support after ATS support is
1465            undefined. So always enable PASID support on devices which
1466            have it, even if we can't yet know if we're ever going to
1467            use it. */
1468         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1469                 info->pasid_enabled = 1;
1470
1471         if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1472                 info->pri_enabled = 1;
1473 #endif
1474         if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1475                 info->ats_enabled = 1;
1476                 domain_update_iotlb(info->domain);
1477                 info->ats_qdep = pci_ats_queue_depth(pdev);
1478         }
1479 }
1480
1481 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1482 {
1483         struct pci_dev *pdev;
1484
1485         assert_spin_locked(&device_domain_lock);
1486
1487         if (!dev_is_pci(info->dev))
1488                 return;
1489
1490         pdev = to_pci_dev(info->dev);
1491
1492         if (info->ats_enabled) {
1493                 pci_disable_ats(pdev);
1494                 info->ats_enabled = 0;
1495                 domain_update_iotlb(info->domain);
1496         }
1497 #ifdef CONFIG_INTEL_IOMMU_SVM
1498         if (info->pri_enabled) {
1499                 pci_disable_pri(pdev);
1500                 info->pri_enabled = 0;
1501         }
1502         if (info->pasid_enabled) {
1503                 pci_disable_pasid(pdev);
1504                 info->pasid_enabled = 0;
1505         }
1506 #endif
1507 }
1508
1509 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1510                                   u64 addr, unsigned mask)
1511 {
1512         u16 sid, qdep;
1513         unsigned long flags;
1514         struct device_domain_info *info;
1515
1516         if (!domain->has_iotlb_device)
1517                 return;
1518
1519         spin_lock_irqsave(&device_domain_lock, flags);
1520         list_for_each_entry(info, &domain->devices, link) {
1521                 if (!info->ats_enabled)
1522                         continue;
1523
1524                 sid = info->bus << 8 | info->devfn;
1525                 qdep = info->ats_qdep;
1526                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1527                                 qdep, addr, mask);
1528         }
1529         spin_unlock_irqrestore(&device_domain_lock, flags);
1530 }
1531
1532 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1533                                   struct dmar_domain *domain,
1534                                   unsigned long pfn, unsigned int pages,
1535                                   int ih, int map)
1536 {
1537         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1538         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1539         u16 did = domain->iommu_did[iommu->seq_id];
1540
1541         BUG_ON(pages == 0);
1542
1543         if (ih)
1544                 ih = 1 << 6;
1545         /*
1546          * Fallback to domain selective flush if no PSI support or the size is
1547          * too big.
1548          * PSI requires page size to be 2 ^ x, and the base address is naturally
1549          * aligned to the size
1550          */
1551         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1552                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1553                                                 DMA_TLB_DSI_FLUSH);
1554         else
1555                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1556                                                 DMA_TLB_PSI_FLUSH);
1557
1558         /*
1559          * In caching mode, changes of pages from non-present to present require
1560          * flush. However, device IOTLB doesn't need to be flushed in this case.
1561          */
1562         if (!cap_caching_mode(iommu->cap) || !map)
1563                 iommu_flush_dev_iotlb(domain, addr, mask);
1564 }
1565
1566 /* Notification for newly created mappings */
1567 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1568                                         struct dmar_domain *domain,
1569                                         unsigned long pfn, unsigned int pages)
1570 {
1571         /* It's a non-present to present mapping. Only flush if caching mode */
1572         if (cap_caching_mode(iommu->cap))
1573                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1574         else
1575                 iommu_flush_write_buffer(iommu);
1576 }
1577
1578 static void iommu_flush_iova(struct iova_domain *iovad)
1579 {
1580         struct dmar_domain *domain;
1581         int idx;
1582
1583         domain = container_of(iovad, struct dmar_domain, iovad);
1584
1585         for_each_domain_iommu(idx, domain) {
1586                 struct intel_iommu *iommu = g_iommus[idx];
1587                 u16 did = domain->iommu_did[iommu->seq_id];
1588
1589                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1590
1591                 if (!cap_caching_mode(iommu->cap))
1592                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1593                                               0, MAX_AGAW_PFN_WIDTH);
1594         }
1595 }
1596
1597 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1598 {
1599         u32 pmen;
1600         unsigned long flags;
1601
1602         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1603         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1604         pmen &= ~DMA_PMEN_EPM;
1605         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1606
1607         /* wait for the protected region status bit to clear */
1608         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1609                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1610
1611         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1612 }
1613
1614 static void iommu_enable_translation(struct intel_iommu *iommu)
1615 {
1616         u32 sts;
1617         unsigned long flags;
1618
1619         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1620         iommu->gcmd |= DMA_GCMD_TE;
1621         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1622
1623         /* Make sure hardware complete it */
1624         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1625                       readl, (sts & DMA_GSTS_TES), sts);
1626
1627         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1628 }
1629
1630 static void iommu_disable_translation(struct intel_iommu *iommu)
1631 {
1632         u32 sts;
1633         unsigned long flag;
1634
1635         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1636         iommu->gcmd &= ~DMA_GCMD_TE;
1637         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1638
1639         /* Make sure hardware complete it */
1640         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1641                       readl, (!(sts & DMA_GSTS_TES)), sts);
1642
1643         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1644 }
1645
1646
1647 static int iommu_init_domains(struct intel_iommu *iommu)
1648 {
1649         u32 ndomains, nlongs;
1650         size_t size;
1651
1652         ndomains = cap_ndoms(iommu->cap);
1653         pr_debug("%s: Number of Domains supported <%d>\n",
1654                  iommu->name, ndomains);
1655         nlongs = BITS_TO_LONGS(ndomains);
1656
1657         spin_lock_init(&iommu->lock);
1658
1659         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1660         if (!iommu->domain_ids) {
1661                 pr_err("%s: Allocating domain id array failed\n",
1662                        iommu->name);
1663                 return -ENOMEM;
1664         }
1665
1666         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1667         iommu->domains = kzalloc(size, GFP_KERNEL);
1668
1669         if (iommu->domains) {
1670                 size = 256 * sizeof(struct dmar_domain *);
1671                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1672         }
1673
1674         if (!iommu->domains || !iommu->domains[0]) {
1675                 pr_err("%s: Allocating domain array failed\n",
1676                        iommu->name);
1677                 kfree(iommu->domain_ids);
1678                 kfree(iommu->domains);
1679                 iommu->domain_ids = NULL;
1680                 iommu->domains    = NULL;
1681                 return -ENOMEM;
1682         }
1683
1684
1685
1686         /*
1687          * If Caching mode is set, then invalid translations are tagged
1688          * with domain-id 0, hence we need to pre-allocate it. We also
1689          * use domain-id 0 as a marker for non-allocated domain-id, so
1690          * make sure it is not used for a real domain.
1691          */
1692         set_bit(0, iommu->domain_ids);
1693
1694         return 0;
1695 }
1696
1697 static void disable_dmar_iommu(struct intel_iommu *iommu)
1698 {
1699         struct device_domain_info *info, *tmp;
1700         unsigned long flags;
1701
1702         if (!iommu->domains || !iommu->domain_ids)
1703                 return;
1704
1705 again:
1706         spin_lock_irqsave(&device_domain_lock, flags);
1707         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1708                 struct dmar_domain *domain;
1709
1710                 if (info->iommu != iommu)
1711                         continue;
1712
1713                 if (!info->dev || !info->domain)
1714                         continue;
1715
1716                 domain = info->domain;
1717
1718                 __dmar_remove_one_dev_info(info);
1719
1720                 if (!domain_type_is_vm_or_si(domain)) {
1721                         /*
1722                          * The domain_exit() function  can't be called under
1723                          * device_domain_lock, as it takes this lock itself.
1724                          * So release the lock here and re-run the loop
1725                          * afterwards.
1726                          */
1727                         spin_unlock_irqrestore(&device_domain_lock, flags);
1728                         domain_exit(domain);
1729                         goto again;
1730                 }
1731         }
1732         spin_unlock_irqrestore(&device_domain_lock, flags);
1733
1734         if (iommu->gcmd & DMA_GCMD_TE)
1735                 iommu_disable_translation(iommu);
1736 }
1737
1738 static void free_dmar_iommu(struct intel_iommu *iommu)
1739 {
1740         if ((iommu->domains) && (iommu->domain_ids)) {
1741                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1742                 int i;
1743
1744                 for (i = 0; i < elems; i++)
1745                         kfree(iommu->domains[i]);
1746                 kfree(iommu->domains);
1747                 kfree(iommu->domain_ids);
1748                 iommu->domains = NULL;
1749                 iommu->domain_ids = NULL;
1750         }
1751
1752         g_iommus[iommu->seq_id] = NULL;
1753
1754         /* free context mapping */
1755         free_context_table(iommu);
1756
1757 #ifdef CONFIG_INTEL_IOMMU_SVM
1758         if (pasid_enabled(iommu)) {
1759                 if (ecap_prs(iommu->ecap))
1760                         intel_svm_finish_prq(iommu);
1761                 intel_svm_exit(iommu);
1762         }
1763 #endif
1764 }
1765
1766 static struct dmar_domain *alloc_domain(int flags)
1767 {
1768         struct dmar_domain *domain;
1769
1770         domain = alloc_domain_mem();
1771         if (!domain)
1772                 return NULL;
1773
1774         memset(domain, 0, sizeof(*domain));
1775         domain->nid = -1;
1776         domain->flags = flags;
1777         domain->has_iotlb_device = false;
1778         INIT_LIST_HEAD(&domain->devices);
1779
1780         return domain;
1781 }
1782
1783 /* Must be called with iommu->lock */
1784 static int domain_attach_iommu(struct dmar_domain *domain,
1785                                struct intel_iommu *iommu)
1786 {
1787         unsigned long ndomains;
1788         int num;
1789
1790         assert_spin_locked(&device_domain_lock);
1791         assert_spin_locked(&iommu->lock);
1792
1793         domain->iommu_refcnt[iommu->seq_id] += 1;
1794         domain->iommu_count += 1;
1795         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1796                 ndomains = cap_ndoms(iommu->cap);
1797                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1798
1799                 if (num >= ndomains) {
1800                         pr_err("%s: No free domain ids\n", iommu->name);
1801                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1802                         domain->iommu_count -= 1;
1803                         return -ENOSPC;
1804                 }
1805
1806                 set_bit(num, iommu->domain_ids);
1807                 set_iommu_domain(iommu, num, domain);
1808
1809                 domain->iommu_did[iommu->seq_id] = num;
1810                 domain->nid                      = iommu->node;
1811
1812                 domain_update_iommu_cap(domain);
1813         }
1814
1815         return 0;
1816 }
1817
1818 static int domain_detach_iommu(struct dmar_domain *domain,
1819                                struct intel_iommu *iommu)
1820 {
1821         int num, count = INT_MAX;
1822
1823         assert_spin_locked(&device_domain_lock);
1824         assert_spin_locked(&iommu->lock);
1825
1826         domain->iommu_refcnt[iommu->seq_id] -= 1;
1827         count = --domain->iommu_count;
1828         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1829                 num = domain->iommu_did[iommu->seq_id];
1830                 clear_bit(num, iommu->domain_ids);
1831                 set_iommu_domain(iommu, num, NULL);
1832
1833                 domain_update_iommu_cap(domain);
1834                 domain->iommu_did[iommu->seq_id] = 0;
1835         }
1836
1837         return count;
1838 }
1839
1840 static struct iova_domain reserved_iova_list;
1841 static struct lock_class_key reserved_rbtree_key;
1842
1843 static int dmar_init_reserved_ranges(void)
1844 {
1845         struct pci_dev *pdev = NULL;
1846         struct iova *iova;
1847         int i;
1848
1849         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1850
1851         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1852                 &reserved_rbtree_key);
1853
1854         /* IOAPIC ranges shouldn't be accessed by DMA */
1855         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1856                 IOVA_PFN(IOAPIC_RANGE_END));
1857         if (!iova) {
1858                 pr_err("Reserve IOAPIC range failed\n");
1859                 return -ENODEV;
1860         }
1861
1862         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1863         for_each_pci_dev(pdev) {
1864                 struct resource *r;
1865
1866                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1867                         r = &pdev->resource[i];
1868                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1869                                 continue;
1870                         iova = reserve_iova(&reserved_iova_list,
1871                                             IOVA_PFN(r->start),
1872                                             IOVA_PFN(r->end));
1873                         if (!iova) {
1874                                 pr_err("Reserve iova failed\n");
1875                                 return -ENODEV;
1876                         }
1877                 }
1878         }
1879         return 0;
1880 }
1881
1882 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1883 {
1884         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1885 }
1886
1887 static inline int guestwidth_to_adjustwidth(int gaw)
1888 {
1889         int agaw;
1890         int r = (gaw - 12) % 9;
1891
1892         if (r == 0)
1893                 agaw = gaw;
1894         else
1895                 agaw = gaw + 9 - r;
1896         if (agaw > 64)
1897                 agaw = 64;
1898         return agaw;
1899 }
1900
1901 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1902                        int guest_width)
1903 {
1904         int adjust_width, agaw;
1905         unsigned long sagaw;
1906         int err;
1907
1908         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1909
1910         err = init_iova_flush_queue(&domain->iovad,
1911                                     iommu_flush_iova, iova_entry_free);
1912         if (err)
1913                 return err;
1914
1915         domain_reserve_special_ranges(domain);
1916
1917         /* calculate AGAW */
1918         if (guest_width > cap_mgaw(iommu->cap))
1919                 guest_width = cap_mgaw(iommu->cap);
1920         domain->gaw = guest_width;
1921         adjust_width = guestwidth_to_adjustwidth(guest_width);
1922         agaw = width_to_agaw(adjust_width);
1923         sagaw = cap_sagaw(iommu->cap);
1924         if (!test_bit(agaw, &sagaw)) {
1925                 /* hardware doesn't support it, choose a bigger one */
1926                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1927                 agaw = find_next_bit(&sagaw, 5, agaw);
1928                 if (agaw >= 5)
1929                         return -ENODEV;
1930         }
1931         domain->agaw = agaw;
1932
1933         if (ecap_coherent(iommu->ecap))
1934                 domain->iommu_coherency = 1;
1935         else
1936                 domain->iommu_coherency = 0;
1937
1938         if (ecap_sc_support(iommu->ecap))
1939                 domain->iommu_snooping = 1;
1940         else
1941                 domain->iommu_snooping = 0;
1942
1943         if (intel_iommu_superpage)
1944                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1945         else
1946                 domain->iommu_superpage = 0;
1947
1948         domain->nid = iommu->node;
1949
1950         /* always allocate the top pgd */
1951         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1952         if (!domain->pgd)
1953                 return -ENOMEM;
1954         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1955         return 0;
1956 }
1957
1958 static void domain_exit(struct dmar_domain *domain)
1959 {
1960         struct page *freelist = NULL;
1961
1962         /* Domain 0 is reserved, so dont process it */
1963         if (!domain)
1964                 return;
1965
1966         /* Remove associated devices and clear attached or cached domains */
1967         rcu_read_lock();
1968         domain_remove_dev_info(domain);
1969         rcu_read_unlock();
1970
1971         /* destroy iovas */
1972         put_iova_domain(&domain->iovad);
1973
1974         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1975
1976         dma_free_pagelist(freelist);
1977
1978         free_domain_mem(domain);
1979 }
1980
1981 static int domain_context_mapping_one(struct dmar_domain *domain,
1982                                       struct intel_iommu *iommu,
1983                                       u8 bus, u8 devfn)
1984 {
1985         u16 did = domain->iommu_did[iommu->seq_id];
1986         int translation = CONTEXT_TT_MULTI_LEVEL;
1987         struct device_domain_info *info = NULL;
1988         struct context_entry *context;
1989         unsigned long flags;
1990         struct dma_pte *pgd;
1991         int ret, agaw;
1992
1993         WARN_ON(did == 0);
1994
1995         if (hw_pass_through && domain_type_is_si(domain))
1996                 translation = CONTEXT_TT_PASS_THROUGH;
1997
1998         pr_debug("Set context mapping for %02x:%02x.%d\n",
1999                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2000
2001         BUG_ON(!domain->pgd);
2002
2003         spin_lock_irqsave(&device_domain_lock, flags);
2004         spin_lock(&iommu->lock);
2005
2006         ret = -ENOMEM;
2007         context = iommu_context_addr(iommu, bus, devfn, 1);
2008         if (!context)
2009                 goto out_unlock;
2010
2011         ret = 0;
2012         if (context_present(context))
2013                 goto out_unlock;
2014
2015         /*
2016          * For kdump cases, old valid entries may be cached due to the
2017          * in-flight DMA and copied pgtable, but there is no unmapping
2018          * behaviour for them, thus we need an explicit cache flush for
2019          * the newly-mapped device. For kdump, at this point, the device
2020          * is supposed to finish reset at its driver probe stage, so no
2021          * in-flight DMA will exist, and we don't need to worry anymore
2022          * hereafter.
2023          */
2024         if (context_copied(context)) {
2025                 u16 did_old = context_domain_id(context);
2026
2027                 if (did_old < cap_ndoms(iommu->cap)) {
2028                         iommu->flush.flush_context(iommu, did_old,
2029                                                    (((u16)bus) << 8) | devfn,
2030                                                    DMA_CCMD_MASK_NOBIT,
2031                                                    DMA_CCMD_DEVICE_INVL);
2032                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2033                                                  DMA_TLB_DSI_FLUSH);
2034                 }
2035         }
2036
2037         pgd = domain->pgd;
2038
2039         context_clear_entry(context);
2040         context_set_domain_id(context, did);
2041
2042         /*
2043          * Skip top levels of page tables for iommu which has less agaw
2044          * than default.  Unnecessary for PT mode.
2045          */
2046         if (translation != CONTEXT_TT_PASS_THROUGH) {
2047                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
2048                         ret = -ENOMEM;
2049                         pgd = phys_to_virt(dma_pte_addr(pgd));
2050                         if (!dma_pte_present(pgd))
2051                                 goto out_unlock;
2052                 }
2053
2054                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2055                 if (info && info->ats_supported)
2056                         translation = CONTEXT_TT_DEV_IOTLB;
2057                 else
2058                         translation = CONTEXT_TT_MULTI_LEVEL;
2059
2060                 context_set_address_root(context, virt_to_phys(pgd));
2061                 context_set_address_width(context, iommu->agaw);
2062         } else {
2063                 /*
2064                  * In pass through mode, AW must be programmed to
2065                  * indicate the largest AGAW value supported by
2066                  * hardware. And ASR is ignored by hardware.
2067                  */
2068                 context_set_address_width(context, iommu->msagaw);
2069         }
2070
2071         context_set_translation_type(context, translation);
2072         context_set_fault_enable(context);
2073         context_set_present(context);
2074         domain_flush_cache(domain, context, sizeof(*context));
2075
2076         /*
2077          * It's a non-present to present mapping. If hardware doesn't cache
2078          * non-present entry we only need to flush the write-buffer. If the
2079          * _does_ cache non-present entries, then it does so in the special
2080          * domain #0, which we have to flush:
2081          */
2082         if (cap_caching_mode(iommu->cap)) {
2083                 iommu->flush.flush_context(iommu, 0,
2084                                            (((u16)bus) << 8) | devfn,
2085                                            DMA_CCMD_MASK_NOBIT,
2086                                            DMA_CCMD_DEVICE_INVL);
2087                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2088         } else {
2089                 iommu_flush_write_buffer(iommu);
2090         }
2091         iommu_enable_dev_iotlb(info);
2092
2093         ret = 0;
2094
2095 out_unlock:
2096         spin_unlock(&iommu->lock);
2097         spin_unlock_irqrestore(&device_domain_lock, flags);
2098
2099         return ret;
2100 }
2101
2102 struct domain_context_mapping_data {
2103         struct dmar_domain *domain;
2104         struct intel_iommu *iommu;
2105 };
2106
2107 static int domain_context_mapping_cb(struct pci_dev *pdev,
2108                                      u16 alias, void *opaque)
2109 {
2110         struct domain_context_mapping_data *data = opaque;
2111
2112         return domain_context_mapping_one(data->domain, data->iommu,
2113                                           PCI_BUS_NUM(alias), alias & 0xff);
2114 }
2115
2116 static int
2117 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2118 {
2119         struct intel_iommu *iommu;
2120         u8 bus, devfn;
2121         struct domain_context_mapping_data data;
2122
2123         iommu = device_to_iommu(dev, &bus, &devfn);
2124         if (!iommu)
2125                 return -ENODEV;
2126
2127         if (!dev_is_pci(dev))
2128                 return domain_context_mapping_one(domain, iommu, bus, devfn);
2129
2130         data.domain = domain;
2131         data.iommu = iommu;
2132
2133         return pci_for_each_dma_alias(to_pci_dev(dev),
2134                                       &domain_context_mapping_cb, &data);
2135 }
2136
2137 static int domain_context_mapped_cb(struct pci_dev *pdev,
2138                                     u16 alias, void *opaque)
2139 {
2140         struct intel_iommu *iommu = opaque;
2141
2142         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2143 }
2144
2145 static int domain_context_mapped(struct device *dev)
2146 {
2147         struct intel_iommu *iommu;
2148         u8 bus, devfn;
2149
2150         iommu = device_to_iommu(dev, &bus, &devfn);
2151         if (!iommu)
2152                 return -ENODEV;
2153
2154         if (!dev_is_pci(dev))
2155                 return device_context_mapped(iommu, bus, devfn);
2156
2157         return !pci_for_each_dma_alias(to_pci_dev(dev),
2158                                        domain_context_mapped_cb, iommu);
2159 }
2160
2161 /* Returns a number of VTD pages, but aligned to MM page size */
2162 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2163                                             size_t size)
2164 {
2165         host_addr &= ~PAGE_MASK;
2166         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2167 }
2168
2169 /* Return largest possible superpage level for a given mapping */
2170 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2171                                           unsigned long iov_pfn,
2172                                           unsigned long phy_pfn,
2173                                           unsigned long pages)
2174 {
2175         int support, level = 1;
2176         unsigned long pfnmerge;
2177
2178         support = domain->iommu_superpage;
2179
2180         /* To use a large page, the virtual *and* physical addresses
2181            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2182            of them will mean we have to use smaller pages. So just
2183            merge them and check both at once. */
2184         pfnmerge = iov_pfn | phy_pfn;
2185
2186         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2187                 pages >>= VTD_STRIDE_SHIFT;
2188                 if (!pages)
2189                         break;
2190                 pfnmerge >>= VTD_STRIDE_SHIFT;
2191                 level++;
2192                 support--;
2193         }
2194         return level;
2195 }
2196
2197 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2198                             struct scatterlist *sg, unsigned long phys_pfn,
2199                             unsigned long nr_pages, int prot)
2200 {
2201         struct dma_pte *first_pte = NULL, *pte = NULL;
2202         phys_addr_t uninitialized_var(pteval);
2203         unsigned long sg_res = 0;
2204         unsigned int largepage_lvl = 0;
2205         unsigned long lvl_pages = 0;
2206
2207         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2208
2209         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2210                 return -EINVAL;
2211
2212         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2213
2214         if (!sg) {
2215                 sg_res = nr_pages;
2216                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2217         }
2218
2219         while (nr_pages > 0) {
2220                 uint64_t tmp;
2221
2222                 if (!sg_res) {
2223                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2224
2225                         sg_res = aligned_nrpages(sg->offset, sg->length);
2226                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2227                         sg->dma_length = sg->length;
2228                         pteval = (sg_phys(sg) - pgoff) | prot;
2229                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2230                 }
2231
2232                 if (!pte) {
2233                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2234
2235                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2236                         if (!pte)
2237                                 return -ENOMEM;
2238                         /* It is large page*/
2239                         if (largepage_lvl > 1) {
2240                                 unsigned long nr_superpages, end_pfn;
2241
2242                                 pteval |= DMA_PTE_LARGE_PAGE;
2243                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2244
2245                                 nr_superpages = sg_res / lvl_pages;
2246                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2247
2248                                 /*
2249                                  * Ensure that old small page tables are
2250                                  * removed to make room for superpage(s).
2251                                  * We're adding new large pages, so make sure
2252                                  * we don't remove their parent tables.
2253                                  */
2254                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2255                                                        largepage_lvl + 1);
2256                         } else {
2257                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2258                         }
2259
2260                 }
2261                 /* We don't need lock here, nobody else
2262                  * touches the iova range
2263                  */
2264                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2265                 if (tmp) {
2266                         static int dumps = 5;
2267                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2268                                 iov_pfn, tmp, (unsigned long long)pteval);
2269                         if (dumps) {
2270                                 dumps--;
2271                                 debug_dma_dump_mappings(NULL);
2272                         }
2273                         WARN_ON(1);
2274                 }
2275
2276                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2277
2278                 BUG_ON(nr_pages < lvl_pages);
2279                 BUG_ON(sg_res < lvl_pages);
2280
2281                 nr_pages -= lvl_pages;
2282                 iov_pfn += lvl_pages;
2283                 phys_pfn += lvl_pages;
2284                 pteval += lvl_pages * VTD_PAGE_SIZE;
2285                 sg_res -= lvl_pages;
2286
2287                 /* If the next PTE would be the first in a new page, then we
2288                    need to flush the cache on the entries we've just written.
2289                    And then we'll need to recalculate 'pte', so clear it and
2290                    let it get set again in the if (!pte) block above.
2291
2292                    If we're done (!nr_pages) we need to flush the cache too.
2293
2294                    Also if we've been setting superpages, we may need to
2295                    recalculate 'pte' and switch back to smaller pages for the
2296                    end of the mapping, if the trailing size is not enough to
2297                    use another superpage (i.e. sg_res < lvl_pages). */
2298                 pte++;
2299                 if (!nr_pages || first_pte_in_page(pte) ||
2300                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2301                         domain_flush_cache(domain, first_pte,
2302                                            (void *)pte - (void *)first_pte);
2303                         pte = NULL;
2304                 }
2305
2306                 if (!sg_res && nr_pages)
2307                         sg = sg_next(sg);
2308         }
2309         return 0;
2310 }
2311
2312 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2313                          struct scatterlist *sg, unsigned long phys_pfn,
2314                          unsigned long nr_pages, int prot)
2315 {
2316        int ret;
2317        struct intel_iommu *iommu;
2318
2319        /* Do the real mapping first */
2320        ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2321        if (ret)
2322                return ret;
2323
2324        /* Notify about the new mapping */
2325        if (domain_type_is_vm(domain)) {
2326                /* VM typed domains can have more than one IOMMUs */
2327                int iommu_id;
2328                for_each_domain_iommu(iommu_id, domain) {
2329                        iommu = g_iommus[iommu_id];
2330                        __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2331                }
2332        } else {
2333                /* General domains only have one IOMMU */
2334                iommu = domain_get_iommu(domain);
2335                __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2336        }
2337
2338        return 0;
2339 }
2340
2341 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2342                                     struct scatterlist *sg, unsigned long nr_pages,
2343                                     int prot)
2344 {
2345         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2346 }
2347
2348 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2349                                      unsigned long phys_pfn, unsigned long nr_pages,
2350                                      int prot)
2351 {
2352         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2353 }
2354
2355 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2356 {
2357         unsigned long flags;
2358         struct context_entry *context;
2359         u16 did_old;
2360
2361         if (!iommu)
2362                 return;
2363
2364         spin_lock_irqsave(&iommu->lock, flags);
2365         context = iommu_context_addr(iommu, bus, devfn, 0);
2366         if (!context) {
2367                 spin_unlock_irqrestore(&iommu->lock, flags);
2368                 return;
2369         }
2370         did_old = context_domain_id(context);
2371         context_clear_entry(context);
2372         __iommu_flush_cache(iommu, context, sizeof(*context));
2373         spin_unlock_irqrestore(&iommu->lock, flags);
2374         iommu->flush.flush_context(iommu,
2375                                    did_old,
2376                                    (((u16)bus) << 8) | devfn,
2377                                    DMA_CCMD_MASK_NOBIT,
2378                                    DMA_CCMD_DEVICE_INVL);
2379         iommu->flush.flush_iotlb(iommu,
2380                                  did_old,
2381                                  0,
2382                                  0,
2383                                  DMA_TLB_DSI_FLUSH);
2384 }
2385
2386 static inline void unlink_domain_info(struct device_domain_info *info)
2387 {
2388         assert_spin_locked(&device_domain_lock);
2389         list_del(&info->link);
2390         list_del(&info->global);
2391         if (info->dev)
2392                 info->dev->archdata.iommu = NULL;
2393 }
2394
2395 static void domain_remove_dev_info(struct dmar_domain *domain)
2396 {
2397         struct device_domain_info *info, *tmp;
2398         unsigned long flags;
2399
2400         spin_lock_irqsave(&device_domain_lock, flags);
2401         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2402                 __dmar_remove_one_dev_info(info);
2403         spin_unlock_irqrestore(&device_domain_lock, flags);
2404 }
2405
2406 /*
2407  * find_domain
2408  * Note: we use struct device->archdata.iommu stores the info
2409  */
2410 static struct dmar_domain *find_domain(struct device *dev)
2411 {
2412         struct device_domain_info *info;
2413
2414         /* No lock here, assumes no domain exit in normal case */
2415         info = dev->archdata.iommu;
2416         if (likely(info))
2417                 return info->domain;
2418         return NULL;
2419 }
2420
2421 static inline struct device_domain_info *
2422 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2423 {
2424         struct device_domain_info *info;
2425
2426         list_for_each_entry(info, &device_domain_list, global)
2427                 if (info->iommu->segment == segment && info->bus == bus &&
2428                     info->devfn == devfn)
2429                         return info;
2430
2431         return NULL;
2432 }
2433
2434 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2435                                                     int bus, int devfn,
2436                                                     struct device *dev,
2437                                                     struct dmar_domain *domain)
2438 {
2439         struct dmar_domain *found = NULL;
2440         struct device_domain_info *info;
2441         unsigned long flags;
2442         int ret;
2443
2444         info = alloc_devinfo_mem();
2445         if (!info)
2446                 return NULL;
2447
2448         info->bus = bus;
2449         info->devfn = devfn;
2450         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2451         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2452         info->ats_qdep = 0;
2453         info->dev = dev;
2454         info->domain = domain;
2455         info->iommu = iommu;
2456         info->pasid_table = NULL;
2457
2458         if (dev && dev_is_pci(dev)) {
2459                 struct pci_dev *pdev = to_pci_dev(info->dev);
2460
2461                 if (!pci_ats_disabled() &&
2462                     ecap_dev_iotlb_support(iommu->ecap) &&
2463                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2464                     dmar_find_matched_atsr_unit(pdev))
2465                         info->ats_supported = 1;
2466
2467                 if (ecs_enabled(iommu)) {
2468                         if (pasid_enabled(iommu)) {
2469                                 int features = pci_pasid_features(pdev);
2470                                 if (features >= 0)
2471                                         info->pasid_supported = features | 1;
2472                         }
2473
2474                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2475                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2476                                 info->pri_supported = 1;
2477                 }
2478         }
2479
2480         spin_lock_irqsave(&device_domain_lock, flags);
2481         if (dev)
2482                 found = find_domain(dev);
2483
2484         if (!found) {
2485                 struct device_domain_info *info2;
2486                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2487                 if (info2) {
2488                         found      = info2->domain;
2489                         info2->dev = dev;
2490                 }
2491         }
2492
2493         if (found) {
2494                 spin_unlock_irqrestore(&device_domain_lock, flags);
2495                 free_devinfo_mem(info);
2496                 /* Caller must free the original domain */
2497                 return found;
2498         }
2499
2500         spin_lock(&iommu->lock);
2501         ret = domain_attach_iommu(domain, iommu);
2502         spin_unlock(&iommu->lock);
2503
2504         if (ret) {
2505                 spin_unlock_irqrestore(&device_domain_lock, flags);
2506                 free_devinfo_mem(info);
2507                 return NULL;
2508         }
2509
2510         list_add(&info->link, &domain->devices);
2511         list_add(&info->global, &device_domain_list);
2512         if (dev)
2513                 dev->archdata.iommu = info;
2514
2515         if (dev && dev_is_pci(dev) && info->pasid_supported) {
2516                 ret = intel_pasid_alloc_table(dev);
2517                 if (ret) {
2518                         pr_warn("No pasid table for %s, pasid disabled\n",
2519                                 dev_name(dev));
2520                         info->pasid_supported = 0;
2521                 }
2522         }
2523         spin_unlock_irqrestore(&device_domain_lock, flags);
2524
2525         if (dev && domain_context_mapping(domain, dev)) {
2526                 pr_err("Domain context map for %s failed\n", dev_name(dev));
2527                 dmar_remove_one_dev_info(domain, dev);
2528                 return NULL;
2529         }
2530
2531         return domain;
2532 }
2533
2534 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2535 {
2536         *(u16 *)opaque = alias;
2537         return 0;
2538 }
2539
2540 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2541 {
2542         struct device_domain_info *info = NULL;
2543         struct dmar_domain *domain = NULL;
2544         struct intel_iommu *iommu;
2545         u16 dma_alias;
2546         unsigned long flags;
2547         u8 bus, devfn;
2548
2549         iommu = device_to_iommu(dev, &bus, &devfn);
2550         if (!iommu)
2551                 return NULL;
2552
2553         if (dev_is_pci(dev)) {
2554                 struct pci_dev *pdev = to_pci_dev(dev);
2555
2556                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2557
2558                 spin_lock_irqsave(&device_domain_lock, flags);
2559                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2560                                                       PCI_BUS_NUM(dma_alias),
2561                                                       dma_alias & 0xff);
2562                 if (info) {
2563                         iommu = info->iommu;
2564                         domain = info->domain;
2565                 }
2566                 spin_unlock_irqrestore(&device_domain_lock, flags);
2567
2568                 /* DMA alias already has a domain, use it */
2569                 if (info)
2570                         goto out;
2571         }
2572
2573         /* Allocate and initialize new domain for the device */
2574         domain = alloc_domain(0);
2575         if (!domain)
2576                 return NULL;
2577         if (domain_init(domain, iommu, gaw)) {
2578                 domain_exit(domain);
2579                 return NULL;
2580         }
2581
2582 out:
2583
2584         return domain;
2585 }
2586
2587 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2588                                               struct dmar_domain *domain)
2589 {
2590         struct intel_iommu *iommu;
2591         struct dmar_domain *tmp;
2592         u16 req_id, dma_alias;
2593         u8 bus, devfn;
2594
2595         iommu = device_to_iommu(dev, &bus, &devfn);
2596         if (!iommu)
2597                 return NULL;
2598
2599         req_id = ((u16)bus << 8) | devfn;
2600
2601         if (dev_is_pci(dev)) {
2602                 struct pci_dev *pdev = to_pci_dev(dev);
2603
2604                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2605
2606                 /* register PCI DMA alias device */
2607                 if (req_id != dma_alias) {
2608                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2609                                         dma_alias & 0xff, NULL, domain);
2610
2611                         if (!tmp || tmp != domain)
2612                                 return tmp;
2613                 }
2614         }
2615
2616         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2617         if (!tmp || tmp != domain)
2618                 return tmp;
2619
2620         return domain;
2621 }
2622
2623 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2624 {
2625         struct dmar_domain *domain, *tmp;
2626
2627         domain = find_domain(dev);
2628         if (domain)
2629                 goto out;
2630
2631         domain = find_or_alloc_domain(dev, gaw);
2632         if (!domain)
2633                 goto out;
2634
2635         tmp = set_domain_for_dev(dev, domain);
2636         if (!tmp || domain != tmp) {
2637                 domain_exit(domain);
2638                 domain = tmp;
2639         }
2640
2641 out:
2642
2643         return domain;
2644 }
2645
2646 static int iommu_domain_identity_map(struct dmar_domain *domain,
2647                                      unsigned long long start,
2648                                      unsigned long long end)
2649 {
2650         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2651         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2652
2653         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2654                           dma_to_mm_pfn(last_vpfn))) {
2655                 pr_err("Reserving iova failed\n");
2656                 return -ENOMEM;
2657         }
2658
2659         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2660         /*
2661          * RMRR range might have overlap with physical memory range,
2662          * clear it first
2663          */
2664         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2665
2666         return __domain_mapping(domain, first_vpfn, NULL,
2667                                 first_vpfn, last_vpfn - first_vpfn + 1,
2668                                 DMA_PTE_READ|DMA_PTE_WRITE);
2669 }
2670
2671 static int domain_prepare_identity_map(struct device *dev,
2672                                        struct dmar_domain *domain,
2673                                        unsigned long long start,
2674                                        unsigned long long end)
2675 {
2676         /* For _hardware_ passthrough, don't bother. But for software
2677            passthrough, we do it anyway -- it may indicate a memory
2678            range which is reserved in E820, so which didn't get set
2679            up to start with in si_domain */
2680         if (domain == si_domain && hw_pass_through) {
2681                 pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2682                         dev_name(dev), start, end);
2683                 return 0;
2684         }
2685
2686         pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2687                 dev_name(dev), start, end);
2688
2689         if (end < start) {
2690                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2691                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2692                         dmi_get_system_info(DMI_BIOS_VENDOR),
2693                         dmi_get_system_info(DMI_BIOS_VERSION),
2694                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2695                 return -EIO;
2696         }
2697
2698         if (end >> agaw_to_width(domain->agaw)) {
2699                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2700                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2701                      agaw_to_width(domain->agaw),
2702                      dmi_get_system_info(DMI_BIOS_VENDOR),
2703                      dmi_get_system_info(DMI_BIOS_VERSION),
2704                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2705                 return -EIO;
2706         }
2707
2708         return iommu_domain_identity_map(domain, start, end);
2709 }
2710
2711 static int iommu_prepare_identity_map(struct device *dev,
2712                                       unsigned long long start,
2713                                       unsigned long long end)
2714 {
2715         struct dmar_domain *domain;
2716         int ret;
2717
2718         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2719         if (!domain)
2720                 return -ENOMEM;
2721
2722         ret = domain_prepare_identity_map(dev, domain, start, end);
2723         if (ret)
2724                 domain_exit(domain);
2725
2726         return ret;
2727 }
2728
2729 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2730                                          struct device *dev)
2731 {
2732         if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2733                 return 0;
2734         return iommu_prepare_identity_map(dev, rmrr->base_address,
2735                                           rmrr->end_address);
2736 }
2737
2738 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2739 static inline void iommu_prepare_isa(void)
2740 {
2741         struct pci_dev *pdev;
2742         int ret;
2743
2744         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2745         if (!pdev)
2746                 return;
2747
2748         pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2749         ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2750
2751         if (ret)
2752                 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2753
2754         pci_dev_put(pdev);
2755 }
2756 #else
2757 static inline void iommu_prepare_isa(void)
2758 {
2759         return;
2760 }
2761 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2762
2763 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2764
2765 static int __init si_domain_init(int hw)
2766 {
2767         int nid, ret = 0;
2768
2769         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2770         if (!si_domain)
2771                 return -EFAULT;
2772
2773         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2774                 domain_exit(si_domain);
2775                 return -EFAULT;
2776         }
2777
2778         pr_debug("Identity mapping domain allocated\n");
2779
2780         if (hw)
2781                 return 0;
2782
2783         for_each_online_node(nid) {
2784                 unsigned long start_pfn, end_pfn;
2785                 int i;
2786
2787                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2788                         ret = iommu_domain_identity_map(si_domain,
2789                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2790                         if (ret)
2791                                 return ret;
2792                 }
2793         }
2794
2795         return 0;
2796 }
2797
2798 static int identity_mapping(struct device *dev)
2799 {
2800         struct device_domain_info *info;
2801
2802         if (likely(!iommu_identity_mapping))
2803                 return 0;
2804
2805         info = dev->archdata.iommu;
2806         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2807                 return (info->domain == si_domain);
2808
2809         return 0;
2810 }
2811
2812 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2813 {
2814         struct dmar_domain *ndomain;
2815         struct intel_iommu *iommu;
2816         u8 bus, devfn;
2817
2818         iommu = device_to_iommu(dev, &bus, &devfn);
2819         if (!iommu)
2820                 return -ENODEV;
2821
2822         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2823         if (ndomain != domain)
2824                 return -EBUSY;
2825
2826         return 0;
2827 }
2828
2829 static bool device_has_rmrr(struct device *dev)
2830 {
2831         struct dmar_rmrr_unit *rmrr;
2832         struct device *tmp;
2833         int i;
2834
2835         rcu_read_lock();
2836         for_each_rmrr_units(rmrr) {
2837                 /*
2838                  * Return TRUE if this RMRR contains the device that
2839                  * is passed in.
2840                  */
2841                 for_each_active_dev_scope(rmrr->devices,
2842                                           rmrr->devices_cnt, i, tmp)
2843                         if (tmp == dev) {
2844                                 rcu_read_unlock();
2845                                 return true;
2846                         }
2847         }
2848         rcu_read_unlock();
2849         return false;
2850 }
2851
2852 /*
2853  * There are a couple cases where we need to restrict the functionality of
2854  * devices associated with RMRRs.  The first is when evaluating a device for
2855  * identity mapping because problems exist when devices are moved in and out
2856  * of domains and their respective RMRR information is lost.  This means that
2857  * a device with associated RMRRs will never be in a "passthrough" domain.
2858  * The second is use of the device through the IOMMU API.  This interface
2859  * expects to have full control of the IOVA space for the device.  We cannot
2860  * satisfy both the requirement that RMRR access is maintained and have an
2861  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2862  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2863  * We therefore prevent devices associated with an RMRR from participating in
2864  * the IOMMU API, which eliminates them from device assignment.
2865  *
2866  * In both cases we assume that PCI USB devices with RMRRs have them largely
2867  * for historical reasons and that the RMRR space is not actively used post
2868  * boot.  This exclusion may change if vendors begin to abuse it.
2869  *
2870  * The same exception is made for graphics devices, with the requirement that
2871  * any use of the RMRR regions will be torn down before assigning the device
2872  * to a guest.
2873  */
2874 static bool device_is_rmrr_locked(struct device *dev)
2875 {
2876         if (!device_has_rmrr(dev))
2877                 return false;
2878
2879         if (dev_is_pci(dev)) {
2880                 struct pci_dev *pdev = to_pci_dev(dev);
2881
2882                 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2883                         return false;
2884         }
2885
2886         return true;
2887 }
2888
2889 static int iommu_should_identity_map(struct device *dev, int startup)
2890 {
2891
2892         if (dev_is_pci(dev)) {
2893                 struct pci_dev *pdev = to_pci_dev(dev);
2894
2895                 if (device_is_rmrr_locked(dev))
2896                         return 0;
2897
2898                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2899                         return 1;
2900
2901                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2902                         return 1;
2903
2904                 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2905                         return 0;
2906
2907                 /*
2908                  * We want to start off with all devices in the 1:1 domain, and
2909                  * take them out later if we find they can't access all of memory.
2910                  *
2911                  * However, we can't do this for PCI devices behind bridges,
2912                  * because all PCI devices behind the same bridge will end up
2913                  * with the same source-id on their transactions.
2914                  *
2915                  * Practically speaking, we can't change things around for these
2916                  * devices at run-time, because we can't be sure there'll be no
2917                  * DMA transactions in flight for any of their siblings.
2918                  *
2919                  * So PCI devices (unless they're on the root bus) as well as
2920                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2921                  * the 1:1 domain, just in _case_ one of their siblings turns out
2922                  * not to be able to map all of memory.
2923                  */
2924                 if (!pci_is_pcie(pdev)) {
2925                         if (!pci_is_root_bus(pdev->bus))
2926                                 return 0;
2927                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2928                                 return 0;
2929                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2930                         return 0;
2931         } else {
2932                 if (device_has_rmrr(dev))
2933                         return 0;
2934         }
2935
2936         /*
2937          * At boot time, we don't yet know if devices will be 64-bit capable.
2938          * Assume that they will — if they turn out not to be, then we can
2939          * take them out of the 1:1 domain later.
2940          */
2941         if (!startup) {
2942                 /*
2943                  * If the device's dma_mask is less than the system's memory
2944                  * size then this is not a candidate for identity mapping.
2945                  */
2946                 u64 dma_mask = *dev->dma_mask;
2947
2948                 if (dev->coherent_dma_mask &&
2949                     dev->coherent_dma_mask < dma_mask)
2950                         dma_mask = dev->coherent_dma_mask;
2951
2952                 return dma_mask >= dma_get_required_mask(dev);
2953         }
2954
2955         return 1;
2956 }
2957
2958 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2959 {
2960         int ret;
2961
2962         if (!iommu_should_identity_map(dev, 1))
2963                 return 0;
2964
2965         ret = domain_add_dev_info(si_domain, dev);
2966         if (!ret)
2967                 pr_info("%s identity mapping for device %s\n",
2968                         hw ? "Hardware" : "Software", dev_name(dev));
2969         else if (ret == -ENODEV)
2970                 /* device not associated with an iommu */
2971                 ret = 0;
2972
2973         return ret;
2974 }
2975
2976
2977 static int __init iommu_prepare_static_identity_mapping(int hw)
2978 {
2979         struct pci_dev *pdev = NULL;
2980         struct dmar_drhd_unit *drhd;
2981         struct intel_iommu *iommu;
2982         struct device *dev;
2983         int i;
2984         int ret = 0;
2985
2986         for_each_pci_dev(pdev) {
2987                 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2988                 if (ret)
2989                         return ret;
2990         }
2991
2992         for_each_active_iommu(iommu, drhd)
2993                 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2994                         struct acpi_device_physical_node *pn;
2995                         struct acpi_device *adev;
2996
2997                         if (dev->bus != &acpi_bus_type)
2998                                 continue;
2999
3000                         adev= to_acpi_device(dev);
3001                         mutex_lock(&adev->physical_node_lock);
3002                         list_for_each_entry(pn, &adev->physical_node_list, node) {
3003                                 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
3004                                 if (ret)
3005                                         break;
3006                         }
3007                         mutex_unlock(&adev->physical_node_lock);
3008                         if (ret)
3009                                 return ret;
3010                 }
3011
3012         return 0;
3013 }
3014
3015 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3016 {
3017         /*
3018          * Start from the sane iommu hardware state.
3019          * If the queued invalidation is already initialized by us
3020          * (for example, while enabling interrupt-remapping) then
3021          * we got the things already rolling from a sane state.
3022          */
3023         if (!iommu->qi) {
3024                 /*
3025                  * Clear any previous faults.
3026                  */
3027                 dmar_fault(-1, iommu);
3028                 /*
3029                  * Disable queued invalidation if supported and already enabled
3030                  * before OS handover.
3031                  */
3032                 dmar_disable_qi(iommu);
3033         }
3034
3035         if (dmar_enable_qi(iommu)) {
3036                 /*
3037                  * Queued Invalidate not enabled, use Register Based Invalidate
3038                  */
3039                 iommu->flush.flush_context = __iommu_flush_context;
3040                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3041                 pr_info("%s: Using Register based invalidation\n",
3042                         iommu->name);
3043         } else {
3044                 iommu->flush.flush_context = qi_flush_context;
3045                 iommu->flush.flush_iotlb = qi_flush_iotlb;
3046                 pr_info("%s: Using Queued invalidation\n", iommu->name);
3047         }
3048 }
3049
3050 static int copy_context_table(struct intel_iommu *iommu,
3051                               struct root_entry *old_re,
3052                               struct context_entry **tbl,
3053                               int bus, bool ext)
3054 {
3055         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3056         struct context_entry *new_ce = NULL, ce;
3057         struct context_entry *old_ce = NULL;
3058         struct root_entry re;
3059         phys_addr_t old_ce_phys;
3060
3061         tbl_idx = ext ? bus * 2 : bus;
3062         memcpy(&re, old_re, sizeof(re));
3063
3064         for (devfn = 0; devfn < 256; devfn++) {
3065                 /* First calculate the correct index */
3066                 idx = (ext ? devfn * 2 : devfn) % 256;
3067
3068                 if (idx == 0) {
3069                         /* First save what we may have and clean up */
3070                         if (new_ce) {
3071                                 tbl[tbl_idx] = new_ce;
3072                                 __iommu_flush_cache(iommu, new_ce,
3073                                                     VTD_PAGE_SIZE);
3074                                 pos = 1;
3075                         }
3076
3077                         if (old_ce)
3078                                 memunmap(old_ce);
3079
3080                         ret = 0;
3081                         if (devfn < 0x80)
3082                                 old_ce_phys = root_entry_lctp(&re);
3083                         else
3084                                 old_ce_phys = root_entry_uctp(&re);
3085
3086                         if (!old_ce_phys) {
3087                                 if (ext && devfn == 0) {
3088                                         /* No LCTP, try UCTP */
3089                                         devfn = 0x7f;
3090                                         continue;
3091                                 } else {
3092                                         goto out;
3093                                 }
3094                         }
3095
3096                         ret = -ENOMEM;
3097                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3098                                         MEMREMAP_WB);
3099                         if (!old_ce)
3100                                 goto out;
3101
3102                         new_ce = alloc_pgtable_page(iommu->node);
3103                         if (!new_ce)
3104                                 goto out_unmap;
3105
3106                         ret = 0;
3107                 }
3108
3109                 /* Now copy the context entry */
3110                 memcpy(&ce, old_ce + idx, sizeof(ce));
3111
3112                 if (!__context_present(&ce))
3113                         continue;
3114
3115                 did = context_domain_id(&ce);
3116                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3117                         set_bit(did, iommu->domain_ids);
3118
3119                 /*
3120                  * We need a marker for copied context entries. This
3121                  * marker needs to work for the old format as well as
3122                  * for extended context entries.
3123                  *
3124                  * Bit 67 of the context entry is used. In the old
3125                  * format this bit is available to software, in the
3126                  * extended format it is the PGE bit, but PGE is ignored
3127                  * by HW if PASIDs are disabled (and thus still
3128                  * available).
3129                  *
3130                  * So disable PASIDs first and then mark the entry
3131                  * copied. This means that we don't copy PASID
3132                  * translations from the old kernel, but this is fine as
3133                  * faults there are not fatal.
3134                  */
3135                 context_clear_pasid_enable(&ce);
3136                 context_set_copied(&ce);
3137
3138                 new_ce[idx] = ce;
3139         }
3140
3141         tbl[tbl_idx + pos] = new_ce;
3142
3143         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3144
3145 out_unmap:
3146         memunmap(old_ce);
3147
3148 out:
3149         return ret;
3150 }
3151
3152 static int copy_translation_tables(struct intel_iommu *iommu)
3153 {
3154         struct context_entry **ctxt_tbls;
3155         struct root_entry *old_rt;
3156         phys_addr_t old_rt_phys;
3157         int ctxt_table_entries;
3158         unsigned long flags;
3159         u64 rtaddr_reg;
3160         int bus, ret;
3161         bool new_ext, ext;
3162
3163         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3164         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3165         new_ext    = !!ecap_ecs(iommu->ecap);
3166
3167         /*
3168          * The RTT bit can only be changed when translation is disabled,
3169          * but disabling translation means to open a window for data
3170          * corruption. So bail out and don't copy anything if we would
3171          * have to change the bit.
3172          */
3173         if (new_ext != ext)
3174                 return -EINVAL;
3175
3176         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3177         if (!old_rt_phys)
3178                 return -EINVAL;
3179
3180         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3181         if (!old_rt)
3182                 return -ENOMEM;
3183
3184         /* This is too big for the stack - allocate it from slab */
3185         ctxt_table_entries = ext ? 512 : 256;
3186         ret = -ENOMEM;
3187         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3188         if (!ctxt_tbls)
3189                 goto out_unmap;
3190
3191         for (bus = 0; bus < 256; bus++) {
3192                 ret = copy_context_table(iommu, &old_rt[bus],
3193                                          ctxt_tbls, bus, ext);
3194                 if (ret) {
3195                         pr_err("%s: Failed to copy context table for bus %d\n",
3196                                 iommu->name, bus);
3197                         continue;
3198                 }
3199         }
3200
3201         spin_lock_irqsave(&iommu->lock, flags);
3202
3203         /* Context tables are copied, now write them to the root_entry table */
3204         for (bus = 0; bus < 256; bus++) {
3205                 int idx = ext ? bus * 2 : bus;
3206                 u64 val;
3207
3208                 if (ctxt_tbls[idx]) {
3209                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3210                         iommu->root_entry[bus].lo = val;
3211                 }