Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
[sfrench/cifs-2.6.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright © 2006-2014 Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * Authors: David Woodhouse <dwmw2@infradead.org>,
14  *          Ashok Raj <ashok.raj@intel.com>,
15  *          Shaohua Li <shaohua.li@intel.com>,
16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17  *          Fenghua Yu <fenghua.yu@intel.com>
18  *          Joerg Roedel <jroedel@suse.de>
19  */
20
21 #define pr_fmt(fmt)     "DMAR: " fmt
22
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/export.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/memory.h>
36 #include <linux/cpu.h>
37 #include <linux/timer.h>
38 #include <linux/io.h>
39 #include <linux/iova.h>
40 #include <linux/iommu.h>
41 #include <linux/intel-iommu.h>
42 #include <linux/syscore_ops.h>
43 #include <linux/tboot.h>
44 #include <linux/dmi.h>
45 #include <linux/pci-ats.h>
46 #include <linux/memblock.h>
47 #include <linux/dma-contiguous.h>
48 #include <linux/dma-direct.h>
49 #include <linux/crash_dump.h>
50 #include <asm/irq_remapping.h>
51 #include <asm/cacheflush.h>
52 #include <asm/iommu.h>
53
54 #include "irq_remapping.h"
55 #include "intel-pasid.h"
56
57 #define ROOT_SIZE               VTD_PAGE_SIZE
58 #define CONTEXT_SIZE            VTD_PAGE_SIZE
59
60 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
61 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
62 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
63 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
64
65 #define IOAPIC_RANGE_START      (0xfee00000)
66 #define IOAPIC_RANGE_END        (0xfeefffff)
67 #define IOVA_START_ADDR         (0x1000)
68
69 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
70
71 #define MAX_AGAW_WIDTH 64
72 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
73
74 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
75 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
76
77 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
78    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
79 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
80                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
81 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
82
83 /* IO virtual address start page frame number */
84 #define IOVA_START_PFN          (1)
85
86 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
87
88 /* page table handling */
89 #define LEVEL_STRIDE            (9)
90 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
91
92 /*
93  * This bitmap is used to advertise the page sizes our hardware support
94  * to the IOMMU core, which will then use this information to split
95  * physically contiguous memory regions it is mapping into page sizes
96  * that we support.
97  *
98  * Traditionally the IOMMU core just handed us the mappings directly,
99  * after making sure the size is an order of a 4KiB page and that the
100  * mapping has natural alignment.
101  *
102  * To retain this behavior, we currently advertise that we support
103  * all page sizes that are an order of 4KiB.
104  *
105  * If at some point we'd like to utilize the IOMMU core's new behavior,
106  * we could change this to advertise the real page sizes we support.
107  */
108 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
109
110 static inline int agaw_to_level(int agaw)
111 {
112         return agaw + 2;
113 }
114
115 static inline int agaw_to_width(int agaw)
116 {
117         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
118 }
119
120 static inline int width_to_agaw(int width)
121 {
122         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
123 }
124
125 static inline unsigned int level_to_offset_bits(int level)
126 {
127         return (level - 1) * LEVEL_STRIDE;
128 }
129
130 static inline int pfn_level_offset(unsigned long pfn, int level)
131 {
132         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
133 }
134
135 static inline unsigned long level_mask(int level)
136 {
137         return -1UL << level_to_offset_bits(level);
138 }
139
140 static inline unsigned long level_size(int level)
141 {
142         return 1UL << level_to_offset_bits(level);
143 }
144
145 static inline unsigned long align_to_level(unsigned long pfn, int level)
146 {
147         return (pfn + level_size(level) - 1) & level_mask(level);
148 }
149
150 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
151 {
152         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
153 }
154
155 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
156    are never going to work. */
157 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
158 {
159         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
160 }
161
162 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
163 {
164         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
165 }
166 static inline unsigned long page_to_dma_pfn(struct page *pg)
167 {
168         return mm_to_dma_pfn(page_to_pfn(pg));
169 }
170 static inline unsigned long virt_to_dma_pfn(void *p)
171 {
172         return page_to_dma_pfn(virt_to_page(p));
173 }
174
175 /* global iommu list, set NULL for ignored DMAR units */
176 static struct intel_iommu **g_iommus;
177
178 static void __init check_tylersburg_isoch(void);
179 static int rwbf_quirk;
180
181 /*
182  * set to 1 to panic kernel if can't successfully enable VT-d
183  * (used when kernel is launched w/ TXT)
184  */
185 static int force_on = 0;
186 int intel_iommu_tboot_noforce;
187
188 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
189
190 /*
191  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
192  * if marked present.
193  */
194 static phys_addr_t root_entry_lctp(struct root_entry *re)
195 {
196         if (!(re->lo & 1))
197                 return 0;
198
199         return re->lo & VTD_PAGE_MASK;
200 }
201
202 /*
203  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
204  * if marked present.
205  */
206 static phys_addr_t root_entry_uctp(struct root_entry *re)
207 {
208         if (!(re->hi & 1))
209                 return 0;
210
211         return re->hi & VTD_PAGE_MASK;
212 }
213
214 static inline void context_clear_pasid_enable(struct context_entry *context)
215 {
216         context->lo &= ~(1ULL << 11);
217 }
218
219 static inline bool context_pasid_enabled(struct context_entry *context)
220 {
221         return !!(context->lo & (1ULL << 11));
222 }
223
224 static inline void context_set_copied(struct context_entry *context)
225 {
226         context->hi |= (1ull << 3);
227 }
228
229 static inline bool context_copied(struct context_entry *context)
230 {
231         return !!(context->hi & (1ULL << 3));
232 }
233
234 static inline bool __context_present(struct context_entry *context)
235 {
236         return (context->lo & 1);
237 }
238
239 bool context_present(struct context_entry *context)
240 {
241         return context_pasid_enabled(context) ?
242              __context_present(context) :
243              __context_present(context) && !context_copied(context);
244 }
245
246 static inline void context_set_present(struct context_entry *context)
247 {
248         context->lo |= 1;
249 }
250
251 static inline void context_set_fault_enable(struct context_entry *context)
252 {
253         context->lo &= (((u64)-1) << 2) | 1;
254 }
255
256 static inline void context_set_translation_type(struct context_entry *context,
257                                                 unsigned long value)
258 {
259         context->lo &= (((u64)-1) << 4) | 3;
260         context->lo |= (value & 3) << 2;
261 }
262
263 static inline void context_set_address_root(struct context_entry *context,
264                                             unsigned long value)
265 {
266         context->lo &= ~VTD_PAGE_MASK;
267         context->lo |= value & VTD_PAGE_MASK;
268 }
269
270 static inline void context_set_address_width(struct context_entry *context,
271                                              unsigned long value)
272 {
273         context->hi |= value & 7;
274 }
275
276 static inline void context_set_domain_id(struct context_entry *context,
277                                          unsigned long value)
278 {
279         context->hi |= (value & ((1 << 16) - 1)) << 8;
280 }
281
282 static inline int context_domain_id(struct context_entry *c)
283 {
284         return((c->hi >> 8) & 0xffff);
285 }
286
287 static inline void context_clear_entry(struct context_entry *context)
288 {
289         context->lo = 0;
290         context->hi = 0;
291 }
292
293 /*
294  * 0: readable
295  * 1: writable
296  * 2-6: reserved
297  * 7: super page
298  * 8-10: available
299  * 11: snoop behavior
300  * 12-63: Host physcial address
301  */
302 struct dma_pte {
303         u64 val;
304 };
305
306 static inline void dma_clear_pte(struct dma_pte *pte)
307 {
308         pte->val = 0;
309 }
310
311 static inline u64 dma_pte_addr(struct dma_pte *pte)
312 {
313 #ifdef CONFIG_64BIT
314         return pte->val & VTD_PAGE_MASK;
315 #else
316         /* Must have a full atomic 64-bit read */
317         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
318 #endif
319 }
320
321 static inline bool dma_pte_present(struct dma_pte *pte)
322 {
323         return (pte->val & 3) != 0;
324 }
325
326 static inline bool dma_pte_superpage(struct dma_pte *pte)
327 {
328         return (pte->val & DMA_PTE_LARGE_PAGE);
329 }
330
331 static inline int first_pte_in_page(struct dma_pte *pte)
332 {
333         return !((unsigned long)pte & ~VTD_PAGE_MASK);
334 }
335
336 /*
337  * This domain is a statically identity mapping domain.
338  *      1. This domain creats a static 1:1 mapping to all usable memory.
339  *      2. It maps to each iommu if successful.
340  *      3. Each iommu mapps to this domain if successful.
341  */
342 static struct dmar_domain *si_domain;
343 static int hw_pass_through = 1;
344
345 /*
346  * Domain represents a virtual machine, more than one devices
347  * across iommus may be owned in one domain, e.g. kvm guest.
348  */
349 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 0)
350
351 /* si_domain contains mulitple devices */
352 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 1)
353
354 #define for_each_domain_iommu(idx, domain)                      \
355         for (idx = 0; idx < g_num_of_iommus; idx++)             \
356                 if (domain->iommu_refcnt[idx])
357
358 struct dmar_rmrr_unit {
359         struct list_head list;          /* list of rmrr units   */
360         struct acpi_dmar_header *hdr;   /* ACPI header          */
361         u64     base_address;           /* reserved base address*/
362         u64     end_address;            /* reserved end address */
363         struct dmar_dev_scope *devices; /* target devices */
364         int     devices_cnt;            /* target device count */
365         struct iommu_resv_region *resv; /* reserved region handle */
366 };
367
368 struct dmar_atsr_unit {
369         struct list_head list;          /* list of ATSR units */
370         struct acpi_dmar_header *hdr;   /* ACPI header */
371         struct dmar_dev_scope *devices; /* target devices */
372         int devices_cnt;                /* target device count */
373         u8 include_all:1;               /* include all ports */
374 };
375
376 static LIST_HEAD(dmar_atsr_units);
377 static LIST_HEAD(dmar_rmrr_units);
378
379 #define for_each_rmrr_units(rmrr) \
380         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
381
382 /* bitmap for indexing intel_iommus */
383 static int g_num_of_iommus;
384
385 static void domain_exit(struct dmar_domain *domain);
386 static void domain_remove_dev_info(struct dmar_domain *domain);
387 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
388                                      struct device *dev);
389 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
390 static void domain_context_clear(struct intel_iommu *iommu,
391                                  struct device *dev);
392 static int domain_detach_iommu(struct dmar_domain *domain,
393                                struct intel_iommu *iommu);
394
395 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
396 int dmar_disabled = 0;
397 #else
398 int dmar_disabled = 1;
399 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
400
401 int intel_iommu_enabled = 0;
402 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
403
404 static int dmar_map_gfx = 1;
405 static int dmar_forcedac;
406 static int intel_iommu_strict;
407 static int intel_iommu_superpage = 1;
408 static int intel_iommu_ecs = 1;
409 static int intel_iommu_pasid28;
410 static int iommu_identity_mapping;
411
412 #define IDENTMAP_ALL            1
413 #define IDENTMAP_GFX            2
414 #define IDENTMAP_AZALIA         4
415
416 /* Broadwell and Skylake have broken ECS support — normal so-called "second
417  * level" translation of DMA requests-without-PASID doesn't actually happen
418  * unless you also set the NESTE bit in an extended context-entry. Which of
419  * course means that SVM doesn't work because it's trying to do nested
420  * translation of the physical addresses it finds in the process page tables,
421  * through the IOVA->phys mapping found in the "second level" page tables.
422  *
423  * The VT-d specification was retroactively changed to change the definition
424  * of the capability bits and pretend that Broadwell/Skylake never happened...
425  * but unfortunately the wrong bit was changed. It's ECS which is broken, but
426  * for some reason it was the PASID capability bit which was redefined (from
427  * bit 28 on BDW/SKL to bit 40 in future).
428  *
429  * So our test for ECS needs to eschew those implementations which set the old
430  * PASID capabiity bit 28, since those are the ones on which ECS is broken.
431  * Unless we are working around the 'pasid28' limitations, that is, by putting
432  * the device into passthrough mode for normal DMA and thus masking the bug.
433  */
434 #define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
435                             (intel_iommu_pasid28 || !ecap_broken_pasid(iommu->ecap)))
436 /* PASID support is thus enabled if ECS is enabled and *either* of the old
437  * or new capability bits are set. */
438 #define pasid_enabled(iommu) (ecs_enabled(iommu) &&                     \
439                               (ecap_pasid(iommu->ecap) || ecap_broken_pasid(iommu->ecap)))
440
441 int intel_iommu_gfx_mapped;
442 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
443
444 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
445 static DEFINE_SPINLOCK(device_domain_lock);
446 static LIST_HEAD(device_domain_list);
447
448 /*
449  * Iterate over elements in device_domain_list and call the specified
450  * callback @fn against each element. This helper should only be used
451  * in the context where the device_domain_lock has already been holden.
452  */
453 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
454                                      void *data), void *data)
455 {
456         int ret = 0;
457         struct device_domain_info *info;
458
459         assert_spin_locked(&device_domain_lock);
460         list_for_each_entry(info, &device_domain_list, global) {
461                 ret = fn(info, data);
462                 if (ret)
463                         return ret;
464         }
465
466         return 0;
467 }
468
469 const struct iommu_ops intel_iommu_ops;
470
471 static bool translation_pre_enabled(struct intel_iommu *iommu)
472 {
473         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
474 }
475
476 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
477 {
478         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
479 }
480
481 static void init_translation_status(struct intel_iommu *iommu)
482 {
483         u32 gsts;
484
485         gsts = readl(iommu->reg + DMAR_GSTS_REG);
486         if (gsts & DMA_GSTS_TES)
487                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
488 }
489
490 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
491 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
492 {
493         return container_of(dom, struct dmar_domain, domain);
494 }
495
496 static int __init intel_iommu_setup(char *str)
497 {
498         if (!str)
499                 return -EINVAL;
500         while (*str) {
501                 if (!strncmp(str, "on", 2)) {
502                         dmar_disabled = 0;
503                         pr_info("IOMMU enabled\n");
504                 } else if (!strncmp(str, "off", 3)) {
505                         dmar_disabled = 1;
506                         pr_info("IOMMU disabled\n");
507                 } else if (!strncmp(str, "igfx_off", 8)) {
508                         dmar_map_gfx = 0;
509                         pr_info("Disable GFX device mapping\n");
510                 } else if (!strncmp(str, "forcedac", 8)) {
511                         pr_info("Forcing DAC for PCI devices\n");
512                         dmar_forcedac = 1;
513                 } else if (!strncmp(str, "strict", 6)) {
514                         pr_info("Disable batched IOTLB flush\n");
515                         intel_iommu_strict = 1;
516                 } else if (!strncmp(str, "sp_off", 6)) {
517                         pr_info("Disable supported super page\n");
518                         intel_iommu_superpage = 0;
519                 } else if (!strncmp(str, "ecs_off", 7)) {
520                         printk(KERN_INFO
521                                 "Intel-IOMMU: disable extended context table support\n");
522                         intel_iommu_ecs = 0;
523                 } else if (!strncmp(str, "pasid28", 7)) {
524                         printk(KERN_INFO
525                                 "Intel-IOMMU: enable pre-production PASID support\n");
526                         intel_iommu_pasid28 = 1;
527                         iommu_identity_mapping |= IDENTMAP_GFX;
528                 } else if (!strncmp(str, "tboot_noforce", 13)) {
529                         printk(KERN_INFO
530                                 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
531                         intel_iommu_tboot_noforce = 1;
532                 }
533
534                 str += strcspn(str, ",");
535                 while (*str == ',')
536                         str++;
537         }
538         return 0;
539 }
540 __setup("intel_iommu=", intel_iommu_setup);
541
542 static struct kmem_cache *iommu_domain_cache;
543 static struct kmem_cache *iommu_devinfo_cache;
544
545 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
546 {
547         struct dmar_domain **domains;
548         int idx = did >> 8;
549
550         domains = iommu->domains[idx];
551         if (!domains)
552                 return NULL;
553
554         return domains[did & 0xff];
555 }
556
557 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
558                              struct dmar_domain *domain)
559 {
560         struct dmar_domain **domains;
561         int idx = did >> 8;
562
563         if (!iommu->domains[idx]) {
564                 size_t size = 256 * sizeof(struct dmar_domain *);
565                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
566         }
567
568         domains = iommu->domains[idx];
569         if (WARN_ON(!domains))
570                 return;
571         else
572                 domains[did & 0xff] = domain;
573 }
574
575 void *alloc_pgtable_page(int node)
576 {
577         struct page *page;
578         void *vaddr = NULL;
579
580         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
581         if (page)
582                 vaddr = page_address(page);
583         return vaddr;
584 }
585
586 void free_pgtable_page(void *vaddr)
587 {
588         free_page((unsigned long)vaddr);
589 }
590
591 static inline void *alloc_domain_mem(void)
592 {
593         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
594 }
595
596 static void free_domain_mem(void *vaddr)
597 {
598         kmem_cache_free(iommu_domain_cache, vaddr);
599 }
600
601 static inline void * alloc_devinfo_mem(void)
602 {
603         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
604 }
605
606 static inline void free_devinfo_mem(void *vaddr)
607 {
608         kmem_cache_free(iommu_devinfo_cache, vaddr);
609 }
610
611 static inline int domain_type_is_vm(struct dmar_domain *domain)
612 {
613         return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
614 }
615
616 static inline int domain_type_is_si(struct dmar_domain *domain)
617 {
618         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
619 }
620
621 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
622 {
623         return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
624                                 DOMAIN_FLAG_STATIC_IDENTITY);
625 }
626
627 static inline int domain_pfn_supported(struct dmar_domain *domain,
628                                        unsigned long pfn)
629 {
630         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
631
632         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
633 }
634
635 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
636 {
637         unsigned long sagaw;
638         int agaw = -1;
639
640         sagaw = cap_sagaw(iommu->cap);
641         for (agaw = width_to_agaw(max_gaw);
642              agaw >= 0; agaw--) {
643                 if (test_bit(agaw, &sagaw))
644                         break;
645         }
646
647         return agaw;
648 }
649
650 /*
651  * Calculate max SAGAW for each iommu.
652  */
653 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
654 {
655         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
656 }
657
658 /*
659  * calculate agaw for each iommu.
660  * "SAGAW" may be different across iommus, use a default agaw, and
661  * get a supported less agaw for iommus that don't support the default agaw.
662  */
663 int iommu_calculate_agaw(struct intel_iommu *iommu)
664 {
665         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
666 }
667
668 /* This functionin only returns single iommu in a domain */
669 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
670 {
671         int iommu_id;
672
673         /* si_domain and vm domain should not get here. */
674         BUG_ON(domain_type_is_vm_or_si(domain));
675         for_each_domain_iommu(iommu_id, domain)
676                 break;
677
678         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
679                 return NULL;
680
681         return g_iommus[iommu_id];
682 }
683
684 static void domain_update_iommu_coherency(struct dmar_domain *domain)
685 {
686         struct dmar_drhd_unit *drhd;
687         struct intel_iommu *iommu;
688         bool found = false;
689         int i;
690
691         domain->iommu_coherency = 1;
692
693         for_each_domain_iommu(i, domain) {
694                 found = true;
695                 if (!ecap_coherent(g_iommus[i]->ecap)) {
696                         domain->iommu_coherency = 0;
697                         break;
698                 }
699         }
700         if (found)
701                 return;
702
703         /* No hardware attached; use lowest common denominator */
704         rcu_read_lock();
705         for_each_active_iommu(iommu, drhd) {
706                 if (!ecap_coherent(iommu->ecap)) {
707                         domain->iommu_coherency = 0;
708                         break;
709                 }
710         }
711         rcu_read_unlock();
712 }
713
714 static int domain_update_iommu_snooping(struct intel_iommu *skip)
715 {
716         struct dmar_drhd_unit *drhd;
717         struct intel_iommu *iommu;
718         int ret = 1;
719
720         rcu_read_lock();
721         for_each_active_iommu(iommu, drhd) {
722                 if (iommu != skip) {
723                         if (!ecap_sc_support(iommu->ecap)) {
724                                 ret = 0;
725                                 break;
726                         }
727                 }
728         }
729         rcu_read_unlock();
730
731         return ret;
732 }
733
734 static int domain_update_iommu_superpage(struct intel_iommu *skip)
735 {
736         struct dmar_drhd_unit *drhd;
737         struct intel_iommu *iommu;
738         int mask = 0xf;
739
740         if (!intel_iommu_superpage) {
741                 return 0;
742         }
743
744         /* set iommu_superpage to the smallest common denominator */
745         rcu_read_lock();
746         for_each_active_iommu(iommu, drhd) {
747                 if (iommu != skip) {
748                         mask &= cap_super_page_val(iommu->cap);
749                         if (!mask)
750                                 break;
751                 }
752         }
753         rcu_read_unlock();
754
755         return fls(mask);
756 }
757
758 /* Some capabilities may be different across iommus */
759 static void domain_update_iommu_cap(struct dmar_domain *domain)
760 {
761         domain_update_iommu_coherency(domain);
762         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
763         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
764 }
765
766 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
767                                          u8 devfn, int alloc)
768 {
769         struct root_entry *root = &iommu->root_entry[bus];
770         struct context_entry *context;
771         u64 *entry;
772
773         entry = &root->lo;
774         if (ecs_enabled(iommu)) {
775                 if (devfn >= 0x80) {
776                         devfn -= 0x80;
777                         entry = &root->hi;
778                 }
779                 devfn *= 2;
780         }
781         if (*entry & 1)
782                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
783         else {
784                 unsigned long phy_addr;
785                 if (!alloc)
786                         return NULL;
787
788                 context = alloc_pgtable_page(iommu->node);
789                 if (!context)
790                         return NULL;
791
792                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
793                 phy_addr = virt_to_phys((void *)context);
794                 *entry = phy_addr | 1;
795                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
796         }
797         return &context[devfn];
798 }
799
800 static int iommu_dummy(struct device *dev)
801 {
802         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
803 }
804
805 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
806 {
807         struct dmar_drhd_unit *drhd = NULL;
808         struct intel_iommu *iommu;
809         struct device *tmp;
810         struct pci_dev *ptmp, *pdev = NULL;
811         u16 segment = 0;
812         int i;
813
814         if (iommu_dummy(dev))
815                 return NULL;
816
817         if (dev_is_pci(dev)) {
818                 struct pci_dev *pf_pdev;
819
820                 pdev = to_pci_dev(dev);
821
822 #ifdef CONFIG_X86
823                 /* VMD child devices currently cannot be handled individually */
824                 if (is_vmd(pdev->bus))
825                         return NULL;
826 #endif
827
828                 /* VFs aren't listed in scope tables; we need to look up
829                  * the PF instead to find the IOMMU. */
830                 pf_pdev = pci_physfn(pdev);
831                 dev = &pf_pdev->dev;
832                 segment = pci_domain_nr(pdev->bus);
833         } else if (has_acpi_companion(dev))
834                 dev = &ACPI_COMPANION(dev)->dev;
835
836         rcu_read_lock();
837         for_each_active_iommu(iommu, drhd) {
838                 if (pdev && segment != drhd->segment)
839                         continue;
840
841                 for_each_active_dev_scope(drhd->devices,
842                                           drhd->devices_cnt, i, tmp) {
843                         if (tmp == dev) {
844                                 /* For a VF use its original BDF# not that of the PF
845                                  * which we used for the IOMMU lookup. Strictly speaking
846                                  * we could do this for all PCI devices; we only need to
847                                  * get the BDF# from the scope table for ACPI matches. */
848                                 if (pdev && pdev->is_virtfn)
849                                         goto got_pdev;
850
851                                 *bus = drhd->devices[i].bus;
852                                 *devfn = drhd->devices[i].devfn;
853                                 goto out;
854                         }
855
856                         if (!pdev || !dev_is_pci(tmp))
857                                 continue;
858
859                         ptmp = to_pci_dev(tmp);
860                         if (ptmp->subordinate &&
861                             ptmp->subordinate->number <= pdev->bus->number &&
862                             ptmp->subordinate->busn_res.end >= pdev->bus->number)
863                                 goto got_pdev;
864                 }
865
866                 if (pdev && drhd->include_all) {
867                 got_pdev:
868                         *bus = pdev->bus->number;
869                         *devfn = pdev->devfn;
870                         goto out;
871                 }
872         }
873         iommu = NULL;
874  out:
875         rcu_read_unlock();
876
877         return iommu;
878 }
879
880 static void domain_flush_cache(struct dmar_domain *domain,
881                                void *addr, int size)
882 {
883         if (!domain->iommu_coherency)
884                 clflush_cache_range(addr, size);
885 }
886
887 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
888 {
889         struct context_entry *context;
890         int ret = 0;
891         unsigned long flags;
892
893         spin_lock_irqsave(&iommu->lock, flags);
894         context = iommu_context_addr(iommu, bus, devfn, 0);
895         if (context)
896                 ret = context_present(context);
897         spin_unlock_irqrestore(&iommu->lock, flags);
898         return ret;
899 }
900
901 static void free_context_table(struct intel_iommu *iommu)
902 {
903         int i;
904         unsigned long flags;
905         struct context_entry *context;
906
907         spin_lock_irqsave(&iommu->lock, flags);
908         if (!iommu->root_entry) {
909                 goto out;
910         }
911         for (i = 0; i < ROOT_ENTRY_NR; i++) {
912                 context = iommu_context_addr(iommu, i, 0, 0);
913                 if (context)
914                         free_pgtable_page(context);
915
916                 if (!ecs_enabled(iommu))
917                         continue;
918
919                 context = iommu_context_addr(iommu, i, 0x80, 0);
920                 if (context)
921                         free_pgtable_page(context);
922
923         }
924         free_pgtable_page(iommu->root_entry);
925         iommu->root_entry = NULL;
926 out:
927         spin_unlock_irqrestore(&iommu->lock, flags);
928 }
929
930 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
931                                       unsigned long pfn, int *target_level)
932 {
933         struct dma_pte *parent, *pte = NULL;
934         int level = agaw_to_level(domain->agaw);
935         int offset;
936
937         BUG_ON(!domain->pgd);
938
939         if (!domain_pfn_supported(domain, pfn))
940                 /* Address beyond IOMMU's addressing capabilities. */
941                 return NULL;
942
943         parent = domain->pgd;
944
945         while (1) {
946                 void *tmp_page;
947
948                 offset = pfn_level_offset(pfn, level);
949                 pte = &parent[offset];
950                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
951                         break;
952                 if (level == *target_level)
953                         break;
954
955                 if (!dma_pte_present(pte)) {
956                         uint64_t pteval;
957
958                         tmp_page = alloc_pgtable_page(domain->nid);
959
960                         if (!tmp_page)
961                                 return NULL;
962
963                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
964                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
965                         if (cmpxchg64(&pte->val, 0ULL, pteval))
966                                 /* Someone else set it while we were thinking; use theirs. */
967                                 free_pgtable_page(tmp_page);
968                         else
969                                 domain_flush_cache(domain, pte, sizeof(*pte));
970                 }
971                 if (level == 1)
972                         break;
973
974                 parent = phys_to_virt(dma_pte_addr(pte));
975                 level--;
976         }
977
978         if (!*target_level)
979                 *target_level = level;
980
981         return pte;
982 }
983
984
985 /* return address's pte at specific level */
986 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
987                                          unsigned long pfn,
988                                          int level, int *large_page)
989 {
990         struct dma_pte *parent, *pte = NULL;
991         int total = agaw_to_level(domain->agaw);
992         int offset;
993
994         parent = domain->pgd;
995         while (level <= total) {
996                 offset = pfn_level_offset(pfn, total);
997                 pte = &parent[offset];
998                 if (level == total)
999                         return pte;
1000
1001                 if (!dma_pte_present(pte)) {
1002                         *large_page = total;
1003                         break;
1004                 }
1005
1006                 if (dma_pte_superpage(pte)) {
1007                         *large_page = total;
1008                         return pte;
1009                 }
1010
1011                 parent = phys_to_virt(dma_pte_addr(pte));
1012                 total--;
1013         }
1014         return NULL;
1015 }
1016
1017 /* clear last level pte, a tlb flush should be followed */
1018 static void dma_pte_clear_range(struct dmar_domain *domain,
1019                                 unsigned long start_pfn,
1020                                 unsigned long last_pfn)
1021 {
1022         unsigned int large_page = 1;
1023         struct dma_pte *first_pte, *pte;
1024
1025         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1026         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1027         BUG_ON(start_pfn > last_pfn);
1028
1029         /* we don't need lock here; nobody else touches the iova range */
1030         do {
1031                 large_page = 1;
1032                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1033                 if (!pte) {
1034                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1035                         continue;
1036                 }
1037                 do {
1038                         dma_clear_pte(pte);
1039                         start_pfn += lvl_to_nr_pages(large_page);
1040                         pte++;
1041                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1042
1043                 domain_flush_cache(domain, first_pte,
1044                                    (void *)pte - (void *)first_pte);
1045
1046         } while (start_pfn && start_pfn <= last_pfn);
1047 }
1048
1049 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1050                                int retain_level, struct dma_pte *pte,
1051                                unsigned long pfn, unsigned long start_pfn,
1052                                unsigned long last_pfn)
1053 {
1054         pfn = max(start_pfn, pfn);
1055         pte = &pte[pfn_level_offset(pfn, level)];
1056
1057         do {
1058                 unsigned long level_pfn;
1059                 struct dma_pte *level_pte;
1060
1061                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1062                         goto next;
1063
1064                 level_pfn = pfn & level_mask(level);
1065                 level_pte = phys_to_virt(dma_pte_addr(pte));
1066
1067                 if (level > 2) {
1068                         dma_pte_free_level(domain, level - 1, retain_level,
1069                                            level_pte, level_pfn, start_pfn,
1070                                            last_pfn);
1071                 }
1072
1073                 /*
1074                  * Free the page table if we're below the level we want to
1075                  * retain and the range covers the entire table.
1076                  */
1077                 if (level < retain_level && !(start_pfn > level_pfn ||
1078                       last_pfn < level_pfn + level_size(level) - 1)) {
1079                         dma_clear_pte(pte);
1080                         domain_flush_cache(domain, pte, sizeof(*pte));
1081                         free_pgtable_page(level_pte);
1082                 }
1083 next:
1084                 pfn += level_size(level);
1085         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1086 }
1087
1088 /*
1089  * clear last level (leaf) ptes and free page table pages below the
1090  * level we wish to keep intact.
1091  */
1092 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1093                                    unsigned long start_pfn,
1094                                    unsigned long last_pfn,
1095                                    int retain_level)
1096 {
1097         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1098         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1099         BUG_ON(start_pfn > last_pfn);
1100
1101         dma_pte_clear_range(domain, start_pfn, last_pfn);
1102
1103         /* We don't need lock here; nobody else touches the iova range */
1104         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1105                            domain->pgd, 0, start_pfn, last_pfn);
1106
1107         /* free pgd */
1108         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1109                 free_pgtable_page(domain->pgd);
1110                 domain->pgd = NULL;
1111         }
1112 }
1113
1114 /* When a page at a given level is being unlinked from its parent, we don't
1115    need to *modify* it at all. All we need to do is make a list of all the
1116    pages which can be freed just as soon as we've flushed the IOTLB and we
1117    know the hardware page-walk will no longer touch them.
1118    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1119    be freed. */
1120 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1121                                             int level, struct dma_pte *pte,
1122                                             struct page *freelist)
1123 {
1124         struct page *pg;
1125
1126         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1127         pg->freelist = freelist;
1128         freelist = pg;
1129
1130         if (level == 1)
1131                 return freelist;
1132
1133         pte = page_address(pg);
1134         do {
1135                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1136                         freelist = dma_pte_list_pagetables(domain, level - 1,
1137                                                            pte, freelist);
1138                 pte++;
1139         } while (!first_pte_in_page(pte));
1140
1141         return freelist;
1142 }
1143
1144 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1145                                         struct dma_pte *pte, unsigned long pfn,
1146                                         unsigned long start_pfn,
1147                                         unsigned long last_pfn,
1148                                         struct page *freelist)
1149 {
1150         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1151
1152         pfn = max(start_pfn, pfn);
1153         pte = &pte[pfn_level_offset(pfn, level)];
1154
1155         do {
1156                 unsigned long level_pfn;
1157
1158                 if (!dma_pte_present(pte))
1159                         goto next;
1160
1161                 level_pfn = pfn & level_mask(level);
1162
1163                 /* If range covers entire pagetable, free it */
1164                 if (start_pfn <= level_pfn &&
1165                     last_pfn >= level_pfn + level_size(level) - 1) {
1166                         /* These suborbinate page tables are going away entirely. Don't
1167                            bother to clear them; we're just going to *free* them. */
1168                         if (level > 1 && !dma_pte_superpage(pte))
1169                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1170
1171                         dma_clear_pte(pte);
1172                         if (!first_pte)
1173                                 first_pte = pte;
1174                         last_pte = pte;
1175                 } else if (level > 1) {
1176                         /* Recurse down into a level that isn't *entirely* obsolete */
1177                         freelist = dma_pte_clear_level(domain, level - 1,
1178                                                        phys_to_virt(dma_pte_addr(pte)),
1179                                                        level_pfn, start_pfn, last_pfn,
1180                                                        freelist);
1181                 }
1182 next:
1183                 pfn += level_size(level);
1184         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1185
1186         if (first_pte)
1187                 domain_flush_cache(domain, first_pte,
1188                                    (void *)++last_pte - (void *)first_pte);
1189
1190         return freelist;
1191 }
1192
1193 /* We can't just free the pages because the IOMMU may still be walking
1194    the page tables, and may have cached the intermediate levels. The
1195    pages can only be freed after the IOTLB flush has been done. */
1196 static struct page *domain_unmap(struct dmar_domain *domain,
1197                                  unsigned long start_pfn,
1198                                  unsigned long last_pfn)
1199 {
1200         struct page *freelist = NULL;
1201
1202         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1203         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1204         BUG_ON(start_pfn > last_pfn);
1205
1206         /* we don't need lock here; nobody else touches the iova range */
1207         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1208                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1209
1210         /* free pgd */
1211         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1212                 struct page *pgd_page = virt_to_page(domain->pgd);
1213                 pgd_page->freelist = freelist;
1214                 freelist = pgd_page;
1215
1216                 domain->pgd = NULL;
1217         }
1218
1219         return freelist;
1220 }
1221
1222 static void dma_free_pagelist(struct page *freelist)
1223 {
1224         struct page *pg;
1225
1226         while ((pg = freelist)) {
1227                 freelist = pg->freelist;
1228                 free_pgtable_page(page_address(pg));
1229         }
1230 }
1231
1232 static void iova_entry_free(unsigned long data)
1233 {
1234         struct page *freelist = (struct page *)data;
1235
1236         dma_free_pagelist(freelist);
1237 }
1238
1239 /* iommu handling */
1240 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1241 {
1242         struct root_entry *root;
1243         unsigned long flags;
1244
1245         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1246         if (!root) {
1247                 pr_err("Allocating root entry for %s failed\n",
1248                         iommu->name);
1249                 return -ENOMEM;
1250         }
1251
1252         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1253
1254         spin_lock_irqsave(&iommu->lock, flags);
1255         iommu->root_entry = root;
1256         spin_unlock_irqrestore(&iommu->lock, flags);
1257
1258         return 0;
1259 }
1260
1261 static void iommu_set_root_entry(struct intel_iommu *iommu)
1262 {
1263         u64 addr;
1264         u32 sts;
1265         unsigned long flag;
1266
1267         addr = virt_to_phys(iommu->root_entry);
1268         if (ecs_enabled(iommu))
1269                 addr |= DMA_RTADDR_RTT;
1270
1271         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1272         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1273
1274         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1275
1276         /* Make sure hardware complete it */
1277         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1278                       readl, (sts & DMA_GSTS_RTPS), sts);
1279
1280         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1281 }
1282
1283 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1284 {
1285         u32 val;
1286         unsigned long flag;
1287
1288         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1289                 return;
1290
1291         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1292         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1293
1294         /* Make sure hardware complete it */
1295         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1296                       readl, (!(val & DMA_GSTS_WBFS)), val);
1297
1298         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1299 }
1300
1301 /* return value determine if we need a write buffer flush */
1302 static void __iommu_flush_context(struct intel_iommu *iommu,
1303                                   u16 did, u16 source_id, u8 function_mask,
1304                                   u64 type)
1305 {
1306         u64 val = 0;
1307         unsigned long flag;
1308
1309         switch (type) {
1310         case DMA_CCMD_GLOBAL_INVL:
1311                 val = DMA_CCMD_GLOBAL_INVL;
1312                 break;
1313         case DMA_CCMD_DOMAIN_INVL:
1314                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1315                 break;
1316         case DMA_CCMD_DEVICE_INVL:
1317                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1318                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1319                 break;
1320         default:
1321                 BUG();
1322         }
1323         val |= DMA_CCMD_ICC;
1324
1325         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1326         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1327
1328         /* Make sure hardware complete it */
1329         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1330                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1331
1332         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1333 }
1334
1335 /* return value determine if we need a write buffer flush */
1336 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1337                                 u64 addr, unsigned int size_order, u64 type)
1338 {
1339         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1340         u64 val = 0, val_iva = 0;
1341         unsigned long flag;
1342
1343         switch (type) {
1344         case DMA_TLB_GLOBAL_FLUSH:
1345                 /* global flush doesn't need set IVA_REG */
1346                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1347                 break;
1348         case DMA_TLB_DSI_FLUSH:
1349                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1350                 break;
1351         case DMA_TLB_PSI_FLUSH:
1352                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1353                 /* IH bit is passed in as part of address */
1354                 val_iva = size_order | addr;
1355                 break;
1356         default:
1357                 BUG();
1358         }
1359         /* Note: set drain read/write */
1360 #if 0
1361         /*
1362          * This is probably to be super secure.. Looks like we can
1363          * ignore it without any impact.
1364          */
1365         if (cap_read_drain(iommu->cap))
1366                 val |= DMA_TLB_READ_DRAIN;
1367 #endif
1368         if (cap_write_drain(iommu->cap))
1369                 val |= DMA_TLB_WRITE_DRAIN;
1370
1371         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1372         /* Note: Only uses first TLB reg currently */
1373         if (val_iva)
1374                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1375         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1376
1377         /* Make sure hardware complete it */
1378         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1379                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1380
1381         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1382
1383         /* check IOTLB invalidation granularity */
1384         if (DMA_TLB_IAIG(val) == 0)
1385                 pr_err("Flush IOTLB failed\n");
1386         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1387                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1388                         (unsigned long long)DMA_TLB_IIRG(type),
1389                         (unsigned long long)DMA_TLB_IAIG(val));
1390 }
1391
1392 static struct device_domain_info *
1393 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1394                          u8 bus, u8 devfn)
1395 {
1396         struct device_domain_info *info;
1397
1398         assert_spin_locked(&device_domain_lock);
1399
1400         if (!iommu->qi)
1401                 return NULL;
1402
1403         list_for_each_entry(info, &domain->devices, link)
1404                 if (info->iommu == iommu && info->bus == bus &&
1405                     info->devfn == devfn) {
1406                         if (info->ats_supported && info->dev)
1407                                 return info;
1408                         break;
1409                 }
1410
1411         return NULL;
1412 }
1413
1414 static void domain_update_iotlb(struct dmar_domain *domain)
1415 {
1416         struct device_domain_info *info;
1417         bool has_iotlb_device = false;
1418
1419         assert_spin_locked(&device_domain_lock);
1420
1421         list_for_each_entry(info, &domain->devices, link) {
1422                 struct pci_dev *pdev;
1423
1424                 if (!info->dev || !dev_is_pci(info->dev))
1425                         continue;
1426
1427                 pdev = to_pci_dev(info->dev);
1428                 if (pdev->ats_enabled) {
1429                         has_iotlb_device = true;
1430                         break;
1431                 }
1432         }
1433
1434         domain->has_iotlb_device = has_iotlb_device;
1435 }
1436
1437 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1438 {
1439         struct pci_dev *pdev;
1440
1441         assert_spin_locked(&device_domain_lock);
1442
1443         if (!info || !dev_is_pci(info->dev))
1444                 return;
1445
1446         pdev = to_pci_dev(info->dev);
1447         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1448          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1449          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1450          * reserved, which should be set to 0.
1451          */
1452         if (!ecap_dit(info->iommu->ecap))
1453                 info->pfsid = 0;
1454         else {
1455                 struct pci_dev *pf_pdev;
1456
1457                 /* pdev will be returned if device is not a vf */
1458                 pf_pdev = pci_physfn(pdev);
1459                 info->pfsid = PCI_DEVID(pf_pdev->bus->number, pf_pdev->devfn);
1460         }
1461
1462 #ifdef CONFIG_INTEL_IOMMU_SVM
1463         /* The PCIe spec, in its wisdom, declares that the behaviour of
1464            the device if you enable PASID support after ATS support is
1465            undefined. So always enable PASID support on devices which
1466            have it, even if we can't yet know if we're ever going to
1467            use it. */
1468         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1469                 info->pasid_enabled = 1;
1470
1471         if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1472                 info->pri_enabled = 1;
1473 #endif
1474         if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1475                 info->ats_enabled = 1;
1476                 domain_update_iotlb(info->domain);
1477                 info->ats_qdep = pci_ats_queue_depth(pdev);
1478         }
1479 }
1480
1481 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1482 {
1483         struct pci_dev *pdev;
1484
1485         assert_spin_locked(&device_domain_lock);
1486
1487         if (!dev_is_pci(info->dev))
1488                 return;
1489
1490         pdev = to_pci_dev(info->dev);
1491
1492         if (info->ats_enabled) {
1493                 pci_disable_ats(pdev);
1494                 info->ats_enabled = 0;
1495                 domain_update_iotlb(info->domain);
1496         }
1497 #ifdef CONFIG_INTEL_IOMMU_SVM
1498         if (info->pri_enabled) {
1499                 pci_disable_pri(pdev);
1500                 info->pri_enabled = 0;
1501         }
1502         if (info->pasid_enabled) {
1503                 pci_disable_pasid(pdev);
1504                 info->pasid_enabled = 0;
1505         }
1506 #endif
1507 }
1508
1509 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1510                                   u64 addr, unsigned mask)
1511 {
1512         u16 sid, qdep;
1513         unsigned long flags;
1514         struct device_domain_info *info;
1515
1516         if (!domain->has_iotlb_device)
1517                 return;
1518
1519         spin_lock_irqsave(&device_domain_lock, flags);
1520         list_for_each_entry(info, &domain->devices, link) {
1521                 if (!info->ats_enabled)
1522                         continue;
1523
1524                 sid = info->bus << 8 | info->devfn;
1525                 qdep = info->ats_qdep;
1526                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1527                                 qdep, addr, mask);
1528         }
1529         spin_unlock_irqrestore(&device_domain_lock, flags);
1530 }
1531
1532 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1533                                   struct dmar_domain *domain,
1534                                   unsigned long pfn, unsigned int pages,
1535                                   int ih, int map)
1536 {
1537         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1538         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1539         u16 did = domain->iommu_did[iommu->seq_id];
1540
1541         BUG_ON(pages == 0);
1542
1543         if (ih)
1544                 ih = 1 << 6;
1545         /*
1546          * Fallback to domain selective flush if no PSI support or the size is
1547          * too big.
1548          * PSI requires page size to be 2 ^ x, and the base address is naturally
1549          * aligned to the size
1550          */
1551         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1552                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1553                                                 DMA_TLB_DSI_FLUSH);
1554         else
1555                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1556                                                 DMA_TLB_PSI_FLUSH);
1557
1558         /*
1559          * In caching mode, changes of pages from non-present to present require
1560          * flush. However, device IOTLB doesn't need to be flushed in this case.
1561          */
1562         if (!cap_caching_mode(iommu->cap) || !map)
1563                 iommu_flush_dev_iotlb(domain, addr, mask);
1564 }
1565
1566 /* Notification for newly created mappings */
1567 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1568                                         struct dmar_domain *domain,
1569                                         unsigned long pfn, unsigned int pages)
1570 {
1571         /* It's a non-present to present mapping. Only flush if caching mode */
1572         if (cap_caching_mode(iommu->cap))
1573                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1574         else
1575                 iommu_flush_write_buffer(iommu);
1576 }
1577
1578 static void iommu_flush_iova(struct iova_domain *iovad)
1579 {
1580         struct dmar_domain *domain;
1581         int idx;
1582
1583         domain = container_of(iovad, struct dmar_domain, iovad);
1584
1585         for_each_domain_iommu(idx, domain) {
1586                 struct intel_iommu *iommu = g_iommus[idx];
1587                 u16 did = domain->iommu_did[iommu->seq_id];
1588
1589                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1590
1591                 if (!cap_caching_mode(iommu->cap))
1592                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1593                                               0, MAX_AGAW_PFN_WIDTH);
1594         }
1595 }
1596
1597 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1598 {
1599         u32 pmen;
1600         unsigned long flags;
1601
1602         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1603         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1604         pmen &= ~DMA_PMEN_EPM;
1605         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1606
1607         /* wait for the protected region status bit to clear */
1608         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1609                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1610
1611         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1612 }
1613
1614 static void iommu_enable_translation(struct intel_iommu *iommu)
1615 {
1616         u32 sts;
1617         unsigned long flags;
1618
1619         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1620         iommu->gcmd |= DMA_GCMD_TE;
1621         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1622
1623         /* Make sure hardware complete it */
1624         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1625                       readl, (sts & DMA_GSTS_TES), sts);
1626
1627         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1628 }
1629
1630 static void iommu_disable_translation(struct intel_iommu *iommu)
1631 {
1632         u32 sts;
1633         unsigned long flag;
1634
1635         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1636         iommu->gcmd &= ~DMA_GCMD_TE;
1637         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1638
1639         /* Make sure hardware complete it */
1640         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1641                       readl, (!(sts & DMA_GSTS_TES)), sts);
1642
1643         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1644 }
1645
1646
1647 static int iommu_init_domains(struct intel_iommu *iommu)
1648 {
1649         u32 ndomains, nlongs;
1650         size_t size;
1651
1652         ndomains = cap_ndoms(iommu->cap);
1653         pr_debug("%s: Number of Domains supported <%d>\n",
1654                  iommu->name, ndomains);
1655         nlongs = BITS_TO_LONGS(ndomains);
1656
1657         spin_lock_init(&iommu->lock);
1658
1659         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1660         if (!iommu->domain_ids) {
1661                 pr_err("%s: Allocating domain id array failed\n",
1662                        iommu->name);
1663                 return -ENOMEM;
1664         }
1665
1666         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1667         iommu->domains = kzalloc(size, GFP_KERNEL);
1668
1669         if (iommu->domains) {
1670                 size = 256 * sizeof(struct dmar_domain *);
1671                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1672         }
1673
1674         if (!iommu->domains || !iommu->domains[0]) {
1675                 pr_err("%s: Allocating domain array failed\n",
1676                        iommu->name);
1677                 kfree(iommu->domain_ids);
1678                 kfree(iommu->domains);
1679                 iommu->domain_ids = NULL;
1680                 iommu->domains    = NULL;
1681                 return -ENOMEM;
1682         }
1683
1684
1685
1686         /*
1687          * If Caching mode is set, then invalid translations are tagged
1688          * with domain-id 0, hence we need to pre-allocate it. We also
1689          * use domain-id 0 as a marker for non-allocated domain-id, so
1690          * make sure it is not used for a real domain.
1691          */
1692         set_bit(0, iommu->domain_ids);
1693
1694         return 0;
1695 }
1696
1697 static void disable_dmar_iommu(struct intel_iommu *iommu)
1698 {
1699         struct device_domain_info *info, *tmp;
1700         unsigned long flags;
1701
1702         if (!iommu->domains || !iommu->domain_ids)
1703                 return;
1704
1705 again:
1706         spin_lock_irqsave(&device_domain_lock, flags);
1707         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1708                 struct dmar_domain *domain;
1709
1710                 if (info->iommu != iommu)
1711                         continue;
1712
1713                 if (!info->dev || !info->domain)
1714                         continue;
1715
1716                 domain = info->domain;
1717
1718                 __dmar_remove_one_dev_info(info);
1719
1720                 if (!domain_type_is_vm_or_si(domain)) {
1721                         /*
1722                          * The domain_exit() function  can't be called under
1723                          * device_domain_lock, as it takes this lock itself.
1724                          * So release the lock here and re-run the loop
1725                          * afterwards.
1726                          */
1727                         spin_unlock_irqrestore(&device_domain_lock, flags);
1728                         domain_exit(domain);
1729                         goto again;
1730                 }
1731         }
1732         spin_unlock_irqrestore(&device_domain_lock, flags);
1733
1734         if (iommu->gcmd & DMA_GCMD_TE)
1735                 iommu_disable_translation(iommu);
1736 }
1737
1738 static void free_dmar_iommu(struct intel_iommu *iommu)
1739 {
1740         if ((iommu->domains) && (iommu->domain_ids)) {
1741                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1742                 int i;
1743
1744                 for (i = 0; i < elems; i++)
1745                         kfree(iommu->domains[i]);
1746                 kfree(iommu->domains);
1747                 kfree(iommu->domain_ids);
1748                 iommu->domains = NULL;
1749                 iommu->domain_ids = NULL;
1750         }
1751
1752         g_iommus[iommu->seq_id] = NULL;
1753
1754         /* free context mapping */
1755         free_context_table(iommu);
1756
1757 #ifdef CONFIG_INTEL_IOMMU_SVM
1758         if (pasid_enabled(iommu)) {
1759                 if (ecap_prs(iommu->ecap))
1760                         intel_svm_finish_prq(iommu);
1761                 intel_svm_exit(iommu);
1762         }
1763 #endif
1764 }
1765
1766 static struct dmar_domain *alloc_domain(int flags)
1767 {
1768         struct dmar_domain *domain;
1769
1770         domain = alloc_domain_mem();
1771         if (!domain)
1772                 return NULL;
1773
1774         memset(domain, 0, sizeof(*domain));
1775         domain->nid = -1;
1776         domain->flags = flags;
1777         domain->has_iotlb_device = false;
1778         INIT_LIST_HEAD(&domain->devices);
1779
1780         return domain;
1781 }
1782
1783 /* Must be called with iommu->lock */
1784 static int domain_attach_iommu(struct dmar_domain *domain,
1785                                struct intel_iommu *iommu)
1786 {
1787         unsigned long ndomains;
1788         int num;
1789
1790         assert_spin_locked(&device_domain_lock);
1791         assert_spin_locked(&iommu->lock);
1792
1793         domain->iommu_refcnt[iommu->seq_id] += 1;
1794         domain->iommu_count += 1;
1795         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1796                 ndomains = cap_ndoms(iommu->cap);
1797                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1798
1799                 if (num >= ndomains) {
1800                         pr_err("%s: No free domain ids\n", iommu->name);
1801                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1802                         domain->iommu_count -= 1;
1803                         return -ENOSPC;
1804                 }
1805
1806                 set_bit(num, iommu->domain_ids);
1807                 set_iommu_domain(iommu, num, domain);
1808
1809                 domain->iommu_did[iommu->seq_id] = num;
1810                 domain->nid                      = iommu->node;
1811
1812                 domain_update_iommu_cap(domain);
1813         }
1814
1815         return 0;
1816 }
1817
1818 static int domain_detach_iommu(struct dmar_domain *domain,
1819                                struct intel_iommu *iommu)
1820 {
1821         int num, count = INT_MAX;
1822
1823         assert_spin_locked(&device_domain_lock);
1824         assert_spin_locked(&iommu->lock);
1825
1826         domain->iommu_refcnt[iommu->seq_id] -= 1;
1827         count = --domain->iommu_count;
1828         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1829                 num = domain->iommu_did[iommu->seq_id];
1830                 clear_bit(num, iommu->domain_ids);
1831                 set_iommu_domain(iommu, num, NULL);
1832
1833                 domain_update_iommu_cap(domain);
1834                 domain->iommu_did[iommu->seq_id] = 0;
1835         }
1836
1837         return count;
1838 }
1839
1840 static struct iova_domain reserved_iova_list;
1841 static struct lock_class_key reserved_rbtree_key;
1842
1843 static int dmar_init_reserved_ranges(void)
1844 {
1845         struct pci_dev *pdev = NULL;
1846         struct iova *iova;
1847         int i;
1848
1849         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1850
1851         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1852                 &reserved_rbtree_key);
1853
1854         /* IOAPIC ranges shouldn't be accessed by DMA */
1855         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1856                 IOVA_PFN(IOAPIC_RANGE_END));
1857         if (!iova) {
1858                 pr_err("Reserve IOAPIC range failed\n");
1859                 return -ENODEV;
1860         }
1861
1862         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1863         for_each_pci_dev(pdev) {
1864                 struct resource *r;
1865
1866                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1867                         r = &pdev->resource[i];
1868                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1869                                 continue;
1870                         iova = reserve_iova(&reserved_iova_list,
1871                                             IOVA_PFN(r->start),
1872                                             IOVA_PFN(r->end));
1873                         if (!iova) {
1874                                 pr_err("Reserve iova failed\n");
1875                                 return -ENODEV;
1876                         }
1877                 }
1878         }
1879         return 0;
1880 }
1881
1882 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1883 {
1884         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1885 }
1886
1887 static inline int guestwidth_to_adjustwidth(int gaw)
1888 {
1889         int agaw;
1890         int r = (gaw - 12) % 9;
1891
1892         if (r == 0)
1893                 agaw = gaw;
1894         else
1895                 agaw = gaw + 9 - r;
1896         if (agaw > 64)
1897                 agaw = 64;
1898         return agaw;
1899 }
1900
1901 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1902                        int guest_width)
1903 {
1904         int adjust_width, agaw;
1905         unsigned long sagaw;
1906         int err;
1907
1908         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1909
1910         err = init_iova_flush_queue(&domain->iovad,
1911                                     iommu_flush_iova, iova_entry_free);
1912         if (err)
1913                 return err;
1914
1915         domain_reserve_special_ranges(domain);
1916
1917         /* calculate AGAW */
1918         if (guest_width > cap_mgaw(iommu->cap))
1919                 guest_width = cap_mgaw(iommu->cap);
1920         domain->gaw = guest_width;
1921         adjust_width = guestwidth_to_adjustwidth(guest_width);
1922         agaw = width_to_agaw(adjust_width);
1923         sagaw = cap_sagaw(iommu->cap);
1924         if (!test_bit(agaw, &sagaw)) {
1925                 /* hardware doesn't support it, choose a bigger one */
1926                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1927                 agaw = find_next_bit(&sagaw, 5, agaw);
1928                 if (agaw >= 5)
1929                         return -ENODEV;
1930         }
1931         domain->agaw = agaw;
1932
1933         if (ecap_coherent(iommu->ecap))
1934                 domain->iommu_coherency = 1;
1935         else
1936                 domain->iommu_coherency = 0;
1937
1938         if (ecap_sc_support(iommu->ecap))
1939                 domain->iommu_snooping = 1;
1940         else
1941                 domain->iommu_snooping = 0;
1942
1943         if (intel_iommu_superpage)
1944                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1945         else
1946                 domain->iommu_superpage = 0;
1947
1948         domain->nid = iommu->node;
1949
1950         /* always allocate the top pgd */
1951         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1952         if (!domain->pgd)
1953                 return -ENOMEM;
1954         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1955         return 0;
1956 }
1957
1958 static void domain_exit(struct dmar_domain *domain)
1959 {
1960         struct page *freelist = NULL;
1961
1962         /* Domain 0 is reserved, so dont process it */
1963         if (!domain)
1964                 return;
1965
1966         /* Remove associated devices and clear attached or cached domains */
1967         rcu_read_lock();
1968         domain_remove_dev_info(domain);
1969         rcu_read_unlock();
1970
1971         /* destroy iovas */
1972         put_iova_domain(&domain->iovad);
1973
1974         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1975
1976         dma_free_pagelist(freelist);
1977
1978         free_domain_mem(domain);
1979 }
1980
1981 static int domain_context_mapping_one(struct dmar_domain *domain,
1982                                       struct intel_iommu *iommu,
1983                                       u8 bus, u8 devfn)
1984 {
1985         u16 did = domain->iommu_did[iommu->seq_id];
1986         int translation = CONTEXT_TT_MULTI_LEVEL;
1987         struct device_domain_info *info = NULL;
1988         struct context_entry *context;
1989         unsigned long flags;
1990         struct dma_pte *pgd;
1991         int ret, agaw;
1992
1993         WARN_ON(did == 0);
1994
1995         if (hw_pass_through && domain_type_is_si(domain))
1996                 translation = CONTEXT_TT_PASS_THROUGH;
1997
1998         pr_debug("Set context mapping for %02x:%02x.%d\n",
1999                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2000
2001         BUG_ON(!domain->pgd);
2002
2003         spin_lock_irqsave(&device_domain_lock, flags);
2004         spin_lock(&iommu->lock);
2005
2006         ret = -ENOMEM;
2007         context = iommu_context_addr(iommu, bus, devfn, 1);
2008         if (!context)
2009                 goto out_unlock;
2010
2011         ret = 0;
2012         if (context_present(context))
2013                 goto out_unlock;
2014
2015         /*
2016          * For kdump cases, old valid entries may be cached due to the
2017          * in-flight DMA and copied pgtable, but there is no unmapping
2018          * behaviour for them, thus we need an explicit cache flush for
2019          * the newly-mapped device. For kdump, at this point, the device
2020          * is supposed to finish reset at its driver probe stage, so no
2021          * in-flight DMA will exist, and we don't need to worry anymore
2022          * hereafter.
2023          */
2024         if (context_copied(context)) {
2025                 u16 did_old = context_domain_id(context);
2026
2027                 if (did_old < cap_ndoms(iommu->cap)) {
2028                         iommu->flush.flush_context(iommu, did_old,
2029                                                    (((u16)bus) << 8) | devfn,
2030                                                    DMA_CCMD_MASK_NOBIT,
2031                                                    DMA_CCMD_DEVICE_INVL);
2032                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2033                                                  DMA_TLB_DSI_FLUSH);
2034                 }
2035         }
2036
2037         pgd = domain->pgd;
2038
2039         context_clear_entry(context);
2040         context_set_domain_id(context, did);
2041
2042         /*
2043          * Skip top levels of page tables for iommu which has less agaw
2044          * than default.  Unnecessary for PT mode.
2045          */
2046         if (translation != CONTEXT_TT_PASS_THROUGH) {
2047                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
2048                         ret = -ENOMEM;
2049                         pgd = phys_to_virt(dma_pte_addr(pgd));
2050                         if (!dma_pte_present(pgd))
2051                                 goto out_unlock;
2052                 }
2053
2054                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2055                 if (info && info->ats_supported)
2056                         translation = CONTEXT_TT_DEV_IOTLB;
2057                 else
2058                         translation = CONTEXT_TT_MULTI_LEVEL;
2059
2060                 context_set_address_root(context, virt_to_phys(pgd));
2061                 context_set_address_width(context, iommu->agaw);
2062         } else {
2063                 /*
2064                  * In pass through mode, AW must be programmed to
2065                  * indicate the largest AGAW value supported by
2066                  * hardware. And ASR is ignored by hardware.
2067                  */
2068                 context_set_address_width(context, iommu->msagaw);
2069         }
2070
2071         context_set_translation_type(context, translation);
2072         context_set_fault_enable(context);
2073         context_set_present(context);
2074         domain_flush_cache(domain, context, sizeof(*context));
2075
2076         /*
2077          * It's a non-present to present mapping. If hardware doesn't cache
2078          * non-present entry we only need to flush the write-buffer. If the
2079          * _does_ cache non-present entries, then it does so in the special
2080          * domain #0, which we have to flush:
2081          */
2082         if (cap_caching_mode(iommu->cap)) {
2083                 iommu->flush.flush_context(iommu, 0,
2084                                            (((u16)bus) << 8) | devfn,
2085                                            DMA_CCMD_MASK_NOBIT,
2086                                            DMA_CCMD_DEVICE_INVL);
2087                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2088         } else {
2089                 iommu_flush_write_buffer(iommu);
2090         }
2091         iommu_enable_dev_iotlb(info);
2092
2093         ret = 0;
2094
2095 out_unlock:
2096         spin_unlock(&iommu->lock);
2097         spin_unlock_irqrestore(&device_domain_lock, flags);
2098
2099         return ret;
2100 }
2101
2102 struct domain_context_mapping_data {
2103         struct dmar_domain *domain;
2104         struct intel_iommu *iommu;
2105 };
2106
2107 static int domain_context_mapping_cb(struct pci_dev *pdev,
2108                                      u16 alias, void *opaque)
2109 {
2110         struct domain_context_mapping_data *data = opaque;
2111
2112         return domain_context_mapping_one(data->domain, data->iommu,
2113                                           PCI_BUS_NUM(alias), alias & 0xff);
2114 }
2115
2116 static int
2117 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2118 {
2119         struct intel_iommu *iommu;
2120         u8 bus, devfn;
2121         struct domain_context_mapping_data data;
2122
2123         iommu = device_to_iommu(dev, &bus, &devfn);
2124         if (!iommu)
2125                 return -ENODEV;
2126
2127         if (!dev_is_pci(dev))
2128                 return domain_context_mapping_one(domain, iommu, bus, devfn);
2129
2130         data.domain = domain;
2131         data.iommu = iommu;
2132
2133         return pci_for_each_dma_alias(to_pci_dev(dev),
2134                                       &domain_context_mapping_cb, &data);
2135 }
2136
2137 static int domain_context_mapped_cb(struct pci_dev *pdev,
2138                                     u16 alias, void *opaque)
2139 {
2140         struct intel_iommu *iommu = opaque;
2141
2142         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2143 }
2144
2145 static int domain_context_mapped(struct device *dev)
2146 {
2147         struct intel_iommu *iommu;
2148         u8 bus, devfn;
2149
2150         iommu = device_to_iommu(dev, &bus, &devfn);
2151         if (!iommu)
2152                 return -ENODEV;
2153
2154         if (!dev_is_pci(dev))
2155                 return device_context_mapped(iommu, bus, devfn);
2156
2157         return !pci_for_each_dma_alias(to_pci_dev(dev),
2158                                        domain_context_mapped_cb, iommu);
2159 }
2160
2161 /* Returns a number of VTD pages, but aligned to MM page size */
2162 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2163                                             size_t size)
2164 {
2165         host_addr &= ~PAGE_MASK;
2166         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2167 }
2168
2169 /* Return largest possible superpage level for a given mapping */
2170 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2171                                           unsigned long iov_pfn,
2172                                           unsigned long phy_pfn,
2173                                           unsigned long pages)
2174 {
2175         int support, level = 1;
2176         unsigned long pfnmerge;
2177
2178         support = domain->iommu_superpage;
2179
2180         /* To use a large page, the virtual *and* physical addresses
2181            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2182            of them will mean we have to use smaller pages. So just
2183            merge them and check both at once. */
2184         pfnmerge = iov_pfn | phy_pfn;
2185
2186         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2187                 pages >>= VTD_STRIDE_SHIFT;
2188                 if (!pages)
2189                         break;
2190                 pfnmerge >>= VTD_STRIDE_SHIFT;
2191                 level++;
2192                 support--;
2193         }
2194         return level;
2195 }
2196
2197 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2198                             struct scatterlist *sg, unsigned long phys_pfn,
2199                             unsigned long nr_pages, int prot)
2200 {
2201         struct dma_pte *first_pte = NULL, *pte = NULL;
2202         phys_addr_t uninitialized_var(pteval);
2203         unsigned long sg_res = 0;
2204         unsigned int largepage_lvl = 0;
2205         unsigned long lvl_pages = 0;
2206
2207         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2208
2209         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2210                 return -EINVAL;
2211
2212         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2213
2214         if (!sg) {
2215                 sg_res = nr_pages;
2216                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2217         }
2218
2219         while (nr_pages > 0) {
2220                 uint64_t tmp;
2221
2222                 if (!sg_res) {
2223                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2224
2225                         sg_res = aligned_nrpages(sg->offset, sg->length);
2226                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2227                         sg->dma_length = sg->length;
2228                         pteval = (sg_phys(sg) - pgoff) | prot;
2229                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2230                 }
2231
2232                 if (!pte) {
2233                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2234
2235                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2236                         if (!pte)
2237                                 return -ENOMEM;
2238                         /* It is large page*/
2239                         if (largepage_lvl > 1) {
2240                                 unsigned long nr_superpages, end_pfn;
2241
2242                                 pteval |= DMA_PTE_LARGE_PAGE;
2243                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2244
2245                                 nr_superpages = sg_res / lvl_pages;
2246                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2247
2248                                 /*
2249                                  * Ensure that old small page tables are
2250                                  * removed to make room for superpage(s).
2251                                  * We're adding new large pages, so make sure
2252                                  * we don't remove their parent tables.
2253                                  */
2254                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2255                                                        largepage_lvl + 1);
2256                         } else {
2257                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2258                         }
2259
2260                 }
2261                 /* We don't need lock here, nobody else
2262                  * touches the iova range
2263                  */
2264                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2265                 if (tmp) {
2266                         static int dumps = 5;
2267                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2268                                 iov_pfn, tmp, (unsigned long long)pteval);
2269                         if (dumps) {
2270                                 dumps--;
2271                                 debug_dma_dump_mappings(NULL);
2272                         }
2273                         WARN_ON(1);
2274                 }
2275
2276                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2277
2278                 BUG_ON(nr_pages < lvl_pages);
2279                 BUG_ON(sg_res < lvl_pages);
2280
2281                 nr_pages -= lvl_pages;
2282                 iov_pfn += lvl_pages;
2283                 phys_pfn += lvl_pages;
2284                 pteval += lvl_pages * VTD_PAGE_SIZE;
2285                 sg_res -= lvl_pages;
2286
2287                 /* If the next PTE would be the first in a new page, then we
2288                    need to flush the cache on the entries we've just written.
2289                    And then we'll need to recalculate 'pte', so clear it and
2290                    let it get set again in the if (!pte) block above.
2291
2292                    If we're done (!nr_pages) we need to flush the cache too.
2293
2294                    Also if we've been setting superpages, we may need to
2295                    recalculate 'pte' and switch back to smaller pages for the
2296                    end of the mapping, if the trailing size is not enough to
2297                    use another superpage (i.e. sg_res < lvl_pages). */
2298                 pte++;
2299                 if (!nr_pages || first_pte_in_page(pte) ||
2300                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2301                         domain_flush_cache(domain, first_pte,
2302                                            (void *)pte - (void *)first_pte);
2303                         pte = NULL;
2304                 }
2305
2306                 if (!sg_res && nr_pages)
2307                         sg = sg_next(sg);
2308         }
2309         return 0;
2310 }
2311
2312 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2313                          struct scatterlist *sg, unsigned long phys_pfn,
2314                          unsigned long nr_pages, int prot)
2315 {
2316        int ret;
2317        struct intel_iommu *iommu;
2318
2319        /* Do the real mapping first */
2320        ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2321        if (ret)
2322                return ret;
2323
2324        /* Notify about the new mapping */
2325        if (domain_type_is_vm(domain)) {
2326                /* VM typed domains can have more than one IOMMUs */
2327                int iommu_id;
2328                for_each_domain_iommu(iommu_id, domain) {
2329                        iommu = g_iommus[iommu_id];
2330                        __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2331                }
2332        } else {
2333                /* General domains only have one IOMMU */
2334                iommu = domain_get_iommu(domain);
2335                __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2336        }
2337
2338        return 0;
2339 }
2340
2341 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2342                                     struct scatterlist *sg, unsigned long nr_pages,
2343                                     int prot)
2344 {
2345         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2346 }
2347
2348 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2349                                      unsigned long phys_pfn, unsigned long nr_pages,
2350                                      int prot)
2351 {
2352         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2353 }
2354
2355 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2356 {
2357         unsigned long flags;
2358         struct context_entry *context;
2359         u16 did_old;
2360
2361         if (!iommu)
2362                 return;
2363
2364         spin_lock_irqsave(&iommu->lock, flags);
2365         context = iommu_context_addr(iommu, bus, devfn, 0);
2366         if (!context) {
2367                 spin_unlock_irqrestore(&iommu->lock, flags);
2368                 return;
2369         }
2370         did_old = context_domain_id(context);
2371         context_clear_entry(context);
2372         __iommu_flush_cache(iommu, context, sizeof(*context));
2373         spin_unlock_irqrestore(&iommu->lock, flags);
2374         iommu->flush.flush_context(iommu,
2375                                    did_old,
2376                                    (((u16)bus) << 8) | devfn,
2377                                    DMA_CCMD_MASK_NOBIT,
2378                                    DMA_CCMD_DEVICE_INVL);
2379         iommu->flush.flush_iotlb(iommu,
2380                                  did_old,
2381                                  0,
2382                                  0,
2383                                  DMA_TLB_DSI_FLUSH);
2384 }
2385
2386 static inline void unlink_domain_info(struct device_domain_info *info)
2387 {
2388         assert_spin_locked(&device_domain_lock);
2389         list_del(&info->link);
2390         list_del(&info->global);
2391         if (info->dev)
2392                 info->dev->archdata.iommu = NULL;
2393 }
2394
2395 static void domain_remove_dev_info(struct dmar_domain *domain)
2396 {
2397         struct device_domain_info *info, *tmp;
2398         unsigned long flags;
2399
2400         spin_lock_irqsave(&device_domain_lock, flags);
2401         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2402                 __dmar_remove_one_dev_info(info);
2403         spin_unlock_irqrestore(&device_domain_lock, flags);
2404 }
2405
2406 /*
2407  * find_domain
2408  * Note: we use struct device->archdata.iommu stores the info
2409  */
2410 static struct dmar_domain *find_domain(struct device *dev)
2411 {
2412         struct device_domain_info *info;
2413
2414         /* No lock here, assumes no domain exit in normal case */
2415         info = dev->archdata.iommu;
2416         if (likely(info))
2417                 return info->domain;
2418         return NULL;
2419 }
2420
2421 static inline struct device_domain_info *
2422 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2423 {
2424         struct device_domain_info *info;
2425
2426         list_for_each_entry(info, &device_domain_list, global)
2427                 if (info->iommu->segment == segment && info->bus == bus &&
2428                     info->devfn == devfn)
2429                         return info;
2430
2431         return NULL;
2432 }
2433
2434 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2435                                                     int bus, int devfn,
2436                                                     struct device *dev,
2437                                                     struct dmar_domain *domain)
2438 {
2439         struct dmar_domain *found = NULL;
2440         struct device_domain_info *info;
2441         unsigned long flags;
2442         int ret;
2443
2444         info = alloc_devinfo_mem();
2445         if (!info)
2446                 return NULL;
2447
2448         info->bus = bus;
2449         info->devfn = devfn;
2450         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2451         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2452         info->ats_qdep = 0;
2453         info->dev = dev;
2454         info->domain = domain;
2455         info->iommu = iommu;
2456         info->pasid_table = NULL;
2457
2458         if (dev && dev_is_pci(dev)) {
2459                 struct pci_dev *pdev = to_pci_dev(info->dev);
2460
2461                 if (!pci_ats_disabled() &&
2462                     ecap_dev_iotlb_support(iommu->ecap) &&
2463                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2464                     dmar_find_matched_atsr_unit(pdev))
2465                         info->ats_supported = 1;
2466
2467                 if (ecs_enabled(iommu)) {
2468                         if (pasid_enabled(iommu)) {
2469                                 int features = pci_pasid_features(pdev);
2470                                 if (features >= 0)
2471                                         info->pasid_supported = features | 1;
2472                         }
2473
2474                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2475                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2476                                 info->pri_supported = 1;
2477                 }
2478         }
2479
2480         spin_lock_irqsave(&device_domain_lock, flags);
2481         if (dev)
2482                 found = find_domain(dev);
2483
2484         if (!found) {
2485                 struct device_domain_info *info2;
2486                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2487                 if (info2) {
2488                         found      = info2->domain;
2489                         info2->dev = dev;
2490                 }
2491         }
2492
2493         if (found) {
2494                 spin_unlock_irqrestore(&device_domain_lock, flags);
2495                 free_devinfo_mem(info);
2496                 /* Caller must free the original domain */
2497                 return found;
2498         }
2499
2500         spin_lock(&iommu->lock);
2501         ret = domain_attach_iommu(domain, iommu);
2502         spin_unlock(&iommu->lock);
2503
2504         if (ret) {
2505                 spin_unlock_irqrestore(&device_domain_lock, flags);
2506                 free_devinfo_mem(info);
2507                 return NULL;
2508         }
2509
2510         list_add(&info->link, &domain->devices);
2511         list_add(&info->global, &device_domain_list);
2512         if (dev)
2513                 dev->archdata.iommu = info;
2514
2515         if (dev && dev_is_pci(dev) && info->pasid_supported) {
2516                 ret = intel_pasid_alloc_table(dev);
2517                 if (ret) {
2518                         pr_warn("No pasid table for %s, pasid disabled\n",
2519                                 dev_name(dev));
2520                         info->pasid_supported = 0;
2521                 }
2522         }
2523         spin_unlock_irqrestore(&device_domain_lock, flags);
2524
2525         if (dev && domain_context_mapping(domain, dev)) {
2526                 pr_err("Domain context map for %s failed\n", dev_name(dev));
2527                 dmar_remove_one_dev_info(domain, dev);
2528                 return NULL;
2529         }
2530
2531         return domain;
2532 }
2533
2534 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2535 {
2536         *(u16 *)opaque = alias;
2537         return 0;
2538 }
2539
2540 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2541 {
2542         struct device_domain_info *info = NULL;
2543         struct dmar_domain *domain = NULL;
2544         struct intel_iommu *iommu;
2545         u16 dma_alias;
2546         unsigned long flags;
2547         u8 bus, devfn;
2548
2549         iommu = device_to_iommu(dev, &bus, &devfn);
2550         if (!iommu)
2551                 return NULL;
2552
2553         if (dev_is_pci(dev)) {
2554                 struct pci_dev *pdev = to_pci_dev(dev);
2555
2556                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2557
2558                 spin_lock_irqsave(&device_domain_lock, flags);
2559                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2560                                                       PCI_BUS_NUM(dma_alias),
2561                                                       dma_alias & 0xff);
2562                 if (info) {
2563                         iommu = info->iommu;
2564                         domain = info->domain;
2565                 }
2566                 spin_unlock_irqrestore(&device_domain_lock, flags);
2567
2568                 /* DMA alias already has a domain, use it */
2569                 if (info)
2570                         goto out;
2571         }
2572
2573         /* Allocate and initialize new domain for the device */
2574         domain = alloc_domain(0);
2575         if (!domain)
2576                 return NULL;
2577         if (domain_init(domain, iommu, gaw)) {
2578                 domain_exit(domain);
2579                 return NULL;
2580         }
2581
2582 out:
2583
2584         return domain;
2585 }
2586
2587 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2588                                               struct dmar_domain *domain)
2589 {
2590         struct intel_iommu *iommu;
2591         struct dmar_domain *tmp;
2592         u16 req_id, dma_alias;
2593         u8 bus, devfn;
2594
2595         iommu = device_to_iommu(dev, &bus, &devfn);
2596         if (!iommu)
2597                 return NULL;
2598
2599         req_id = ((u16)bus << 8) | devfn;
2600
2601         if (dev_is_pci(dev)) {
2602                 struct pci_dev *pdev = to_pci_dev(dev);
2603
2604                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2605
2606                 /* register PCI DMA alias device */
2607                 if (req_id != dma_alias) {
2608                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2609                                         dma_alias & 0xff, NULL, domain);
2610
2611                         if (!tmp || tmp != domain)
2612                                 return tmp;
2613                 }
2614         }
2615
2616         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2617         if (!tmp || tmp != domain)
2618                 return tmp;
2619
2620         return domain;
2621 }
2622
2623 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2624 {
2625         struct dmar_domain *domain, *tmp;
2626
2627         domain = find_domain(dev);
2628         if (domain)
2629                 goto out;
2630
2631         domain = find_or_alloc_domain(dev, gaw);
2632         if (!domain)
2633                 goto out;
2634
2635         tmp = set_domain_for_dev(dev, domain);
2636         if (!tmp || domain != tmp) {
2637                 domain_exit(domain);
2638                 domain = tmp;
2639         }
2640
2641 out:
2642
2643         return domain;
2644 }
2645
2646 static int iommu_domain_identity_map(struct dmar_domain *domain,
2647                                      unsigned long long start,
2648                                      unsigned long long end)
2649 {
2650         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2651         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2652
2653         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2654                           dma_to_mm_pfn(last_vpfn))) {
2655                 pr_err("Reserving iova failed\n");
2656                 return -ENOMEM;
2657         }
2658
2659         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2660         /*
2661          * RMRR range might have overlap with physical memory range,
2662          * clear it first
2663          */
2664         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2665
2666         return __domain_mapping(domain, first_vpfn, NULL,
2667                                 first_vpfn, last_vpfn - first_vpfn + 1,
2668                                 DMA_PTE_READ|DMA_PTE_WRITE);
2669 }
2670
2671 static int domain_prepare_identity_map(struct device *dev,
2672                                        struct dmar_domain *domain,
2673                                        unsigned long long start,
2674                                        unsigned long long end)
2675 {
2676         /* For _hardware_ passthrough, don't bother. But for software
2677            passthrough, we do it anyway -- it may indicate a memory
2678            range which is reserved in E820, so which didn't get set
2679            up to start with in si_domain */
2680         if (domain == si_domain && hw_pass_through) {
2681                 pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2682                         dev_name(dev), start, end);
2683                 return 0;
2684         }
2685
2686         pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2687                 dev_name(dev), start, end);
2688
2689         if (end < start) {
2690                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2691                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2692                         dmi_get_system_info(DMI_BIOS_VENDOR),
2693                         dmi_get_system_info(DMI_BIOS_VERSION),
2694                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2695                 return -EIO;
2696         }
2697
2698         if (end >> agaw_to_width(domain->agaw)) {
2699                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2700                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2701                      agaw_to_width(domain->agaw),
2702                      dmi_get_system_info(DMI_BIOS_VENDOR),
2703                      dmi_get_system_info(DMI_BIOS_VERSION),
2704                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2705                 return -EIO;
2706         }
2707
2708         return iommu_domain_identity_map(domain, start, end);
2709 }
2710
2711 static int iommu_prepare_identity_map(struct device *dev,
2712                                       unsigned long long start,
2713                                       unsigned long long end)
2714 {
2715         struct dmar_domain *domain;
2716         int ret;
2717
2718         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2719         if (!domain)
2720                 return -ENOMEM;
2721
2722         ret = domain_prepare_identity_map(dev, domain, start, end);
2723         if (ret)
2724                 domain_exit(domain);
2725
2726         return ret;
2727 }
2728
2729 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2730                                          struct device *dev)
2731 {
2732         if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2733                 return 0;
2734         return iommu_prepare_identity_map(dev, rmrr->base_address,
2735                                           rmrr->end_address);
2736 }
2737
2738 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2739 static inline void iommu_prepare_isa(void)
2740 {
2741         struct pci_dev *pdev;
2742         int ret;
2743
2744         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2745         if (!pdev)
2746                 return;
2747
2748         pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2749         ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2750
2751         if (ret)
2752                 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2753
2754         pci_dev_put(pdev);
2755 }
2756 #else
2757 static inline void iommu_prepare_isa(void)
2758 {
2759         return;
2760 }
2761 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2762
2763 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2764
2765 static int __init si_domain_init(int hw)
2766 {
2767         int nid, ret = 0;
2768
2769         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2770         if (!si_domain)
2771                 return -EFAULT;
2772
2773         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2774                 domain_exit(si_domain);
2775                 return -EFAULT;
2776         }
2777
2778         pr_debug("Identity mapping domain allocated\n");
2779
2780         if (hw)
2781                 return 0;
2782
2783         for_each_online_node(nid) {
2784                 unsigned long start_pfn, end_pfn;
2785                 int i;
2786
2787                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2788                         ret = iommu_domain_identity_map(si_domain,
2789                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2790                         if (ret)
2791                                 return ret;
2792                 }
2793         }
2794
2795         return 0;
2796 }
2797
2798 static int identity_mapping(struct device *dev)
2799 {
2800         struct device_domain_info *info;
2801
2802         if (likely(!iommu_identity_mapping))
2803                 return 0;
2804
2805         info = dev->archdata.iommu;
2806         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2807                 return (info->domain == si_domain);
2808
2809         return 0;
2810 }
2811
2812 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2813 {
2814         struct dmar_domain *ndomain;
2815         struct intel_iommu *iommu;
2816         u8 bus, devfn;
2817
2818         iommu = device_to_iommu(dev, &bus, &devfn);
2819         if (!iommu)
2820                 return -ENODEV;
2821
2822         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2823         if (ndomain != domain)
2824                 return -EBUSY;
2825
2826         return 0;
2827 }
2828
2829 static bool device_has_rmrr(struct device *dev)
2830 {
2831         struct dmar_rmrr_unit *rmrr;
2832         struct device *tmp;
2833         int i;
2834
2835         rcu_read_lock();
2836         for_each_rmrr_units(rmrr) {
2837                 /*
2838                  * Return TRUE if this RMRR contains the device that
2839                  * is passed in.
2840                  */
2841                 for_each_active_dev_scope(rmrr->devices,
2842                                           rmrr->devices_cnt, i, tmp)
2843                         if (tmp == dev) {
2844                                 rcu_read_unlock();
2845                                 return true;
2846                         }
2847         }
2848         rcu_read_unlock();
2849         return false;
2850 }
2851
2852 /*
2853  * There are a couple cases where we need to restrict the functionality of
2854  * devices associated with RMRRs.  The first is when evaluating a device for
2855  * identity mapping because problems exist when devices are moved in and out
2856  * of domains and their respective RMRR information is lost.  This means that
2857  * a device with associated RMRRs will never be in a "passthrough" domain.
2858  * The second is use of the device through the IOMMU API.  This interface
2859  * expects to have full control of the IOVA space for the device.  We cannot
2860  * satisfy both the requirement that RMRR access is maintained and have an
2861  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2862  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2863  * We therefore prevent devices associated with an RMRR from participating in
2864  * the IOMMU API, which eliminates them from device assignment.
2865  *
2866  * In both cases we assume that PCI USB devices with RMRRs have them largely
2867  * for historical reasons and that the RMRR space is not actively used post
2868  * boot.  This exclusion may change if vendors begin to abuse it.
2869  *
2870  * The same exception is made for graphics devices, with the requirement that
2871  * any use of the RMRR regions will be torn down before assigning the device
2872  * to a guest.
2873  */
2874 static bool device_is_rmrr_locked(struct device *dev)
2875 {
2876         if (!device_has_rmrr(dev))
2877                 return false;
2878
2879         if (dev_is_pci(dev)) {
2880                 struct pci_dev *pdev = to_pci_dev(dev);
2881
2882                 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2883                         return false;
2884         }
2885
2886         return true;
2887 }
2888
2889 static int iommu_should_identity_map(struct device *dev, int startup)
2890 {
2891
2892         if (dev_is_pci(dev)) {
2893                 struct pci_dev *pdev = to_pci_dev(dev);
2894
2895                 if (device_is_rmrr_locked(dev))
2896                         return 0;
2897
2898                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2899                         return 1;
2900
2901                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2902                         return 1;
2903
2904                 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2905                         return 0;
2906
2907                 /*
2908                  * We want to start off with all devices in the 1:1 domain, and
2909                  * take them out later if we find they can't access all of memory.
2910                  *
2911                  * However, we can't do this for PCI devices behind bridges,
2912                  * because all PCI devices behind the same bridge will end up
2913                  * with the same source-id on their transactions.
2914                  *
2915                  * Practically speaking, we can't change things around for these
2916                  * devices at run-time, because we can't be sure there'll be no
2917                  * DMA transactions in flight for any of their siblings.
2918                  *
2919                  * So PCI devices (unless they're on the root bus) as well as
2920                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2921                  * the 1:1 domain, just in _case_ one of their siblings turns out
2922                  * not to be able to map all of memory.
2923                  */
2924                 if (!pci_is_pcie(pdev)) {
2925                         if (!pci_is_root_bus(pdev->bus))
2926                                 return 0;
2927                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2928                                 return 0;
2929                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2930                         return 0;
2931         } else {
2932                 if (device_has_rmrr(dev))
2933                         return 0;
2934         }
2935
2936         /*
2937          * At boot time, we don't yet know if devices will be 64-bit capable.
2938          * Assume that they will — if they turn out not to be, then we can
2939          * take them out of the 1:1 domain later.
2940          */
2941         if (!startup) {
2942                 /*
2943                  * If the device's dma_mask is less than the system's memory
2944                  * size then this is not a candidate for identity mapping.
2945                  */
2946                 u64 dma_mask = *dev->dma_mask;
2947
2948                 if (dev->coherent_dma_mask &&
2949                     dev->coherent_dma_mask < dma_mask)
2950                         dma_mask = dev->coherent_dma_mask;
2951
2952                 return dma_mask >= dma_get_required_mask(dev);
2953         }
2954
2955         return 1;
2956 }
2957
2958 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2959 {
2960         int ret;
2961
2962         if (!iommu_should_identity_map(dev, 1))
2963                 return 0;
2964
2965         ret = domain_add_dev_info(si_domain, dev);
2966         if (!ret)
2967                 pr_info("%s identity mapping for device %s\n",
2968                         hw ? "Hardware" : "Software", dev_name(dev));
2969         else if (ret == -ENODEV)
2970                 /* device not associated with an iommu */
2971                 ret = 0;
2972
2973         return ret;
2974 }
2975
2976
2977 static int __init iommu_prepare_static_identity_mapping(int hw)
2978 {
2979         struct pci_dev *pdev = NULL;
2980         struct dmar_drhd_unit *drhd;
2981         struct intel_iommu *iommu;
2982         struct device *dev;
2983         int i;
2984         int ret = 0;
2985
2986         for_each_pci_dev(pdev) {
2987                 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2988                 if (ret)
2989                         return ret;
2990         }
2991
2992         for_each_active_iommu(iommu, drhd)
2993                 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2994                         struct acpi_device_physical_node *pn;
2995                         struct acpi_device *adev;
2996
2997                         if (dev->bus != &acpi_bus_type)
2998                                 continue;
2999
3000                         adev= to_acpi_device(dev);
3001                         mutex_lock(&adev->physical_node_lock);
3002                         list_for_each_entry(pn, &adev->physical_node_list, node) {
3003                                 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
3004                                 if (ret)
3005                                         break;
3006                         }
3007                         mutex_unlock(&adev->physical_node_lock);
3008                         if (ret)
3009                                 return ret;
3010                 }
3011
3012         return 0;
3013 }
3014
3015 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3016 {
3017         /*
3018          * Start from the sane iommu hardware state.
3019          * If the queued invalidation is already initialized by us
3020          * (for example, while enabling interrupt-remapping) then
3021          * we got the things already rolling from a sane state.
3022          */
3023         if (!iommu->qi) {
3024                 /*
3025                  * Clear any previous faults.
3026                  */
3027                 dmar_fault(-1, iommu);
3028                 /*
3029                  * Disable queued invalidation if supported and already enabled
3030                  * before OS handover.
3031                  */
3032                 dmar_disable_qi(iommu);
3033         }
3034
3035         if (dmar_enable_qi(iommu)) {
3036                 /*
3037                  * Queued Invalidate not enabled, use Register Based Invalidate
3038                  */
3039                 iommu->flush.flush_context = __iommu_flush_context;
3040                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3041                 pr_info("%s: Using Register based invalidation\n",
3042                         iommu->name);
3043         } else {
3044                 iommu->flush.flush_context = qi_flush_context;
3045                 iommu->flush.flush_iotlb = qi_flush_iotlb;
3046                 pr_info("%s: Using Queued invalidation\n", iommu->name);
3047         }
3048 }
3049
3050 static int copy_context_table(struct intel_iommu *iommu,
3051                               struct root_entry *old_re,
3052                               struct context_entry **tbl,
3053                               int bus, bool ext)
3054 {
3055         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3056         struct context_entry *new_ce = NULL, ce;
3057         struct context_entry *old_ce = NULL;
3058         struct root_entry re;
3059         phys_addr_t old_ce_phys;
3060
3061         tbl_idx = ext ? bus * 2 : bus;
3062         memcpy(&re, old_re, sizeof(re));
3063
3064         for (devfn = 0; devfn < 256; devfn++) {
3065                 /* First calculate the correct index */
3066                 idx = (ext ? devfn * 2 : devfn) % 256;
3067
3068                 if (idx == 0) {
3069                         /* First save what we may have and clean up */
3070                         if (new_ce) {
3071                                 tbl[tbl_idx] = new_ce;
3072                                 __iommu_flush_cache(iommu, new_ce,
3073                                                     VTD_PAGE_SIZE);
3074                                 pos = 1;
3075                         }
3076
3077                         if (old_ce)
3078                                 iounmap(old_ce);
3079
3080                         ret = 0;
3081                         if (devfn < 0x80)
3082                                 old_ce_phys = root_entry_lctp(&re);
3083                         else
3084                                 old_ce_phys = root_entry_uctp(&re);
3085
3086                         if (!old_ce_phys) {
3087                                 if (ext && devfn == 0) {
3088                                         /* No LCTP, try UCTP */
3089                                         devfn = 0x7f;
3090                                         continue;
3091                                 } else {
3092                                         goto out;
3093                                 }
3094                         }
3095
3096                         ret = -ENOMEM;
3097                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3098                                         MEMREMAP_WB);
3099                         if (!old_ce)
3100                                 goto out;
3101
3102                         new_ce = alloc_pgtable_page(iommu->node);
3103                         if (!new_ce)
3104                                 goto out_unmap;
3105
3106                         ret = 0;
3107                 }
3108
3109                 /* Now copy the context entry */
3110                 memcpy(&ce, old_ce + idx, sizeof(ce));
3111
3112                 if (!__context_present(&ce))
3113                         continue;
3114
3115                 did = context_domain_id(&ce);
3116                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3117                         set_bit(did, iommu->domain_ids);
3118
3119                 /*
3120                  * We need a marker for copied context entries. This
3121                  * marker needs to work for the old format as well as
3122                  * for extended context entries.
3123                  *
3124                  * Bit 67 of the context entry is used. In the old
3125                  * format this bit is available to software, in the
3126                  * extended format it is the PGE bit, but PGE is ignored
3127                  * by HW if PASIDs are disabled (and thus still
3128                  * available).
3129                  *
3130                  * So disable PASIDs first and then mark the entry
3131                  * copied. This means that we don't copy PASID
3132                  * translations from the old kernel, but this is fine as
3133                  * faults there are not fatal.
3134                  */
3135                 context_clear_pasid_enable(&ce);
3136                 context_set_copied(&ce);
3137
3138                 new_ce[idx] = ce;
3139         }
3140
3141         tbl[tbl_idx + pos] = new_ce;
3142
3143         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3144
3145 out_unmap:
3146         memunmap(old_ce);
3147
3148 out:
3149         return ret;
3150 }
3151
3152 static int copy_translation_tables(struct intel_iommu *iommu)
3153 {
3154         struct context_entry **ctxt_tbls;
3155         struct root_entry *old_rt;
3156         phys_addr_t old_rt_phys;
3157         int ctxt_table_entries;
3158         unsigned long flags;
3159         u64 rtaddr_reg;
3160         int bus, ret;
3161         bool new_ext, ext;
3162
3163         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3164         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3165         new_ext    = !!ecap_ecs(iommu->ecap);
3166
3167         /*
3168          * The RTT bit can only be changed when translation is disabled,
3169          * but disabling translation means to open a window for data
3170          * corruption. So bail out and don't copy anything if we would
3171          * have to change the bit.
3172          */
3173         if (new_ext != ext)
3174                 return -EINVAL;
3175
3176         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3177         if (!old_rt_phys)
3178                 return -EINVAL;
3179
3180         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3181         if (!old_rt)
3182                 return -ENOMEM;
3183
3184         /* This is too big for the stack - allocate it from slab */
3185         ctxt_table_entries = ext ? 512 : 256;
3186         ret = -ENOMEM;
3187         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3188         if (!ctxt_tbls)
3189                 goto out_unmap;
3190
3191         for (bus = 0; bus < 256; bus++) {
3192                 ret = copy_context_table(iommu, &old_rt[bus],
3193                                          ctxt_tbls, bus, ext);
3194                 if (ret) {
3195                         pr_err("%s: Failed to copy context table for bus %d\n",
3196                                 iommu->name, bus);
3197                         continue;
3198                 }
3199         }
3200
3201         spin_lock_irqsave(&iommu->lock, flags);
3202
3203         /* Context tables are copied, now write them to the root_entry table */
3204         for (bus = 0; bus < 256; bus++) {
3205                 int idx = ext ? bus * 2 : bus;
3206                 u64 val;
3207
3208                 if (ctxt_tbls[idx]) {
3209                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3210                         iommu->root_entry[bus].lo = val;
3211                 }
3212
3213                 if (!ext || !ctxt_tbls[idx + 1])
3214                         continue;
3215
3216                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3217                 iommu->root_entry[bus].hi = val;
3218         }
3219
3220         spin_unlock_irqrestore(&iommu->lock, flags);
3221
3222         kfree(ctxt_tbls);
3223
3224         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3225
3226         ret = 0;
3227
3228 out_unmap:
3229         memunmap(old_rt);
3230
3231         return ret;
3232 }
3233
3234 static int __init init_dmars(void)
3235 {
3236         struct dmar_drhd_unit *drhd;
3237         struct dmar_rmrr_unit *rmrr;
3238         bool copied_tables = false;
3239         struct device *dev;
3240         struct intel_iommu *iommu;
3241         int i, ret;
3242
3243         /*
3244          * for each drhd
3245          *    allocate root
3246          *    initialize and program root entry to not present
3247          * endfor
3248          */
3249         for_each_drhd_unit(drhd) {
3250                 /*
3251                  * lock not needed as this is only incremented in the single
3252                  * threaded kernel __init code path all other access are read
3253                  * only
3254                  */
3255                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3256                         g_num_of_iommus++;
3257                         continue;
3258                 }
3259                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3260         }
3261
3262         /* Preallocate enough resources for IOMMU hot-addition */
3263         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3264                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3265
3266         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3267                         GFP_KERNEL);
3268         if (!g_iommus) {
3269                 pr_err("Allocating global iommu array failed\n");
3270                 ret = -ENOMEM;
3271                 goto error;
3272         }
3273
3274         for_each_active_iommu(iommu, drhd) {
3275                 /*
3276                  * Find the max pasid size of all IOMMU's in the system.
3277                  * We need to ensure the system pasid table is no bigger
3278                  * than the smallest supported.
3279                  */
3280                 if (pasid_enabled(iommu)) {
3281                         u32 temp = 2 << ecap_pss(iommu->ecap);
3282
3283                         intel_pasid_max_id = min_t(u32, temp,
3284                                                    intel_pasid_max_id);
3285                 }
3286
3287                 g_iommus[iommu->seq_id] = iommu;
3288
3289                 intel_iommu_init_qi(iommu);
3290
3291                 ret = iommu_init_domains(iommu);
3292                 if (ret)
3293                         goto free_iommu;
3294
3295                 init_translation_status(iommu);
3296
3297                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3298                         iommu_disable_translation(iommu);
3299                         clear_translation_pre_enabled(iommu);
3300                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3301                                 iommu->name);
3302                 }
3303
3304                 /*
3305                  * TBD:
3306                  * we could share the same root & context tables
3307                  * among all IOMMU's. Need to Split it later.
3308                  */
3309                 ret = iommu_alloc_root_entry(iommu);
3310                 if (ret)
3311                         goto free_iommu;
3312
3313                 if (translation_pre_enabled(iommu)) {
3314                         pr_info("Translation already enabled - trying to copy translation structures\n");
3315
3316                         ret = copy_translation_tables(iommu);
3317                         if (ret) {
3318                                 /*
3319                                  * We found the IOMMU with translation
3320                                  * enabled - but failed to copy over the
3321                                  * old root-entry table. Try to proceed
3322                                  * by disabling translation now and
3323                                  * allocating a clean root-entry table.
3324                                  * This might cause DMAR faults, but
3325                                  * probably the dump will still succeed.
3326                                  */
3327                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3328                                        iommu->name);
3329                                 iommu_disable_translation(iommu);
3330                                 clear_translation_pre_enabled(iommu);
3331                         } else {
3332                                 pr_info("Copied translation tables from previous kernel for %s\n",
3333                                         iommu->name);
3334                                 copied_tables = true;
3335                         }
3336                 }
3337
3338                 if (!ecap_pass_through(iommu->ecap))
3339                         hw_pass_through = 0;
3340 #ifdef CONFIG_INTEL_IOMMU_SVM
3341                 if (pasid_enabled(iommu))
3342                         intel_svm_init(iommu);
3343 #endif
3344         }
3345
3346         /*
3347          * Now that qi is enabled on all iommus, set the root entry and flush
3348          * caches. This is required on some Intel X58 chipsets, otherwise the
3349          * flush_context function will loop forever and the boot hangs.
3350          */
3351         for_each_active_iommu(iommu, drhd) {
3352                 iommu_flush_write_buffer(iommu);
3353                 iommu_set_root_entry(iommu);
3354                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3355                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3356         }
3357
3358         if (iommu_pass_through)
3359                 iommu_identity_mapping |= IDENTMAP_ALL;
3360
3361 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3362         iommu_identity_mapping |= IDENTMAP_GFX;
3363 #endif
3364
3365         check_tylersburg_isoch();
3366
3367         if (iommu_identity_mapping) {
3368                 ret = si_domain_init(hw_pass_through);
3369                 if (ret)
3370                         goto free_iommu;
3371         }
3372
3373
3374         /*
3375          * If we copied translations from a previous kernel in the kdump
3376          * case, we can not assign the devices to domains now, as that
3377          * would eliminate the old mappings. So skip this part and defer
3378          * the assignment to device driver initialization time.
3379          */
3380         if (copied_tables)
3381                 goto domains_done;
3382
3383         /*
3384          * If pass through is not set or not enabled, setup context entries for
3385          * identity mappings for rmrr, gfx, and isa and may fall back to static
3386          * identity mapping if iommu_identity_mapping is set.
3387          */
3388         if (iommu_identity_mapping) {
3389                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3390                 if (ret) {
3391                         pr_crit("Failed to setup IOMMU pass-through\n");
3392                         goto free_iommu;
3393                 }
3394         }
3395         /*
3396          * For each rmrr
3397          *   for each dev attached to rmrr
3398          *   do
3399          *     locate drhd for dev, alloc domain for dev
3400          *     allocate free domain
3401          *     allocate page table entries for rmrr
3402          *     if context not allocated for bus
3403          *           allocate and init context
3404          *           set present in root table for this bus
3405          *     init context with domain, translation etc
3406          *    endfor
3407          * endfor
3408          */
3409         pr_info("Setting RMRR:\n");
3410         for_each_rmrr_units(rmrr) {
3411                 /* some BIOS lists non-exist devices in DMAR table. */
3412                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3413                                           i, dev) {
3414                         ret = iommu_prepare_rmrr_dev(rmrr, dev);
3415                         if (ret)
3416                                 pr_err("Mapping reserved region failed\n");
3417                 }
3418         }
3419
3420         iommu_prepare_isa();
3421
3422 domains_done:
3423
3424         /*
3425          * for each drhd
3426          *   enable fault log
3427          *   global invalidate context cache
3428          *   global invalidate iotlb
3429          *   enable translation
3430          */
3431         for_each_iommu(iommu, drhd) {
3432                 if (drhd->ignored) {
3433                         /*
3434                          * we always have to disable PMRs or DMA may fail on
3435                          * this device
3436                          */
3437                         if (force_on)
3438                                 iommu_disable_protect_mem_regions(iommu);
3439                         continue;
3440                 }
3441
3442                 iommu_flush_write_buffer(iommu);
3443
3444 #ifdef CONFIG_INTEL_IOMMU_SVM
3445                 if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
3446                         ret = intel_svm_enable_prq(iommu);
3447                         if (ret)
3448                                 goto free_iommu;
3449                 }
3450 #endif
3451                 ret = dmar_set_interrupt(iommu);
3452                 if (ret)
3453                         goto free_iommu;
3454
3455                 if (!translation_pre_enabled(iommu))
3456                         iommu_enable_translation(iommu);
3457
3458                 iommu_disable_protect_mem_regions(iommu);
3459         }
3460
3461         return 0;
3462
3463 free_iommu:
3464         for_each_active_iommu(iommu, drhd) {
3465                 disable_dmar_iommu(iommu);
3466                 free_dmar_iommu(iommu);
3467         }
3468
3469         kfree(g_iommus);
3470
3471 error:
3472         return ret;
3473 }
3474
3475 /* This takes a number of _MM_ pages, not VTD pages */
3476 static unsigned long intel_alloc_iova(struct device *dev,
3477                                      struct dmar_domain *domain,
3478                                      unsigned long nrpages, uint64_t dma_mask)
3479 {
3480         unsigned long iova_pfn = 0;
3481
3482         /* Restrict dma_mask to the width that the iommu can handle */
3483         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3484         /* Ensure we reserve the whole size-aligned region */
3485         nrpages = __roundup_pow_of_two(nrpages);
3486
3487         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3488                 /*
3489                  * First try to allocate an io virtual address in
3490                  * DMA_BIT_MASK(32) and if that fails then try allocating
3491                  * from higher range
3492                  */
3493                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3494                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3495                 if (iova_pfn)
3496                         return iova_pfn;
3497         }
3498         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3499                                    IOVA_PFN(dma_mask), true);
3500         if (unlikely(!iova_pfn)) {
3501                 pr_err("Allocating %ld-page iova for %s failed",
3502                        nrpages, dev_name(dev));
3503                 return 0;
3504         }
3505
3506         return iova_pfn;
3507 }
3508
3509 struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3510 {
3511         struct dmar_domain *domain, *tmp;
3512         struct dmar_rmrr_unit *rmrr;
3513         struct device *i_dev;
3514         int i, ret;
3515
3516         domain = find_domain(dev);
3517         if (domain)
3518                 goto out;
3519
3520         domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3521         if (!domain)
3522                 goto out;
3523
3524         /* We have a new domain - setup possible RMRRs for the device */
3525         rcu_read_lock();
3526         for_each_rmrr_units(rmrr) {
3527                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3528                                           i, i_dev) {
3529                         if (i_dev != dev)
3530                                 continue;
3531
3532                         ret = domain_prepare_identity_map(dev, domain,
3533                                                           rmrr->base_address,
3534                                                           rmrr->end_address);
3535                         if (ret)
3536                                 dev_err(dev, "Mapping reserved region failed\n");
3537                 }
3538         }
3539         rcu_read_unlock();
3540
3541         tmp = set_domain_for_dev(dev, domain);
3542         if (!tmp || domain != tmp) {
3543                 domain_exit(domain);
3544                 domain = tmp;
3545         }
3546
3547 out:
3548
3549         if (!domain)
3550                 pr_err("Allocating domain for %s failed\n", dev_name(dev));
3551
3552
3553         return domain;
3554 }
3555
3556 /* Check if the dev needs to go through non-identity map and unmap process.*/
3557 static int iommu_no_mapping(struct device *dev)
3558 {
3559         int found;
3560
3561         if (iommu_dummy(dev))
3562                 return 1;
3563
3564         if (!iommu_identity_mapping)
3565                 return 0;
3566
3567         found = identity_mapping(dev);
3568         if (found) {
3569                 if (iommu_should_identity_map(dev, 0))
3570                         return 1;
3571                 else {
3572                         /*
3573                          * 32 bit DMA is removed from si_domain and fall back
3574                          * to non-identity mapping.
3575                          */
3576                         dmar_remove_one_dev_info(si_domain, dev);
3577                         pr_info("32bit %s uses non-identity mapping\n",
3578                                 dev_name(dev));
3579                         return 0;
3580                 }
3581         } else {
3582                 /*
3583                  * In case of a detached 64 bit DMA device from vm, the device
3584                  * is put into si_domain for identity mapping.
3585                  */
3586                 if (iommu_should_identity_map(dev, 0)) {
3587                         int ret;
3588                         ret = domain_add_dev_info(si_domain, dev);
3589                         if (!ret) {
3590                                 pr_info("64bit %s uses identity mapping\n",
3591                                         dev_name(dev));
3592                                 return 1;
3593                         }
3594                 }
3595         }
3596
3597         return 0;
3598 }
3599
3600 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3601                                      size_t size, int dir, u64 dma_mask)
3602 {
3603         struct dmar_domain *domain;
3604         phys_addr_t start_paddr;
3605         unsigned long iova_pfn;
3606         int prot = 0;
3607         int ret;
3608         struct intel_iommu *iommu;
3609         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3610
3611         BUG_ON(dir == DMA_NONE);
3612
3613         if (iommu_no_mapping(dev))
3614                 return paddr;
3615
3616         domain = get_valid_domain_for_dev(dev);
3617         if (!domain)
3618                 return 0;
3619
3620         iommu = domain_get_iommu(domain);
3621         size = aligned_nrpages(paddr, size);
3622
3623         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3624         if (!iova_pfn)
3625                 goto error;
3626
3627         /*
3628          * Check if DMAR supports zero-length reads on write only
3629          * mappings..
3630          */
3631         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3632                         !cap_zlr(iommu->cap))
3633                 prot |= DMA_PTE_READ;
3634         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3635                 prot |= DMA_PTE_WRITE;
3636         /*
3637          * paddr - (paddr + size) might be partial page, we should map the whole
3638          * page.  Note: if two part of one page are separately mapped, we
3639          * might have two guest_addr mapping to the same host paddr, but this
3640          * is not a big problem
3641          */
3642         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3643                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3644         if (ret)
3645                 goto error;
3646
3647         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3648         start_paddr += paddr & ~PAGE_MASK;
3649         return start_paddr;
3650
3651 error:
3652         if (iova_pfn)
3653                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3654         pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3655                 dev_name(dev), size, (unsigned long long)paddr, dir);
3656         return 0;
3657 }
3658
3659 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3660                                  unsigned long offset, size_t size,
3661                                  enum dma_data_direction dir,
3662                                  unsigned long attrs)
3663 {
3664         return __intel_map_single(dev, page_to_phys(page) + offset, size,
3665                                   dir, *dev->dma_mask);
3666 }
3667
3668 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3669 {
3670         struct dmar_domain *domain;
3671         unsigned long start_pfn, last_pfn;
3672         unsigned long nrpages;
3673         unsigned long iova_pfn;
3674         struct intel_iommu *iommu;
3675         struct page *freelist;
3676
3677         if (iommu_no_mapping(dev))
3678                 return;
3679
3680         domain = find_domain(dev);
3681         BUG_ON(!domain);
3682
3683         iommu = domain_get_iommu(domain);
3684
3685         iova_pfn = IOVA_PFN(dev_addr);
3686
3687         nrpages = aligned_nrpages(dev_addr, size);
3688         start_pfn = mm_to_dma_pfn(iova_pfn);
3689         last_pfn = start_pfn + nrpages - 1;
3690
3691         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3692                  dev_name(dev), start_pfn, last_pfn);
3693
3694         freelist = domain_unmap(domain, start_pfn, last_pfn);
3695
3696         if (intel_iommu_strict) {
3697                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3698                                       nrpages, !freelist, 0);
3699                 /* free iova */
3700                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3701                 dma_free_pagelist(freelist);
3702         } else {
3703                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3704                            (unsigned long)freelist);
3705                 /*
3706                  * queue up the release of the unmap to save the 1/6th of the
3707                  * cpu used up by the iotlb flush operation...
3708                  */
3709         }
3710 }
3711
3712 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3713                              size_t size, enum dma_data_direction dir,
3714                              unsigned long attrs)
3715 {
3716         intel_unmap(dev, dev_addr, size);
3717 }
3718
3719 static void *intel_alloc_coherent(struct device *dev, size_t size,
3720                                   dma_addr_t *dma_handle, gfp_t flags,
3721                                   unsigned long attrs)
3722 {
3723         struct page *page = NULL;
3724         int order;
3725
3726         size = PAGE_ALIGN(size);
3727         order = get_order(size);
3728
3729         if (!iommu_no_mapping(dev))
3730                 flags &= ~(GFP_DMA | GFP_DMA32);
3731         else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3732                 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3733                         flags |= GFP_DMA;
3734                 else
3735                         flags |= GFP_DMA32;
3736         }
3737
3738         if (gfpflags_allow_blocking(flags)) {
3739                 unsigned int count = size >> PAGE_SHIFT;
3740
3741                 page = dma_alloc_from_contiguous(dev, count, order,
3742                                                  flags & __GFP_NOWARN);
3743                 if (page && iommu_no_mapping(dev) &&
3744                     page_to_phys(page) + size > dev->coherent_dma_mask) {
3745                         dma_release_from_contiguous(dev, page, count);
3746                         page = NULL;
3747                 }
3748         }
3749
3750         if (!page)
3751                 page = alloc_pages(flags, order);
3752         if (!page)
3753                 return NULL;
3754         memset(page_address(page), 0, size);
3755
3756         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3757                                          DMA_BIDIRECTIONAL,
3758                                          dev->coherent_dma_mask);
3759         if (*dma_handle)
3760                 return page_address(page);
3761         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3762                 __free_pages(page, order);
3763
3764         return NULL;
3765 }
3766
3767 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3768                                 dma_addr_t dma_handle, unsigned long attrs)
3769 {
3770         int order;
3771         struct page *page = virt_to_page(vaddr);
3772
3773         size = PAGE_ALIGN(size);
3774         order = get_order(size);
3775
3776         intel_unmap(dev, dma_handle, size);
3777         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3778                 __free_pages(page, order);
3779 }
3780
3781 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3782                            int nelems, enum dma_data_direction dir,
3783                            unsigned long attrs)
3784 {
3785         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3786         unsigned long nrpages = 0;
3787         struct scatterlist *sg;
3788         int i;
3789
3790         for_each_sg(sglist, sg, nelems, i) {
3791                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3792         }
3793
3794         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3795 }
3796
3797 static int intel_nontranslate_map_sg(struct device *hddev,
3798         struct scatterlist *sglist, int nelems, int dir)
3799 {
3800         int i;
3801         struct scatterlist *sg;
3802
3803         for_each_sg(sglist, sg, nelems, i) {
3804                 BUG_ON(!sg_page(sg));
3805                 sg->dma_address = sg_phys(sg);
3806                 sg->dma_length = sg->length;
3807         }
3808         return nelems;
3809 }
3810
3811 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3812                         enum dma_data_direction dir, unsigned long attrs)
3813 {
3814         int i;
3815         struct dmar_domain *domain;
3816         size_t size = 0;
3817         int prot = 0;
3818         unsigned long iova_pfn;
3819         int ret;
3820         struct scatterlist *sg;
3821         unsigned long start_vpfn;
3822         struct intel_iommu *iommu;
3823
3824         BUG_ON(dir == DMA_NONE);
3825         if (iommu_no_mapping(dev))
3826                 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3827
3828         domain = get_valid_domain_for_dev(dev);
3829         if (!domain)
3830                 return 0;
3831
3832         iommu = domain_get_iommu(domain);
3833
3834         for_each_sg(sglist, sg, nelems, i)
3835                 size += aligned_nrpages(sg->offset, sg->length);
3836
3837         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3838                                 *dev->dma_mask);
3839         if (!iova_pfn) {
3840                 sglist->dma_length = 0;
3841                 return 0;
3842         }
3843
3844         /*
3845          * Check if DMAR supports zero-length reads on write only
3846          * mappings..
3847          */
3848         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3849                         !cap_zlr(iommu->cap))
3850                 prot |= DMA_PTE_READ;
3851         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3852                 prot |= DMA_PTE_WRITE;
3853
3854         start_vpfn = mm_to_dma_pfn(iova_pfn);
3855
3856         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3857         if (unlikely(ret)) {
3858                 dma_pte_free_pagetable(domain, start_vpfn,
3859                                        start_vpfn + size - 1,
3860                                        agaw_to_level(domain->agaw) + 1);
3861                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3862                 return 0;
3863         }
3864
3865         return nelems;
3866 }
3867
3868 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3869 {
3870         return !dma_addr;
3871 }
3872
3873 static const struct dma_map_ops intel_dma_ops = {
3874         .alloc = intel_alloc_coherent,
3875         .free = intel_free_coherent,
3876         .map_sg = intel_map_sg,
3877         .unmap_sg = intel_unmap_sg,
3878         .map_page = intel_map_page,
3879         .unmap_page = intel_unmap_page,
3880         .mapping_error = intel_mapping_error,
3881         .dma_supported = dma_direct_supported,
3882 };
3883
3884 static inline int iommu_domain_cache_init(void)
3885 {
3886         int ret = 0;
3887
3888         iommu_domain_cache = kmem_cache_create("iommu_domain",
3889                                          sizeof(struct dmar_domain),
3890                                          0,
3891                                          SLAB_HWCACHE_ALIGN,
3892
3893                                          NULL);
3894         if (!iommu_domain_cache) {
3895                 pr_err("Couldn't create iommu_domain cache\n");
3896                 ret = -ENOMEM;
3897         }
3898
3899         return ret;
3900 }
3901
3902 static inline int iommu_devinfo_cache_init(void)
3903 {
3904         int ret = 0;
3905
3906         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3907                                          sizeof(struct device_domain_info),
3908                                          0,
3909                                          SLAB_HWCACHE_ALIGN,
3910                                          NULL);
3911         if (!iommu_devinfo_cache) {
3912                 pr_err("Couldn't create devinfo cache\n");
3913                 ret = -ENOMEM;
3914         }
3915
3916         return ret;
3917 }
3918
3919 static int __init iommu_init_mempool(void)
3920 {
3921         int ret;
3922         ret = iova_cache_get();
3923         if (ret)
3924                 return ret;
3925
3926         ret = iommu_domain_cache_init();
3927         if (ret)
3928                 goto domain_error;
3929
3930         ret = iommu_devinfo_cache_init();
3931         if (!ret)
3932                 return ret;
3933
3934         kmem_cache_destroy(iommu_domain_cache);
3935 domain_error:
3936         iova_cache_put();
3937
3938         return -ENOMEM;
3939 }
3940
3941 static void __init iommu_exit_mempool(void)
3942 {
3943         kmem_cache_destroy(iommu_devinfo_cache);
3944         kmem_cache_destroy(iommu_domain_cache);
3945         iova_cache_put();
3946 }
3947
3948 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3949 {
3950         struct dmar_drhd_unit *drhd;
3951         u32 vtbar;
3952         int rc;
3953
3954         /* We know that this device on this chipset has its own IOMMU.
3955          * If we find it under a different IOMMU, then the BIOS is lying
3956          * to us. Hope that the IOMMU for this device is actually
3957          * disabled, and it needs no translation...
3958          */
3959         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3960         if (rc) {
3961                 /* "can't" happen */
3962                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3963                 return;
3964         }
3965         vtbar &= 0xffff0000;
3966
3967         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3968         drhd = dmar_find_matched_drhd_unit(pdev);
3969         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3970                             TAINT_FIRMWARE_WORKAROUND,
3971                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3972                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3973 }
3974 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3975
3976 static void __init init_no_remapping_devices(void)
3977 {
3978         struct dmar_drhd_unit *drhd;
3979         struct device *dev;
3980         int i;
3981
3982         for_each_drhd_unit(drhd) {
3983                 if (!drhd->include_all) {
3984                         for_each_active_dev_scope(drhd->devices,
3985                                                   drhd->devices_cnt, i, dev)
3986                                 break;
3987                         /* ignore DMAR unit if no devices exist */
3988                         if (i == drhd->devices_cnt)
3989                                 drhd->ignored = 1;
3990                 }
3991         }
3992
3993         for_each_active_drhd_unit(drhd) {
3994                 if (drhd->include_all)
3995                         continue;
3996
3997                 for_each_active_dev_scope(drhd->devices,
3998                                           drhd->devices_cnt, i, dev)
3999                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4000                                 break;
4001                 if (i < drhd->devices_cnt)
4002                         continue;
4003
4004                 /* This IOMMU has *only* gfx devices. Either bypass it or
4005                    set the gfx_mapped flag, as appropriate */
4006                 if (dmar_map_gfx) {
4007                         intel_iommu_gfx_mapped = 1;
4008                 } else {
4009                         drhd->ignored = 1;
4010                         for_each_active_dev_scope(drhd->devices,
4011                                                   drhd->devices_cnt, i, dev)
4012                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4013                 }
4014         }
4015 }
4016
4017 #ifdef CONFIG_SUSPEND
4018 static int init_iommu_hw(void)
4019 {
4020         struct dmar_drhd_unit *drhd;
4021         struct intel_iommu *iommu = NULL;
4022
4023         for_each_active_iommu(iommu, drhd)
4024                 if (iommu->qi)
4025                         dmar_reenable_qi(iommu);
4026
4027         for_each_iommu(iommu, drhd) {
4028                 if (drhd->ignored) {
4029                         /*
4030                          * we always have to disable PMRs or DMA may fail on
4031                          * this device
4032                          */
4033                         if (force_on)
4034                                 iommu_disable_protect_mem_regions(iommu);
4035                         continue;
4036                 }
4037         
4038                 iommu_flush_write_buffer(iommu);
4039
4040                 iommu_set_root_entry(iommu);
4041
4042                 iommu->flush.flush_context(iommu, 0, 0, 0,
4043                                            DMA_CCMD_GLOBAL_INVL);
4044                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4045                 iommu_enable_translation(iommu);
4046                 iommu_disable_protect_mem_regions(iommu);
4047         }
4048
4049         return 0;
4050 }
4051
4052 static void iommu_flush_all(void)
4053 {
4054         struct dmar_drhd_unit *drhd;
4055         struct intel_iommu *iommu;
4056
4057         for_each_active_iommu(iommu, drhd) {
4058                 iommu->flush.flush_context(iommu, 0, 0, 0,
4059                                            DMA_CCMD_GLOBAL_INVL);
4060                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4061                                          DMA_TLB_GLOBAL_FLUSH);
4062         }
4063 }
4064
4065 static int iommu_suspend(void)
4066 {
4067         struct dmar_drhd_unit *drhd;
4068         struct intel_iommu *iommu = NULL;
4069         unsigned long flag;
4070
4071         for_each_active_iommu(iommu, drhd) {
4072                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4073                                                  GFP_ATOMIC);
4074                 if (!iommu->iommu_state)
4075                         goto nomem;
4076         }
4077
4078         iommu_flush_all();
4079
4080         for_each_active_iommu(iommu, drhd) {
4081                 iommu_disable_translation(iommu);
4082
4083                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4084
4085                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4086                         readl(iommu->reg + DMAR_FECTL_REG);
4087                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4088                         readl(iommu->reg + DMAR_FEDATA_REG);
4089                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4090                         readl(iommu->reg + DMAR_FEADDR_REG);
4091                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4092                         readl(iommu->reg + DMAR_FEUADDR_REG);
4093
4094                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4095         }
4096         return 0;
4097
4098 nomem:
4099         for_each_active_iommu(iommu, drhd)
4100                 kfree(iommu->iommu_state);
4101
4102         return -ENOMEM;
4103 }
4104
4105 static void iommu_resume(void)
4106 {
4107         struct dmar_drhd_unit *drhd;
4108         struct intel_iommu *iommu = NULL;
4109         unsigned long flag;
4110
4111         if (init_iommu_hw()) {
4112                 if (force_on)
4113                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4114                 else
4115                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4116                 return;
4117         }
4118
4119         for_each_active_iommu(iommu, drhd) {
4120
4121                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4122
4123                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4124                         iommu->reg + DMAR_FECTL_REG);
4125                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4126                         iommu->reg + DMAR_FEDATA_REG);
4127                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4128                         iommu->reg + DMAR_FEADDR_REG);
4129                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4130                         iommu->reg + DMAR_FEUADDR_REG);
4131
4132                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4133         }
4134
4135         for_each_active_iommu(iommu, drhd)
4136                 kfree(iommu->iommu_state);
4137 }
4138
4139 static struct syscore_ops iommu_syscore_ops = {
4140         .resume         = iommu_resume,
4141         .suspend        = iommu_suspend,
4142 };
4143
4144 static void __init init_iommu_pm_ops(void)
4145 {
4146         register_syscore_ops(&iommu_syscore_ops);
4147 }
4148
4149 #else
4150 static inline void init_iommu_pm_ops(void) {}
4151 #endif  /* CONFIG_PM */
4152
4153
4154 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4155 {
4156         struct acpi_dmar_reserved_memory *rmrr;
4157         int prot = DMA_PTE_READ|DMA_PTE_WRITE;
4158         struct dmar_rmrr_unit *rmrru;
4159         size_t length;
4160
4161         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4162         if (!rmrru)
4163                 goto out;
4164
4165         rmrru->hdr = header;
4166         rmrr = (struct acpi_dmar_reserved_memory *)header;
4167         rmrru->base_address = rmrr->base_address;
4168         rmrru->end_address = rmrr->end_address;
4169
4170         length = rmrr->end_address - rmrr->base_address + 1;
4171         rmrru->resv = iommu_alloc_resv_region(rmrr->base_address, length, prot,
4172                                               IOMMU_RESV_DIRECT);
4173         if (!rmrru->resv)
4174                 goto free_rmrru;
4175
4176         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4177                                 ((void *)rmrr) + rmrr->header.length,
4178                                 &rmrru->devices_cnt);
4179         if (rmrru->devices_cnt && rmrru->devices == NULL)
4180                 goto free_all;
4181
4182         list_add(&rmrru->list, &dmar_rmrr_units);
4183
4184         return 0;
4185 free_all:
4186         kfree(rmrru->resv);
4187 free_rmrru:
4188         kfree(rmrru);
4189 out:
4190         return -ENOMEM;
4191 }
4192
4193 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4194 {
4195         struct dmar_atsr_unit *atsru;
4196         struct acpi_dmar_atsr *tmp;
4197
4198         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4199                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4200                 if (atsr->segment != tmp->segment)
4201                         continue;
4202                 if (atsr->header.length != tmp->header.length)
4203                         continue;
4204                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4205                         return atsru;
4206         }
4207
4208         return NULL;
4209 }
4210
4211 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4212 {
4213         struct acpi_dmar_atsr *atsr;
4214         struct dmar_atsr_unit *atsru;
4215
4216         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4217                 return 0;
4218
4219         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4220         atsru = dmar_find_atsr(atsr);
4221         if (atsru)
4222                 return 0;
4223
4224         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4225         if (!atsru)
4226                 return -ENOMEM;
4227
4228         /*
4229          * If memory is allocated from slab by ACPI _DSM method, we need to
4230          * copy the memory content because the memory buffer will be freed
4231          * on return.
4232          */
4233         atsru->hdr = (void *)(atsru + 1);
4234         memcpy(atsru->hdr, hdr, hdr->length);
4235         atsru->include_all = atsr->flags & 0x1;
4236         if (!atsru->include_all) {
4237                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4238                                 (void *)atsr + atsr->header.length,
4239                                 &atsru->devices_cnt);
4240                 if (atsru->devices_cnt && atsru->devices == NULL) {
4241                         kfree(atsru);
4242                         return -ENOMEM;
4243                 }
4244         }
4245
4246         list_add_rcu(&atsru->list, &dmar_atsr_units);
4247
4248         return 0;
4249 }
4250
4251 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4252 {
4253         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4254         kfree(atsru);
4255 }
4256
4257 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4258 {
4259         struct acpi_dmar_atsr *atsr;
4260         struct dmar_atsr_unit *atsru;
4261
4262         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4263         atsru = dmar_find_atsr(atsr);
4264         if (atsru) {
4265                 list_del_rcu(&atsru->list);
4266                 synchronize_rcu();
4267                 intel_iommu_free_atsr(atsru);
4268         }
4269
4270         return 0;
4271 }
4272
4273 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4274 {
4275         int i;
4276         struct device *dev;
4277         struct acpi_dmar_atsr *atsr;
4278         struct dmar_atsr_unit *atsru;
4279
4280         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4281         atsru = dmar_find_atsr(atsr);
4282         if (!atsru)
4283                 return 0;
4284
4285         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4286                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4287                                           i, dev)
4288                         return -EBUSY;
4289         }
4290
4291         return 0;
4292 }
4293
4294 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4295 {
4296         int sp, ret = 0;
4297         struct intel_iommu *iommu = dmaru->iommu;
4298
4299         if (g_iommus[iommu->seq_id])
4300                 return 0;
4301
4302         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4303                 pr_warn("%s: Doesn't support hardware pass through.\n",
4304                         iommu->name);
4305                 return -ENXIO;
4306         }
4307         if (!ecap_sc_support(iommu->ecap) &&
4308             domain_update_iommu_snooping(iommu)) {
4309                 pr_warn("%s: Doesn't support snooping.\n",
4310                         iommu->name);
4311                 return -ENXIO;
4312         }
4313         sp = domain_update_iommu_superpage(iommu) - 1;
4314         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4315                 pr_warn("%s: Doesn't support large page.\n",
4316                         iommu->name);
4317                 return -ENXIO;
4318         }
4319
4320         /*
4321          * Disable translation if already enabled prior to OS handover.
4322          */
4323         if (iommu->gcmd & DMA_GCMD_TE)
4324                 iommu_disable_translation(iommu);
4325
4326         g_iommus[iommu->seq_id] = iommu;
4327         ret = iommu_init_domains(iommu);
4328         if (ret == 0)
4329                 ret = iommu_alloc_root_entry(iommu);
4330         if (ret)
4331                 goto out;
4332
4333 #ifdef CONFIG_INTEL_IOMMU_SVM
4334         if (pasid_enabled(iommu))
4335                 intel_svm_init(iommu);
4336 #endif
4337
4338         if (dmaru->ignored) {
4339                 /*
4340                  * we always have to disable PMRs or DMA may fail on this device
4341                  */
4342                 if (force_on)
4343                         iommu_disable_protect_mem_regions(iommu);
4344                 return 0;
4345         }
4346
4347         intel_iommu_init_qi(iommu);
4348         iommu_flush_write_buffer(iommu);
4349
4350 #ifdef CONFIG_INTEL_IOMMU_SVM
4351         if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
4352                 ret = intel_svm_enable_prq(iommu);
4353                 if (ret)
4354                         goto disable_iommu;
4355         }
4356 #endif
4357         ret = dmar_set_interrupt(iommu);
4358         if (ret)
4359                 goto disable_iommu;
4360
4361         iommu_set_root_entry(iommu);
4362         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4363         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4364         iommu_enable_translation(iommu);
4365
4366         iommu_disable_protect_mem_regions(iommu);
4367         return 0;
4368
4369 disable_iommu:
4370         disable_dmar_iommu(iommu);
4371 out:
4372         free_dmar_iommu(iommu);
4373         return ret;
4374 }
4375
4376 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4377 {
4378         int ret = 0;
4379         struct intel_iommu *iommu = dmaru->iommu;
4380
4381         if (!intel_iommu_enabled)
4382                 return 0;
4383         if (iommu == NULL)
4384                 return -EINVAL;
4385
4386         if (insert) {
4387                 ret = intel_iommu_add(dmaru);
4388         } else {
4389                 disable_dmar_iommu(iommu);
4390                 free_dmar_iommu(iommu);
4391         }
4392
4393         return ret;
4394 }
4395
4396 static void intel_iommu_free_dmars(void)
4397 {
4398         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4399         struct dmar_atsr_unit *atsru, *atsr_n;
4400
4401         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4402                 list_del(&rmrru->list);
4403                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4404                 kfree(rmrru->resv);
4405                 kfree(rmrru);
4406         }
4407
4408         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4409                 list_del(&atsru->list);
4410                 intel_iommu_free_atsr(atsru);
4411         }
4412 }
4413
4414 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4415 {
4416         int i, ret = 1;
4417         struct pci_bus *bus;
4418         struct pci_dev *bridge = NULL;
4419         struct device *tmp;
4420         struct acpi_dmar_atsr *atsr;
4421         struct dmar_atsr_unit *atsru;
4422
4423         dev = pci_physfn(dev);
4424         for (bus = dev->bus; bus; bus = bus->parent) {
4425                 bridge = bus->self;
4426                 /* If it's an integrated device, allow ATS */
4427                 if (!bridge)
4428                         return 1;
4429                 /* Connected via non-PCIe: no ATS */
4430                 if (!pci_is_pcie(bridge) ||
4431                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4432                         return 0;
4433                 /* If we found the root port, look it up in the ATSR */
4434                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4435                         break;
4436         }
4437
4438         rcu_read_lock();
4439         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4440                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4441                 if (atsr->segment != pci_domain_nr(dev->bus))
4442                         continue;
4443
4444                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4445                         if (tmp == &bridge->dev)
4446                                 goto out;
4447
4448                 if (atsru->include_all)
4449                         goto out;
4450         }
4451         ret = 0;
4452 out:
4453         rcu_read_unlock();
4454
4455         return ret;
4456 }
4457
4458 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4459 {
4460         int ret = 0;
4461         struct dmar_rmrr_unit *rmrru;
4462         struct dmar_atsr_unit *atsru;
4463         struct acpi_dmar_atsr *atsr;
4464         struct acpi_dmar_reserved_memory *rmrr;
4465
4466         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4467                 return 0;
4468
4469         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4470                 rmrr = container_of(rmrru->hdr,
4471                                     struct acpi_dmar_reserved_memory, header);
4472                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4473                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4474                                 ((void *)rmrr) + rmrr->header.length,
4475                                 rmrr->segment, rmrru->devices,
4476                                 rmrru->devices_cnt);
4477                         if(ret < 0)
4478                                 return ret;
4479                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4480                         dmar_remove_dev_scope(info, rmrr->segment,
4481                                 rmrru->devices, rmrru->devices_cnt);
4482                 }
4483         }
4484
4485         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4486                 if (atsru->include_all)
4487                         continue;
4488
4489                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4490                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4491                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4492                                         (void *)atsr + atsr->header.length,
4493                                         atsr->segment, atsru->devices,
4494                                         atsru->devices_cnt);
4495                         if (ret > 0)
4496                                 break;
4497                         else if(ret < 0)
4498                                 return ret;
4499                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4500                         if (dmar_remove_dev_scope(info, atsr->segment,
4501                                         atsru->devices, atsru->devices_cnt))
4502                                 break;
4503                 }
4504         }
4505
4506         return 0;
4507 }
4508
4509 /*
4510  * Here we only respond to action of unbound device from driver.
4511  *
4512  * Added device is not attached to its DMAR domain here yet. That will happen
4513  * when mapping the device to iova.
4514  */
4515 static int device_notifier(struct notifier_block *nb,
4516                                   unsigned long action, void *data)
4517 {
4518         struct device *dev = data;
4519         struct dmar_domain *domain;
4520
4521         if (iommu_dummy(dev))
4522                 return 0;
4523
4524         if (action != BUS_NOTIFY_REMOVED_DEVICE)
4525                 return 0;
4526
4527         domain = find_domain(dev);
4528         if (!domain)
4529                 return 0;
4530
4531         dmar_remove_one_dev_info(domain, dev);
4532         if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4533                 domain_exit(domain);
4534
4535         return 0;
4536 }
4537
4538 static struct notifier_block device_nb = {
4539         .notifier_call = device_notifier,
4540 };
4541
4542 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4543                                        unsigned long val, void *v)
4544 {
4545         struct memory_notify *mhp = v;
4546         unsigned long long start, end;
4547         unsigned long start_vpfn, last_vpfn;
4548
4549         switch (val) {
4550         case MEM_GOING_ONLINE:
4551                 start = mhp->start_pfn << PAGE_SHIFT;
4552                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4553                 if (iommu_domain_identity_map(si_domain, start, end)) {
4554                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4555                                 start, end);
4556                         return NOTIFY_BAD;
4557                 }
4558                 break;
4559
4560         case MEM_OFFLINE:
4561         case MEM_CANCEL_ONLINE:
4562                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4563                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4564                 while (start_vpfn <= last_vpfn) {
4565                         struct iova *iova;
4566                         struct dmar_drhd_unit *drhd;
4567                         struct intel_iommu *iommu;
4568                         struct page *freelist;
4569
4570                         iova = find_iova(&si_domain->iovad, start_vpfn);
4571                         if (iova == NULL) {
4572                                 pr_debug("Failed get IOVA for PFN %lx\n",
4573                                          start_vpfn);
4574                                 break;
4575                         }
4576
4577                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4578                                                      start_vpfn, last_vpfn);
4579                         if (iova == NULL) {
4580                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4581                                         start_vpfn, last_vpfn);
4582                                 return NOTIFY_BAD;
4583                         }
4584
4585                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4586                                                iova->pfn_hi);
4587
4588                         rcu_read_lock();
4589                         for_each_active_iommu(iommu, drhd)
4590                                 iommu_flush_iotlb_psi(iommu, si_domain,
4591                                         iova->pfn_lo, iova_size(iova),
4592                                         !freelist, 0);
4593                         rcu_read_unlock();
4594                         dma_free_pagelist(freelist);
4595
4596                         start_vpfn = iova->pfn_hi + 1;
4597                         free_iova_mem(iova);
4598                 }
4599                 break;
4600         }
4601
4602         return NOTIFY_OK;
4603 }
4604
4605 static struct notifier_block intel_iommu_memory_nb = {
4606         .notifier_call = intel_iommu_memory_notifier,
4607         .priority = 0
4608 };
4609
4610 static void free_all_cpu_cached_iovas(unsigned int cpu)
4611 {
4612         int i;
4613
4614         for (i = 0; i < g_num_of_iommus; i++) {
4615                 struct intel_iommu *iommu = g_iommus[i];
4616                 struct dmar_domain *domain;
4617                 int did;
4618
4619                 if (!iommu)
4620                         continue;
4621
4622                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4623                         domain = get_iommu_domain(iommu, (u16)did);
4624
4625                         if (!domain)
4626                                 continue;
4627                         free_cpu_cached_iovas(cpu, &domain->iovad);
4628                 }
4629         }
4630 }
4631
4632 static int intel_iommu_cpu_dead(unsigned int cpu)
4633 {
4634         free_all_cpu_cached_iovas(cpu);
4635         return 0;
4636 }
4637
4638 static void intel_disable_iommus(void)
4639 {
4640         struct intel_iommu *iommu = NULL;
4641         struct dmar_drhd_unit *drhd;
4642
4643         for_each_iommu(iommu, drhd)
4644                 iommu_disable_translation(iommu);
4645 }
4646
4647 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4648 {
4649         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4650
4651         return container_of(iommu_dev, struct intel_iommu, iommu);
4652 }
4653
4654 static ssize_t intel_iommu_show_version(struct device *dev,
4655                                         struct device_attribute *attr,
4656                                         char *buf)
4657 {
4658         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4659         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4660         return sprintf(buf, "%d:%d\n",
4661                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4662 }
4663 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4664
4665 static ssize_t intel_iommu_show_address(struct device *dev,
4666                                         struct device_attribute *attr,
4667                                         char *buf)
4668 {
4669         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4670         return sprintf(buf, "%llx\n", iommu->reg_phys);
4671 }
4672 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4673
4674 static ssize_t intel_iommu_show_cap(struct device *dev,
4675                                     struct device_attribute *attr,
4676                                     char *buf)
4677 {
4678         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4679         return sprintf(buf, "%llx\n", iommu->cap);
4680 }
4681 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4682
4683 static ssize_t intel_iommu_show_ecap(struct device *dev,
4684                                     struct device_attribute *attr,
4685                                     char *buf)
4686 {
4687         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4688         return sprintf(buf, "%llx\n", iommu->ecap);
4689 }
4690 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4691
4692 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4693                                       struct device_attribute *attr,
4694                                       char *buf)
4695 {
4696         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4697         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4698 }
4699 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4700
4701 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4702                                            struct device_attribute *attr,
4703                                            char *buf)
4704 {
4705         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4706         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4707                                                   cap_ndoms(iommu->cap)));
4708 }
4709 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4710
4711 static struct attribute *intel_iommu_attrs[] = {
4712         &dev_attr_version.attr,
4713         &dev_attr_address.attr,
4714         &dev_attr_cap.attr,
4715         &dev_attr_ecap.attr,
4716         &dev_attr_domains_supported.attr,
4717         &dev_attr_domains_used.attr,
4718         NULL,
4719 };
4720
4721 static struct attribute_group intel_iommu_group = {
4722         .name = "intel-iommu",
4723         .attrs = intel_iommu_attrs,
4724 };
4725
4726 const struct attribute_group *intel_iommu_groups[] = {
4727         &intel_iommu_group,
4728         NULL,
4729 };
4730
4731 int __init intel_iommu_init(void)
4732 {
4733         int ret = -ENODEV;
4734         struct dmar_drhd_unit *drhd;
4735         struct intel_iommu *iommu;
4736
4737         /* VT-d is required for a TXT/tboot launch, so enforce that */
4738         force_on = tboot_force_iommu();
4739
4740         if (iommu_init_mempool()) {
4741                 if (force_on)
4742                         panic("tboot: Failed to initialize iommu memory\n");
4743                 return -ENOMEM;
4744         }
4745
4746         down_write(&dmar_global_lock);
4747         if (dmar_table_init()) {
4748                 if (force_on)
4749                         panic("tboot: Failed to initialize DMAR table\n");
4750                 goto out_free_dmar;
4751         }
4752
4753         if (dmar_dev_scope_init() < 0) {
4754                 if (force_on)
4755                         panic("tboot: Failed to initialize DMAR device scope\n");
4756                 goto out_free_dmar;
4757         }
4758
4759         up_write(&dmar_global_lock);
4760
4761         /*
4762          * The bus notifier takes the dmar_global_lock, so lockdep will
4763          * complain later when we register it under the lock.
4764          */
4765         dmar_register_bus_notifier();
4766
4767         down_write(&dmar_global_lock);
4768
4769         if (no_iommu || dmar_disabled) {
4770                 /*
4771                  * We exit the function here to ensure IOMMU's remapping and
4772                  * mempool aren't setup, which means that the IOMMU's PMRs
4773                  * won't be disabled via the call to init_dmars(). So disable
4774                  * it explicitly here. The PMRs were setup by tboot prior to
4775                  * calling SENTER, but the kernel is expected to reset/tear
4776                  * down the PMRs.
4777                  */
4778                 if (intel_iommu_tboot_noforce) {
4779                         for_each_iommu(iommu, drhd)
4780                                 iommu_disable_protect_mem_regions(iommu);
4781                 }
4782
4783                 /*
4784                  * Make sure the IOMMUs are switched off, even when we
4785                  * boot into a kexec kernel and the previous kernel left
4786                  * them enabled
4787                  */
4788                 intel_disable_iommus();
4789                 goto out_free_dmar;
4790         }
4791
4792         if (list_empty(&dmar_rmrr_units))
4793                 pr_info("No RMRR found\n");
4794
4795         if (list_empty(&dmar_atsr_units))
4796                 pr_info("No ATSR found\n");
4797
4798         if (dmar_init_reserved_ranges()) {
4799                 if (force_on)
4800                         panic("tboot: Failed to reserve iommu ranges\n");
4801                 goto out_free_reserved_range;
4802         }
4803
4804         init_no_remapping_devices();
4805
4806         ret = init_dmars();
4807         if (ret) {
4808                 if (force_on)
4809                         panic("tboot: Failed to initialize DMARs\n");
4810                 pr_err("Initialization failed\n");
4811                 goto out_free_reserved_range;
4812         }
4813         up_write(&dmar_global_lock);
4814         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4815
4816 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4817         swiotlb = 0;
4818 #endif
4819         dma_ops = &intel_dma_ops;
4820
4821         init_iommu_pm_ops();
4822
4823         for_each_active_iommu(iommu, drhd) {
4824                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4825                                        intel_iommu_groups,
4826                                        "%s", iommu->name);
4827                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4828                 iommu_device_register(&iommu->iommu);
4829         }
4830
4831         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4832         bus_register_notifier(&pci_bus_type, &device_nb);
4833         if (si_domain && !hw_pass_through)
4834                 register_memory_notifier(&intel_iommu_memory_nb);
4835         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4836                           intel_iommu_cpu_dead);
4837         intel_iommu_enabled = 1;
4838         intel_iommu_debugfs_init();
4839
4840         return 0;
4841
4842 out_free_reserved_range:
4843         put_iova_domain(&reserved_iova_list);
4844 out_free_dmar:
4845         intel_iommu_free_dmars();
4846         up_write(&dmar_global_lock);
4847         iommu_exit_mempool();
4848         return ret;
4849 }
4850
4851 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4852 {
4853         struct intel_iommu *iommu = opaque;
4854
4855         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4856         return 0;
4857 }
4858
4859 /*
4860  * NB - intel-iommu lacks any sort of reference counting for the users of
4861  * dependent devices.  If multiple endpoints have intersecting dependent
4862  * devices, unbinding the driver from any one of them will possibly leave
4863  * the others unable to operate.
4864  */
4865 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4866 {
4867         if (!iommu || !dev || !dev_is_pci(dev))
4868                 return;
4869
4870         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4871 }
4872
4873 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4874 {
4875         struct intel_iommu *iommu;
4876         unsigned long flags;
4877
4878         assert_spin_locked(&device_domain_lock);
4879
4880         if (WARN_ON(!info))
4881                 return;
4882
4883         iommu = info->iommu;
4884
4885         if (info->dev) {
4886                 iommu_disable_dev_iotlb(info);
4887                 domain_context_clear(iommu, info->dev);
4888                 intel_pasid_free_table(info->dev);
4889         }
4890
4891         unlink_domain_info(info);
4892
4893         spin_lock_irqsave(&iommu->lock, flags);
4894         domain_detach_iommu(info->domain, iommu);
4895         spin_unlock_irqrestore(&iommu->lock, flags);
4896
4897         free_devinfo_mem(info);
4898 }
4899
4900 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
4901                                      struct device *dev)
4902 {
4903         struct device_domain_info *info;
4904         unsigned long flags;
4905
4906         spin_lock_irqsave(&device_domain_lock, flags);
4907         info = dev->archdata.iommu;
4908         __dmar_remove_one_dev_info(info);
4909         spin_unlock_irqrestore(&device_domain_lock, flags);
4910 }
4911
4912 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4913 {
4914         int adjust_width;
4915
4916         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
4917         domain_reserve_special_ranges(domain);
4918
4919         /* calculate AGAW */
4920         domain->gaw = guest_width;
4921         adjust_width = guestwidth_to_adjustwidth(guest_width);
4922         domain->agaw = width_to_agaw(adjust_width);
4923
4924         domain->iommu_coherency = 0;
4925         domain->iommu_snooping = 0;
4926         domain->iommu_superpage = 0;
4927         domain->max_addr = 0;
4928
4929         /* always allocate the top pgd */
4930         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4931         if (!domain->pgd)
4932                 return -ENOMEM;
4933         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4934         return 0;
4935 }
4936
4937 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4938 {
4939         struct dmar_domain *dmar_domain;
4940         struct iommu_domain *domain;
4941
4942         if (type != IOMMU_DOMAIN_UNMANAGED)
4943                 return NULL;
4944
4945         dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4946         if (!dmar_domain) {
4947                 pr_err("Can't allocate dmar_domain\n");
4948                 return NULL;
4949         }
4950         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4951                 pr_err("Domain initialization failed\n");
4952                 domain_exit(dmar_domain);
4953                 return NULL;
4954         }
4955         domain_update_iommu_cap(dmar_domain);
4956
4957         domain = &dmar_domain->domain;
4958         domain->geometry.aperture_start = 0;
4959         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4960         domain->geometry.force_aperture = true;
4961
4962         return domain;
4963 }
4964
4965 static void intel_iommu_domain_free(struct iommu_domain *domain)
4966 {
4967         domain_exit(to_dmar_domain(domain));
4968 }
4969
4970 static int intel_iommu_attach_device(struct iommu_domain *domain,
4971                                      struct device *dev)
4972 {
4973         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4974         struct intel_iommu *iommu;
4975         int addr_width;
4976         u8 bus, devfn;
4977
4978         if (device_is_rmrr_locked(dev)) {
4979                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4980                 return -EPERM;
4981         }
4982
4983         /* normally dev is not mapped */
4984         if (unlikely(domain_context_mapped(dev))) {
4985                 struct dmar_domain *old_domain;
4986
4987                 old_domain = find_domain(dev);
4988                 if (old_domain) {
4989                         rcu_read_lock();
4990                         dmar_remove_one_dev_info(old_domain, dev);
4991                         rcu_read_unlock();
4992
4993                         if (!domain_type_is_vm_or_si(old_domain) &&
4994                              list_empty(&old_domain->devices))
4995                                 domain_exit(old_domain);
4996                 }
4997         }
4998
4999         iommu = device_to_iommu(dev, &bus, &devfn);
5000         if (!iommu)
5001                 return -ENODEV;
5002
5003         /* check if this iommu agaw is sufficient for max mapped address */
5004         addr_width = agaw_to_width(iommu->agaw);
5005         if (addr_width > cap_mgaw(iommu->cap))
5006                 addr_width = cap_mgaw(iommu->cap);
5007
5008         if (dmar_domain->max_addr > (1LL << addr_width)) {
5009                 pr_err("%s: iommu width (%d) is not "
5010                        "sufficient for the mapped address (%llx)\n",
5011                        __func__, addr_width, dmar_domain->max_addr);
5012                 return -EFAULT;
5013         }
5014         dmar_domain->gaw = addr_width;
5015
5016         /*
5017          * Knock out extra levels of page tables if necessary
5018          */
5019         while (iommu->agaw < dmar_domain->agaw) {
5020                 struct dma_pte *pte;
5021
5022                 pte = dmar_domain->pgd;
5023                 if (dma_pte_present(pte)) {
5024                         dmar_domain->pgd = (struct dma_pte *)
5025                                 phys_to_virt(dma_pte_addr(pte));
5026                         free_pgtable_page(pte);
5027                 }
5028                 dmar_domain->agaw--;
5029         }
5030
5031         return domain_add_dev_info(dmar_domain, dev);
5032 }
5033
5034 static void intel_iommu_detach_device(struct iommu_domain *domain,
5035                                       struct device *dev)
5036 {
5037         dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
5038 }
5039
5040 static int intel_iommu_map(struct iommu_domain *domain,
5041                            unsigned long iova, phys_addr_t hpa,
5042                            size_t size, int iommu_prot)
5043 {
5044         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5045         u64 max_addr;
5046         int prot = 0;
5047         int ret;
5048
5049         if (iommu_prot & IOMMU_READ)
5050                 prot |= DMA_PTE_READ;
5051         if (iommu_prot & IOMMU_WRITE)
5052                 prot |= DMA_PTE_WRITE;
5053         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5054                 prot |= DMA_PTE_SNP;
5055
5056         max_addr = iova + size;
5057         if (dmar_domain->max_addr < max_addr) {
5058                 u64 end;
5059
5060                 /* check if minimum agaw is sufficient for mapped address */
5061                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5062                 if (end < max_addr) {
5063                         pr_err("%s: iommu width (%d) is not "
5064                                "sufficient for the mapped address (%llx)\n",
5065                                __func__, dmar_domain->gaw, max_addr);
5066                         return -EFAULT;
5067                 }
5068                 dmar_domain->max_addr = max_addr;
5069         }
5070         /* Round up size to next multiple of PAGE_SIZE, if it and
5071            the low bits of hpa would take us onto the next page */
5072         size = aligned_nrpages(hpa, size);
5073         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5074                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5075         return ret;
5076 }
5077
5078 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5079                                 unsigned long iova, size_t size)
5080 {
5081         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5082         struct page *freelist = NULL;
5083         unsigned long start_pfn, last_pfn;
5084         unsigned int npages;
5085         int iommu_id, level = 0;
5086
5087         /* Cope with horrid API which requires us to unmap more than the
5088            size argument if it happens to be a large-page mapping. */
5089         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5090
5091         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5092                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5093
5094         start_pfn = iova >> VTD_PAGE_SHIFT;
5095         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5096
5097         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5098
5099         npages = last_pfn - start_pfn + 1;
5100
5101         for_each_domain_iommu(iommu_id, dmar_domain)
5102                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5103                                       start_pfn, npages, !freelist, 0);
5104
5105         dma_free_pagelist(freelist);
5106
5107         if (dmar_domain->max_addr == iova + size)
5108                 dmar_domain->max_addr = iova;
5109
5110         return size;
5111 }
5112
5113 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5114                                             dma_addr_t iova)
5115 {
5116         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5117         struct dma_pte *pte;
5118         int level = 0;
5119         u64 phys = 0;
5120
5121         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5122         if (pte)
5123                 phys = dma_pte_addr(pte);
5124
5125         return phys;
5126 }
5127
5128 static bool intel_iommu_capable(enum iommu_cap cap)
5129 {
5130         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5131                 return domain_update_iommu_snooping(NULL) == 1;
5132         if (cap == IOMMU_CAP_INTR_REMAP)
5133                 return irq_remapping_enabled == 1;
5134
5135         return false;
5136 }
5137
5138 static int intel_iommu_add_device(struct device *dev)
5139 {
5140         struct intel_iommu *iommu;
5141         struct iommu_group *group;
5142         u8 bus, devfn;
5143
5144         iommu = device_to_iommu(dev, &bus, &devfn);
5145         if (!iommu)
5146                 return -ENODEV;
5147
5148         iommu_device_link(&iommu->iommu, dev);
5149
5150         group = iommu_group_get_for_dev(dev);
5151
5152         if (IS_ERR(group))
5153                 return PTR_ERR(group);
5154
5155         iommu_group_put(group);
5156         return 0;
5157 }
5158
5159 static void intel_iommu_remove_device(struct device *dev)
5160 {
5161         struct intel_iommu *iommu;
5162         u8 bus, devfn;
5163
5164         iommu = device_to_iommu(dev, &bus, &devfn);
5165         if (!iommu)
5166                 return;
5167
5168         iommu_group_remove_device(dev);
5169
5170         iommu_device_unlink(&iommu->iommu, dev);
5171 }
5172
5173 static void intel_iommu_get_resv_regions(struct device *device,
5174                                          struct list_head *head)
5175 {
5176         struct iommu_resv_region *reg;
5177         struct dmar_rmrr_unit *rmrr;
5178         struct device *i_dev;
5179         int i;
5180
5181         rcu_read_lock();
5182         for_each_rmrr_units(rmrr) {
5183                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5184                                           i, i_dev) {
5185                         if (i_dev != device)
5186                                 continue;
5187
5188                         list_add_tail(&rmrr->resv->list, head);
5189                 }
5190         }
5191         rcu_read_unlock();
5192
5193         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5194                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5195                                       0, IOMMU_RESV_MSI);
5196         if (!reg)
5197                 return;
5198         list_add_tail(&reg->list, head);
5199 }
5200
5201 static void intel_iommu_put_resv_regions(struct device *dev,
5202                                          struct list_head *head)
5203 {
5204         struct iommu_resv_region *entry, *next;
5205
5206         list_for_each_entry_safe(entry, next, head, list) {
5207                 if (entry->type == IOMMU_RESV_RESERVED)
5208                         kfree(entry);
5209         }
5210 }
5211
5212 #ifdef CONFIG_INTEL_IOMMU_SVM
5213 #define MAX_NR_PASID_BITS (20)
5214 static inline unsigned long intel_iommu_get_pts(struct device *dev)
5215 {
5216         int pts, max_pasid;
5217
5218         max_pasid = intel_pasid_get_dev_max_id(dev);
5219         pts = find_first_bit((unsigned long *)&max_pasid, MAX_NR_PASID_BITS);
5220         if (pts < 5)
5221                 return 0;
5222
5223         return pts - 5;
5224 }
5225
5226 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev)
5227 {
5228         struct device_domain_info *info;
5229         struct context_entry *context;
5230         struct dmar_domain *domain;
5231         unsigned long flags;
5232         u64 ctx_lo;
5233         int ret;
5234
5235         domain = get_valid_domain_for_dev(sdev->dev);
5236         if (!domain)
5237                 return -EINVAL;
5238
5239         spin_lock_irqsave(&device_domain_lock, flags);
5240         spin_lock(&iommu->lock);
5241
5242         ret = -EINVAL;
5243         info = sdev->dev->archdata.iommu;
5244         if (!info || !info->pasid_supported)
5245                 goto out;
5246
5247         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5248         if (WARN_ON(!context))
5249                 goto out;
5250
5251         ctx_lo = context[0].lo;
5252
5253         sdev->did = domain->iommu_did[iommu->seq_id];
5254         sdev->sid = PCI_DEVID(info->bus, info->devfn);
5255
5256         if (!(ctx_lo & CONTEXT_PASIDE)) {
5257                 if (iommu->pasid_state_table)
5258                         context[1].hi = (u64)virt_to_phys(iommu->pasid_state_table);
5259                 context[1].lo = (u64)virt_to_phys(info->pasid_table->table) |
5260                         intel_iommu_get_pts(sdev->dev);
5261
5262                 wmb();
5263                 /* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both
5264                  * extended to permit requests-with-PASID if the PASIDE bit
5265                  * is set. which makes sense. For CONTEXT_TT_PASS_THROUGH,
5266                  * however, the PASIDE bit is ignored and requests-with-PASID
5267                  * are unconditionally blocked. Which makes less sense.
5268                  * So convert from CONTEXT_TT_PASS_THROUGH to one of the new
5269                  * "guest mode" translation types depending on whether ATS
5270                  * is available or not. Annoyingly, we can't use the new
5271                  * modes *unless* PASIDE is set. */
5272                 if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 2)) {
5273                         ctx_lo &= ~CONTEXT_TT_MASK;
5274                         if (info->ats_supported)
5275                                 ctx_lo |= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2;
5276                         else
5277                                 ctx_lo |= CONTEXT_TT_PT_PASID << 2;
5278                 }
5279                 ctx_lo |= CONTEXT_PASIDE;
5280                 if (iommu->pasid_state_table)
5281                         ctx_lo |= CONTEXT_DINVE;
5282                 if (info->pri_supported)
5283                         ctx_lo |= CONTEXT_PRS;
5284                 context[0].lo = ctx_lo;
5285                 wmb();
5286                 iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
5287                                            DMA_CCMD_MASK_NOBIT,
5288                                            DMA_CCMD_DEVICE_INVL);
5289         }
5290
5291         /* Enable PASID support in the device, if it wasn't already */
5292         if (!info->pasid_enabled)
5293                 iommu_enable_dev_iotlb(info);
5294
5295         if (info->ats_enabled) {
5296                 sdev->dev_iotlb = 1;
5297                 sdev->qdep = info->ats_qdep;
5298                 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
5299                         sdev->qdep = 0;
5300         }
5301         ret = 0;
5302
5303  out:
5304         spin_unlock(&iommu->lock);
5305         spin_unlock_irqrestore(&device_domain_lock, flags);
5306
5307         return ret;
5308 }
5309
5310 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5311 {
5312         struct intel_iommu *iommu;
5313         u8 bus, devfn;
5314
5315         if (iommu_dummy(dev)) {
5316                 dev_warn(dev,
5317                          "No IOMMU translation for device; cannot enable SVM\n");
5318                 return NULL;
5319         }
5320
5321         iommu = device_to_iommu(dev, &bus, &devfn);
5322         if ((!iommu)) {
5323                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5324                 return NULL;
5325         }
5326
5327         return iommu;
5328 }
5329 #endif /* CONFIG_INTEL_IOMMU_SVM */
5330
5331 const struct iommu_ops intel_iommu_ops = {
5332         .capable                = intel_iommu_capable,
5333         .domain_alloc           = intel_iommu_domain_alloc,
5334         .domain_free            = intel_iommu_domain_free,
5335         .attach_dev             = intel_iommu_attach_device,
5336         .detach_dev             = intel_iommu_detach_device,
5337         .map                    = intel_iommu_map,
5338         .unmap                  = intel_iommu_unmap,
5339         .iova_to_phys           = intel_iommu_iova_to_phys,
5340         .add_device             = intel_iommu_add_device,
5341         .remove_device          = intel_iommu_remove_device,
5342         .get_resv_regions       = intel_iommu_get_resv_regions,
5343         .put_resv_regions       = intel_iommu_put_resv_regions,
5344         .device_group           = pci_device_group,
5345         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
5346 };
5347
5348 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5349 {
5350         /* G4x/GM45 integrated gfx dmar support is totally busted. */
5351         pr_info("Disabling IOMMU for graphics on this chipset\n");
5352         dmar_map_gfx = 0;
5353 }
5354
5355 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5356 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5357 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5358 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5359 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5360 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5361 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5362
5363 static void quirk_iommu_rwbf(struct pci_dev *dev)
5364 {
5365         /*
5366          * Mobile 4 Series Chipset neglects to set RWBF capability,
5367          * but needs it. Same seems to hold for the desktop versions.
5368          */
5369         pr_info("Forcing write-buffer flush capability\n");
5370         rwbf_quirk = 1;
5371 }
5372
5373 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5374 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5375 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5376 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5377 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5378 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5379 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5380
5381 #define GGC 0x52
5382 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5383 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5384 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5385 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5386 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5387 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5388 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5389 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5390
5391 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5392 {
5393         unsigned short ggc;
5394
5395         if (pci_read_config_word(dev, GGC, &ggc))
5396                 return;
5397
5398         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5399                 pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5400                 dmar_map_gfx = 0;
5401         } else if (dmar_map_gfx) {
5402                 /* we have to ensure the gfx device is idle before we flush */
5403                 pr_info("Disabling batched IOTLB flush on Ironlake\n");
5404                 intel_iommu_strict = 1;
5405        }
5406 }
5407 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5408 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5409 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5410 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5411
5412 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5413    ISOCH DMAR unit for the Azalia sound device, but not give it any
5414    TLB entries, which causes it to deadlock. Check for that.  We do
5415    this in a function called from init_dmars(), instead of in a PCI
5416    quirk, because we don't want to print the obnoxious "BIOS broken"
5417    message if VT-d is actually disabled.
5418 */
5419 static void __init check_tylersburg_isoch(void)
5420 {
5421         struct pci_dev *pdev;
5422         uint32_t vtisochctrl;
5423
5424         /* If there's no Azalia in the system anyway, forget it. */
5425         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5426         if (!pdev)
5427                 return;
5428         pci_dev_put(pdev);
5429
5430         /* System Management Registers. Might be hidden, in which case
5431            we can't do the sanity check. But that's OK, because the
5432            known-broken BIOSes _don't_ actually hide it, so far. */
5433         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5434         if (!pdev)
5435                 return;
5436
5437         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5438                 pci_dev_put(pdev);
5439                 return;
5440         }
5441
5442         pci_dev_put(pdev);
5443
5444         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5445         if (vtisochctrl & 1)
5446                 return;
5447
5448         /* Drop all bits other than the number of TLB entries */
5449         vtisochctrl &= 0x1c;
5450
5451         /* If we have the recommended number of TLB entries (16), fine. */
5452         if (vtisochctrl == 0x10)
5453                 return;
5454
5455         /* Zero TLB entries? You get to ride the short bus to school. */
5456         if (!vtisochctrl) {
5457                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5458                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5459                      dmi_get_system_info(DMI_BIOS_VENDOR),
5460                      dmi_get_system_info(DMI_BIOS_VERSION),
5461                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5462                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5463                 return;
5464         }
5465
5466         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5467                vtisochctrl);
5468 }