Merge tag 'drm-next-2019-07-16' of git://anongit.freedesktop.org/drm/drm
[sfrench/cifs-2.6.git] / drivers / iommu / intel-iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <asm/irq_remapping.h>
45 #include <asm/cacheflush.h>
46 #include <asm/iommu.h>
47
48 #include "irq_remapping.h"
49 #include "intel-pasid.h"
50
51 #define ROOT_SIZE               VTD_PAGE_SIZE
52 #define CONTEXT_SIZE            VTD_PAGE_SIZE
53
54 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
55 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
56 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
57 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
58
59 #define IOAPIC_RANGE_START      (0xfee00000)
60 #define IOAPIC_RANGE_END        (0xfeefffff)
61 #define IOVA_START_ADDR         (0x1000)
62
63 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
64
65 #define MAX_AGAW_WIDTH 64
66 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
67
68 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
69 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
70
71 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
72    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
73 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
74                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
75 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
76
77 /* IO virtual address start page frame number */
78 #define IOVA_START_PFN          (1)
79
80 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
81
82 /* page table handling */
83 #define LEVEL_STRIDE            (9)
84 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
85
86 /*
87  * This bitmap is used to advertise the page sizes our hardware support
88  * to the IOMMU core, which will then use this information to split
89  * physically contiguous memory regions it is mapping into page sizes
90  * that we support.
91  *
92  * Traditionally the IOMMU core just handed us the mappings directly,
93  * after making sure the size is an order of a 4KiB page and that the
94  * mapping has natural alignment.
95  *
96  * To retain this behavior, we currently advertise that we support
97  * all page sizes that are an order of 4KiB.
98  *
99  * If at some point we'd like to utilize the IOMMU core's new behavior,
100  * we could change this to advertise the real page sizes we support.
101  */
102 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
103
104 static inline int agaw_to_level(int agaw)
105 {
106         return agaw + 2;
107 }
108
109 static inline int agaw_to_width(int agaw)
110 {
111         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
112 }
113
114 static inline int width_to_agaw(int width)
115 {
116         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
117 }
118
119 static inline unsigned int level_to_offset_bits(int level)
120 {
121         return (level - 1) * LEVEL_STRIDE;
122 }
123
124 static inline int pfn_level_offset(unsigned long pfn, int level)
125 {
126         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
127 }
128
129 static inline unsigned long level_mask(int level)
130 {
131         return -1UL << level_to_offset_bits(level);
132 }
133
134 static inline unsigned long level_size(int level)
135 {
136         return 1UL << level_to_offset_bits(level);
137 }
138
139 static inline unsigned long align_to_level(unsigned long pfn, int level)
140 {
141         return (pfn + level_size(level) - 1) & level_mask(level);
142 }
143
144 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
145 {
146         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
147 }
148
149 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
150    are never going to work. */
151 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
152 {
153         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
154 }
155
156 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
157 {
158         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
159 }
160 static inline unsigned long page_to_dma_pfn(struct page *pg)
161 {
162         return mm_to_dma_pfn(page_to_pfn(pg));
163 }
164 static inline unsigned long virt_to_dma_pfn(void *p)
165 {
166         return page_to_dma_pfn(virt_to_page(p));
167 }
168
169 /* global iommu list, set NULL for ignored DMAR units */
170 static struct intel_iommu **g_iommus;
171
172 static void __init check_tylersburg_isoch(void);
173 static int rwbf_quirk;
174
175 /*
176  * set to 1 to panic kernel if can't successfully enable VT-d
177  * (used when kernel is launched w/ TXT)
178  */
179 static int force_on = 0;
180 int intel_iommu_tboot_noforce;
181 static int no_platform_optin;
182
183 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
184
185 /*
186  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
187  * if marked present.
188  */
189 static phys_addr_t root_entry_lctp(struct root_entry *re)
190 {
191         if (!(re->lo & 1))
192                 return 0;
193
194         return re->lo & VTD_PAGE_MASK;
195 }
196
197 /*
198  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
199  * if marked present.
200  */
201 static phys_addr_t root_entry_uctp(struct root_entry *re)
202 {
203         if (!(re->hi & 1))
204                 return 0;
205
206         return re->hi & VTD_PAGE_MASK;
207 }
208
209 static inline void context_clear_pasid_enable(struct context_entry *context)
210 {
211         context->lo &= ~(1ULL << 11);
212 }
213
214 static inline bool context_pasid_enabled(struct context_entry *context)
215 {
216         return !!(context->lo & (1ULL << 11));
217 }
218
219 static inline void context_set_copied(struct context_entry *context)
220 {
221         context->hi |= (1ull << 3);
222 }
223
224 static inline bool context_copied(struct context_entry *context)
225 {
226         return !!(context->hi & (1ULL << 3));
227 }
228
229 static inline bool __context_present(struct context_entry *context)
230 {
231         return (context->lo & 1);
232 }
233
234 bool context_present(struct context_entry *context)
235 {
236         return context_pasid_enabled(context) ?
237              __context_present(context) :
238              __context_present(context) && !context_copied(context);
239 }
240
241 static inline void context_set_present(struct context_entry *context)
242 {
243         context->lo |= 1;
244 }
245
246 static inline void context_set_fault_enable(struct context_entry *context)
247 {
248         context->lo &= (((u64)-1) << 2) | 1;
249 }
250
251 static inline void context_set_translation_type(struct context_entry *context,
252                                                 unsigned long value)
253 {
254         context->lo &= (((u64)-1) << 4) | 3;
255         context->lo |= (value & 3) << 2;
256 }
257
258 static inline void context_set_address_root(struct context_entry *context,
259                                             unsigned long value)
260 {
261         context->lo &= ~VTD_PAGE_MASK;
262         context->lo |= value & VTD_PAGE_MASK;
263 }
264
265 static inline void context_set_address_width(struct context_entry *context,
266                                              unsigned long value)
267 {
268         context->hi |= value & 7;
269 }
270
271 static inline void context_set_domain_id(struct context_entry *context,
272                                          unsigned long value)
273 {
274         context->hi |= (value & ((1 << 16) - 1)) << 8;
275 }
276
277 static inline int context_domain_id(struct context_entry *c)
278 {
279         return((c->hi >> 8) & 0xffff);
280 }
281
282 static inline void context_clear_entry(struct context_entry *context)
283 {
284         context->lo = 0;
285         context->hi = 0;
286 }
287
288 /*
289  * This domain is a statically identity mapping domain.
290  *      1. This domain creats a static 1:1 mapping to all usable memory.
291  *      2. It maps to each iommu if successful.
292  *      3. Each iommu mapps to this domain if successful.
293  */
294 static struct dmar_domain *si_domain;
295 static int hw_pass_through = 1;
296
297 /* si_domain contains mulitple devices */
298 #define DOMAIN_FLAG_STATIC_IDENTITY             BIT(0)
299
300 /*
301  * This is a DMA domain allocated through the iommu domain allocation
302  * interface. But one or more devices belonging to this domain have
303  * been chosen to use a private domain. We should avoid to use the
304  * map/unmap/iova_to_phys APIs on it.
305  */
306 #define DOMAIN_FLAG_LOSE_CHILDREN               BIT(1)
307
308 #define for_each_domain_iommu(idx, domain)                      \
309         for (idx = 0; idx < g_num_of_iommus; idx++)             \
310                 if (domain->iommu_refcnt[idx])
311
312 struct dmar_rmrr_unit {
313         struct list_head list;          /* list of rmrr units   */
314         struct acpi_dmar_header *hdr;   /* ACPI header          */
315         u64     base_address;           /* reserved base address*/
316         u64     end_address;            /* reserved end address */
317         struct dmar_dev_scope *devices; /* target devices */
318         int     devices_cnt;            /* target device count */
319 };
320
321 struct dmar_atsr_unit {
322         struct list_head list;          /* list of ATSR units */
323         struct acpi_dmar_header *hdr;   /* ACPI header */
324         struct dmar_dev_scope *devices; /* target devices */
325         int devices_cnt;                /* target device count */
326         u8 include_all:1;               /* include all ports */
327 };
328
329 static LIST_HEAD(dmar_atsr_units);
330 static LIST_HEAD(dmar_rmrr_units);
331
332 #define for_each_rmrr_units(rmrr) \
333         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
334
335 /* bitmap for indexing intel_iommus */
336 static int g_num_of_iommus;
337
338 static void domain_exit(struct dmar_domain *domain);
339 static void domain_remove_dev_info(struct dmar_domain *domain);
340 static void dmar_remove_one_dev_info(struct device *dev);
341 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
342 static void domain_context_clear(struct intel_iommu *iommu,
343                                  struct device *dev);
344 static int domain_detach_iommu(struct dmar_domain *domain,
345                                struct intel_iommu *iommu);
346 static bool device_is_rmrr_locked(struct device *dev);
347 static int intel_iommu_attach_device(struct iommu_domain *domain,
348                                      struct device *dev);
349
350 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
351 int dmar_disabled = 0;
352 #else
353 int dmar_disabled = 1;
354 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
355
356 int intel_iommu_sm;
357 int intel_iommu_enabled = 0;
358 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
359
360 static int dmar_map_gfx = 1;
361 static int dmar_forcedac;
362 static int intel_iommu_strict;
363 static int intel_iommu_superpage = 1;
364 static int iommu_identity_mapping;
365
366 #define IDENTMAP_ALL            1
367 #define IDENTMAP_GFX            2
368 #define IDENTMAP_AZALIA         4
369
370 int intel_iommu_gfx_mapped;
371 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
372
373 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
374 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
375 static DEFINE_SPINLOCK(device_domain_lock);
376 static LIST_HEAD(device_domain_list);
377
378 /*
379  * Iterate over elements in device_domain_list and call the specified
380  * callback @fn against each element.
381  */
382 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
383                                      void *data), void *data)
384 {
385         int ret = 0;
386         unsigned long flags;
387         struct device_domain_info *info;
388
389         spin_lock_irqsave(&device_domain_lock, flags);
390         list_for_each_entry(info, &device_domain_list, global) {
391                 ret = fn(info, data);
392                 if (ret) {
393                         spin_unlock_irqrestore(&device_domain_lock, flags);
394                         return ret;
395                 }
396         }
397         spin_unlock_irqrestore(&device_domain_lock, flags);
398
399         return 0;
400 }
401
402 const struct iommu_ops intel_iommu_ops;
403
404 static bool translation_pre_enabled(struct intel_iommu *iommu)
405 {
406         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
407 }
408
409 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
410 {
411         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
412 }
413
414 static void init_translation_status(struct intel_iommu *iommu)
415 {
416         u32 gsts;
417
418         gsts = readl(iommu->reg + DMAR_GSTS_REG);
419         if (gsts & DMA_GSTS_TES)
420                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
421 }
422
423 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
424 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
425 {
426         return container_of(dom, struct dmar_domain, domain);
427 }
428
429 static int __init intel_iommu_setup(char *str)
430 {
431         if (!str)
432                 return -EINVAL;
433         while (*str) {
434                 if (!strncmp(str, "on", 2)) {
435                         dmar_disabled = 0;
436                         pr_info("IOMMU enabled\n");
437                 } else if (!strncmp(str, "off", 3)) {
438                         dmar_disabled = 1;
439                         no_platform_optin = 1;
440                         pr_info("IOMMU disabled\n");
441                 } else if (!strncmp(str, "igfx_off", 8)) {
442                         dmar_map_gfx = 0;
443                         pr_info("Disable GFX device mapping\n");
444                 } else if (!strncmp(str, "forcedac", 8)) {
445                         pr_info("Forcing DAC for PCI devices\n");
446                         dmar_forcedac = 1;
447                 } else if (!strncmp(str, "strict", 6)) {
448                         pr_info("Disable batched IOTLB flush\n");
449                         intel_iommu_strict = 1;
450                 } else if (!strncmp(str, "sp_off", 6)) {
451                         pr_info("Disable supported super page\n");
452                         intel_iommu_superpage = 0;
453                 } else if (!strncmp(str, "sm_on", 5)) {
454                         pr_info("Intel-IOMMU: scalable mode supported\n");
455                         intel_iommu_sm = 1;
456                 } else if (!strncmp(str, "tboot_noforce", 13)) {
457                         printk(KERN_INFO
458                                 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
459                         intel_iommu_tboot_noforce = 1;
460                 }
461
462                 str += strcspn(str, ",");
463                 while (*str == ',')
464                         str++;
465         }
466         return 0;
467 }
468 __setup("intel_iommu=", intel_iommu_setup);
469
470 static struct kmem_cache *iommu_domain_cache;
471 static struct kmem_cache *iommu_devinfo_cache;
472
473 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
474 {
475         struct dmar_domain **domains;
476         int idx = did >> 8;
477
478         domains = iommu->domains[idx];
479         if (!domains)
480                 return NULL;
481
482         return domains[did & 0xff];
483 }
484
485 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
486                              struct dmar_domain *domain)
487 {
488         struct dmar_domain **domains;
489         int idx = did >> 8;
490
491         if (!iommu->domains[idx]) {
492                 size_t size = 256 * sizeof(struct dmar_domain *);
493                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
494         }
495
496         domains = iommu->domains[idx];
497         if (WARN_ON(!domains))
498                 return;
499         else
500                 domains[did & 0xff] = domain;
501 }
502
503 void *alloc_pgtable_page(int node)
504 {
505         struct page *page;
506         void *vaddr = NULL;
507
508         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
509         if (page)
510                 vaddr = page_address(page);
511         return vaddr;
512 }
513
514 void free_pgtable_page(void *vaddr)
515 {
516         free_page((unsigned long)vaddr);
517 }
518
519 static inline void *alloc_domain_mem(void)
520 {
521         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
522 }
523
524 static void free_domain_mem(void *vaddr)
525 {
526         kmem_cache_free(iommu_domain_cache, vaddr);
527 }
528
529 static inline void * alloc_devinfo_mem(void)
530 {
531         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
532 }
533
534 static inline void free_devinfo_mem(void *vaddr)
535 {
536         kmem_cache_free(iommu_devinfo_cache, vaddr);
537 }
538
539 static inline int domain_type_is_si(struct dmar_domain *domain)
540 {
541         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
542 }
543
544 static inline int domain_pfn_supported(struct dmar_domain *domain,
545                                        unsigned long pfn)
546 {
547         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
548
549         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
550 }
551
552 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
553 {
554         unsigned long sagaw;
555         int agaw = -1;
556
557         sagaw = cap_sagaw(iommu->cap);
558         for (agaw = width_to_agaw(max_gaw);
559              agaw >= 0; agaw--) {
560                 if (test_bit(agaw, &sagaw))
561                         break;
562         }
563
564         return agaw;
565 }
566
567 /*
568  * Calculate max SAGAW for each iommu.
569  */
570 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
571 {
572         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
573 }
574
575 /*
576  * calculate agaw for each iommu.
577  * "SAGAW" may be different across iommus, use a default agaw, and
578  * get a supported less agaw for iommus that don't support the default agaw.
579  */
580 int iommu_calculate_agaw(struct intel_iommu *iommu)
581 {
582         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
583 }
584
585 /* This functionin only returns single iommu in a domain */
586 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
587 {
588         int iommu_id;
589
590         /* si_domain and vm domain should not get here. */
591         if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
592                 return NULL;
593
594         for_each_domain_iommu(iommu_id, domain)
595                 break;
596
597         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
598                 return NULL;
599
600         return g_iommus[iommu_id];
601 }
602
603 static void domain_update_iommu_coherency(struct dmar_domain *domain)
604 {
605         struct dmar_drhd_unit *drhd;
606         struct intel_iommu *iommu;
607         bool found = false;
608         int i;
609
610         domain->iommu_coherency = 1;
611
612         for_each_domain_iommu(i, domain) {
613                 found = true;
614                 if (!ecap_coherent(g_iommus[i]->ecap)) {
615                         domain->iommu_coherency = 0;
616                         break;
617                 }
618         }
619         if (found)
620                 return;
621
622         /* No hardware attached; use lowest common denominator */
623         rcu_read_lock();
624         for_each_active_iommu(iommu, drhd) {
625                 if (!ecap_coherent(iommu->ecap)) {
626                         domain->iommu_coherency = 0;
627                         break;
628                 }
629         }
630         rcu_read_unlock();
631 }
632
633 static int domain_update_iommu_snooping(struct intel_iommu *skip)
634 {
635         struct dmar_drhd_unit *drhd;
636         struct intel_iommu *iommu;
637         int ret = 1;
638
639         rcu_read_lock();
640         for_each_active_iommu(iommu, drhd) {
641                 if (iommu != skip) {
642                         if (!ecap_sc_support(iommu->ecap)) {
643                                 ret = 0;
644                                 break;
645                         }
646                 }
647         }
648         rcu_read_unlock();
649
650         return ret;
651 }
652
653 static int domain_update_iommu_superpage(struct intel_iommu *skip)
654 {
655         struct dmar_drhd_unit *drhd;
656         struct intel_iommu *iommu;
657         int mask = 0xf;
658
659         if (!intel_iommu_superpage) {
660                 return 0;
661         }
662
663         /* set iommu_superpage to the smallest common denominator */
664         rcu_read_lock();
665         for_each_active_iommu(iommu, drhd) {
666                 if (iommu != skip) {
667                         mask &= cap_super_page_val(iommu->cap);
668                         if (!mask)
669                                 break;
670                 }
671         }
672         rcu_read_unlock();
673
674         return fls(mask);
675 }
676
677 /* Some capabilities may be different across iommus */
678 static void domain_update_iommu_cap(struct dmar_domain *domain)
679 {
680         domain_update_iommu_coherency(domain);
681         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
682         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
683 }
684
685 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
686                                          u8 devfn, int alloc)
687 {
688         struct root_entry *root = &iommu->root_entry[bus];
689         struct context_entry *context;
690         u64 *entry;
691
692         entry = &root->lo;
693         if (sm_supported(iommu)) {
694                 if (devfn >= 0x80) {
695                         devfn -= 0x80;
696                         entry = &root->hi;
697                 }
698                 devfn *= 2;
699         }
700         if (*entry & 1)
701                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
702         else {
703                 unsigned long phy_addr;
704                 if (!alloc)
705                         return NULL;
706
707                 context = alloc_pgtable_page(iommu->node);
708                 if (!context)
709                         return NULL;
710
711                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
712                 phy_addr = virt_to_phys((void *)context);
713                 *entry = phy_addr | 1;
714                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
715         }
716         return &context[devfn];
717 }
718
719 static int iommu_dummy(struct device *dev)
720 {
721         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
722 }
723
724 /**
725  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
726  *                               sub-hierarchy of a candidate PCI-PCI bridge
727  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
728  * @bridge: the candidate PCI-PCI bridge
729  *
730  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
731  */
732 static bool
733 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
734 {
735         struct pci_dev *pdev, *pbridge;
736
737         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
738                 return false;
739
740         pdev = to_pci_dev(dev);
741         pbridge = to_pci_dev(bridge);
742
743         if (pbridge->subordinate &&
744             pbridge->subordinate->number <= pdev->bus->number &&
745             pbridge->subordinate->busn_res.end >= pdev->bus->number)
746                 return true;
747
748         return false;
749 }
750
751 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
752 {
753         struct dmar_drhd_unit *drhd = NULL;
754         struct intel_iommu *iommu;
755         struct device *tmp;
756         struct pci_dev *pdev = NULL;
757         u16 segment = 0;
758         int i;
759
760         if (iommu_dummy(dev))
761                 return NULL;
762
763         if (dev_is_pci(dev)) {
764                 struct pci_dev *pf_pdev;
765
766                 pdev = to_pci_dev(dev);
767
768 #ifdef CONFIG_X86
769                 /* VMD child devices currently cannot be handled individually */
770                 if (is_vmd(pdev->bus))
771                         return NULL;
772 #endif
773
774                 /* VFs aren't listed in scope tables; we need to look up
775                  * the PF instead to find the IOMMU. */
776                 pf_pdev = pci_physfn(pdev);
777                 dev = &pf_pdev->dev;
778                 segment = pci_domain_nr(pdev->bus);
779         } else if (has_acpi_companion(dev))
780                 dev = &ACPI_COMPANION(dev)->dev;
781
782         rcu_read_lock();
783         for_each_active_iommu(iommu, drhd) {
784                 if (pdev && segment != drhd->segment)
785                         continue;
786
787                 for_each_active_dev_scope(drhd->devices,
788                                           drhd->devices_cnt, i, tmp) {
789                         if (tmp == dev) {
790                                 /* For a VF use its original BDF# not that of the PF
791                                  * which we used for the IOMMU lookup. Strictly speaking
792                                  * we could do this for all PCI devices; we only need to
793                                  * get the BDF# from the scope table for ACPI matches. */
794                                 if (pdev && pdev->is_virtfn)
795                                         goto got_pdev;
796
797                                 *bus = drhd->devices[i].bus;
798                                 *devfn = drhd->devices[i].devfn;
799                                 goto out;
800                         }
801
802                         if (is_downstream_to_pci_bridge(dev, tmp))
803                                 goto got_pdev;
804                 }
805
806                 if (pdev && drhd->include_all) {
807                 got_pdev:
808                         *bus = pdev->bus->number;
809                         *devfn = pdev->devfn;
810                         goto out;
811                 }
812         }
813         iommu = NULL;
814  out:
815         rcu_read_unlock();
816
817         return iommu;
818 }
819
820 static void domain_flush_cache(struct dmar_domain *domain,
821                                void *addr, int size)
822 {
823         if (!domain->iommu_coherency)
824                 clflush_cache_range(addr, size);
825 }
826
827 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
828 {
829         struct context_entry *context;
830         int ret = 0;
831         unsigned long flags;
832
833         spin_lock_irqsave(&iommu->lock, flags);
834         context = iommu_context_addr(iommu, bus, devfn, 0);
835         if (context)
836                 ret = context_present(context);
837         spin_unlock_irqrestore(&iommu->lock, flags);
838         return ret;
839 }
840
841 static void free_context_table(struct intel_iommu *iommu)
842 {
843         int i;
844         unsigned long flags;
845         struct context_entry *context;
846
847         spin_lock_irqsave(&iommu->lock, flags);
848         if (!iommu->root_entry) {
849                 goto out;
850         }
851         for (i = 0; i < ROOT_ENTRY_NR; i++) {
852                 context = iommu_context_addr(iommu, i, 0, 0);
853                 if (context)
854                         free_pgtable_page(context);
855
856                 if (!sm_supported(iommu))
857                         continue;
858
859                 context = iommu_context_addr(iommu, i, 0x80, 0);
860                 if (context)
861                         free_pgtable_page(context);
862
863         }
864         free_pgtable_page(iommu->root_entry);
865         iommu->root_entry = NULL;
866 out:
867         spin_unlock_irqrestore(&iommu->lock, flags);
868 }
869
870 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
871                                       unsigned long pfn, int *target_level)
872 {
873         struct dma_pte *parent, *pte;
874         int level = agaw_to_level(domain->agaw);
875         int offset;
876
877         BUG_ON(!domain->pgd);
878
879         if (!domain_pfn_supported(domain, pfn))
880                 /* Address beyond IOMMU's addressing capabilities. */
881                 return NULL;
882
883         parent = domain->pgd;
884
885         while (1) {
886                 void *tmp_page;
887
888                 offset = pfn_level_offset(pfn, level);
889                 pte = &parent[offset];
890                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
891                         break;
892                 if (level == *target_level)
893                         break;
894
895                 if (!dma_pte_present(pte)) {
896                         uint64_t pteval;
897
898                         tmp_page = alloc_pgtable_page(domain->nid);
899
900                         if (!tmp_page)
901                                 return NULL;
902
903                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
904                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
905                         if (cmpxchg64(&pte->val, 0ULL, pteval))
906                                 /* Someone else set it while we were thinking; use theirs. */
907                                 free_pgtable_page(tmp_page);
908                         else
909                                 domain_flush_cache(domain, pte, sizeof(*pte));
910                 }
911                 if (level == 1)
912                         break;
913
914                 parent = phys_to_virt(dma_pte_addr(pte));
915                 level--;
916         }
917
918         if (!*target_level)
919                 *target_level = level;
920
921         return pte;
922 }
923
924 /* return address's pte at specific level */
925 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
926                                          unsigned long pfn,
927                                          int level, int *large_page)
928 {
929         struct dma_pte *parent, *pte;
930         int total = agaw_to_level(domain->agaw);
931         int offset;
932
933         parent = domain->pgd;
934         while (level <= total) {
935                 offset = pfn_level_offset(pfn, total);
936                 pte = &parent[offset];
937                 if (level == total)
938                         return pte;
939
940                 if (!dma_pte_present(pte)) {
941                         *large_page = total;
942                         break;
943                 }
944
945                 if (dma_pte_superpage(pte)) {
946                         *large_page = total;
947                         return pte;
948                 }
949
950                 parent = phys_to_virt(dma_pte_addr(pte));
951                 total--;
952         }
953         return NULL;
954 }
955
956 /* clear last level pte, a tlb flush should be followed */
957 static void dma_pte_clear_range(struct dmar_domain *domain,
958                                 unsigned long start_pfn,
959                                 unsigned long last_pfn)
960 {
961         unsigned int large_page;
962         struct dma_pte *first_pte, *pte;
963
964         BUG_ON(!domain_pfn_supported(domain, start_pfn));
965         BUG_ON(!domain_pfn_supported(domain, last_pfn));
966         BUG_ON(start_pfn > last_pfn);
967
968         /* we don't need lock here; nobody else touches the iova range */
969         do {
970                 large_page = 1;
971                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
972                 if (!pte) {
973                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
974                         continue;
975                 }
976                 do {
977                         dma_clear_pte(pte);
978                         start_pfn += lvl_to_nr_pages(large_page);
979                         pte++;
980                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
981
982                 domain_flush_cache(domain, first_pte,
983                                    (void *)pte - (void *)first_pte);
984
985         } while (start_pfn && start_pfn <= last_pfn);
986 }
987
988 static void dma_pte_free_level(struct dmar_domain *domain, int level,
989                                int retain_level, struct dma_pte *pte,
990                                unsigned long pfn, unsigned long start_pfn,
991                                unsigned long last_pfn)
992 {
993         pfn = max(start_pfn, pfn);
994         pte = &pte[pfn_level_offset(pfn, level)];
995
996         do {
997                 unsigned long level_pfn;
998                 struct dma_pte *level_pte;
999
1000                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1001                         goto next;
1002
1003                 level_pfn = pfn & level_mask(level);
1004                 level_pte = phys_to_virt(dma_pte_addr(pte));
1005
1006                 if (level > 2) {
1007                         dma_pte_free_level(domain, level - 1, retain_level,
1008                                            level_pte, level_pfn, start_pfn,
1009                                            last_pfn);
1010                 }
1011
1012                 /*
1013                  * Free the page table if we're below the level we want to
1014                  * retain and the range covers the entire table.
1015                  */
1016                 if (level < retain_level && !(start_pfn > level_pfn ||
1017                       last_pfn < level_pfn + level_size(level) - 1)) {
1018                         dma_clear_pte(pte);
1019                         domain_flush_cache(domain, pte, sizeof(*pte));
1020                         free_pgtable_page(level_pte);
1021                 }
1022 next:
1023                 pfn += level_size(level);
1024         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1025 }
1026
1027 /*
1028  * clear last level (leaf) ptes and free page table pages below the
1029  * level we wish to keep intact.
1030  */
1031 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1032                                    unsigned long start_pfn,
1033                                    unsigned long last_pfn,
1034                                    int retain_level)
1035 {
1036         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1037         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1038         BUG_ON(start_pfn > last_pfn);
1039
1040         dma_pte_clear_range(domain, start_pfn, last_pfn);
1041
1042         /* We don't need lock here; nobody else touches the iova range */
1043         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1044                            domain->pgd, 0, start_pfn, last_pfn);
1045
1046         /* free pgd */
1047         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1048                 free_pgtable_page(domain->pgd);
1049                 domain->pgd = NULL;
1050         }
1051 }
1052
1053 /* When a page at a given level is being unlinked from its parent, we don't
1054    need to *modify* it at all. All we need to do is make a list of all the
1055    pages which can be freed just as soon as we've flushed the IOTLB and we
1056    know the hardware page-walk will no longer touch them.
1057    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1058    be freed. */
1059 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1060                                             int level, struct dma_pte *pte,
1061                                             struct page *freelist)
1062 {
1063         struct page *pg;
1064
1065         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1066         pg->freelist = freelist;
1067         freelist = pg;
1068
1069         if (level == 1)
1070                 return freelist;
1071
1072         pte = page_address(pg);
1073         do {
1074                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1075                         freelist = dma_pte_list_pagetables(domain, level - 1,
1076                                                            pte, freelist);
1077                 pte++;
1078         } while (!first_pte_in_page(pte));
1079
1080         return freelist;
1081 }
1082
1083 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1084                                         struct dma_pte *pte, unsigned long pfn,
1085                                         unsigned long start_pfn,
1086                                         unsigned long last_pfn,
1087                                         struct page *freelist)
1088 {
1089         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1090
1091         pfn = max(start_pfn, pfn);
1092         pte = &pte[pfn_level_offset(pfn, level)];
1093
1094         do {
1095                 unsigned long level_pfn;
1096
1097                 if (!dma_pte_present(pte))
1098                         goto next;
1099
1100                 level_pfn = pfn & level_mask(level);
1101
1102                 /* If range covers entire pagetable, free it */
1103                 if (start_pfn <= level_pfn &&
1104                     last_pfn >= level_pfn + level_size(level) - 1) {
1105                         /* These suborbinate page tables are going away entirely. Don't
1106                            bother to clear them; we're just going to *free* them. */
1107                         if (level > 1 && !dma_pte_superpage(pte))
1108                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1109
1110                         dma_clear_pte(pte);
1111                         if (!first_pte)
1112                                 first_pte = pte;
1113                         last_pte = pte;
1114                 } else if (level > 1) {
1115                         /* Recurse down into a level that isn't *entirely* obsolete */
1116                         freelist = dma_pte_clear_level(domain, level - 1,
1117                                                        phys_to_virt(dma_pte_addr(pte)),
1118                                                        level_pfn, start_pfn, last_pfn,
1119                                                        freelist);
1120                 }
1121 next:
1122                 pfn += level_size(level);
1123         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1124
1125         if (first_pte)
1126                 domain_flush_cache(domain, first_pte,
1127                                    (void *)++last_pte - (void *)first_pte);
1128
1129         return freelist;
1130 }
1131
1132 /* We can't just free the pages because the IOMMU may still be walking
1133    the page tables, and may have cached the intermediate levels. The
1134    pages can only be freed after the IOTLB flush has been done. */
1135 static struct page *domain_unmap(struct dmar_domain *domain,
1136                                  unsigned long start_pfn,
1137                                  unsigned long last_pfn)
1138 {
1139         struct page *freelist;
1140
1141         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1142         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1143         BUG_ON(start_pfn > last_pfn);
1144
1145         /* we don't need lock here; nobody else touches the iova range */
1146         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1147                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1148
1149         /* free pgd */
1150         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1151                 struct page *pgd_page = virt_to_page(domain->pgd);
1152                 pgd_page->freelist = freelist;
1153                 freelist = pgd_page;
1154
1155                 domain->pgd = NULL;
1156         }
1157
1158         return freelist;
1159 }
1160
1161 static void dma_free_pagelist(struct page *freelist)
1162 {
1163         struct page *pg;
1164
1165         while ((pg = freelist)) {
1166                 freelist = pg->freelist;
1167                 free_pgtable_page(page_address(pg));
1168         }
1169 }
1170
1171 static void iova_entry_free(unsigned long data)
1172 {
1173         struct page *freelist = (struct page *)data;
1174
1175         dma_free_pagelist(freelist);
1176 }
1177
1178 /* iommu handling */
1179 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1180 {
1181         struct root_entry *root;
1182         unsigned long flags;
1183
1184         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1185         if (!root) {
1186                 pr_err("Allocating root entry for %s failed\n",
1187                         iommu->name);
1188                 return -ENOMEM;
1189         }
1190
1191         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1192
1193         spin_lock_irqsave(&iommu->lock, flags);
1194         iommu->root_entry = root;
1195         spin_unlock_irqrestore(&iommu->lock, flags);
1196
1197         return 0;
1198 }
1199
1200 static void iommu_set_root_entry(struct intel_iommu *iommu)
1201 {
1202         u64 addr;
1203         u32 sts;
1204         unsigned long flag;
1205
1206         addr = virt_to_phys(iommu->root_entry);
1207         if (sm_supported(iommu))
1208                 addr |= DMA_RTADDR_SMT;
1209
1210         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1211         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1212
1213         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1214
1215         /* Make sure hardware complete it */
1216         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1217                       readl, (sts & DMA_GSTS_RTPS), sts);
1218
1219         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1220 }
1221
1222 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1223 {
1224         u32 val;
1225         unsigned long flag;
1226
1227         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1228                 return;
1229
1230         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1231         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1232
1233         /* Make sure hardware complete it */
1234         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1235                       readl, (!(val & DMA_GSTS_WBFS)), val);
1236
1237         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1238 }
1239
1240 /* return value determine if we need a write buffer flush */
1241 static void __iommu_flush_context(struct intel_iommu *iommu,
1242                                   u16 did, u16 source_id, u8 function_mask,
1243                                   u64 type)
1244 {
1245         u64 val = 0;
1246         unsigned long flag;
1247
1248         switch (type) {
1249         case DMA_CCMD_GLOBAL_INVL:
1250                 val = DMA_CCMD_GLOBAL_INVL;
1251                 break;
1252         case DMA_CCMD_DOMAIN_INVL:
1253                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1254                 break;
1255         case DMA_CCMD_DEVICE_INVL:
1256                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1257                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1258                 break;
1259         default:
1260                 BUG();
1261         }
1262         val |= DMA_CCMD_ICC;
1263
1264         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1265         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1266
1267         /* Make sure hardware complete it */
1268         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1269                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1270
1271         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1272 }
1273
1274 /* return value determine if we need a write buffer flush */
1275 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1276                                 u64 addr, unsigned int size_order, u64 type)
1277 {
1278         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1279         u64 val = 0, val_iva = 0;
1280         unsigned long flag;
1281
1282         switch (type) {
1283         case DMA_TLB_GLOBAL_FLUSH:
1284                 /* global flush doesn't need set IVA_REG */
1285                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1286                 break;
1287         case DMA_TLB_DSI_FLUSH:
1288                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1289                 break;
1290         case DMA_TLB_PSI_FLUSH:
1291                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1292                 /* IH bit is passed in as part of address */
1293                 val_iva = size_order | addr;
1294                 break;
1295         default:
1296                 BUG();
1297         }
1298         /* Note: set drain read/write */
1299 #if 0
1300         /*
1301          * This is probably to be super secure.. Looks like we can
1302          * ignore it without any impact.
1303          */
1304         if (cap_read_drain(iommu->cap))
1305                 val |= DMA_TLB_READ_DRAIN;
1306 #endif
1307         if (cap_write_drain(iommu->cap))
1308                 val |= DMA_TLB_WRITE_DRAIN;
1309
1310         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1311         /* Note: Only uses first TLB reg currently */
1312         if (val_iva)
1313                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1314         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1315
1316         /* Make sure hardware complete it */
1317         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1318                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1319
1320         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1321
1322         /* check IOTLB invalidation granularity */
1323         if (DMA_TLB_IAIG(val) == 0)
1324                 pr_err("Flush IOTLB failed\n");
1325         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1326                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1327                         (unsigned long long)DMA_TLB_IIRG(type),
1328                         (unsigned long long)DMA_TLB_IAIG(val));
1329 }
1330
1331 static struct device_domain_info *
1332 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1333                          u8 bus, u8 devfn)
1334 {
1335         struct device_domain_info *info;
1336
1337         assert_spin_locked(&device_domain_lock);
1338
1339         if (!iommu->qi)
1340                 return NULL;
1341
1342         list_for_each_entry(info, &domain->devices, link)
1343                 if (info->iommu == iommu && info->bus == bus &&
1344                     info->devfn == devfn) {
1345                         if (info->ats_supported && info->dev)
1346                                 return info;
1347                         break;
1348                 }
1349
1350         return NULL;
1351 }
1352
1353 static void domain_update_iotlb(struct dmar_domain *domain)
1354 {
1355         struct device_domain_info *info;
1356         bool has_iotlb_device = false;
1357
1358         assert_spin_locked(&device_domain_lock);
1359
1360         list_for_each_entry(info, &domain->devices, link) {
1361                 struct pci_dev *pdev;
1362
1363                 if (!info->dev || !dev_is_pci(info->dev))
1364                         continue;
1365
1366                 pdev = to_pci_dev(info->dev);
1367                 if (pdev->ats_enabled) {
1368                         has_iotlb_device = true;
1369                         break;
1370                 }
1371         }
1372
1373         domain->has_iotlb_device = has_iotlb_device;
1374 }
1375
1376 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1377 {
1378         struct pci_dev *pdev;
1379
1380         assert_spin_locked(&device_domain_lock);
1381
1382         if (!info || !dev_is_pci(info->dev))
1383                 return;
1384
1385         pdev = to_pci_dev(info->dev);
1386         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1387          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1388          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1389          * reserved, which should be set to 0.
1390          */
1391         if (!ecap_dit(info->iommu->ecap))
1392                 info->pfsid = 0;
1393         else {
1394                 struct pci_dev *pf_pdev;
1395
1396                 /* pdev will be returned if device is not a vf */
1397                 pf_pdev = pci_physfn(pdev);
1398                 info->pfsid = pci_dev_id(pf_pdev);
1399         }
1400
1401 #ifdef CONFIG_INTEL_IOMMU_SVM
1402         /* The PCIe spec, in its wisdom, declares that the behaviour of
1403            the device if you enable PASID support after ATS support is
1404            undefined. So always enable PASID support on devices which
1405            have it, even if we can't yet know if we're ever going to
1406            use it. */
1407         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1408                 info->pasid_enabled = 1;
1409
1410         if (info->pri_supported &&
1411             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1412             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1413                 info->pri_enabled = 1;
1414 #endif
1415         if (!pdev->untrusted && info->ats_supported &&
1416             pci_ats_page_aligned(pdev) &&
1417             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1418                 info->ats_enabled = 1;
1419                 domain_update_iotlb(info->domain);
1420                 info->ats_qdep = pci_ats_queue_depth(pdev);
1421         }
1422 }
1423
1424 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1425 {
1426         struct pci_dev *pdev;
1427
1428         assert_spin_locked(&device_domain_lock);
1429
1430         if (!dev_is_pci(info->dev))
1431                 return;
1432
1433         pdev = to_pci_dev(info->dev);
1434
1435         if (info->ats_enabled) {
1436                 pci_disable_ats(pdev);
1437                 info->ats_enabled = 0;
1438                 domain_update_iotlb(info->domain);
1439         }
1440 #ifdef CONFIG_INTEL_IOMMU_SVM
1441         if (info->pri_enabled) {
1442                 pci_disable_pri(pdev);
1443                 info->pri_enabled = 0;
1444         }
1445         if (info->pasid_enabled) {
1446                 pci_disable_pasid(pdev);
1447                 info->pasid_enabled = 0;
1448         }
1449 #endif
1450 }
1451
1452 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1453                                   u64 addr, unsigned mask)
1454 {
1455         u16 sid, qdep;
1456         unsigned long flags;
1457         struct device_domain_info *info;
1458
1459         if (!domain->has_iotlb_device)
1460                 return;
1461
1462         spin_lock_irqsave(&device_domain_lock, flags);
1463         list_for_each_entry(info, &domain->devices, link) {
1464                 if (!info->ats_enabled)
1465                         continue;
1466
1467                 sid = info->bus << 8 | info->devfn;
1468                 qdep = info->ats_qdep;
1469                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1470                                 qdep, addr, mask);
1471         }
1472         spin_unlock_irqrestore(&device_domain_lock, flags);
1473 }
1474
1475 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1476                                   struct dmar_domain *domain,
1477                                   unsigned long pfn, unsigned int pages,
1478                                   int ih, int map)
1479 {
1480         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1481         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1482         u16 did = domain->iommu_did[iommu->seq_id];
1483
1484         BUG_ON(pages == 0);
1485
1486         if (ih)
1487                 ih = 1 << 6;
1488         /*
1489          * Fallback to domain selective flush if no PSI support or the size is
1490          * too big.
1491          * PSI requires page size to be 2 ^ x, and the base address is naturally
1492          * aligned to the size
1493          */
1494         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1495                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1496                                                 DMA_TLB_DSI_FLUSH);
1497         else
1498                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1499                                                 DMA_TLB_PSI_FLUSH);
1500
1501         /*
1502          * In caching mode, changes of pages from non-present to present require
1503          * flush. However, device IOTLB doesn't need to be flushed in this case.
1504          */
1505         if (!cap_caching_mode(iommu->cap) || !map)
1506                 iommu_flush_dev_iotlb(domain, addr, mask);
1507 }
1508
1509 /* Notification for newly created mappings */
1510 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1511                                         struct dmar_domain *domain,
1512                                         unsigned long pfn, unsigned int pages)
1513 {
1514         /* It's a non-present to present mapping. Only flush if caching mode */
1515         if (cap_caching_mode(iommu->cap))
1516                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1517         else
1518                 iommu_flush_write_buffer(iommu);
1519 }
1520
1521 static void iommu_flush_iova(struct iova_domain *iovad)
1522 {
1523         struct dmar_domain *domain;
1524         int idx;
1525
1526         domain = container_of(iovad, struct dmar_domain, iovad);
1527
1528         for_each_domain_iommu(idx, domain) {
1529                 struct intel_iommu *iommu = g_iommus[idx];
1530                 u16 did = domain->iommu_did[iommu->seq_id];
1531
1532                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1533
1534                 if (!cap_caching_mode(iommu->cap))
1535                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1536                                               0, MAX_AGAW_PFN_WIDTH);
1537         }
1538 }
1539
1540 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1541 {
1542         u32 pmen;
1543         unsigned long flags;
1544
1545         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1546                 return;
1547
1548         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1549         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1550         pmen &= ~DMA_PMEN_EPM;
1551         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1552
1553         /* wait for the protected region status bit to clear */
1554         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1555                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1556
1557         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1558 }
1559
1560 static void iommu_enable_translation(struct intel_iommu *iommu)
1561 {
1562         u32 sts;
1563         unsigned long flags;
1564
1565         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1566         iommu->gcmd |= DMA_GCMD_TE;
1567         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1568
1569         /* Make sure hardware complete it */
1570         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1571                       readl, (sts & DMA_GSTS_TES), sts);
1572
1573         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1574 }
1575
1576 static void iommu_disable_translation(struct intel_iommu *iommu)
1577 {
1578         u32 sts;
1579         unsigned long flag;
1580
1581         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1582         iommu->gcmd &= ~DMA_GCMD_TE;
1583         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1584
1585         /* Make sure hardware complete it */
1586         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1587                       readl, (!(sts & DMA_GSTS_TES)), sts);
1588
1589         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1590 }
1591
1592 static int iommu_init_domains(struct intel_iommu *iommu)
1593 {
1594         u32 ndomains, nlongs;
1595         size_t size;
1596
1597         ndomains = cap_ndoms(iommu->cap);
1598         pr_debug("%s: Number of Domains supported <%d>\n",
1599                  iommu->name, ndomains);
1600         nlongs = BITS_TO_LONGS(ndomains);
1601
1602         spin_lock_init(&iommu->lock);
1603
1604         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1605         if (!iommu->domain_ids) {
1606                 pr_err("%s: Allocating domain id array failed\n",
1607                        iommu->name);
1608                 return -ENOMEM;
1609         }
1610
1611         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1612         iommu->domains = kzalloc(size, GFP_KERNEL);
1613
1614         if (iommu->domains) {
1615                 size = 256 * sizeof(struct dmar_domain *);
1616                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1617         }
1618
1619         if (!iommu->domains || !iommu->domains[0]) {
1620                 pr_err("%s: Allocating domain array failed\n",
1621                        iommu->name);
1622                 kfree(iommu->domain_ids);
1623                 kfree(iommu->domains);
1624                 iommu->domain_ids = NULL;
1625                 iommu->domains    = NULL;
1626                 return -ENOMEM;
1627         }
1628
1629         /*
1630          * If Caching mode is set, then invalid translations are tagged
1631          * with domain-id 0, hence we need to pre-allocate it. We also
1632          * use domain-id 0 as a marker for non-allocated domain-id, so
1633          * make sure it is not used for a real domain.
1634          */
1635         set_bit(0, iommu->domain_ids);
1636
1637         /*
1638          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1639          * entry for first-level or pass-through translation modes should
1640          * be programmed with a domain id different from those used for
1641          * second-level or nested translation. We reserve a domain id for
1642          * this purpose.
1643          */
1644         if (sm_supported(iommu))
1645                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1646
1647         return 0;
1648 }
1649
1650 static void disable_dmar_iommu(struct intel_iommu *iommu)
1651 {
1652         struct device_domain_info *info, *tmp;
1653         unsigned long flags;
1654
1655         if (!iommu->domains || !iommu->domain_ids)
1656                 return;
1657
1658         spin_lock_irqsave(&device_domain_lock, flags);
1659         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1660                 if (info->iommu != iommu)
1661                         continue;
1662
1663                 if (!info->dev || !info->domain)
1664                         continue;
1665
1666                 __dmar_remove_one_dev_info(info);
1667         }
1668         spin_unlock_irqrestore(&device_domain_lock, flags);
1669
1670         if (iommu->gcmd & DMA_GCMD_TE)
1671                 iommu_disable_translation(iommu);
1672 }
1673
1674 static void free_dmar_iommu(struct intel_iommu *iommu)
1675 {
1676         if ((iommu->domains) && (iommu->domain_ids)) {
1677                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1678                 int i;
1679
1680                 for (i = 0; i < elems; i++)
1681                         kfree(iommu->domains[i]);
1682                 kfree(iommu->domains);
1683                 kfree(iommu->domain_ids);
1684                 iommu->domains = NULL;
1685                 iommu->domain_ids = NULL;
1686         }
1687
1688         g_iommus[iommu->seq_id] = NULL;
1689
1690         /* free context mapping */
1691         free_context_table(iommu);
1692
1693 #ifdef CONFIG_INTEL_IOMMU_SVM
1694         if (pasid_supported(iommu)) {
1695                 if (ecap_prs(iommu->ecap))
1696                         intel_svm_finish_prq(iommu);
1697         }
1698 #endif
1699 }
1700
1701 static struct dmar_domain *alloc_domain(int flags)
1702 {
1703         struct dmar_domain *domain;
1704
1705         domain = alloc_domain_mem();
1706         if (!domain)
1707                 return NULL;
1708
1709         memset(domain, 0, sizeof(*domain));
1710         domain->nid = NUMA_NO_NODE;
1711         domain->flags = flags;
1712         domain->has_iotlb_device = false;
1713         INIT_LIST_HEAD(&domain->devices);
1714
1715         return domain;
1716 }
1717
1718 /* Must be called with iommu->lock */
1719 static int domain_attach_iommu(struct dmar_domain *domain,
1720                                struct intel_iommu *iommu)
1721 {
1722         unsigned long ndomains;
1723         int num;
1724
1725         assert_spin_locked(&device_domain_lock);
1726         assert_spin_locked(&iommu->lock);
1727
1728         domain->iommu_refcnt[iommu->seq_id] += 1;
1729         domain->iommu_count += 1;
1730         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1731                 ndomains = cap_ndoms(iommu->cap);
1732                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1733
1734                 if (num >= ndomains) {
1735                         pr_err("%s: No free domain ids\n", iommu->name);
1736                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1737                         domain->iommu_count -= 1;
1738                         return -ENOSPC;
1739                 }
1740
1741                 set_bit(num, iommu->domain_ids);
1742                 set_iommu_domain(iommu, num, domain);
1743
1744                 domain->iommu_did[iommu->seq_id] = num;
1745                 domain->nid                      = iommu->node;
1746
1747                 domain_update_iommu_cap(domain);
1748         }
1749
1750         return 0;
1751 }
1752
1753 static int domain_detach_iommu(struct dmar_domain *domain,
1754                                struct intel_iommu *iommu)
1755 {
1756         int num, count;
1757
1758         assert_spin_locked(&device_domain_lock);
1759         assert_spin_locked(&iommu->lock);
1760
1761         domain->iommu_refcnt[iommu->seq_id] -= 1;
1762         count = --domain->iommu_count;
1763         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1764                 num = domain->iommu_did[iommu->seq_id];
1765                 clear_bit(num, iommu->domain_ids);
1766                 set_iommu_domain(iommu, num, NULL);
1767
1768                 domain_update_iommu_cap(domain);
1769                 domain->iommu_did[iommu->seq_id] = 0;
1770         }
1771
1772         return count;
1773 }
1774
1775 static struct iova_domain reserved_iova_list;
1776 static struct lock_class_key reserved_rbtree_key;
1777
1778 static int dmar_init_reserved_ranges(void)
1779 {
1780         struct pci_dev *pdev = NULL;
1781         struct iova *iova;
1782         int i;
1783
1784         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1785
1786         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1787                 &reserved_rbtree_key);
1788
1789         /* IOAPIC ranges shouldn't be accessed by DMA */
1790         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1791                 IOVA_PFN(IOAPIC_RANGE_END));
1792         if (!iova) {
1793                 pr_err("Reserve IOAPIC range failed\n");
1794                 return -ENODEV;
1795         }
1796
1797         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1798         for_each_pci_dev(pdev) {
1799                 struct resource *r;
1800
1801                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1802                         r = &pdev->resource[i];
1803                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1804                                 continue;
1805                         iova = reserve_iova(&reserved_iova_list,
1806                                             IOVA_PFN(r->start),
1807                                             IOVA_PFN(r->end));
1808                         if (!iova) {
1809                                 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1810                                 return -ENODEV;
1811                         }
1812                 }
1813         }
1814         return 0;
1815 }
1816
1817 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1818 {
1819         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1820 }
1821
1822 static inline int guestwidth_to_adjustwidth(int gaw)
1823 {
1824         int agaw;
1825         int r = (gaw - 12) % 9;
1826
1827         if (r == 0)
1828                 agaw = gaw;
1829         else
1830                 agaw = gaw + 9 - r;
1831         if (agaw > 64)
1832                 agaw = 64;
1833         return agaw;
1834 }
1835
1836 static void domain_exit(struct dmar_domain *domain)
1837 {
1838         struct page *freelist;
1839
1840         /* Remove associated devices and clear attached or cached domains */
1841         domain_remove_dev_info(domain);
1842
1843         /* destroy iovas */
1844         put_iova_domain(&domain->iovad);
1845
1846         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1847
1848         dma_free_pagelist(freelist);
1849
1850         free_domain_mem(domain);
1851 }
1852
1853 /*
1854  * Get the PASID directory size for scalable mode context entry.
1855  * Value of X in the PDTS field of a scalable mode context entry
1856  * indicates PASID directory with 2^(X + 7) entries.
1857  */
1858 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1859 {
1860         int pds, max_pde;
1861
1862         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1863         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1864         if (pds < 7)
1865                 return 0;
1866
1867         return pds - 7;
1868 }
1869
1870 /*
1871  * Set the RID_PASID field of a scalable mode context entry. The
1872  * IOMMU hardware will use the PASID value set in this field for
1873  * DMA translations of DMA requests without PASID.
1874  */
1875 static inline void
1876 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1877 {
1878         context->hi |= pasid & ((1 << 20) - 1);
1879         context->hi |= (1 << 20);
1880 }
1881
1882 /*
1883  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1884  * entry.
1885  */
1886 static inline void context_set_sm_dte(struct context_entry *context)
1887 {
1888         context->lo |= (1 << 2);
1889 }
1890
1891 /*
1892  * Set the PRE(Page Request Enable) field of a scalable mode context
1893  * entry.
1894  */
1895 static inline void context_set_sm_pre(struct context_entry *context)
1896 {
1897         context->lo |= (1 << 4);
1898 }
1899
1900 /* Convert value to context PASID directory size field coding. */
1901 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1902
1903 static int domain_context_mapping_one(struct dmar_domain *domain,
1904                                       struct intel_iommu *iommu,
1905                                       struct pasid_table *table,
1906                                       u8 bus, u8 devfn)
1907 {
1908         u16 did = domain->iommu_did[iommu->seq_id];
1909         int translation = CONTEXT_TT_MULTI_LEVEL;
1910         struct device_domain_info *info = NULL;
1911         struct context_entry *context;
1912         unsigned long flags;
1913         int ret;
1914
1915         WARN_ON(did == 0);
1916
1917         if (hw_pass_through && domain_type_is_si(domain))
1918                 translation = CONTEXT_TT_PASS_THROUGH;
1919
1920         pr_debug("Set context mapping for %02x:%02x.%d\n",
1921                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1922
1923         BUG_ON(!domain->pgd);
1924
1925         spin_lock_irqsave(&device_domain_lock, flags);
1926         spin_lock(&iommu->lock);
1927
1928         ret = -ENOMEM;
1929         context = iommu_context_addr(iommu, bus, devfn, 1);
1930         if (!context)
1931                 goto out_unlock;
1932
1933         ret = 0;
1934         if (context_present(context))
1935                 goto out_unlock;
1936
1937         /*
1938          * For kdump cases, old valid entries may be cached due to the
1939          * in-flight DMA and copied pgtable, but there is no unmapping
1940          * behaviour for them, thus we need an explicit cache flush for
1941          * the newly-mapped device. For kdump, at this point, the device
1942          * is supposed to finish reset at its driver probe stage, so no
1943          * in-flight DMA will exist, and we don't need to worry anymore
1944          * hereafter.
1945          */
1946         if (context_copied(context)) {
1947                 u16 did_old = context_domain_id(context);
1948
1949                 if (did_old < cap_ndoms(iommu->cap)) {
1950                         iommu->flush.flush_context(iommu, did_old,
1951                                                    (((u16)bus) << 8) | devfn,
1952                                                    DMA_CCMD_MASK_NOBIT,
1953                                                    DMA_CCMD_DEVICE_INVL);
1954                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1955                                                  DMA_TLB_DSI_FLUSH);
1956                 }
1957         }
1958
1959         context_clear_entry(context);
1960
1961         if (sm_supported(iommu)) {
1962                 unsigned long pds;
1963
1964                 WARN_ON(!table);
1965
1966                 /* Setup the PASID DIR pointer: */
1967                 pds = context_get_sm_pds(table);
1968                 context->lo = (u64)virt_to_phys(table->table) |
1969                                 context_pdts(pds);
1970
1971                 /* Setup the RID_PASID field: */
1972                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
1973
1974                 /*
1975                  * Setup the Device-TLB enable bit and Page request
1976                  * Enable bit:
1977                  */
1978                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
1979                 if (info && info->ats_supported)
1980                         context_set_sm_dte(context);
1981                 if (info && info->pri_supported)
1982                         context_set_sm_pre(context);
1983         } else {
1984                 struct dma_pte *pgd = domain->pgd;
1985                 int agaw;
1986
1987                 context_set_domain_id(context, did);
1988
1989                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1990                         /*
1991                          * Skip top levels of page tables for iommu which has
1992                          * less agaw than default. Unnecessary for PT mode.
1993                          */
1994                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1995                                 ret = -ENOMEM;
1996                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1997                                 if (!dma_pte_present(pgd))
1998                                         goto out_unlock;
1999                         }
2000
2001                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2002                         if (info && info->ats_supported)
2003                                 translation = CONTEXT_TT_DEV_IOTLB;
2004                         else
2005                                 translation = CONTEXT_TT_MULTI_LEVEL;
2006
2007                         context_set_address_root(context, virt_to_phys(pgd));
2008                         context_set_address_width(context, agaw);
2009                 } else {
2010                         /*
2011                          * In pass through mode, AW must be programmed to
2012                          * indicate the largest AGAW value supported by
2013                          * hardware. And ASR is ignored by hardware.
2014                          */
2015                         context_set_address_width(context, iommu->msagaw);
2016                 }
2017
2018                 context_set_translation_type(context, translation);
2019         }
2020
2021         context_set_fault_enable(context);
2022         context_set_present(context);
2023         domain_flush_cache(domain, context, sizeof(*context));
2024
2025         /*
2026          * It's a non-present to present mapping. If hardware doesn't cache
2027          * non-present entry we only need to flush the write-buffer. If the
2028          * _does_ cache non-present entries, then it does so in the special
2029          * domain #0, which we have to flush:
2030          */
2031         if (cap_caching_mode(iommu->cap)) {
2032                 iommu->flush.flush_context(iommu, 0,
2033                                            (((u16)bus) << 8) | devfn,
2034                                            DMA_CCMD_MASK_NOBIT,
2035                                            DMA_CCMD_DEVICE_INVL);
2036                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2037         } else {
2038                 iommu_flush_write_buffer(iommu);
2039         }
2040         iommu_enable_dev_iotlb(info);
2041
2042         ret = 0;
2043
2044 out_unlock:
2045         spin_unlock(&iommu->lock);
2046         spin_unlock_irqrestore(&device_domain_lock, flags);
2047
2048         return ret;
2049 }
2050
2051 struct domain_context_mapping_data {
2052         struct dmar_domain *domain;
2053         struct intel_iommu *iommu;
2054         struct pasid_table *table;
2055 };
2056
2057 static int domain_context_mapping_cb(struct pci_dev *pdev,
2058                                      u16 alias, void *opaque)
2059 {
2060         struct domain_context_mapping_data *data = opaque;
2061
2062         return domain_context_mapping_one(data->domain, data->iommu,
2063                                           data->table, PCI_BUS_NUM(alias),
2064                                           alias & 0xff);
2065 }
2066
2067 static int
2068 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2069 {
2070         struct domain_context_mapping_data data;
2071         struct pasid_table *table;
2072         struct intel_iommu *iommu;
2073         u8 bus, devfn;
2074
2075         iommu = device_to_iommu(dev, &bus, &devfn);
2076         if (!iommu)
2077                 return -ENODEV;
2078
2079         table = intel_pasid_get_table(dev);
2080
2081         if (!dev_is_pci(dev))
2082                 return domain_context_mapping_one(domain, iommu, table,
2083                                                   bus, devfn);
2084
2085         data.domain = domain;
2086         data.iommu = iommu;
2087         data.table = table;
2088
2089         return pci_for_each_dma_alias(to_pci_dev(dev),
2090                                       &domain_context_mapping_cb, &data);
2091 }
2092
2093 static int domain_context_mapped_cb(struct pci_dev *pdev,
2094                                     u16 alias, void *opaque)
2095 {
2096         struct intel_iommu *iommu = opaque;
2097
2098         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2099 }
2100
2101 static int domain_context_mapped(struct device *dev)
2102 {
2103         struct intel_iommu *iommu;
2104         u8 bus, devfn;
2105
2106         iommu = device_to_iommu(dev, &bus, &devfn);
2107         if (!iommu)
2108                 return -ENODEV;
2109
2110         if (!dev_is_pci(dev))
2111                 return device_context_mapped(iommu, bus, devfn);
2112
2113         return !pci_for_each_dma_alias(to_pci_dev(dev),
2114                                        domain_context_mapped_cb, iommu);
2115 }
2116
2117 /* Returns a number of VTD pages, but aligned to MM page size */
2118 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2119                                             size_t size)
2120 {
2121         host_addr &= ~PAGE_MASK;
2122         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2123 }
2124
2125 /* Return largest possible superpage level for a given mapping */
2126 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2127                                           unsigned long iov_pfn,
2128                                           unsigned long phy_pfn,
2129                                           unsigned long pages)
2130 {
2131         int support, level = 1;
2132         unsigned long pfnmerge;
2133
2134         support = domain->iommu_superpage;
2135
2136         /* To use a large page, the virtual *and* physical addresses
2137            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2138            of them will mean we have to use smaller pages. So just
2139            merge them and check both at once. */
2140         pfnmerge = iov_pfn | phy_pfn;
2141
2142         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2143                 pages >>= VTD_STRIDE_SHIFT;
2144                 if (!pages)
2145                         break;
2146                 pfnmerge >>= VTD_STRIDE_SHIFT;
2147                 level++;
2148                 support--;
2149         }
2150         return level;
2151 }
2152
2153 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2154                             struct scatterlist *sg, unsigned long phys_pfn,
2155                             unsigned long nr_pages, int prot)
2156 {
2157         struct dma_pte *first_pte = NULL, *pte = NULL;
2158         phys_addr_t uninitialized_var(pteval);
2159         unsigned long sg_res = 0;
2160         unsigned int largepage_lvl = 0;
2161         unsigned long lvl_pages = 0;
2162
2163         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2164
2165         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2166                 return -EINVAL;
2167
2168         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2169
2170         if (!sg) {
2171                 sg_res = nr_pages;
2172                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2173         }
2174
2175         while (nr_pages > 0) {
2176                 uint64_t tmp;
2177
2178                 if (!sg_res) {
2179                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2180
2181                         sg_res = aligned_nrpages(sg->offset, sg->length);
2182                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2183                         sg->dma_length = sg->length;
2184                         pteval = (sg_phys(sg) - pgoff) | prot;
2185                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2186                 }
2187
2188                 if (!pte) {
2189                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2190
2191                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2192                         if (!pte)
2193                                 return -ENOMEM;
2194                         /* It is large page*/
2195                         if (largepage_lvl > 1) {
2196                                 unsigned long nr_superpages, end_pfn;
2197
2198                                 pteval |= DMA_PTE_LARGE_PAGE;
2199                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2200
2201                                 nr_superpages = sg_res / lvl_pages;
2202                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2203
2204                                 /*
2205                                  * Ensure that old small page tables are
2206                                  * removed to make room for superpage(s).
2207                                  * We're adding new large pages, so make sure
2208                                  * we don't remove their parent tables.
2209                                  */
2210                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2211                                                        largepage_lvl + 1);
2212                         } else {
2213                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2214                         }
2215
2216                 }
2217                 /* We don't need lock here, nobody else
2218                  * touches the iova range
2219                  */
2220                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2221                 if (tmp) {
2222                         static int dumps = 5;
2223                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2224                                 iov_pfn, tmp, (unsigned long long)pteval);
2225                         if (dumps) {
2226                                 dumps--;
2227                                 debug_dma_dump_mappings(NULL);
2228                         }
2229                         WARN_ON(1);
2230                 }
2231
2232                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2233
2234                 BUG_ON(nr_pages < lvl_pages);
2235                 BUG_ON(sg_res < lvl_pages);
2236
2237                 nr_pages -= lvl_pages;
2238                 iov_pfn += lvl_pages;
2239                 phys_pfn += lvl_pages;
2240                 pteval += lvl_pages * VTD_PAGE_SIZE;
2241                 sg_res -= lvl_pages;
2242
2243                 /* If the next PTE would be the first in a new page, then we
2244                    need to flush the cache on the entries we've just written.
2245                    And then we'll need to recalculate 'pte', so clear it and
2246                    let it get set again in the if (!pte) block above.
2247
2248                    If we're done (!nr_pages) we need to flush the cache too.
2249
2250                    Also if we've been setting superpages, we may need to
2251                    recalculate 'pte' and switch back to smaller pages for the
2252                    end of the mapping, if the trailing size is not enough to
2253                    use another superpage (i.e. sg_res < lvl_pages). */
2254                 pte++;
2255                 if (!nr_pages || first_pte_in_page(pte) ||
2256                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2257                         domain_flush_cache(domain, first_pte,
2258                                            (void *)pte - (void *)first_pte);
2259                         pte = NULL;
2260                 }
2261
2262                 if (!sg_res && nr_pages)
2263                         sg = sg_next(sg);
2264         }
2265         return 0;
2266 }
2267
2268 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2269                           struct scatterlist *sg, unsigned long phys_pfn,
2270                           unsigned long nr_pages, int prot)
2271 {
2272         int iommu_id, ret;
2273         struct intel_iommu *iommu;
2274
2275         /* Do the real mapping first */
2276         ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2277         if (ret)
2278                 return ret;
2279
2280         for_each_domain_iommu(iommu_id, domain) {
2281                 iommu = g_iommus[iommu_id];
2282                 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2283         }
2284
2285         return 0;
2286 }
2287
2288 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2289                                     struct scatterlist *sg, unsigned long nr_pages,
2290                                     int prot)
2291 {
2292         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2293 }
2294
2295 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2296                                      unsigned long phys_pfn, unsigned long nr_pages,
2297                                      int prot)
2298 {
2299         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2300 }
2301
2302 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2303 {
2304         unsigned long flags;
2305         struct context_entry *context;
2306         u16 did_old;
2307
2308         if (!iommu)
2309                 return;
2310
2311         spin_lock_irqsave(&iommu->lock, flags);
2312         context = iommu_context_addr(iommu, bus, devfn, 0);
2313         if (!context) {
2314                 spin_unlock_irqrestore(&iommu->lock, flags);
2315                 return;
2316         }
2317         did_old = context_domain_id(context);
2318         context_clear_entry(context);
2319         __iommu_flush_cache(iommu, context, sizeof(*context));
2320         spin_unlock_irqrestore(&iommu->lock, flags);
2321         iommu->flush.flush_context(iommu,
2322                                    did_old,
2323                                    (((u16)bus) << 8) | devfn,
2324                                    DMA_CCMD_MASK_NOBIT,
2325                                    DMA_CCMD_DEVICE_INVL);
2326         iommu->flush.flush_iotlb(iommu,
2327                                  did_old,
2328                                  0,
2329                                  0,
2330                                  DMA_TLB_DSI_FLUSH);
2331 }
2332
2333 static inline void unlink_domain_info(struct device_domain_info *info)
2334 {
2335         assert_spin_locked(&device_domain_lock);
2336         list_del(&info->link);
2337         list_del(&info->global);
2338         if (info->dev)
2339                 info->dev->archdata.iommu = NULL;
2340 }
2341
2342 static void domain_remove_dev_info(struct dmar_domain *domain)
2343 {
2344         struct device_domain_info *info, *tmp;
2345         unsigned long flags;
2346
2347         spin_lock_irqsave(&device_domain_lock, flags);
2348         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2349                 __dmar_remove_one_dev_info(info);
2350         spin_unlock_irqrestore(&device_domain_lock, flags);
2351 }
2352
2353 /*
2354  * find_domain
2355  * Note: we use struct device->archdata.iommu stores the info
2356  */
2357 static struct dmar_domain *find_domain(struct device *dev)
2358 {
2359         struct device_domain_info *info;
2360
2361         if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO)) {
2362                 struct iommu_domain *domain;
2363
2364                 dev->archdata.iommu = NULL;
2365                 domain = iommu_get_domain_for_dev(dev);
2366                 if (domain)
2367                         intel_iommu_attach_device(domain, dev);
2368         }
2369
2370         /* No lock here, assumes no domain exit in normal case */
2371         info = dev->archdata.iommu;
2372
2373         if (likely(info))
2374                 return info->domain;
2375         return NULL;
2376 }
2377
2378 static inline struct device_domain_info *
2379 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2380 {
2381         struct device_domain_info *info;
2382
2383         list_for_each_entry(info, &device_domain_list, global)
2384                 if (info->iommu->segment == segment && info->bus == bus &&
2385                     info->devfn == devfn)
2386                         return info;
2387
2388         return NULL;
2389 }
2390
2391 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2392                                                     int bus, int devfn,
2393                                                     struct device *dev,
2394                                                     struct dmar_domain *domain)
2395 {
2396         struct dmar_domain *found = NULL;
2397         struct device_domain_info *info;
2398         unsigned long flags;
2399         int ret;
2400
2401         info = alloc_devinfo_mem();
2402         if (!info)
2403                 return NULL;
2404
2405         info->bus = bus;
2406         info->devfn = devfn;
2407         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2408         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2409         info->ats_qdep = 0;
2410         info->dev = dev;
2411         info->domain = domain;
2412         info->iommu = iommu;
2413         info->pasid_table = NULL;
2414         info->auxd_enabled = 0;
2415         INIT_LIST_HEAD(&info->auxiliary_domains);
2416
2417         if (dev && dev_is_pci(dev)) {
2418                 struct pci_dev *pdev = to_pci_dev(info->dev);
2419
2420                 if (!pdev->untrusted &&
2421                     !pci_ats_disabled() &&
2422                     ecap_dev_iotlb_support(iommu->ecap) &&
2423                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2424                     dmar_find_matched_atsr_unit(pdev))
2425                         info->ats_supported = 1;
2426
2427                 if (sm_supported(iommu)) {
2428                         if (pasid_supported(iommu)) {
2429                                 int features = pci_pasid_features(pdev);
2430                                 if (features >= 0)
2431                                         info->pasid_supported = features | 1;
2432                         }
2433
2434                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2435                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2436                                 info->pri_supported = 1;
2437                 }
2438         }
2439
2440         spin_lock_irqsave(&device_domain_lock, flags);
2441         if (dev)
2442                 found = find_domain(dev);
2443
2444         if (!found) {
2445                 struct device_domain_info *info2;
2446                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2447                 if (info2) {
2448                         found      = info2->domain;
2449                         info2->dev = dev;
2450                 }
2451         }
2452
2453         if (found) {
2454                 spin_unlock_irqrestore(&device_domain_lock, flags);
2455                 free_devinfo_mem(info);
2456                 /* Caller must free the original domain */
2457                 return found;
2458         }
2459
2460         spin_lock(&iommu->lock);
2461         ret = domain_attach_iommu(domain, iommu);
2462         spin_unlock(&iommu->lock);
2463
2464         if (ret) {
2465                 spin_unlock_irqrestore(&device_domain_lock, flags);
2466                 free_devinfo_mem(info);
2467                 return NULL;
2468         }
2469
2470         list_add(&info->link, &domain->devices);
2471         list_add(&info->global, &device_domain_list);
2472         if (dev)
2473                 dev->archdata.iommu = info;
2474         spin_unlock_irqrestore(&device_domain_lock, flags);
2475
2476         /* PASID table is mandatory for a PCI device in scalable mode. */
2477         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2478                 ret = intel_pasid_alloc_table(dev);
2479                 if (ret) {
2480                         dev_err(dev, "PASID table allocation failed\n");
2481                         dmar_remove_one_dev_info(dev);
2482                         return NULL;
2483                 }
2484
2485                 /* Setup the PASID entry for requests without PASID: */
2486                 spin_lock(&iommu->lock);
2487                 if (hw_pass_through && domain_type_is_si(domain))
2488                         ret = intel_pasid_setup_pass_through(iommu, domain,
2489                                         dev, PASID_RID2PASID);
2490                 else
2491                         ret = intel_pasid_setup_second_level(iommu, domain,
2492                                         dev, PASID_RID2PASID);
2493                 spin_unlock(&iommu->lock);
2494                 if (ret) {
2495                         dev_err(dev, "Setup RID2PASID failed\n");
2496                         dmar_remove_one_dev_info(dev);
2497                         return NULL;
2498                 }
2499         }
2500
2501         if (dev && domain_context_mapping(domain, dev)) {
2502                 dev_err(dev, "Domain context map failed\n");
2503                 dmar_remove_one_dev_info(dev);
2504                 return NULL;
2505         }
2506
2507         return domain;
2508 }
2509
2510 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2511 {
2512         *(u16 *)opaque = alias;
2513         return 0;
2514 }
2515
2516 static int domain_init(struct dmar_domain *domain, int guest_width)
2517 {
2518         int adjust_width;
2519
2520         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
2521         domain_reserve_special_ranges(domain);
2522
2523         /* calculate AGAW */
2524         domain->gaw = guest_width;
2525         adjust_width = guestwidth_to_adjustwidth(guest_width);
2526         domain->agaw = width_to_agaw(adjust_width);
2527
2528         domain->iommu_coherency = 0;
2529         domain->iommu_snooping = 0;
2530         domain->iommu_superpage = 0;
2531         domain->max_addr = 0;
2532
2533         /* always allocate the top pgd */
2534         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
2535         if (!domain->pgd)
2536                 return -ENOMEM;
2537         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
2538         return 0;
2539 }
2540
2541 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2542 {
2543         struct device_domain_info *info;
2544         struct dmar_domain *domain = NULL;
2545         struct intel_iommu *iommu;
2546         u16 dma_alias;
2547         unsigned long flags;
2548         u8 bus, devfn;
2549
2550         iommu = device_to_iommu(dev, &bus, &devfn);
2551         if (!iommu)
2552                 return NULL;
2553
2554         if (dev_is_pci(dev)) {
2555                 struct pci_dev *pdev = to_pci_dev(dev);
2556
2557                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2558
2559                 spin_lock_irqsave(&device_domain_lock, flags);
2560                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2561                                                       PCI_BUS_NUM(dma_alias),
2562                                                       dma_alias & 0xff);
2563                 if (info) {
2564                         iommu = info->iommu;
2565                         domain = info->domain;
2566                 }
2567                 spin_unlock_irqrestore(&device_domain_lock, flags);
2568
2569                 /* DMA alias already has a domain, use it */
2570                 if (info)
2571                         goto out;
2572         }
2573
2574         /* Allocate and initialize new domain for the device */
2575         domain = alloc_domain(0);
2576         if (!domain)
2577                 return NULL;
2578
2579         if (domain_init(domain, gaw)) {
2580                 domain_exit(domain);
2581                 return NULL;
2582         }
2583
2584         if (init_iova_flush_queue(&domain->iovad,
2585                                   iommu_flush_iova,
2586                                   iova_entry_free)) {
2587                 pr_warn("iova flush queue initialization failed\n");
2588                 intel_iommu_strict = 1;
2589         }
2590
2591 out:
2592         return domain;
2593 }
2594
2595 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2596                                               struct dmar_domain *domain)
2597 {
2598         struct intel_iommu *iommu;
2599         struct dmar_domain *tmp;
2600         u16 req_id, dma_alias;
2601         u8 bus, devfn;
2602
2603         iommu = device_to_iommu(dev, &bus, &devfn);
2604         if (!iommu)
2605                 return NULL;
2606
2607         req_id = ((u16)bus << 8) | devfn;
2608
2609         if (dev_is_pci(dev)) {
2610                 struct pci_dev *pdev = to_pci_dev(dev);
2611
2612                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2613
2614                 /* register PCI DMA alias device */
2615                 if (req_id != dma_alias) {
2616                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2617                                         dma_alias & 0xff, NULL, domain);
2618
2619                         if (!tmp || tmp != domain)
2620                                 return tmp;
2621                 }
2622         }
2623
2624         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2625         if (!tmp || tmp != domain)
2626                 return tmp;
2627
2628         return domain;
2629 }
2630
2631 static int iommu_domain_identity_map(struct dmar_domain *domain,
2632                                      unsigned long long start,
2633                                      unsigned long long end)
2634 {
2635         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2636         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2637
2638         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2639                           dma_to_mm_pfn(last_vpfn))) {
2640                 pr_err("Reserving iova failed\n");
2641                 return -ENOMEM;
2642         }
2643
2644         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2645         /*
2646          * RMRR range might have overlap with physical memory range,
2647          * clear it first
2648          */
2649         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2650
2651         return __domain_mapping(domain, first_vpfn, NULL,
2652                                 first_vpfn, last_vpfn - first_vpfn + 1,
2653                                 DMA_PTE_READ|DMA_PTE_WRITE);
2654 }
2655
2656 static int domain_prepare_identity_map(struct device *dev,
2657                                        struct dmar_domain *domain,
2658                                        unsigned long long start,
2659                                        unsigned long long end)
2660 {
2661         /* For _hardware_ passthrough, don't bother. But for software
2662            passthrough, we do it anyway -- it may indicate a memory
2663            range which is reserved in E820, so which didn't get set
2664            up to start with in si_domain */
2665         if (domain == si_domain && hw_pass_through) {
2666                 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2667                          start, end);
2668                 return 0;
2669         }
2670
2671         dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2672
2673         if (end < start) {
2674                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2675                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2676                         dmi_get_system_info(DMI_BIOS_VENDOR),
2677                         dmi_get_system_info(DMI_BIOS_VERSION),
2678                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2679                 return -EIO;
2680         }
2681
2682         if (end >> agaw_to_width(domain->agaw)) {
2683                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2684                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2685                      agaw_to_width(domain->agaw),
2686                      dmi_get_system_info(DMI_BIOS_VENDOR),
2687                      dmi_get_system_info(DMI_BIOS_VERSION),
2688                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2689                 return -EIO;
2690         }
2691
2692         return iommu_domain_identity_map(domain, start, end);
2693 }
2694
2695 static int __init si_domain_init(int hw)
2696 {
2697         struct dmar_rmrr_unit *rmrr;
2698         struct device *dev;
2699         int i, nid, ret;
2700
2701         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2702         if (!si_domain)
2703                 return -EFAULT;
2704
2705         if (domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2706                 domain_exit(si_domain);
2707                 return -EFAULT;
2708         }
2709
2710         if (hw)
2711                 return 0;
2712
2713         for_each_online_node(nid) {
2714                 unsigned long start_pfn, end_pfn;
2715                 int i;
2716
2717                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2718                         ret = iommu_domain_identity_map(si_domain,
2719                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2720                         if (ret)
2721                                 return ret;
2722                 }
2723         }
2724
2725         /*
2726          * Normally we use DMA domains for devices which have RMRRs. But we
2727          * loose this requirement for graphic and usb devices. Identity map
2728          * the RMRRs for graphic and USB devices so that they could use the
2729          * si_domain.
2730          */
2731         for_each_rmrr_units(rmrr) {
2732                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2733                                           i, dev) {
2734                         unsigned long long start = rmrr->base_address;
2735                         unsigned long long end = rmrr->end_address;
2736
2737                         if (device_is_rmrr_locked(dev))
2738                                 continue;
2739
2740                         if (WARN_ON(end < start ||
2741                                     end >> agaw_to_width(si_domain->agaw)))
2742                                 continue;
2743
2744                         ret = iommu_domain_identity_map(si_domain, start, end);
2745                         if (ret)
2746                                 return ret;
2747                 }
2748         }
2749
2750         return 0;
2751 }
2752
2753 static int identity_mapping(struct device *dev)
2754 {
2755         struct device_domain_info *info;
2756
2757         info = dev->archdata.iommu;
2758         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2759                 return (info->domain == si_domain);
2760
2761         return 0;
2762 }
2763
2764 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2765 {
2766         struct dmar_domain *ndomain;
2767         struct intel_iommu *iommu;
2768         u8 bus, devfn;
2769
2770         iommu = device_to_iommu(dev, &bus, &devfn);
2771         if (!iommu)
2772                 return -ENODEV;
2773
2774         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2775         if (ndomain != domain)
2776                 return -EBUSY;
2777
2778         return 0;
2779 }
2780
2781 static bool device_has_rmrr(struct device *dev)
2782 {
2783         struct dmar_rmrr_unit *rmrr;
2784         struct device *tmp;
2785         int i;
2786
2787         rcu_read_lock();
2788         for_each_rmrr_units(rmrr) {
2789                 /*
2790                  * Return TRUE if this RMRR contains the device that
2791                  * is passed in.
2792                  */
2793                 for_each_active_dev_scope(rmrr->devices,
2794                                           rmrr->devices_cnt, i, tmp)
2795                         if (tmp == dev ||
2796                             is_downstream_to_pci_bridge(dev, tmp)) {
2797                                 rcu_read_unlock();
2798                                 return true;
2799                         }
2800         }
2801         rcu_read_unlock();
2802         return false;
2803 }
2804
2805 /**
2806  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2807  * is relaxable (ie. is allowed to be not enforced under some conditions)
2808  * @dev: device handle
2809  *
2810  * We assume that PCI USB devices with RMRRs have them largely
2811  * for historical reasons and that the RMRR space is not actively used post
2812  * boot.  This exclusion may change if vendors begin to abuse it.
2813  *
2814  * The same exception is made for graphics devices, with the requirement that
2815  * any use of the RMRR regions will be torn down before assigning the device
2816  * to a guest.
2817  *
2818  * Return: true if the RMRR is relaxable, false otherwise
2819  */
2820 static bool device_rmrr_is_relaxable(struct device *dev)
2821 {
2822         struct pci_dev *pdev;
2823
2824         if (!dev_is_pci(dev))
2825                 return false;
2826
2827         pdev = to_pci_dev(dev);
2828         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2829                 return true;
2830         else
2831                 return false;
2832 }
2833
2834 /*
2835  * There are a couple cases where we need to restrict the functionality of
2836  * devices associated with RMRRs.  The first is when evaluating a device for
2837  * identity mapping because problems exist when devices are moved in and out
2838  * of domains and their respective RMRR information is lost.  This means that
2839  * a device with associated RMRRs will never be in a "passthrough" domain.
2840  * The second is use of the device through the IOMMU API.  This interface
2841  * expects to have full control of the IOVA space for the device.  We cannot
2842  * satisfy both the requirement that RMRR access is maintained and have an
2843  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2844  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2845  * We therefore prevent devices associated with an RMRR from participating in
2846  * the IOMMU API, which eliminates them from device assignment.
2847  *
2848  * In both cases, devices which have relaxable RMRRs are not concerned by this
2849  * restriction. See device_rmrr_is_relaxable comment.
2850  */
2851 static bool device_is_rmrr_locked(struct device *dev)
2852 {
2853         if (!device_has_rmrr(dev))
2854                 return false;
2855
2856         if (device_rmrr_is_relaxable(dev))
2857                 return false;
2858
2859         return true;
2860 }
2861
2862 /*
2863  * Return the required default domain type for a specific device.
2864  *
2865  * @dev: the device in query
2866  * @startup: true if this is during early boot
2867  *
2868  * Returns:
2869  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2870  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2871  *  - 0: both identity and dynamic domains work for this device
2872  */
2873 static int device_def_domain_type(struct device *dev)
2874 {
2875         if (dev_is_pci(dev)) {
2876                 struct pci_dev *pdev = to_pci_dev(dev);
2877
2878                 if (device_is_rmrr_locked(dev))
2879                         return IOMMU_DOMAIN_DMA;
2880
2881                 /*
2882                  * Prevent any device marked as untrusted from getting
2883                  * placed into the statically identity mapping domain.
2884                  */
2885                 if (pdev->untrusted)
2886                         return IOMMU_DOMAIN_DMA;
2887
2888                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2889                         return IOMMU_DOMAIN_IDENTITY;
2890
2891                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2892                         return IOMMU_DOMAIN_IDENTITY;
2893
2894                 /*
2895                  * We want to start off with all devices in the 1:1 domain, and
2896                  * take them out later if we find they can't access all of memory.
2897                  *
2898                  * However, we can't do this for PCI devices behind bridges,
2899                  * because all PCI devices behind the same bridge will end up
2900                  * with the same source-id on their transactions.
2901                  *
2902                  * Practically speaking, we can't change things around for these
2903                  * devices at run-time, because we can't be sure there'll be no
2904                  * DMA transactions in flight for any of their siblings.
2905                  *
2906                  * So PCI devices (unless they're on the root bus) as well as
2907                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2908                  * the 1:1 domain, just in _case_ one of their siblings turns out
2909                  * not to be able to map all of memory.
2910                  */
2911                 if (!pci_is_pcie(pdev)) {
2912                         if (!pci_is_root_bus(pdev->bus))
2913                                 return IOMMU_DOMAIN_DMA;
2914                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2915                                 return IOMMU_DOMAIN_DMA;
2916                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2917                         return IOMMU_DOMAIN_DMA;
2918         } else {
2919                 if (device_has_rmrr(dev))
2920                         return IOMMU_DOMAIN_DMA;
2921         }
2922
2923         return (iommu_identity_mapping & IDENTMAP_ALL) ?
2924                         IOMMU_DOMAIN_IDENTITY : 0;
2925 }
2926
2927 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2928 {
2929         /*
2930          * Start from the sane iommu hardware state.
2931          * If the queued invalidation is already initialized by us
2932          * (for example, while enabling interrupt-remapping) then
2933          * we got the things already rolling from a sane state.
2934          */
2935         if (!iommu->qi) {
2936                 /*
2937                  * Clear any previous faults.
2938                  */
2939                 dmar_fault(-1, iommu);
2940                 /*
2941                  * Disable queued invalidation if supported and already enabled
2942                  * before OS handover.
2943                  */
2944                 dmar_disable_qi(iommu);
2945         }
2946
2947         if (dmar_enable_qi(iommu)) {
2948                 /*
2949                  * Queued Invalidate not enabled, use Register Based Invalidate
2950                  */
2951                 iommu->flush.flush_context = __iommu_flush_context;
2952                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2953                 pr_info("%s: Using Register based invalidation\n",
2954                         iommu->name);
2955         } else {
2956                 iommu->flush.flush_context = qi_flush_context;
2957                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2958                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2959         }
2960 }
2961
2962 static int copy_context_table(struct intel_iommu *iommu,
2963                               struct root_entry *old_re,
2964                               struct context_entry **tbl,
2965                               int bus, bool ext)
2966 {
2967         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2968         struct context_entry *new_ce = NULL, ce;
2969         struct context_entry *old_ce = NULL;
2970         struct root_entry re;
2971         phys_addr_t old_ce_phys;
2972
2973         tbl_idx = ext ? bus * 2 : bus;
2974         memcpy(&re, old_re, sizeof(re));
2975
2976         for (devfn = 0; devfn < 256; devfn++) {
2977                 /* First calculate the correct index */
2978                 idx = (ext ? devfn * 2 : devfn) % 256;
2979
2980                 if (idx == 0) {
2981                         /* First save what we may have and clean up */
2982                         if (new_ce) {
2983                                 tbl[tbl_idx] = new_ce;
2984                                 __iommu_flush_cache(iommu, new_ce,
2985                                                     VTD_PAGE_SIZE);
2986                                 pos = 1;
2987                         }
2988
2989                         if (old_ce)
2990                                 memunmap(old_ce);
2991
2992                         ret = 0;
2993                         if (devfn < 0x80)
2994                                 old_ce_phys = root_entry_lctp(&re);
2995                         else
2996                                 old_ce_phys = root_entry_uctp(&re);
2997
2998                         if (!old_ce_phys) {
2999                                 if (ext && devfn == 0) {
3000                                         /* No LCTP, try UCTP */
3001                                         devfn = 0x7f;
3002                                         continue;
3003                                 } else {
3004                                         goto out;
3005                                 }
3006                         }
3007
3008                         ret = -ENOMEM;
3009                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3010                                         MEMREMAP_WB);
3011                         if (!old_ce)
3012                                 goto out;
3013
3014                         new_ce = alloc_pgtable_page(iommu->node);
3015                         if (!new_ce)
3016                                 goto out_unmap;
3017
3018                         ret = 0;
3019                 }
3020
3021                 /* Now copy the context entry */
3022                 memcpy(&ce, old_ce + idx, sizeof(ce));
3023
3024                 if (!__context_present(&ce))
3025                         continue;
3026
3027                 did = context_domain_id(&ce);
3028                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3029                         set_bit(did, iommu->domain_ids);
3030
3031                 /*
3032                  * We need a marker for copied context entries. This
3033                  * marker needs to work for the old format as well as
3034                  * for extended context entries.
3035                  *
3036                  * Bit 67 of the context entry is used. In the old
3037                  * format this bit is available to software, in the
3038                  * extended format it is the PGE bit, but PGE is ignored
3039                  * by HW if PASIDs are disabled (and thus still
3040                  * available).
3041                  *
3042                  * So disable PASIDs first and then mark the entry
3043                  * copied. This means that we don't copy PASID
3044                  * translations from the old kernel, but this is fine as
3045                  * faults there are not fatal.
3046                  */
3047                 context_clear_pasid_enable(&ce);
3048                 context_set_copied(&ce);
3049
3050                 new_ce[idx] = ce;
3051         }
3052
3053         tbl[tbl_idx + pos] = new_ce;
3054
3055         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3056
3057 out_unmap:
3058         memunmap(old_ce);
3059
3060 out:
3061         return ret;
3062 }
3063
3064 static int copy_translation_tables(struct intel_iommu *iommu)
3065 {
3066         struct context_entry **ctxt_tbls;
3067         struct root_entry *old_rt;
3068         phys_addr_t old_rt_phys;
3069         int ctxt_table_entries;
3070         unsigned long flags;
3071         u64 rtaddr_reg;
3072         int bus, ret;
3073         bool new_ext, ext;
3074
3075         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3076         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3077         new_ext    = !!ecap_ecs(iommu->ecap);
3078
3079         /*
3080          * The RTT bit can only be changed when translation is disabled,
3081          * but disabling translation means to open a window for data
3082          * corruption. So bail out and don't copy anything if we would
3083          * have to change the bit.
3084          */
3085         if (new_ext != ext)
3086                 return -EINVAL;
3087
3088         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3089         if (!old_rt_phys)
3090                 return -EINVAL;
3091
3092         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3093         if (!old_rt)
3094                 return -ENOMEM;
3095
3096         /* This is too big for the stack - allocate it from slab */
3097         ctxt_table_entries = ext ? 512 : 256;
3098         ret = -ENOMEM;
3099         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3100         if (!ctxt_tbls)
3101                 goto out_unmap;
3102
3103         for (bus = 0; bus < 256; bus++) {
3104                 ret = copy_context_table(iommu, &old_rt[bus],
3105                                          ctxt_tbls, bus, ext);
3106                 if (ret) {
3107                         pr_err("%s: Failed to copy context table for bus %d\n",
3108                                 iommu->name, bus);
3109                         continue;
3110                 }
3111         }
3112
3113         spin_lock_irqsave(&iommu->lock, flags);
3114
3115         /* Context tables are copied, now write them to the root_entry table */
3116         for (bus = 0; bus < 256; bus++) {
3117                 int idx = ext ? bus * 2 : bus;
3118                 u64 val;
3119
3120                 if (ctxt_tbls[idx]) {
3121                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3122                         iommu->root_entry[bus].lo = val;
3123                 }
3124
3125                 if (!ext || !ctxt_tbls[idx + 1])
3126                         continue;
3127
3128                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3129                 iommu->root_entry[bus].hi = val;
3130         }
3131
3132         spin_unlock_irqrestore(&iommu->lock, flags);
3133
3134         kfree(ctxt_tbls);
3135
3136         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3137
3138         ret = 0;
3139
3140 out_unmap:
3141         memunmap(old_rt);
3142
3143         return ret;
3144 }
3145
3146 static int __init init_dmars(void)
3147 {
3148         struct dmar_drhd_unit *drhd;
3149         struct intel_iommu *iommu;
3150         int ret;
3151
3152         /*
3153          * for each drhd
3154          *    allocate root
3155          *    initialize and program root entry to not present
3156          * endfor
3157          */
3158         for_each_drhd_unit(drhd) {
3159                 /*
3160                  * lock not needed as this is only incremented in the single
3161                  * threaded kernel __init code path all other access are read
3162                  * only
3163                  */
3164                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3165                         g_num_of_iommus++;
3166                         continue;
3167                 }
3168                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3169         }
3170
3171         /* Preallocate enough resources for IOMMU hot-addition */
3172         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3173                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3174
3175         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3176                         GFP_KERNEL);
3177         if (!g_iommus) {
3178                 pr_err("Allocating global iommu array failed\n");
3179                 ret = -ENOMEM;
3180                 goto error;
3181         }
3182
3183         for_each_iommu(iommu, drhd) {
3184                 if (drhd->ignored) {
3185                         iommu_disable_translation(iommu);
3186                         continue;
3187                 }
3188
3189                 /*
3190                  * Find the max pasid size of all IOMMU's in the system.
3191                  * We need to ensure the system pasid table is no bigger
3192                  * than the smallest supported.
3193                  */
3194                 if (pasid_supported(iommu)) {
3195                         u32 temp = 2 << ecap_pss(iommu->ecap);
3196
3197                         intel_pasid_max_id = min_t(u32, temp,
3198                                                    intel_pasid_max_id);
3199                 }
3200
3201                 g_iommus[iommu->seq_id] = iommu;
3202
3203                 intel_iommu_init_qi(iommu);
3204
3205                 ret = iommu_init_domains(iommu);
3206                 if (ret)
3207                         goto free_iommu;
3208
3209                 init_translation_status(iommu);
3210
3211                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3212                         iommu_disable_translation(iommu);
3213                         clear_translation_pre_enabled(iommu);
3214                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3215                                 iommu->name);
3216                 }
3217
3218                 /*
3219                  * TBD:
3220                  * we could share the same root & context tables
3221                  * among all IOMMU's. Need to Split it later.
3222                  */
3223                 ret = iommu_alloc_root_entry(iommu);
3224                 if (ret)
3225                         goto free_iommu;
3226
3227                 if (translation_pre_enabled(iommu)) {
3228                         pr_info("Translation already enabled - trying to copy translation structures\n");
3229
3230                         ret = copy_translation_tables(iommu);
3231                         if (ret) {
3232                                 /*
3233                                  * We found the IOMMU with translation
3234                                  * enabled - but failed to copy over the
3235                                  * old root-entry table. Try to proceed
3236                                  * by disabling translation now and
3237                                  * allocating a clean root-entry table.
3238                                  * This might cause DMAR faults, but
3239                                  * probably the dump will still succeed.
3240                                  */
3241                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3242                                        iommu->name);
3243                                 iommu_disable_translation(iommu);
3244                                 clear_translation_pre_enabled(iommu);
3245                         } else {
3246                                 pr_info("Copied translation tables from previous kernel for %s\n",
3247                                         iommu->name);
3248                         }
3249                 }
3250
3251                 if (!ecap_pass_through(iommu->ecap))
3252                         hw_pass_through = 0;
3253 #ifdef CONFIG_INTEL_IOMMU_SVM
3254                 if (pasid_supported(iommu))
3255                         intel_svm_init(iommu);
3256 #endif
3257         }
3258
3259         /*
3260          * Now that qi is enabled on all iommus, set the root entry and flush
3261          * caches. This is required on some Intel X58 chipsets, otherwise the
3262          * flush_context function will loop forever and the boot hangs.
3263          */
3264         for_each_active_iommu(iommu, drhd) {
3265                 iommu_flush_write_buffer(iommu);
3266                 iommu_set_root_entry(iommu);
3267                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3268                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3269         }
3270
3271         if (iommu_pass_through)
3272                 iommu_identity_mapping |= IDENTMAP_ALL;
3273
3274 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3275         dmar_map_gfx = 0;
3276 #endif
3277
3278         if (!dmar_map_gfx)
3279                 iommu_identity_mapping |= IDENTMAP_GFX;
3280
3281         check_tylersburg_isoch();
3282
3283         ret = si_domain_init(hw_pass_through);
3284         if (ret)
3285                 goto free_iommu;
3286
3287         /*
3288          * for each drhd
3289          *   enable fault log
3290          *   global invalidate context cache
3291          *   global invalidate iotlb
3292          *   enable translation
3293          */
3294         for_each_iommu(iommu, drhd) {
3295                 if (drhd->ignored) {
3296                         /*
3297                          * we always have to disable PMRs or DMA may fail on
3298                          * this device
3299                          */
3300                         if (force_on)
3301                                 iommu_disable_protect_mem_regions(iommu);
3302                         continue;
3303                 }
3304
3305                 iommu_flush_write_buffer(iommu);
3306
3307 #ifdef CONFIG_INTEL_IOMMU_SVM
3308                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3309                         /*
3310                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3311                          * could cause possible lock race condition.
3312                          */
3313                         up_write(&dmar_global_lock);
3314                         ret = intel_svm_enable_prq(iommu);
3315                         down_write(&dmar_global_lock);
3316                         if (ret)
3317                                 goto free_iommu;
3318                 }
3319 #endif
3320                 ret = dmar_set_interrupt(iommu);
3321                 if (ret)
3322                         goto free_iommu;
3323         }
3324
3325         return 0;
3326
3327 free_iommu:
3328         for_each_active_iommu(iommu, drhd) {
3329                 disable_dmar_iommu(iommu);
3330                 free_dmar_iommu(iommu);
3331         }
3332
3333         kfree(g_iommus);
3334
3335 error:
3336         return ret;
3337 }
3338
3339 /* This takes a number of _MM_ pages, not VTD pages */
3340 static unsigned long intel_alloc_iova(struct device *dev,
3341                                      struct dmar_domain *domain,
3342                                      unsigned long nrpages, uint64_t dma_mask)
3343 {
3344         unsigned long iova_pfn;
3345
3346         /* Restrict dma_mask to the width that the iommu can handle */
3347         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3348         /* Ensure we reserve the whole size-aligned region */
3349         nrpages = __roundup_pow_of_two(nrpages);
3350
3351         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3352                 /*
3353                  * First try to allocate an io virtual address in
3354                  * DMA_BIT_MASK(32) and if that fails then try allocating
3355                  * from higher range
3356                  */
3357                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3358                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3359                 if (iova_pfn)
3360                         return iova_pfn;
3361         }
3362         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3363                                    IOVA_PFN(dma_mask), true);
3364         if (unlikely(!iova_pfn)) {
3365                 dev_err(dev, "Allocating %ld-page iova failed", nrpages);
3366                 return 0;
3367         }
3368
3369         return iova_pfn;
3370 }
3371
3372 static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3373 {
3374         struct dmar_domain *domain, *tmp;
3375         struct dmar_rmrr_unit *rmrr;
3376         struct device *i_dev;
3377         int i, ret;
3378
3379         /* Device shouldn't be attached by any domains. */
3380         domain = find_domain(dev);
3381         if (domain)
3382                 return NULL;
3383
3384         domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3385         if (!domain)
3386                 goto out;
3387
3388         /* We have a new domain - setup possible RMRRs for the device */
3389         rcu_read_lock();
3390         for_each_rmrr_units(rmrr) {
3391                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3392                                           i, i_dev) {
3393                         if (i_dev != dev)
3394                                 continue;
3395
3396                         ret = domain_prepare_identity_map(dev, domain,
3397                                                           rmrr->base_address,
3398                                                           rmrr->end_address);
3399                         if (ret)
3400                                 dev_err(dev, "Mapping reserved region failed\n");
3401                 }
3402         }
3403         rcu_read_unlock();
3404
3405         tmp = set_domain_for_dev(dev, domain);
3406         if (!tmp || domain != tmp) {
3407                 domain_exit(domain);
3408                 domain = tmp;
3409         }
3410
3411 out:
3412         if (!domain)
3413                 dev_err(dev, "Allocating domain failed\n");
3414         else
3415                 domain->domain.type = IOMMU_DOMAIN_DMA;
3416
3417         return domain;
3418 }
3419
3420 /* Check if the dev needs to go through non-identity map and unmap process.*/
3421 static bool iommu_need_mapping(struct device *dev)
3422 {
3423         int ret;
3424
3425         if (iommu_dummy(dev))
3426                 return false;
3427
3428         ret = identity_mapping(dev);
3429         if (ret) {
3430                 u64 dma_mask = *dev->dma_mask;
3431
3432                 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3433                         dma_mask = dev->coherent_dma_mask;
3434
3435                 if (dma_mask >= dma_get_required_mask(dev))
3436                         return false;
3437
3438                 /*
3439                  * 32 bit DMA is removed from si_domain and fall back to
3440                  * non-identity mapping.
3441                  */
3442                 dmar_remove_one_dev_info(dev);
3443                 ret = iommu_request_dma_domain_for_dev(dev);
3444                 if (ret) {
3445                         struct iommu_domain *domain;
3446                         struct dmar_domain *dmar_domain;
3447
3448                         domain = iommu_get_domain_for_dev(dev);
3449                         if (domain) {
3450                                 dmar_domain = to_dmar_domain(domain);
3451                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3452                         }
3453                         get_private_domain_for_dev(dev);
3454                 }
3455
3456                 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3457         }
3458
3459         return true;
3460 }
3461
3462 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3463                                      size_t size, int dir, u64 dma_mask)
3464 {
3465         struct dmar_domain *domain;
3466         phys_addr_t start_paddr;
3467         unsigned long iova_pfn;
3468         int prot = 0;
3469         int ret;
3470         struct intel_iommu *iommu;
3471         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3472
3473         BUG_ON(dir == DMA_NONE);
3474
3475         domain = find_domain(dev);
3476         if (!domain)
3477                 return DMA_MAPPING_ERROR;
3478
3479         iommu = domain_get_iommu(domain);
3480         size = aligned_nrpages(paddr, size);
3481
3482         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3483         if (!iova_pfn)
3484                 goto error;
3485
3486         /*
3487          * Check if DMAR supports zero-length reads on write only
3488          * mappings..
3489          */
3490         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3491                         !cap_zlr(iommu->cap))
3492                 prot |= DMA_PTE_READ;
3493         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3494                 prot |= DMA_PTE_WRITE;
3495         /*
3496          * paddr - (paddr + size) might be partial page, we should map the whole
3497          * page.  Note: if two part of one page are separately mapped, we
3498          * might have two guest_addr mapping to the same host paddr, but this
3499          * is not a big problem
3500          */
3501         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3502                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3503         if (ret)
3504                 goto error;
3505
3506         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3507         start_paddr += paddr & ~PAGE_MASK;
3508         return start_paddr;
3509
3510 error:
3511         if (iova_pfn)
3512                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3513         dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3514                 size, (unsigned long long)paddr, dir);
3515         return DMA_MAPPING_ERROR;
3516 }
3517
3518 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3519                                  unsigned long offset, size_t size,
3520                                  enum dma_data_direction dir,
3521                                  unsigned long attrs)
3522 {
3523         if (iommu_need_mapping(dev))
3524                 return __intel_map_single(dev, page_to_phys(page) + offset,
3525                                 size, dir, *dev->dma_mask);
3526         return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3527 }
3528
3529 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3530                                      size_t size, enum dma_data_direction dir,
3531                                      unsigned long attrs)
3532 {
3533         if (iommu_need_mapping(dev))
3534                 return __intel_map_single(dev, phys_addr, size, dir,
3535                                 *dev->dma_mask);
3536         return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3537 }
3538
3539 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3540 {
3541         struct dmar_domain *domain;
3542         unsigned long start_pfn, last_pfn;
3543         unsigned long nrpages;
3544         unsigned long iova_pfn;
3545         struct intel_iommu *iommu;
3546         struct page *freelist;
3547         struct pci_dev *pdev = NULL;
3548
3549         domain = find_domain(dev);
3550         BUG_ON(!domain);
3551
3552         iommu = domain_get_iommu(domain);
3553
3554         iova_pfn = IOVA_PFN(dev_addr);
3555
3556         nrpages = aligned_nrpages(dev_addr, size);
3557         start_pfn = mm_to_dma_pfn(iova_pfn);
3558         last_pfn = start_pfn + nrpages - 1;
3559
3560         if (dev_is_pci(dev))
3561                 pdev = to_pci_dev(dev);
3562
3563         dev_dbg(dev, "Device unmapping: pfn %lx-%lx\n", start_pfn, last_pfn);
3564
3565         freelist = domain_unmap(domain, start_pfn, last_pfn);
3566
3567         if (intel_iommu_strict || (pdev && pdev->untrusted)) {
3568                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3569                                       nrpages, !freelist, 0);
3570                 /* free iova */
3571                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3572                 dma_free_pagelist(freelist);
3573         } else {
3574                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3575                            (unsigned long)freelist);
3576                 /*
3577                  * queue up the release of the unmap to save the 1/6th of the
3578                  * cpu used up by the iotlb flush operation...
3579                  */
3580         }
3581 }
3582
3583 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3584                              size_t size, enum dma_data_direction dir,
3585                              unsigned long attrs)
3586 {
3587         if (iommu_need_mapping(dev))
3588                 intel_unmap(dev, dev_addr, size);
3589         else
3590                 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3591 }
3592
3593 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3594                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3595 {
3596         if (iommu_need_mapping(dev))
3597                 intel_unmap(dev, dev_addr, size);
3598 }
3599
3600 static void *intel_alloc_coherent(struct device *dev, size_t size,
3601                                   dma_addr_t *dma_handle, gfp_t flags,
3602                                   unsigned long attrs)
3603 {
3604         struct page *page = NULL;
3605         int order;
3606
3607         if (!iommu_need_mapping(dev))
3608                 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3609
3610         size = PAGE_ALIGN(size);
3611         order = get_order(size);
3612
3613         if (gfpflags_allow_blocking(flags)) {
3614                 unsigned int count = size >> PAGE_SHIFT;
3615
3616                 page = dma_alloc_from_contiguous(dev, count, order,
3617                                                  flags & __GFP_NOWARN);
3618         }
3619
3620         if (!page)
3621                 page = alloc_pages(flags, order);
3622         if (!page)
3623                 return NULL;
3624         memset(page_address(page), 0, size);
3625
3626         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3627                                          DMA_BIDIRECTIONAL,
3628                                          dev->coherent_dma_mask);
3629         if (*dma_handle != DMA_MAPPING_ERROR)
3630                 return page_address(page);
3631         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3632                 __free_pages(page, order);
3633
3634         return NULL;
3635 }
3636
3637 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3638                                 dma_addr_t dma_handle, unsigned long attrs)
3639 {
3640         int order;
3641         struct page *page = virt_to_page(vaddr);
3642
3643         if (!iommu_need_mapping(dev))
3644                 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3645
3646         size = PAGE_ALIGN(size);
3647         order = get_order(size);
3648
3649         intel_unmap(dev, dma_handle, size);
3650         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3651                 __free_pages(page, order);
3652 }
3653
3654 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3655                            int nelems, enum dma_data_direction dir,
3656                            unsigned long attrs)
3657 {
3658         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3659         unsigned long nrpages = 0;
3660         struct scatterlist *sg;
3661         int i;
3662
3663         if (!iommu_need_mapping(dev))
3664                 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3665
3666         for_each_sg(sglist, sg, nelems, i) {
3667                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3668         }
3669
3670         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3671 }
3672
3673 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3674                         enum dma_data_direction dir, unsigned long attrs)
3675 {
3676         int i;
3677         struct dmar_domain *domain;
3678         size_t size = 0;
3679         int prot = 0;
3680         unsigned long iova_pfn;
3681         int ret;
3682         struct scatterlist *sg;
3683         unsigned long start_vpfn;
3684         struct intel_iommu *iommu;
3685
3686         BUG_ON(dir == DMA_NONE);
3687         if (!iommu_need_mapping(dev))
3688                 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3689
3690         domain = find_domain(dev);
3691         if (!domain)
3692                 return 0;
3693
3694         iommu = domain_get_iommu(domain);
3695
3696         for_each_sg(sglist, sg, nelems, i)
3697                 size += aligned_nrpages(sg->offset, sg->length);
3698
3699         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3700                                 *dev->dma_mask);
3701         if (!iova_pfn) {
3702                 sglist->dma_length = 0;
3703                 return 0;
3704         }
3705
3706         /*
3707          * Check if DMAR supports zero-length reads on write only
3708          * mappings..
3709          */
3710         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3711                         !cap_zlr(iommu->cap))
3712                 prot |= DMA_PTE_READ;
3713         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3714                 prot |= DMA_PTE_WRITE;
3715
3716         start_vpfn = mm_to_dma_pfn(iova_pfn);
3717
3718         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3719         if (unlikely(ret)) {
3720                 dma_pte_free_pagetable(domain, start_vpfn,
3721                                        start_vpfn + size - 1,
3722                                        agaw_to_level(domain->agaw) + 1);
3723                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3724                 return 0;
3725         }
3726
3727         return nelems;
3728 }
3729
3730 static const struct dma_map_ops intel_dma_ops = {
3731         .alloc = intel_alloc_coherent,
3732         .free = intel_free_coherent,
3733         .map_sg = intel_map_sg,
3734         .unmap_sg = intel_unmap_sg,
3735         .map_page = intel_map_page,
3736         .unmap_page = intel_unmap_page,
3737         .map_resource = intel_map_resource,
3738         .unmap_resource = intel_unmap_resource,
3739         .dma_supported = dma_direct_supported,
3740 };
3741
3742 static inline int iommu_domain_cache_init(void)
3743 {
3744         int ret = 0;
3745
3746         iommu_domain_cache = kmem_cache_create("iommu_domain",
3747                                          sizeof(struct dmar_domain),
3748                                          0,
3749                                          SLAB_HWCACHE_ALIGN,
3750
3751                                          NULL);
3752         if (!iommu_domain_cache) {
3753                 pr_err("Couldn't create iommu_domain cache\n");
3754                 ret = -ENOMEM;
3755         }
3756
3757         return ret;
3758 }
3759
3760 static inline int iommu_devinfo_cache_init(void)
3761 {
3762         int ret = 0;
3763
3764         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3765                                          sizeof(struct device_domain_info),
3766                                          0,
3767                                          SLAB_HWCACHE_ALIGN,
3768                                          NULL);
3769         if (!iommu_devinfo_cache) {
3770                 pr_err("Couldn't create devinfo cache\n");
3771                 ret = -ENOMEM;
3772         }
3773
3774         return ret;
3775 }
3776
3777 static int __init iommu_init_mempool(void)
3778 {
3779         int ret;
3780         ret = iova_cache_get();
3781         if (ret)
3782                 return ret;
3783
3784         ret = iommu_domain_cache_init();
3785         if (ret)
3786                 goto domain_error;
3787
3788         ret = iommu_devinfo_cache_init();
3789         if (!ret)
3790                 return ret;
3791
3792         kmem_cache_destroy(iommu_domain_cache);
3793 domain_error:
3794         iova_cache_put();
3795
3796         return -ENOMEM;
3797 }
3798
3799 static void __init iommu_exit_mempool(void)
3800 {
3801         kmem_cache_destroy(iommu_devinfo_cache);
3802         kmem_cache_destroy(iommu_domain_cache);
3803         iova_cache_put();
3804 }
3805
3806 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3807 {
3808         struct dmar_drhd_unit *drhd;
3809         u32 vtbar;
3810         int rc;
3811
3812         /* We know that this device on this chipset has its own IOMMU.
3813          * If we find it under a different IOMMU, then the BIOS is lying
3814          * to us. Hope that the IOMMU for this device is actually
3815          * disabled, and it needs no translation...
3816          */
3817         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3818         if (rc) {
3819                 /* "can't" happen */
3820                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3821                 return;
3822         }
3823         vtbar &= 0xffff0000;
3824
3825         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3826         drhd = dmar_find_matched_drhd_unit(pdev);
3827         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3828                             TAINT_FIRMWARE_WORKAROUND,
3829                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3830                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3831 }
3832 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3833
3834 static void __init init_no_remapping_devices(void)
3835 {
3836         struct dmar_drhd_unit *drhd;
3837         struct device *dev;
3838         int i;
3839
3840         for_each_drhd_unit(drhd) {
3841                 if (!drhd->include_all) {
3842                         for_each_active_dev_scope(drhd->devices,
3843                                                   drhd->devices_cnt, i, dev)
3844                                 break;
3845                         /* ignore DMAR unit if no devices exist */
3846                         if (i == drhd->devices_cnt)
3847                                 drhd->ignored = 1;
3848                 }
3849         }
3850
3851         for_each_active_drhd_unit(drhd) {
3852                 if (drhd->include_all)
3853                         continue;
3854
3855                 for_each_active_dev_scope(drhd->devices,
3856                                           drhd->devices_cnt, i, dev)
3857                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3858                                 break;
3859                 if (i < drhd->devices_cnt)
3860                         continue;
3861
3862                 /* This IOMMU has *only* gfx devices. Either bypass it or
3863                    set the gfx_mapped flag, as appropriate */
3864                 if (!dmar_map_gfx) {
3865                         drhd->ignored = 1;
3866                         for_each_active_dev_scope(drhd->devices,
3867                                                   drhd->devices_cnt, i, dev)
3868                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3869                 }
3870         }
3871 }
3872
3873 #ifdef CONFIG_SUSPEND
3874 static int init_iommu_hw(void)
3875 {
3876         struct dmar_drhd_unit *drhd;
3877         struct intel_iommu *iommu = NULL;
3878
3879         for_each_active_iommu(iommu, drhd)
3880                 if (iommu->qi)
3881                         dmar_reenable_qi(iommu);
3882
3883         for_each_iommu(iommu, drhd) {
3884                 if (drhd->ignored) {
3885                         /*
3886                          * we always have to disable PMRs or DMA may fail on
3887                          * this device
3888                          */
3889                         if (force_on)
3890                                 iommu_disable_protect_mem_regions(iommu);
3891                         continue;
3892                 }
3893
3894                 iommu_flush_write_buffer(iommu);
3895
3896                 iommu_set_root_entry(iommu);
3897
3898                 iommu->flush.flush_context(iommu, 0, 0, 0,
3899                                            DMA_CCMD_GLOBAL_INVL);
3900                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3901                 iommu_enable_translation(iommu);
3902                 iommu_disable_protect_mem_regions(iommu);
3903         }
3904
3905         return 0;
3906 }
3907
3908 static void iommu_flush_all(void)
3909 {
3910         struct dmar_drhd_unit *drhd;
3911         struct intel_iommu *iommu;
3912
3913         for_each_active_iommu(iommu, drhd) {
3914                 iommu->flush.flush_context(iommu, 0, 0, 0,
3915                                            DMA_CCMD_GLOBAL_INVL);
3916                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3917                                          DMA_TLB_GLOBAL_FLUSH);
3918         }
3919 }
3920
3921 static int iommu_suspend(void)
3922 {
3923         struct dmar_drhd_unit *drhd;
3924         struct intel_iommu *iommu = NULL;
3925         unsigned long flag;
3926
3927         for_each_active_iommu(iommu, drhd) {
3928                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3929                                                  GFP_ATOMIC);
3930                 if (!iommu->iommu_state)
3931                         goto nomem;
3932         }
3933
3934         iommu_flush_all();
3935
3936         for_each_active_iommu(iommu, drhd) {
3937                 iommu_disable_translation(iommu);
3938
3939                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3940
3941                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3942                         readl(iommu->reg + DMAR_FECTL_REG);
3943                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3944                         readl(iommu->reg + DMAR_FEDATA_REG);
3945                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3946                         readl(iommu->reg + DMAR_FEADDR_REG);
3947                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3948                         readl(iommu->reg + DMAR_FEUADDR_REG);
3949
3950                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3951         }
3952         return 0;
3953
3954 nomem:
3955         for_each_active_iommu(iommu, drhd)
3956                 kfree(iommu->iommu_state);
3957
3958         return -ENOMEM;
3959 }
3960
3961 static void iommu_resume(void)
3962 {
3963         struct dmar_drhd_unit *drhd;
3964         struct intel_iommu *iommu = NULL;
3965         unsigned long flag;
3966
3967         if (init_iommu_hw()) {
3968                 if (force_on)
3969                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3970                 else
3971                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3972                 return;
3973         }
3974
3975         for_each_active_iommu(iommu, drhd) {
3976
3977                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3978
3979                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3980                         iommu->reg + DMAR_FECTL_REG);
3981                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3982                         iommu->reg + DMAR_FEDATA_REG);
3983                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3984                         iommu->reg + DMAR_FEADDR_REG);
3985                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3986                         iommu->reg + DMAR_FEUADDR_REG);
3987
3988                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3989         }
3990
3991         for_each_active_iommu(iommu, drhd)
3992                 kfree(iommu->iommu_state);
3993 }
3994
3995 static struct syscore_ops iommu_syscore_ops = {
3996         .resume         = iommu_resume,
3997         .suspend        = iommu_suspend,
3998 };
3999
4000 static void __init init_iommu_pm_ops(void)
4001 {
4002         register_syscore_ops(&iommu_syscore_ops);
4003 }
4004
4005 #else
4006 static inline void init_iommu_pm_ops(void) {}
4007 #endif  /* CONFIG_PM */
4008
4009 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4010 {
4011         struct acpi_dmar_reserved_memory *rmrr;
4012         struct dmar_rmrr_unit *rmrru;
4013
4014         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4015         if (!rmrru)
4016                 goto out;
4017
4018         rmrru->hdr = header;
4019         rmrr = (struct acpi_dmar_reserved_memory *)header;
4020         rmrru->base_address = rmrr->base_address;
4021         rmrru->end_address = rmrr->end_address;
4022
4023         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4024                                 ((void *)rmrr) + rmrr->header.length,
4025                                 &rmrru->devices_cnt);
4026         if (rmrru->devices_cnt && rmrru->devices == NULL)
4027                 goto free_rmrru;
4028
4029         list_add(&rmrru->list, &dmar_rmrr_units);
4030
4031         return 0;
4032 free_rmrru:
4033         kfree(rmrru);
4034 out:
4035         return -ENOMEM;
4036 }
4037
4038 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4039 {
4040         struct dmar_atsr_unit *atsru;
4041         struct acpi_dmar_atsr *tmp;
4042
4043         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4044                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4045                 if (atsr->segment != tmp->segment)
4046                         continue;
4047                 if (atsr->header.length != tmp->header.length)
4048                         continue;
4049                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4050                         return atsru;
4051         }
4052
4053         return NULL;
4054 }
4055
4056 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4057 {
4058         struct acpi_dmar_atsr *atsr;
4059         struct dmar_atsr_unit *atsru;
4060
4061         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4062                 return 0;
4063
4064         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4065         atsru = dmar_find_atsr(atsr);
4066         if (atsru)
4067                 return 0;
4068
4069         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4070         if (!atsru)
4071                 return -ENOMEM;
4072
4073         /*
4074          * If memory is allocated from slab by ACPI _DSM method, we need to
4075          * copy the memory content because the memory buffer will be freed
4076          * on return.
4077          */
4078         atsru->hdr = (void *)(atsru + 1);
4079         memcpy(atsru->hdr, hdr, hdr->length);
4080         atsru->include_all = atsr->flags & 0x1;
4081         if (!atsru->include_all) {
4082                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4083                                 (void *)atsr + atsr->header.length,
4084                                 &atsru->devices_cnt);
4085                 if (atsru->devices_cnt && atsru->devices == NULL) {
4086                         kfree(atsru);
4087                         return -ENOMEM;
4088                 }
4089         }
4090
4091         list_add_rcu(&atsru->list, &dmar_atsr_units);
4092
4093         return 0;
4094 }
4095
4096 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4097 {
4098         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4099         kfree(atsru);
4100 }
4101
4102 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4103 {
4104         struct acpi_dmar_atsr *atsr;
4105         struct dmar_atsr_unit *atsru;
4106
4107         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4108         atsru = dmar_find_atsr(atsr);
4109         if (atsru) {
4110                 list_del_rcu(&atsru->list);
4111                 synchronize_rcu();
4112                 intel_iommu_free_atsr(atsru);
4113         }
4114
4115         return 0;
4116 }
4117
4118 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4119 {
4120         int i;
4121         struct device *dev;
4122         struct acpi_dmar_atsr *atsr;
4123         struct dmar_atsr_unit *atsru;
4124
4125         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4126         atsru = dmar_find_atsr(atsr);
4127         if (!atsru)
4128                 return 0;
4129
4130         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4131                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4132                                           i, dev)
4133                         return -EBUSY;
4134         }
4135
4136         return 0;
4137 }
4138
4139 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4140 {
4141         int sp, ret;
4142         struct intel_iommu *iommu = dmaru->iommu;
4143
4144         if (g_iommus[iommu->seq_id])
4145                 return 0;
4146
4147         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4148                 pr_warn("%s: Doesn't support hardware pass through.\n",
4149                         iommu->name);
4150                 return -ENXIO;
4151         }
4152         if (!ecap_sc_support(iommu->ecap) &&
4153             domain_update_iommu_snooping(iommu)) {
4154                 pr_warn("%s: Doesn't support snooping.\n",
4155                         iommu->name);
4156                 return -ENXIO;
4157         }
4158         sp = domain_update_iommu_superpage(iommu) - 1;
4159         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4160                 pr_warn("%s: Doesn't support large page.\n",
4161                         iommu->name);
4162                 return -ENXIO;
4163         }
4164
4165         /*
4166          * Disable translation if already enabled prior to OS handover.
4167          */
4168         if (iommu->gcmd & DMA_GCMD_TE)
4169                 iommu_disable_translation(iommu);
4170
4171         g_iommus[iommu->seq_id] = iommu;
4172         ret = iommu_init_domains(iommu);
4173         if (ret == 0)
4174                 ret = iommu_alloc_root_entry(iommu);
4175         if (ret)
4176                 goto out;
4177
4178 #ifdef CONFIG_INTEL_IOMMU_SVM
4179         if (pasid_supported(iommu))
4180                 intel_svm_init(iommu);
4181 #endif
4182
4183         if (dmaru->ignored) {
4184                 /*
4185                  * we always have to disable PMRs or DMA may fail on this device
4186                  */
4187                 if (force_on)
4188                         iommu_disable_protect_mem_regions(iommu);
4189                 return 0;
4190         }
4191
4192         intel_iommu_init_qi(iommu);
4193         iommu_flush_write_buffer(iommu);
4194
4195 #ifdef CONFIG_INTEL_IOMMU_SVM
4196         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4197                 ret = intel_svm_enable_prq(iommu);
4198                 if (ret)
4199                         goto disable_iommu;
4200         }
4201 #endif
4202         ret = dmar_set_interrupt(iommu);
4203         if (ret)
4204                 goto disable_iommu;
4205
4206         iommu_set_root_entry(iommu);
4207         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4208         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4209         iommu_enable_translation(iommu);
4210
4211         iommu_disable_protect_mem_regions(iommu);
4212         return 0;
4213
4214 disable_iommu:
4215         disable_dmar_iommu(iommu);
4216 out:
4217         free_dmar_iommu(iommu);
4218         return ret;
4219 }
4220
4221 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4222 {
4223         int ret = 0;
4224         struct intel_iommu *iommu = dmaru->iommu;
4225
4226         if (!intel_iommu_enabled)
4227                 return 0;
4228         if (iommu == NULL)
4229                 return -EINVAL;
4230
4231         if (insert) {
4232                 ret = intel_iommu_add(dmaru);
4233         } else {
4234                 disable_dmar_iommu(iommu);
4235                 free_dmar_iommu(iommu);
4236         }
4237
4238         return ret;
4239 }
4240
4241 static void intel_iommu_free_dmars(void)
4242 {
4243         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4244         struct dmar_atsr_unit *atsru, *atsr_n;
4245
4246         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4247                 list_del(&rmrru->list);
4248                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4249                 kfree(rmrru);
4250         }
4251
4252         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4253                 list_del(&atsru->list);
4254                 intel_iommu_free_atsr(atsru);
4255         }
4256 }
4257
4258 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4259 {
4260         int i, ret = 1;
4261         struct pci_bus *bus;
4262         struct pci_dev *bridge = NULL;
4263         struct device *tmp;
4264         struct acpi_dmar_atsr *atsr;
4265         struct dmar_atsr_unit *atsru;
4266
4267         dev = pci_physfn(dev);
4268         for (bus = dev->bus; bus; bus = bus->parent) {
4269                 bridge = bus->self;
4270                 /* If it's an integrated device, allow ATS */
4271                 if (!bridge)
4272                         return 1;
4273                 /* Connected via non-PCIe: no ATS */
4274                 if (!pci_is_pcie(bridge) ||
4275                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4276                         return 0;
4277                 /* If we found the root port, look it up in the ATSR */
4278                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4279                         break;
4280         }
4281
4282         rcu_read_lock();
4283         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4284                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4285                 if (atsr->segment != pci_domain_nr(dev->bus))
4286                         continue;
4287
4288                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4289                         if (tmp == &bridge->dev)
4290                                 goto out;
4291
4292                 if (atsru->include_all)
4293                         goto out;
4294         }
4295         ret = 0;
4296 out:
4297         rcu_read_unlock();
4298
4299         return ret;
4300 }
4301
4302 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4303 {
4304         int ret;
4305         struct dmar_rmrr_unit *rmrru;
4306         struct dmar_atsr_unit *atsru;
4307         struct acpi_dmar_atsr *atsr;
4308         struct acpi_dmar_reserved_memory *rmrr;
4309
4310         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4311                 return 0;
4312
4313         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4314                 rmrr = container_of(rmrru->hdr,
4315                                     struct acpi_dmar_reserved_memory, header);
4316                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4317                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4318                                 ((void *)rmrr) + rmrr->header.length,
4319                                 rmrr->segment, rmrru->devices,
4320                                 rmrru->devices_cnt);
4321                         if (ret < 0)
4322                                 return ret;
4323                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4324                         dmar_remove_dev_scope(info, rmrr->segment,
4325                                 rmrru->devices, rmrru->devices_cnt);
4326                 }
4327         }
4328
4329         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4330                 if (atsru->include_all)
4331                         continue;
4332
4333                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4334                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4335                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4336                                         (void *)atsr + atsr->header.length,
4337                                         atsr->segment, atsru->devices,
4338                                         atsru->devices_cnt);
4339                         if (ret > 0)
4340                                 break;
4341                         else if (ret < 0)
4342                                 return ret;
4343                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4344                         if (dmar_remove_dev_scope(info, atsr->segment,
4345                                         atsru->devices, atsru->devices_cnt))
4346                                 break;
4347                 }
4348         }
4349
4350         return 0;
4351 }
4352
4353 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4354                                        unsigned long val, void *v)
4355 {
4356         struct memory_notify *mhp = v;
4357         unsigned long long start, end;
4358         unsigned long start_vpfn, last_vpfn;
4359
4360         switch (val) {
4361         case MEM_GOING_ONLINE:
4362                 start = mhp->start_pfn << PAGE_SHIFT;
4363                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4364                 if (iommu_domain_identity_map(si_domain, start, end)) {
4365                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4366                                 start, end);
4367                         return NOTIFY_BAD;
4368                 }
4369                 break;
4370
4371         case MEM_OFFLINE:
4372         case MEM_CANCEL_ONLINE:
4373                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4374                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4375                 while (start_vpfn <= last_vpfn) {
4376                         struct iova *iova;
4377                         struct dmar_drhd_unit *drhd;
4378                         struct intel_iommu *iommu;
4379                         struct page *freelist;
4380
4381                         iova = find_iova(&si_domain->iovad, start_vpfn);
4382                         if (iova == NULL) {
4383                                 pr_debug("Failed get IOVA for PFN %lx\n",
4384                                          start_vpfn);
4385                                 break;
4386                         }
4387
4388                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4389                                                      start_vpfn, last_vpfn);
4390                         if (iova == NULL) {
4391                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4392                                         start_vpfn, last_vpfn);
4393                                 return NOTIFY_BAD;
4394                         }
4395
4396                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4397                                                iova->pfn_hi);
4398
4399                         rcu_read_lock();
4400                         for_each_active_iommu(iommu, drhd)
4401                                 iommu_flush_iotlb_psi(iommu, si_domain,
4402                                         iova->pfn_lo, iova_size(iova),
4403                                         !freelist, 0);
4404                         rcu_read_unlock();
4405                         dma_free_pagelist(freelist);
4406
4407                         start_vpfn = iova->pfn_hi + 1;
4408                         free_iova_mem(iova);
4409                 }
4410                 break;
4411         }
4412
4413         return NOTIFY_OK;
4414 }
4415
4416 static struct notifier_block intel_iommu_memory_nb = {
4417         .notifier_call = intel_iommu_memory_notifier,
4418         .priority = 0
4419 };
4420
4421 static void free_all_cpu_cached_iovas(unsigned int cpu)
4422 {
4423         int i;
4424
4425         for (i = 0; i < g_num_of_iommus; i++) {
4426                 struct intel_iommu *iommu = g_iommus[i];
4427                 struct dmar_domain *domain;
4428                 int did;
4429
4430                 if (!iommu)
4431                         continue;
4432
4433                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4434                         domain = get_iommu_domain(iommu, (u16)did);
4435
4436                         if (!domain)
4437                                 continue;
4438                         free_cpu_cached_iovas(cpu, &domain->iovad);
4439                 }
4440         }
4441 }
4442
4443 static int intel_iommu_cpu_dead(unsigned int cpu)
4444 {
4445         free_all_cpu_cached_iovas(cpu);
4446         return 0;
4447 }
4448
4449 static void intel_disable_iommus(void)
4450 {
4451         struct intel_iommu *iommu = NULL;
4452         struct dmar_drhd_unit *drhd;
4453
4454         for_each_iommu(iommu, drhd)
4455                 iommu_disable_translation(iommu);
4456 }
4457
4458 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4459 {
4460         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4461
4462         return container_of(iommu_dev, struct intel_iommu, iommu);
4463 }
4464
4465 static ssize_t intel_iommu_show_version(struct device *dev,
4466                                         struct device_attribute *attr,
4467                                         char *buf)
4468 {
4469         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4470         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4471         return sprintf(buf, "%d:%d\n",
4472                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4473 }
4474 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4475
4476 static ssize_t intel_iommu_show_address(struct device *dev,
4477                                         struct device_attribute *attr,
4478                                         char *buf)
4479 {
4480         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4481         return sprintf(buf, "%llx\n", iommu->reg_phys);
4482 }
4483 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4484
4485 static ssize_t intel_iommu_show_cap(struct device *dev,
4486                                     struct device_attribute *attr,
4487                                     char *buf)
4488 {
4489         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4490         return sprintf(buf, "%llx\n", iommu->cap);
4491 }
4492 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4493
4494 static ssize_t intel_iommu_show_ecap(struct device *dev,
4495                                     struct device_attribute *attr,
4496                                     char *buf)
4497 {
4498         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4499         return sprintf(buf, "%llx\n", iommu->ecap);
4500 }
4501 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4502
4503 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4504                                       struct device_attribute *attr,
4505                                       char *buf)
4506 {
4507         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4508         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4509 }
4510 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4511
4512 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4513                                            struct device_attribute *attr,
4514                                            char *buf)
4515 {
4516         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4517         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4518                                                   cap_ndoms(iommu->cap)));
4519 }
4520 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4521
4522 static struct attribute *intel_iommu_attrs[] = {
4523         &dev_attr_version.attr,
4524         &dev_attr_address.attr,
4525         &dev_attr_cap.attr,
4526         &dev_attr_ecap.attr,
4527         &dev_attr_domains_supported.attr,
4528         &dev_attr_domains_used.attr,
4529         NULL,
4530 };
4531
4532 static struct attribute_group intel_iommu_group = {
4533         .name = "intel-iommu",
4534         .attrs = intel_iommu_attrs,
4535 };
4536
4537 const struct attribute_group *intel_iommu_groups[] = {
4538         &intel_iommu_group,
4539         NULL,
4540 };
4541
4542 static int __init platform_optin_force_iommu(void)
4543 {
4544         struct pci_dev *pdev = NULL;
4545         bool has_untrusted_dev = false;
4546
4547         if (!dmar_platform_optin() || no_platform_optin)
4548                 return 0;
4549
4550         for_each_pci_dev(pdev) {
4551                 if (pdev->untrusted) {
4552                         has_untrusted_dev = true;
4553                         break;
4554                 }
4555         }
4556
4557         if (!has_untrusted_dev)
4558                 return 0;
4559
4560         if (no_iommu || dmar_disabled)
4561                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4562
4563         /*
4564          * If Intel-IOMMU is disabled by default, we will apply identity
4565          * map for all devices except those marked as being untrusted.
4566          */
4567         if (dmar_disabled)
4568                 iommu_identity_mapping |= IDENTMAP_ALL;
4569
4570         dmar_disabled = 0;
4571 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4572         swiotlb = 0;
4573 #endif
4574         no_iommu = 0;
4575
4576         return 1;
4577 }
4578
4579 static int __init probe_acpi_namespace_devices(void)
4580 {
4581         struct dmar_drhd_unit *drhd;
4582         /* To avoid a -Wunused-but-set-variable warning. */
4583         struct intel_iommu *iommu __maybe_unused;
4584         struct device *dev;
4585         int i, ret = 0;
4586
4587         for_each_active_iommu(iommu, drhd) {
4588                 for_each_active_dev_scope(drhd->devices,
4589                                           drhd->devices_cnt, i, dev) {
4590                         struct acpi_device_physical_node *pn;
4591                         struct iommu_group *group;
4592                         struct acpi_device *adev;
4593
4594                         if (dev->bus != &acpi_bus_type)
4595                                 continue;
4596
4597                         adev = to_acpi_device(dev);
4598                         mutex_lock(&adev->physical_node_lock);
4599                         list_for_each_entry(pn,
4600                                             &adev->physical_node_list, node) {
4601                                 group = iommu_group_get(pn->dev);
4602                                 if (group) {
4603                                         iommu_group_put(group);
4604                                         continue;
4605                                 }
4606
4607                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4608                                 ret = iommu_probe_device(pn->dev);
4609                                 if (ret)
4610                                         break;
4611                         }
4612                         mutex_unlock(&adev->physical_node_lock);
4613
4614                         if (ret)
4615                                 return ret;
4616                 }
4617         }
4618
4619         return 0;
4620 }
4621
4622 int __init intel_iommu_init(void)
4623 {
4624         int ret = -ENODEV;
4625         struct dmar_drhd_unit *drhd;
4626         struct intel_iommu *iommu;
4627
4628         /*
4629          * Intel IOMMU is required for a TXT/tboot launch or platform
4630          * opt in, so enforce that.
4631          */
4632         force_on = tboot_force_iommu() || platform_optin_force_iommu();
4633
4634         if (iommu_init_mempool()) {
4635                 if (force_on)
4636                         panic("tboot: Failed to initialize iommu memory\n");
4637                 return -ENOMEM;
4638         }
4639
4640         down_write(&dmar_global_lock);
4641         if (dmar_table_init()) {
4642                 if (force_on)
4643                         panic("tboot: Failed to initialize DMAR table\n");
4644                 goto out_free_dmar;
4645         }
4646
4647         if (dmar_dev_scope_init() < 0) {
4648                 if (force_on)
4649                         panic("tboot: Failed to initialize DMAR device scope\n");
4650                 goto out_free_dmar;
4651         }
4652
4653         up_write(&dmar_global_lock);
4654
4655         /*
4656          * The bus notifier takes the dmar_global_lock, so lockdep will
4657          * complain later when we register it under the lock.
4658          */
4659         dmar_register_bus_notifier();
4660
4661         down_write(&dmar_global_lock);
4662
4663         if (no_iommu || dmar_disabled) {
4664                 /*
4665                  * We exit the function here to ensure IOMMU's remapping and
4666                  * mempool aren't setup, which means that the IOMMU's PMRs
4667                  * won't be disabled via the call to init_dmars(). So disable
4668                  * it explicitly here. The PMRs were setup by tboot prior to
4669                  * calling SENTER, but the kernel is expected to reset/tear
4670                  * down the PMRs.
4671                  */
4672                 if (intel_iommu_tboot_noforce) {
4673                         for_each_iommu(iommu, drhd)
4674                                 iommu_disable_protect_mem_regions(iommu);
4675                 }
4676
4677                 /*
4678                  * Make sure the IOMMUs are switched off, even when we
4679                  * boot into a kexec kernel and the previous kernel left
4680                  * them enabled
4681                  */
4682                 intel_disable_iommus();
4683                 goto out_free_dmar;
4684         }
4685
4686         if (list_empty(&dmar_rmrr_units))
4687                 pr_info("No RMRR found\n");
4688
4689         if (list_empty(&dmar_atsr_units))
4690                 pr_info("No ATSR found\n");
4691
4692         if (dmar_init_reserved_ranges()) {
4693                 if (force_on)
4694                         panic("tboot: Failed to reserve iommu ranges\n");
4695                 goto out_free_reserved_range;
4696         }
4697
4698         if (dmar_map_gfx)
4699                 intel_iommu_gfx_mapped = 1;
4700
4701         init_no_remapping_devices();
4702
4703         ret = init_dmars();
4704         if (ret) {
4705                 if (force_on)
4706                         panic("tboot: Failed to initialize DMARs\n");
4707                 pr_err("Initialization failed\n");
4708                 goto out_free_reserved_range;
4709         }
4710         up_write(&dmar_global_lock);
4711
4712 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4713         swiotlb = 0;
4714 #endif
4715         dma_ops = &intel_dma_ops;
4716
4717         init_iommu_pm_ops();
4718
4719         for_each_active_iommu(iommu, drhd) {
4720                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4721                                        intel_iommu_groups,
4722                                        "%s", iommu->name);
4723                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4724                 iommu_device_register(&iommu->iommu);
4725         }
4726
4727         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4728         if (si_domain && !hw_pass_through)
4729                 register_memory_notifier(&intel_iommu_memory_nb);
4730         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4731                           intel_iommu_cpu_dead);
4732
4733         down_read(&dmar_global_lock);
4734         if (probe_acpi_namespace_devices())
4735                 pr_warn("ACPI name space devices didn't probe correctly\n");
4736         up_read(&dmar_global_lock);
4737
4738         /* Finally, we enable the DMA remapping hardware. */
4739         for_each_iommu(iommu, drhd) {
4740                 if (!drhd->ignored && !translation_pre_enabled(iommu))
4741                         iommu_enable_translation(iommu);
4742
4743                 iommu_disable_protect_mem_regions(iommu);
4744         }
4745         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4746
4747         intel_iommu_enabled = 1;
4748         intel_iommu_debugfs_init();
4749
4750         return 0;
4751
4752 out_free_reserved_range:
4753         put_iova_domain(&reserved_iova_list);
4754 out_free_dmar:
4755         intel_iommu_free_dmars();
4756         up_write(&dmar_global_lock);
4757         iommu_exit_mempool();
4758         return ret;
4759 }
4760
4761 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4762 {
4763         struct intel_iommu *iommu = opaque;
4764
4765         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4766         return 0;
4767 }
4768
4769 /*
4770  * NB - intel-iommu lacks any sort of reference counting for the users of
4771  * dependent devices.  If multiple endpoints have intersecting dependent
4772  * devices, unbinding the driver from any one of them will possibly leave
4773  * the others unable to operate.
4774  */
4775 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4776 {
4777         if (!iommu || !dev || !dev_is_pci(dev))
4778                 return;
4779
4780         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4781 }
4782
4783 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4784 {
4785         struct dmar_domain *domain;
4786         struct intel_iommu *iommu;
4787         unsigned long flags;
4788
4789         assert_spin_locked(&device_domain_lock);
4790
4791         if (WARN_ON(!info))
4792                 return;
4793
4794         iommu = info->iommu;
4795         domain = info->domain;
4796
4797         if (info->dev) {
4798                 if (dev_is_pci(info->dev) && sm_supported(iommu))
4799                         intel_pasid_tear_down_entry(iommu, info->dev,
4800                                         PASID_RID2PASID);
4801
4802                 iommu_disable_dev_iotlb(info);
4803                 domain_context_clear(iommu, info->dev);
4804                 intel_pasid_free_table(info->dev);
4805         }
4806
4807         unlink_domain_info(info);
4808
4809         spin_lock_irqsave(&iommu->lock, flags);
4810         domain_detach_iommu(domain, iommu);
4811         spin_unlock_irqrestore(&iommu->lock, flags);
4812
4813         /* free the private domain */
4814         if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
4815             !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY))
4816                 domain_exit(info->domain);
4817
4818         free_devinfo_mem(info);
4819 }
4820
4821 static void dmar_remove_one_dev_info(struct device *dev)
4822 {
4823         struct device_domain_info *info;
4824         unsigned long flags;
4825
4826         spin_lock_irqsave(&device_domain_lock, flags);
4827         info = dev->archdata.iommu;
4828         __dmar_remove_one_dev_info(info);
4829         spin_unlock_irqrestore(&device_domain_lock, flags);
4830 }
4831
4832 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4833 {
4834         struct dmar_domain *dmar_domain;
4835         struct iommu_domain *domain;
4836
4837         switch (type) {
4838         case IOMMU_DOMAIN_DMA:
4839         /* fallthrough */
4840         case IOMMU_DOMAIN_UNMANAGED:
4841                 dmar_domain = alloc_domain(0);
4842                 if (!dmar_domain) {
4843                         pr_err("Can't allocate dmar_domain\n");
4844                         return NULL;
4845                 }
4846                 if (domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4847                         pr_err("Domain initialization failed\n");
4848                         domain_exit(dmar_domain);
4849                         return NULL;
4850                 }
4851
4852                 if (type == IOMMU_DOMAIN_DMA &&
4853                     init_iova_flush_queue(&dmar_domain->iovad,
4854                                           iommu_flush_iova, iova_entry_free)) {
4855                         pr_warn("iova flush queue initialization failed\n");
4856                         intel_iommu_strict = 1;
4857                 }
4858
4859                 domain_update_iommu_cap(dmar_domain);
4860
4861                 domain = &dmar_domain->domain;
4862                 domain->geometry.aperture_start = 0;
4863                 domain->geometry.aperture_end   =
4864                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4865                 domain->geometry.force_aperture = true;
4866
4867                 return domain;
4868         case IOMMU_DOMAIN_IDENTITY:
4869                 return &si_domain->domain;
4870         default:
4871                 return NULL;
4872         }
4873
4874         return NULL;
4875 }
4876
4877 static void intel_iommu_domain_free(struct iommu_domain *domain)
4878 {
4879         if (domain != &si_domain->domain)
4880                 domain_exit(to_dmar_domain(domain));
4881 }
4882
4883 /*
4884  * Check whether a @domain could be attached to the @dev through the
4885  * aux-domain attach/detach APIs.
4886  */
4887 static inline bool
4888 is_aux_domain(struct device *dev, struct iommu_domain *domain)
4889 {
4890         struct device_domain_info *info = dev->archdata.iommu;
4891
4892         return info && info->auxd_enabled &&
4893                         domain->type == IOMMU_DOMAIN_UNMANAGED;
4894 }
4895
4896 static void auxiliary_link_device(struct dmar_domain *domain,
4897                                   struct device *dev)
4898 {
4899         struct device_domain_info *info = dev->archdata.iommu;
4900
4901         assert_spin_locked(&device_domain_lock);
4902         if (WARN_ON(!info))
4903                 return;
4904
4905         domain->auxd_refcnt++;
4906         list_add(&domain->auxd, &info->auxiliary_domains);
4907 }
4908
4909 static void auxiliary_unlink_device(struct dmar_domain *domain,
4910                                     struct device *dev)
4911 {
4912         struct device_domain_info *info = dev->archdata.iommu;
4913
4914         assert_spin_locked(&device_domain_lock);
4915         if (WARN_ON(!info))
4916                 return;
4917
4918         list_del(&domain->auxd);
4919         domain->auxd_refcnt--;
4920
4921         if (!domain->auxd_refcnt && domain->default_pasid > 0)
4922                 intel_pasid_free_id(domain->default_pasid);
4923 }
4924
4925 static int aux_domain_add_dev(struct dmar_domain *domain,
4926                               struct device *dev)
4927 {
4928         int ret;
4929         u8 bus, devfn;
4930         unsigned long flags;
4931         struct intel_iommu *iommu;
4932
4933         iommu = device_to_iommu(dev, &bus, &devfn);
4934         if (!iommu)
4935                 return -ENODEV;
4936
4937         if (domain->default_pasid <= 0) {
4938                 int pasid;
4939
4940                 pasid = intel_pasid_alloc_id(domain, PASID_MIN,
4941                                              pci_max_pasids(to_pci_dev(dev)),
4942                                              GFP_KERNEL);
4943                 if (pasid <= 0) {
4944                         pr_err("Can't allocate default pasid\n");
4945                         return -ENODEV;
4946                 }
4947                 domain->default_pasid = pasid;
4948         }
4949
4950         spin_lock_irqsave(&device_domain_lock, flags);
4951         /*
4952          * iommu->lock must be held to attach domain to iommu and setup the
4953          * pasid entry for second level translation.
4954          */
4955         spin_lock(&iommu->lock);
4956         ret = domain_attach_iommu(domain, iommu);
4957         if (ret)
4958                 goto attach_failed;
4959
4960         /* Setup the PASID entry for mediated devices: */
4961         ret = intel_pasid_setup_second_level(iommu, domain, dev,
4962                                              domain->default_pasid);
4963         if (ret)
4964                 goto table_failed;
4965         spin_unlock(&iommu->lock);
4966
4967         auxiliary_link_device(domain, dev);
4968
4969         spin_unlock_irqrestore(&device_domain_lock, flags);
4970
4971         return 0;
4972
4973 table_failed:
4974         domain_detach_iommu(domain, iommu);
4975 attach_failed:
4976         spin_unlock(&iommu->lock);
4977         spin_unlock_irqrestore(&device_domain_lock, flags);
4978         if (!domain->auxd_refcnt && domain->default_pasid > 0)
4979                 intel_pasid_free_id(domain->default_pasid);
4980
4981         return ret;
4982 }
4983
4984 static void aux_domain_remove_dev(struct dmar_domain *domain,
4985                                   struct device *dev)
4986 {
4987         struct device_domain_info *info;
4988         struct intel_iommu *iommu;
4989         unsigned long flags;
4990
4991         if (!is_aux_domain(dev, &domain->domain))
4992                 return;
4993
4994         spin_lock_irqsave(&device_domain_lock, flags);
4995         info = dev->archdata.iommu;
4996         iommu = info->iommu;
4997
4998         auxiliary_unlink_device(domain, dev);
4999
5000         spin_lock(&iommu->lock);
5001         intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5002         domain_detach_iommu(domain, iommu);
5003         spin_unlock(&iommu->lock);
5004
5005         spin_unlock_irqrestore(&device_domain_lock, flags);
5006 }
5007
5008 static int prepare_domain_attach_device(struct iommu_domain *domain,
5009                                         struct device *dev)
5010 {
5011         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5012         struct intel_iommu *iommu;
5013         int addr_width;
5014         u8 bus, devfn;
5015
5016         iommu = device_to_iommu(dev, &bus, &devfn);
5017         if (!iommu)
5018                 return -ENODEV;
5019
5020         /* check if this iommu agaw is sufficient for max mapped address */
5021         addr_width = agaw_to_width(iommu->agaw);
5022         if (addr_width > cap_mgaw(iommu->cap))
5023                 addr_width = cap_mgaw(iommu->cap);
5024
5025         if (dmar_domain->max_addr > (1LL << addr_width)) {
5026                 dev_err(dev, "%s: iommu width (%d) is not "
5027                         "sufficient for the mapped address (%llx)\n",
5028                         __func__, addr_width, dmar_domain->max_addr);
5029                 return -EFAULT;
5030         }
5031         dmar_domain->gaw = addr_width;
5032
5033         /*
5034          * Knock out extra levels of page tables if necessary
5035          */
5036         while (iommu->agaw < dmar_domain->agaw) {
5037                 struct dma_pte *pte;
5038
5039                 pte = dmar_domain->pgd;
5040                 if (dma_pte_present(pte)) {
5041                         dmar_domain->pgd = (struct dma_pte *)
5042                                 phys_to_virt(dma_pte_addr(pte));
5043                         free_pgtable_page(pte);
5044                 }
5045                 dmar_domain->agaw--;
5046         }
5047
5048         return 0;
5049 }
5050
5051 static int intel_iommu_attach_device(struct iommu_domain *domain,
5052                                      struct device *dev)
5053 {
5054         int ret;
5055
5056         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5057             device_is_rmrr_locked(dev)) {
5058                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5059                 return -EPERM;
5060         }
5061
5062         if (is_aux_domain(dev, domain))
5063                 return -EPERM;
5064
5065         /* normally dev is not mapped */
5066         if (unlikely(domain_context_mapped(dev))) {
5067                 struct dmar_domain *old_domain;
5068
5069                 old_domain = find_domain(dev);
5070                 if (old_domain)
5071                         dmar_remove_one_dev_info(dev);
5072         }
5073
5074         ret = prepare_domain_attach_device(domain, dev);
5075         if (ret)
5076                 return ret;
5077
5078         return domain_add_dev_info(to_dmar_domain(domain), dev);
5079 }
5080
5081 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5082                                          struct device *dev)
5083 {
5084         int ret;
5085
5086         if (!is_aux_domain(dev, domain))
5087                 return -EPERM;
5088
5089         ret = prepare_domain_attach_device(domain, dev);
5090         if (ret)
5091                 return ret;
5092
5093         return aux_domain_add_dev(to_dmar_domain(domain), dev);
5094 }
5095
5096 static void intel_iommu_detach_device(struct iommu_domain *domain,
5097                                       struct device *dev)
5098 {
5099         dmar_remove_one_dev_info(dev);
5100 }
5101
5102 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5103                                           struct device *dev)
5104 {
5105         aux_domain_remove_dev(to_dmar_domain(domain), dev);
5106 }
5107
5108 static int intel_iommu_map(struct iommu_domain *domain,
5109                            unsigned long iova, phys_addr_t hpa,
5110                            size_t size, int iommu_prot)
5111 {
5112         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5113         u64 max_addr;
5114         int prot = 0;
5115         int ret;
5116
5117         if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5118                 return -EINVAL;
5119
5120         if (iommu_prot & IOMMU_READ)
5121                 prot |= DMA_PTE_READ;
5122         if (iommu_prot & IOMMU_WRITE)
5123                 prot |= DMA_PTE_WRITE;
5124         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5125                 prot |= DMA_PTE_SNP;
5126
5127         max_addr = iova + size;
5128         if (dmar_domain->max_addr < max_addr) {
5129                 u64 end;
5130
5131                 /* check if minimum agaw is sufficient for mapped address */
5132                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5133                 if (end < max_addr) {
5134                         pr_err("%s: iommu width (%d) is not "
5135                                "sufficient for the mapped address (%llx)\n",
5136                                __func__, dmar_domain->gaw, max_addr);
5137                         return -EFAULT;
5138                 }
5139                 dmar_domain->max_addr = max_addr;
5140         }
5141         /* Round up size to next multiple of PAGE_SIZE, if it and
5142            the low bits of hpa would take us onto the next page */
5143         size = aligned_nrpages(hpa, size);
5144         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5145                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5146         return ret;
5147 }
5148
5149 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5150                                 unsigned long iova, size_t size)
5151 {
5152         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5153         struct page *freelist = NULL;
5154         unsigned long start_pfn, last_pfn;
5155         unsigned int npages;
5156         int iommu_id, level = 0;
5157
5158         /* Cope with horrid API which requires us to unmap more than the
5159            size argument if it happens to be a large-page mapping. */
5160         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5161         if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5162                 return 0;
5163
5164         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5165                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5166
5167         start_pfn = iova >> VTD_PAGE_SHIFT;
5168         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5169
5170         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5171
5172         npages = last_pfn - start_pfn + 1;
5173
5174         for_each_domain_iommu(iommu_id, dmar_domain)
5175                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5176                                       start_pfn, npages, !freelist, 0);
5177
5178         dma_free_pagelist(freelist);
5179
5180         if (dmar_domain->max_addr == iova + size)
5181                 dmar_domain->max_addr = iova;
5182
5183         return size;
5184 }
5185
5186 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5187                                             dma_addr_t iova)
5188 {
5189         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5190         struct dma_pte *pte;
5191         int level = 0;
5192         u64 phys = 0;
5193
5194         if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5195                 return 0;
5196
5197         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5198         if (pte)
5199                 phys = dma_pte_addr(pte);
5200
5201         return phys;
5202 }
5203
5204 static inline bool scalable_mode_support(void)
5205 {
5206         struct dmar_drhd_unit *drhd;
5207         struct intel_iommu *iommu;
5208         bool ret = true;
5209
5210         rcu_read_lock();
5211         for_each_active_iommu(iommu, drhd) {
5212                 if (!sm_supported(iommu)) {
5213                         ret = false;
5214                         break;
5215                 }
5216         }
5217         rcu_read_unlock();
5218
5219         return ret;
5220 }
5221
5222 static inline bool iommu_pasid_support(void)
5223 {
5224         struct dmar_drhd_unit *drhd;
5225         struct intel_iommu *iommu;
5226         bool ret = true;
5227
5228         rcu_read_lock();
5229         for_each_active_iommu(iommu, drhd) {
5230                 if (!pasid_supported(iommu)) {
5231                         ret = false;
5232                         break;
5233                 }
5234         }
5235         rcu_read_unlock();
5236
5237         return ret;
5238 }
5239
5240 static bool intel_iommu_capable(enum iommu_cap cap)
5241 {
5242         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5243                 return domain_update_iommu_snooping(NULL) == 1;
5244         if (cap == IOMMU_CAP_INTR_REMAP)
5245                 return irq_remapping_enabled == 1;
5246
5247         return false;
5248 }
5249
5250 static int intel_iommu_add_device(struct device *dev)
5251 {
5252         struct dmar_domain *dmar_domain;
5253         struct iommu_domain *domain;
5254         struct intel_iommu *iommu;
5255         struct iommu_group *group;
5256         u8 bus, devfn;
5257         int ret;
5258
5259         iommu = device_to_iommu(dev, &bus, &devfn);
5260         if (!iommu)
5261                 return -ENODEV;
5262
5263         iommu_device_link(&iommu->iommu, dev);
5264
5265         if (translation_pre_enabled(iommu))
5266                 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5267
5268         group = iommu_group_get_for_dev(dev);
5269
5270         if (IS_ERR(group))
5271                 return PTR_ERR(group);
5272
5273         iommu_group_put(group);
5274
5275         domain = iommu_get_domain_for_dev(dev);
5276         dmar_domain = to_dmar_domain(domain);
5277         if (domain->type == IOMMU_DOMAIN_DMA) {
5278                 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5279                         ret = iommu_request_dm_for_dev(dev);
5280                         if (ret) {
5281                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5282                                 domain_add_dev_info(si_domain, dev);
5283                                 dev_info(dev,
5284                                          "Device uses a private identity domain.\n");
5285                         }
5286                 }
5287         } else {
5288                 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5289                         ret = iommu_request_dma_domain_for_dev(dev);
5290                         if (ret) {
5291                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5292                                 if (!get_private_domain_for_dev(dev)) {
5293                                         dev_warn(dev,
5294                                                  "Failed to get a private domain.\n");
5295                                         return -ENOMEM;
5296                                 }
5297
5298                                 dev_info(dev,
5299                                          "Device uses a private dma domain.\n");
5300                         }
5301                 }
5302         }
5303
5304         return 0;
5305 }
5306
5307 static void intel_iommu_remove_device(struct device *dev)
5308 {
5309         struct intel_iommu *iommu;
5310         u8 bus, devfn;
5311
5312         iommu = device_to_iommu(dev, &bus, &devfn);
5313         if (!iommu)
5314                 return;
5315
5316         iommu_group_remove_device(dev);
5317
5318         iommu_device_unlink(&iommu->iommu, dev);
5319 }
5320
5321 static void intel_iommu_get_resv_regions(struct device *device,
5322                                          struct list_head *head)
5323 {
5324         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5325         struct iommu_resv_region *reg;
5326         struct dmar_rmrr_unit *rmrr;
5327         struct device *i_dev;
5328         int i;
5329
5330         down_read(&dmar_global_lock);
5331         for_each_rmrr_units(rmrr) {
5332                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5333                                           i, i_dev) {
5334                         struct iommu_resv_region *resv;
5335                         enum iommu_resv_type type;
5336                         size_t length;
5337
5338                         if (i_dev != device &&
5339                             !is_downstream_to_pci_bridge(device, i_dev))
5340                                 continue;
5341
5342                         length = rmrr->end_address - rmrr->base_address + 1;
5343
5344                         type = device_rmrr_is_relaxable(device) ?
5345                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5346
5347                         resv = iommu_alloc_resv_region(rmrr->base_address,
5348                                                        length, prot, type);
5349                         if (!resv)
5350                                 break;
5351
5352                         list_add_tail(&resv->list, head);
5353                 }
5354         }
5355         up_read(&dmar_global_lock);
5356
5357 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5358         if (dev_is_pci(device)) {
5359                 struct pci_dev *pdev = to_pci_dev(device);
5360
5361                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5362                         reg = iommu_alloc_resv_region(0, 1UL << 24, 0,
5363                                                       IOMMU_RESV_DIRECT);
5364                         if (reg)
5365                                 list_add_tail(&reg->list, head);
5366                 }
5367         }
5368 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5369
5370         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5371                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5372                                       0, IOMMU_RESV_MSI);
5373         if (!reg)
5374                 return;
5375         list_add_tail(&reg->list, head);
5376 }
5377
5378 static void intel_iommu_put_resv_regions(struct device *dev,
5379                                          struct list_head *head)
5380 {
5381         struct iommu_resv_region *entry, *next;
5382
5383         list_for_each_entry_safe(entry, next, head, list)
5384                 kfree(entry);
5385 }
5386
5387 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5388 {
5389         struct device_domain_info *info;
5390         struct context_entry *context;
5391         struct dmar_domain *domain;
5392         unsigned long flags;
5393         u64 ctx_lo;
5394         int ret;
5395
5396         domain = find_domain(dev);
5397         if (!domain)
5398                 return -EINVAL;
5399
5400         spin_lock_irqsave(&device_domain_lock, flags);
5401         spin_lock(&iommu->lock);
5402
5403         ret = -EINVAL;
5404         info = dev->archdata.iommu;
5405         if (!info || !info->pasid_supported)
5406                 goto out;
5407
5408         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5409         if (WARN_ON(!context))
5410                 goto out;
5411
5412         ctx_lo = context[0].lo;
5413
5414         if (!(ctx_lo & CONTEXT_PASIDE)) {
5415                 ctx_lo |= CONTEXT_PASIDE;
5416                 context[0].lo = ctx_lo;
5417                 wmb();
5418                 iommu->flush.flush_context(iommu,
5419                                            domain->iommu_did[iommu->seq_id],
5420                                            PCI_DEVID(info->bus, info->devfn),
5421                                            DMA_CCMD_MASK_NOBIT,
5422                                            DMA_CCMD_DEVICE_INVL);
5423         }
5424
5425         /* Enable PASID support in the device, if it wasn't already */
5426         if (!info->pasid_enabled)
5427                 iommu_enable_dev_iotlb(info);
5428
5429         ret = 0;
5430
5431  out:
5432         spin_unlock(&iommu->lock);
5433         spin_unlock_irqrestore(&device_domain_lock, flags);
5434
5435         return ret;
5436 }
5437
5438 static void intel_iommu_apply_resv_region(struct device *dev,
5439                                           struct iommu_domain *domain,
5440                                           struct iommu_resv_region *region)
5441 {
5442         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5443         unsigned long start, end;
5444
5445         start = IOVA_PFN(region->start);
5446         end   = IOVA_PFN(region->start + region->length - 1);
5447
5448         WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5449 }
5450
5451 #ifdef CONFIG_INTEL_IOMMU_SVM
5452 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5453 {
5454         struct intel_iommu *iommu;
5455         u8 bus, devfn;
5456
5457         if (iommu_dummy(dev)) {
5458                 dev_warn(dev,
5459                          "No IOMMU translation for device; cannot enable SVM\n");
5460                 return NULL;
5461         }
5462
5463         iommu = device_to_iommu(dev, &bus, &devfn);
5464         if ((!iommu)) {
5465                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5466                 return NULL;
5467         }
5468
5469         return iommu;
5470 }
5471 #endif /* CONFIG_INTEL_IOMMU_SVM */
5472
5473 static int intel_iommu_enable_auxd(struct device *dev)
5474 {
5475         struct device_domain_info *info;
5476         struct intel_iommu *iommu;
5477         unsigned long flags;
5478         u8 bus, devfn;
5479         int ret;
5480
5481         iommu = device_to_iommu(dev, &bus, &devfn);
5482         if (!iommu || dmar_disabled)
5483                 return -EINVAL;
5484
5485         if (!sm_supported(iommu) || !pasid_supported(iommu))
5486                 return -EINVAL;
5487
5488         ret = intel_iommu_enable_pasid(iommu, dev);
5489         if (ret)
5490                 return -ENODEV;
5491
5492         spin_lock_irqsave(&device_domain_lock, flags);
5493         info = dev->archdata.iommu;
5494         info->auxd_enabled = 1;
5495         spin_unlock_irqrestore(&device_domain_lock, flags);
5496
5497         return 0;
5498 }
5499
5500 static int intel_iommu_disable_auxd(struct device *dev)
5501 {
5502         struct device_domain_info *info;
5503         unsigned long flags;
5504
5505         spin_lock_irqsave(&device_domain_lock, flags);
5506         info = dev->archdata.iommu;
5507         if (!WARN_ON(!info))
5508                 info->auxd_enabled = 0;
5509         spin_unlock_irqrestore(&device_domain_lock, flags);
5510
5511         return 0;
5512 }
5513
5514 /*
5515  * A PCI express designated vendor specific extended capability is defined
5516  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5517  * for system software and tools to detect endpoint devices supporting the
5518  * Intel scalable IO virtualization without host driver dependency.
5519  *
5520  * Returns the address of the matching extended capability structure within
5521  * the device's PCI configuration space or 0 if the device does not support
5522  * it.
5523  */
5524 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5525 {
5526         int pos;
5527         u16 vendor, id;
5528
5529         pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5530         while (pos) {
5531                 pci_read_config_word(pdev, pos + 4, &vendor);
5532                 pci_read_config_word(pdev, pos + 8, &id);
5533                 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5534                         return pos;
5535
5536                 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5537         }
5538
5539         return 0;
5540 }
5541
5542 static bool
5543 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5544 {
5545         if (feat == IOMMU_DEV_FEAT_AUX) {
5546                 int ret;
5547
5548                 if (!dev_is_pci(dev) || dmar_disabled ||
5549                     !scalable_mode_support() || !iommu_pasid_support())
5550                         return false;
5551
5552                 ret = pci_pasid_features(to_pci_dev(dev));
5553                 if (ret < 0)
5554                         return false;
5555
5556                 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5557         }
5558
5559         return false;
5560 }
5561
5562 static int
5563 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5564 {
5565         if (feat == IOMMU_DEV_FEAT_AUX)
5566                 return intel_iommu_enable_auxd(dev);
5567
5568         return -ENODEV;
5569 }
5570
5571 static int
5572 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5573 {
5574         if (feat == IOMMU_DEV_FEAT_AUX)
5575                 return intel_iommu_disable_auxd(dev);
5576
5577         return -ENODEV;
5578 }
5579
5580 static bool
5581 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5582 {
5583         struct device_domain_info *info = dev->archdata.iommu;
5584
5585         if (feat == IOMMU_DEV_FEAT_AUX)
5586                 return scalable_mode_support() && info && info->auxd_enabled;
5587
5588         return false;
5589 }
5590
5591 static int
5592 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5593 {
5594         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5595
5596         return dmar_domain->default_pasid > 0 ?
5597                         dmar_domain->default_pasid : -EINVAL;
5598 }
5599
5600 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5601                                            struct device *dev)
5602 {
5603         return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
5604 }
5605
5606 const struct iommu_ops intel_iommu_ops = {
5607         .capable                = intel_iommu_capable,
5608         .domain_alloc           = intel_iommu_domain_alloc,
5609         .domain_free            = intel_iommu_domain_free,
5610         .attach_dev             = intel_iommu_attach_device,
5611         .detach_dev             = intel_iommu_detach_device,
5612         .aux_attach_dev         = intel_iommu_aux_attach_device,
5613         .aux_detach_dev         = intel_iommu_aux_detach_device,
5614         .aux_get_pasid          = intel_iommu_aux_get_pasid,
5615         .map                    = intel_iommu_map,
5616         .unmap                  = intel_iommu_unmap,
5617         .iova_to_phys           = intel_iommu_iova_to_phys,
5618         .add_device             = intel_iommu_add_device,
5619         .remove_device          = intel_iommu_remove_device,
5620         .get_resv_regions       = intel_iommu_get_resv_regions,
5621         .put_resv_regions       = intel_iommu_put_resv_regions,
5622         .apply_resv_region      = intel_iommu_apply_resv_region,
5623         .device_group           = pci_device_group,
5624         .dev_has_feat           = intel_iommu_dev_has_feat,
5625         .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
5626         .dev_enable_feat        = intel_iommu_dev_enable_feat,
5627         .dev_disable_feat       = intel_iommu_dev_disable_feat,
5628         .is_attach_deferred     = intel_iommu_is_attach_deferred,
5629         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
5630 };
5631
5632 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5633 {
5634         /* G4x/GM45 integrated gfx dmar support is totally busted. */
5635         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5636         dmar_map_gfx = 0;
5637 }
5638
5639 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5640 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5641 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5642 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5643 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5644 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5645 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5646
5647 static void quirk_iommu_rwbf(struct pci_dev *dev)
5648 {
5649         /*
5650          * Mobile 4 Series Chipset neglects to set RWBF capability,
5651          * but needs it. Same seems to hold for the desktop versions.
5652          */
5653         pci_info(dev, "Forcing write-buffer flush capability\n");
5654         rwbf_quirk = 1;
5655 }
5656
5657 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5658 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5659 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5660 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5661 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5662 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5663 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5664
5665 #define GGC 0x52
5666 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5667 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5668 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5669 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5670 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5671 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5672 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5673 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5674
5675 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5676 {
5677         unsigned short ggc;
5678
5679         if (pci_read_config_word(dev, GGC, &ggc))
5680                 return;
5681
5682         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5683                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5684                 dmar_map_gfx = 0;
5685         } else if (dmar_map_gfx) {
5686                 /* we have to ensure the gfx device is idle before we flush */
5687                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5688                 intel_iommu_strict = 1;
5689        }
5690 }
5691 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5692 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5693 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5694 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5695
5696 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5697    ISOCH DMAR unit for the Azalia sound device, but not give it any
5698    TLB entries, which causes it to deadlock. Check for that.  We do
5699    this in a function called from init_dmars(), instead of in a PCI
5700    quirk, because we don't want to print the obnoxious "BIOS broken"
5701    message if VT-d is actually disabled.
5702 */
5703 static void __init check_tylersburg_isoch(void)
5704 {
5705         struct pci_dev *pdev;
5706         uint32_t vtisochctrl;
5707
5708         /* If there's no Azalia in the system anyway, forget it. */
5709         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5710         if (!pdev)
5711                 return;
5712         pci_dev_put(pdev);
5713
5714         /* System Management Registers. Might be hidden, in which case
5715            we can't do the sanity check. But that's OK, because the
5716            known-broken BIOSes _don't_ actually hide it, so far. */
5717         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5718         if (!pdev)
5719                 return;
5720
5721         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5722                 pci_dev_put(pdev);
5723                 return;
5724         }
5725
5726         pci_dev_put(pdev);
5727
5728         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5729         if (vtisochctrl & 1)
5730                 return;
5731
5732         /* Drop all bits other than the number of TLB entries */
5733         vtisochctrl &= 0x1c;
5734
5735         /* If we have the recommended number of TLB entries (16), fine. */
5736         if (vtisochctrl == 0x10)
5737                 return;
5738
5739         /* Zero TLB entries? You get to ride the short bus to school. */
5740         if (!vtisochctrl) {
5741                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5742                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5743                      dmi_get_system_info(DMI_BIOS_VENDOR),
5744                      dmi_get_system_info(DMI_BIOS_VERSION),
5745                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5746                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5747                 return;
5748         }
5749
5750         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5751                vtisochctrl);
5752 }