Merge tag 'iommu-updates-v6.7' of git://git.kernel.org/pub/scm/linux/kernel/git/joro...
[sfrench/cifs-2.6.git] / drivers / iommu / intel / iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-sva.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33 #include "perfmon.h"
34
35 #define ROOT_SIZE               VTD_PAGE_SIZE
36 #define CONTEXT_SIZE            VTD_PAGE_SIZE
37
38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
42
43 #define IOAPIC_RANGE_START      (0xfee00000)
44 #define IOAPIC_RANGE_END        (0xfeefffff)
45 #define IOVA_START_ADDR         (0x1000)
46
47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
48
49 #define MAX_AGAW_WIDTH 64
50 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
51
52 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
53 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
54
55 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
56    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
57 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
58                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
59 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
60
61 /* IO virtual address start page frame number */
62 #define IOVA_START_PFN          (1)
63
64 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
65
66 /* page table handling */
67 #define LEVEL_STRIDE            (9)
68 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
69
70 static inline int agaw_to_level(int agaw)
71 {
72         return agaw + 2;
73 }
74
75 static inline int agaw_to_width(int agaw)
76 {
77         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
78 }
79
80 static inline int width_to_agaw(int width)
81 {
82         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
83 }
84
85 static inline unsigned int level_to_offset_bits(int level)
86 {
87         return (level - 1) * LEVEL_STRIDE;
88 }
89
90 static inline int pfn_level_offset(u64 pfn, int level)
91 {
92         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
93 }
94
95 static inline u64 level_mask(int level)
96 {
97         return -1ULL << level_to_offset_bits(level);
98 }
99
100 static inline u64 level_size(int level)
101 {
102         return 1ULL << level_to_offset_bits(level);
103 }
104
105 static inline u64 align_to_level(u64 pfn, int level)
106 {
107         return (pfn + level_size(level) - 1) & level_mask(level);
108 }
109
110 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
111 {
112         return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
113 }
114
115 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
116    are never going to work. */
117 static inline unsigned long mm_to_dma_pfn_start(unsigned long mm_pfn)
118 {
119         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
120 }
121 static inline unsigned long mm_to_dma_pfn_end(unsigned long mm_pfn)
122 {
123         return ((mm_pfn + 1) << (PAGE_SHIFT - VTD_PAGE_SHIFT)) - 1;
124 }
125 static inline unsigned long page_to_dma_pfn(struct page *pg)
126 {
127         return mm_to_dma_pfn_start(page_to_pfn(pg));
128 }
129 static inline unsigned long virt_to_dma_pfn(void *p)
130 {
131         return page_to_dma_pfn(virt_to_page(p));
132 }
133
134 static void __init check_tylersburg_isoch(void);
135 static int rwbf_quirk;
136
137 /*
138  * set to 1 to panic kernel if can't successfully enable VT-d
139  * (used when kernel is launched w/ TXT)
140  */
141 static int force_on = 0;
142 static int intel_iommu_tboot_noforce;
143 static int no_platform_optin;
144
145 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
146
147 /*
148  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
149  * if marked present.
150  */
151 static phys_addr_t root_entry_lctp(struct root_entry *re)
152 {
153         if (!(re->lo & 1))
154                 return 0;
155
156         return re->lo & VTD_PAGE_MASK;
157 }
158
159 /*
160  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
161  * if marked present.
162  */
163 static phys_addr_t root_entry_uctp(struct root_entry *re)
164 {
165         if (!(re->hi & 1))
166                 return 0;
167
168         return re->hi & VTD_PAGE_MASK;
169 }
170
171 static inline void context_set_present(struct context_entry *context)
172 {
173         context->lo |= 1;
174 }
175
176 static inline void context_set_fault_enable(struct context_entry *context)
177 {
178         context->lo &= (((u64)-1) << 2) | 1;
179 }
180
181 static inline void context_set_translation_type(struct context_entry *context,
182                                                 unsigned long value)
183 {
184         context->lo &= (((u64)-1) << 4) | 3;
185         context->lo |= (value & 3) << 2;
186 }
187
188 static inline void context_set_address_root(struct context_entry *context,
189                                             unsigned long value)
190 {
191         context->lo &= ~VTD_PAGE_MASK;
192         context->lo |= value & VTD_PAGE_MASK;
193 }
194
195 static inline void context_set_address_width(struct context_entry *context,
196                                              unsigned long value)
197 {
198         context->hi |= value & 7;
199 }
200
201 static inline void context_set_domain_id(struct context_entry *context,
202                                          unsigned long value)
203 {
204         context->hi |= (value & ((1 << 16) - 1)) << 8;
205 }
206
207 static inline void context_set_pasid(struct context_entry *context)
208 {
209         context->lo |= CONTEXT_PASIDE;
210 }
211
212 static inline int context_domain_id(struct context_entry *c)
213 {
214         return((c->hi >> 8) & 0xffff);
215 }
216
217 static inline void context_clear_entry(struct context_entry *context)
218 {
219         context->lo = 0;
220         context->hi = 0;
221 }
222
223 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
224 {
225         if (!iommu->copied_tables)
226                 return false;
227
228         return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
229 }
230
231 static inline void
232 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
233 {
234         set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
235 }
236
237 static inline void
238 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
239 {
240         clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
241 }
242
243 /*
244  * This domain is a statically identity mapping domain.
245  *      1. This domain creats a static 1:1 mapping to all usable memory.
246  *      2. It maps to each iommu if successful.
247  *      3. Each iommu mapps to this domain if successful.
248  */
249 static struct dmar_domain *si_domain;
250 static int hw_pass_through = 1;
251
252 struct dmar_rmrr_unit {
253         struct list_head list;          /* list of rmrr units   */
254         struct acpi_dmar_header *hdr;   /* ACPI header          */
255         u64     base_address;           /* reserved base address*/
256         u64     end_address;            /* reserved end address */
257         struct dmar_dev_scope *devices; /* target devices */
258         int     devices_cnt;            /* target device count */
259 };
260
261 struct dmar_atsr_unit {
262         struct list_head list;          /* list of ATSR units */
263         struct acpi_dmar_header *hdr;   /* ACPI header */
264         struct dmar_dev_scope *devices; /* target devices */
265         int devices_cnt;                /* target device count */
266         u8 include_all:1;               /* include all ports */
267 };
268
269 struct dmar_satc_unit {
270         struct list_head list;          /* list of SATC units */
271         struct acpi_dmar_header *hdr;   /* ACPI header */
272         struct dmar_dev_scope *devices; /* target devices */
273         struct intel_iommu *iommu;      /* the corresponding iommu */
274         int devices_cnt;                /* target device count */
275         u8 atc_required:1;              /* ATS is required */
276 };
277
278 static LIST_HEAD(dmar_atsr_units);
279 static LIST_HEAD(dmar_rmrr_units);
280 static LIST_HEAD(dmar_satc_units);
281
282 #define for_each_rmrr_units(rmrr) \
283         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
284
285 static void intel_iommu_domain_free(struct iommu_domain *domain);
286
287 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
288 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
289
290 int intel_iommu_enabled = 0;
291 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
292
293 static int dmar_map_gfx = 1;
294 static int intel_iommu_superpage = 1;
295 static int iommu_identity_mapping;
296 static int iommu_skip_te_disable;
297
298 #define IDENTMAP_GFX            2
299 #define IDENTMAP_AZALIA         4
300
301 const struct iommu_ops intel_iommu_ops;
302 const struct iommu_dirty_ops intel_dirty_ops;
303
304 static bool translation_pre_enabled(struct intel_iommu *iommu)
305 {
306         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
307 }
308
309 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
310 {
311         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
312 }
313
314 static void init_translation_status(struct intel_iommu *iommu)
315 {
316         u32 gsts;
317
318         gsts = readl(iommu->reg + DMAR_GSTS_REG);
319         if (gsts & DMA_GSTS_TES)
320                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
321 }
322
323 static int __init intel_iommu_setup(char *str)
324 {
325         if (!str)
326                 return -EINVAL;
327
328         while (*str) {
329                 if (!strncmp(str, "on", 2)) {
330                         dmar_disabled = 0;
331                         pr_info("IOMMU enabled\n");
332                 } else if (!strncmp(str, "off", 3)) {
333                         dmar_disabled = 1;
334                         no_platform_optin = 1;
335                         pr_info("IOMMU disabled\n");
336                 } else if (!strncmp(str, "igfx_off", 8)) {
337                         dmar_map_gfx = 0;
338                         pr_info("Disable GFX device mapping\n");
339                 } else if (!strncmp(str, "forcedac", 8)) {
340                         pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
341                         iommu_dma_forcedac = true;
342                 } else if (!strncmp(str, "strict", 6)) {
343                         pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
344                         iommu_set_dma_strict();
345                 } else if (!strncmp(str, "sp_off", 6)) {
346                         pr_info("Disable supported super page\n");
347                         intel_iommu_superpage = 0;
348                 } else if (!strncmp(str, "sm_on", 5)) {
349                         pr_info("Enable scalable mode if hardware supports\n");
350                         intel_iommu_sm = 1;
351                 } else if (!strncmp(str, "sm_off", 6)) {
352                         pr_info("Scalable mode is disallowed\n");
353                         intel_iommu_sm = 0;
354                 } else if (!strncmp(str, "tboot_noforce", 13)) {
355                         pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
356                         intel_iommu_tboot_noforce = 1;
357                 } else {
358                         pr_notice("Unknown option - '%s'\n", str);
359                 }
360
361                 str += strcspn(str, ",");
362                 while (*str == ',')
363                         str++;
364         }
365
366         return 1;
367 }
368 __setup("intel_iommu=", intel_iommu_setup);
369
370 void *alloc_pgtable_page(int node, gfp_t gfp)
371 {
372         struct page *page;
373         void *vaddr = NULL;
374
375         page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
376         if (page)
377                 vaddr = page_address(page);
378         return vaddr;
379 }
380
381 void free_pgtable_page(void *vaddr)
382 {
383         free_page((unsigned long)vaddr);
384 }
385
386 static inline int domain_type_is_si(struct dmar_domain *domain)
387 {
388         return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
389 }
390
391 static inline int domain_pfn_supported(struct dmar_domain *domain,
392                                        unsigned long pfn)
393 {
394         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
395
396         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
397 }
398
399 /*
400  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
401  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
402  * the returned SAGAW.
403  */
404 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
405 {
406         unsigned long fl_sagaw, sl_sagaw;
407
408         fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
409         sl_sagaw = cap_sagaw(iommu->cap);
410
411         /* Second level only. */
412         if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
413                 return sl_sagaw;
414
415         /* First level only. */
416         if (!ecap_slts(iommu->ecap))
417                 return fl_sagaw;
418
419         return fl_sagaw & sl_sagaw;
420 }
421
422 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
423 {
424         unsigned long sagaw;
425         int agaw;
426
427         sagaw = __iommu_calculate_sagaw(iommu);
428         for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
429                 if (test_bit(agaw, &sagaw))
430                         break;
431         }
432
433         return agaw;
434 }
435
436 /*
437  * Calculate max SAGAW for each iommu.
438  */
439 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
440 {
441         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
442 }
443
444 /*
445  * calculate agaw for each iommu.
446  * "SAGAW" may be different across iommus, use a default agaw, and
447  * get a supported less agaw for iommus that don't support the default agaw.
448  */
449 int iommu_calculate_agaw(struct intel_iommu *iommu)
450 {
451         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
452 }
453
454 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
455 {
456         return sm_supported(iommu) ?
457                         ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
458 }
459
460 static void domain_update_iommu_coherency(struct dmar_domain *domain)
461 {
462         struct iommu_domain_info *info;
463         struct dmar_drhd_unit *drhd;
464         struct intel_iommu *iommu;
465         bool found = false;
466         unsigned long i;
467
468         domain->iommu_coherency = true;
469         xa_for_each(&domain->iommu_array, i, info) {
470                 found = true;
471                 if (!iommu_paging_structure_coherency(info->iommu)) {
472                         domain->iommu_coherency = false;
473                         break;
474                 }
475         }
476         if (found)
477                 return;
478
479         /* No hardware attached; use lowest common denominator */
480         rcu_read_lock();
481         for_each_active_iommu(iommu, drhd) {
482                 if (!iommu_paging_structure_coherency(iommu)) {
483                         domain->iommu_coherency = false;
484                         break;
485                 }
486         }
487         rcu_read_unlock();
488 }
489
490 static int domain_update_iommu_superpage(struct dmar_domain *domain,
491                                          struct intel_iommu *skip)
492 {
493         struct dmar_drhd_unit *drhd;
494         struct intel_iommu *iommu;
495         int mask = 0x3;
496
497         if (!intel_iommu_superpage)
498                 return 0;
499
500         /* set iommu_superpage to the smallest common denominator */
501         rcu_read_lock();
502         for_each_active_iommu(iommu, drhd) {
503                 if (iommu != skip) {
504                         if (domain && domain->use_first_level) {
505                                 if (!cap_fl1gp_support(iommu->cap))
506                                         mask = 0x1;
507                         } else {
508                                 mask &= cap_super_page_val(iommu->cap);
509                         }
510
511                         if (!mask)
512                                 break;
513                 }
514         }
515         rcu_read_unlock();
516
517         return fls(mask);
518 }
519
520 static int domain_update_device_node(struct dmar_domain *domain)
521 {
522         struct device_domain_info *info;
523         int nid = NUMA_NO_NODE;
524         unsigned long flags;
525
526         spin_lock_irqsave(&domain->lock, flags);
527         list_for_each_entry(info, &domain->devices, link) {
528                 /*
529                  * There could possibly be multiple device numa nodes as devices
530                  * within the same domain may sit behind different IOMMUs. There
531                  * isn't perfect answer in such situation, so we select first
532                  * come first served policy.
533                  */
534                 nid = dev_to_node(info->dev);
535                 if (nid != NUMA_NO_NODE)
536                         break;
537         }
538         spin_unlock_irqrestore(&domain->lock, flags);
539
540         return nid;
541 }
542
543 static void domain_update_iotlb(struct dmar_domain *domain);
544
545 /* Return the super pagesize bitmap if supported. */
546 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
547 {
548         unsigned long bitmap = 0;
549
550         /*
551          * 1-level super page supports page size of 2MiB, 2-level super page
552          * supports page size of both 2MiB and 1GiB.
553          */
554         if (domain->iommu_superpage == 1)
555                 bitmap |= SZ_2M;
556         else if (domain->iommu_superpage == 2)
557                 bitmap |= SZ_2M | SZ_1G;
558
559         return bitmap;
560 }
561
562 /* Some capabilities may be different across iommus */
563 void domain_update_iommu_cap(struct dmar_domain *domain)
564 {
565         domain_update_iommu_coherency(domain);
566         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
567
568         /*
569          * If RHSA is missing, we should default to the device numa domain
570          * as fall back.
571          */
572         if (domain->nid == NUMA_NO_NODE)
573                 domain->nid = domain_update_device_node(domain);
574
575         /*
576          * First-level translation restricts the input-address to a
577          * canonical address (i.e., address bits 63:N have the same
578          * value as address bit [N-1], where N is 48-bits with 4-level
579          * paging and 57-bits with 5-level paging). Hence, skip bit
580          * [N-1].
581          */
582         if (domain->use_first_level)
583                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
584         else
585                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
586
587         domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
588         domain_update_iotlb(domain);
589 }
590
591 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
592                                          u8 devfn, int alloc)
593 {
594         struct root_entry *root = &iommu->root_entry[bus];
595         struct context_entry *context;
596         u64 *entry;
597
598         /*
599          * Except that the caller requested to allocate a new entry,
600          * returning a copied context entry makes no sense.
601          */
602         if (!alloc && context_copied(iommu, bus, devfn))
603                 return NULL;
604
605         entry = &root->lo;
606         if (sm_supported(iommu)) {
607                 if (devfn >= 0x80) {
608                         devfn -= 0x80;
609                         entry = &root->hi;
610                 }
611                 devfn *= 2;
612         }
613         if (*entry & 1)
614                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
615         else {
616                 unsigned long phy_addr;
617                 if (!alloc)
618                         return NULL;
619
620                 context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
621                 if (!context)
622                         return NULL;
623
624                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
625                 phy_addr = virt_to_phys((void *)context);
626                 *entry = phy_addr | 1;
627                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
628         }
629         return &context[devfn];
630 }
631
632 /**
633  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
634  *                               sub-hierarchy of a candidate PCI-PCI bridge
635  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
636  * @bridge: the candidate PCI-PCI bridge
637  *
638  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
639  */
640 static bool
641 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
642 {
643         struct pci_dev *pdev, *pbridge;
644
645         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
646                 return false;
647
648         pdev = to_pci_dev(dev);
649         pbridge = to_pci_dev(bridge);
650
651         if (pbridge->subordinate &&
652             pbridge->subordinate->number <= pdev->bus->number &&
653             pbridge->subordinate->busn_res.end >= pdev->bus->number)
654                 return true;
655
656         return false;
657 }
658
659 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
660 {
661         struct dmar_drhd_unit *drhd;
662         u32 vtbar;
663         int rc;
664
665         /* We know that this device on this chipset has its own IOMMU.
666          * If we find it under a different IOMMU, then the BIOS is lying
667          * to us. Hope that the IOMMU for this device is actually
668          * disabled, and it needs no translation...
669          */
670         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
671         if (rc) {
672                 /* "can't" happen */
673                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
674                 return false;
675         }
676         vtbar &= 0xffff0000;
677
678         /* we know that the this iommu should be at offset 0xa000 from vtbar */
679         drhd = dmar_find_matched_drhd_unit(pdev);
680         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
681                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
682                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
683                 return true;
684         }
685
686         return false;
687 }
688
689 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
690 {
691         if (!iommu || iommu->drhd->ignored)
692                 return true;
693
694         if (dev_is_pci(dev)) {
695                 struct pci_dev *pdev = to_pci_dev(dev);
696
697                 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
698                     pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
699                     quirk_ioat_snb_local_iommu(pdev))
700                         return true;
701         }
702
703         return false;
704 }
705
706 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
707 {
708         struct dmar_drhd_unit *drhd = NULL;
709         struct pci_dev *pdev = NULL;
710         struct intel_iommu *iommu;
711         struct device *tmp;
712         u16 segment = 0;
713         int i;
714
715         if (!dev)
716                 return NULL;
717
718         if (dev_is_pci(dev)) {
719                 struct pci_dev *pf_pdev;
720
721                 pdev = pci_real_dma_dev(to_pci_dev(dev));
722
723                 /* VFs aren't listed in scope tables; we need to look up
724                  * the PF instead to find the IOMMU. */
725                 pf_pdev = pci_physfn(pdev);
726                 dev = &pf_pdev->dev;
727                 segment = pci_domain_nr(pdev->bus);
728         } else if (has_acpi_companion(dev))
729                 dev = &ACPI_COMPANION(dev)->dev;
730
731         rcu_read_lock();
732         for_each_iommu(iommu, drhd) {
733                 if (pdev && segment != drhd->segment)
734                         continue;
735
736                 for_each_active_dev_scope(drhd->devices,
737                                           drhd->devices_cnt, i, tmp) {
738                         if (tmp == dev) {
739                                 /* For a VF use its original BDF# not that of the PF
740                                  * which we used for the IOMMU lookup. Strictly speaking
741                                  * we could do this for all PCI devices; we only need to
742                                  * get the BDF# from the scope table for ACPI matches. */
743                                 if (pdev && pdev->is_virtfn)
744                                         goto got_pdev;
745
746                                 if (bus && devfn) {
747                                         *bus = drhd->devices[i].bus;
748                                         *devfn = drhd->devices[i].devfn;
749                                 }
750                                 goto out;
751                         }
752
753                         if (is_downstream_to_pci_bridge(dev, tmp))
754                                 goto got_pdev;
755                 }
756
757                 if (pdev && drhd->include_all) {
758 got_pdev:
759                         if (bus && devfn) {
760                                 *bus = pdev->bus->number;
761                                 *devfn = pdev->devfn;
762                         }
763                         goto out;
764                 }
765         }
766         iommu = NULL;
767 out:
768         if (iommu_is_dummy(iommu, dev))
769                 iommu = NULL;
770
771         rcu_read_unlock();
772
773         return iommu;
774 }
775
776 static void domain_flush_cache(struct dmar_domain *domain,
777                                void *addr, int size)
778 {
779         if (!domain->iommu_coherency)
780                 clflush_cache_range(addr, size);
781 }
782
783 static void free_context_table(struct intel_iommu *iommu)
784 {
785         struct context_entry *context;
786         int i;
787
788         if (!iommu->root_entry)
789                 return;
790
791         for (i = 0; i < ROOT_ENTRY_NR; i++) {
792                 context = iommu_context_addr(iommu, i, 0, 0);
793                 if (context)
794                         free_pgtable_page(context);
795
796                 if (!sm_supported(iommu))
797                         continue;
798
799                 context = iommu_context_addr(iommu, i, 0x80, 0);
800                 if (context)
801                         free_pgtable_page(context);
802         }
803
804         free_pgtable_page(iommu->root_entry);
805         iommu->root_entry = NULL;
806 }
807
808 #ifdef CONFIG_DMAR_DEBUG
809 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
810                          u8 bus, u8 devfn, struct dma_pte *parent, int level)
811 {
812         struct dma_pte *pte;
813         int offset;
814
815         while (1) {
816                 offset = pfn_level_offset(pfn, level);
817                 pte = &parent[offset];
818                 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
819                         pr_info("PTE not present at level %d\n", level);
820                         break;
821                 }
822
823                 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
824
825                 if (level == 1)
826                         break;
827
828                 parent = phys_to_virt(dma_pte_addr(pte));
829                 level--;
830         }
831 }
832
833 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
834                           unsigned long long addr, u32 pasid)
835 {
836         struct pasid_dir_entry *dir, *pde;
837         struct pasid_entry *entries, *pte;
838         struct context_entry *ctx_entry;
839         struct root_entry *rt_entry;
840         int i, dir_index, index, level;
841         u8 devfn = source_id & 0xff;
842         u8 bus = source_id >> 8;
843         struct dma_pte *pgtable;
844
845         pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
846
847         /* root entry dump */
848         rt_entry = &iommu->root_entry[bus];
849         if (!rt_entry) {
850                 pr_info("root table entry is not present\n");
851                 return;
852         }
853
854         if (sm_supported(iommu))
855                 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
856                         rt_entry->hi, rt_entry->lo);
857         else
858                 pr_info("root entry: 0x%016llx", rt_entry->lo);
859
860         /* context entry dump */
861         ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
862         if (!ctx_entry) {
863                 pr_info("context table entry is not present\n");
864                 return;
865         }
866
867         pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
868                 ctx_entry->hi, ctx_entry->lo);
869
870         /* legacy mode does not require PASID entries */
871         if (!sm_supported(iommu)) {
872                 level = agaw_to_level(ctx_entry->hi & 7);
873                 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
874                 goto pgtable_walk;
875         }
876
877         /* get the pointer to pasid directory entry */
878         dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
879         if (!dir) {
880                 pr_info("pasid directory entry is not present\n");
881                 return;
882         }
883         /* For request-without-pasid, get the pasid from context entry */
884         if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
885                 pasid = IOMMU_NO_PASID;
886
887         dir_index = pasid >> PASID_PDE_SHIFT;
888         pde = &dir[dir_index];
889         pr_info("pasid dir entry: 0x%016llx\n", pde->val);
890
891         /* get the pointer to the pasid table entry */
892         entries = get_pasid_table_from_pde(pde);
893         if (!entries) {
894                 pr_info("pasid table entry is not present\n");
895                 return;
896         }
897         index = pasid & PASID_PTE_MASK;
898         pte = &entries[index];
899         for (i = 0; i < ARRAY_SIZE(pte->val); i++)
900                 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
901
902         if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
903                 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
904                 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
905         } else {
906                 level = agaw_to_level((pte->val[0] >> 2) & 0x7);
907                 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
908         }
909
910 pgtable_walk:
911         pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
912 }
913 #endif
914
915 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
916                                       unsigned long pfn, int *target_level,
917                                       gfp_t gfp)
918 {
919         struct dma_pte *parent, *pte;
920         int level = agaw_to_level(domain->agaw);
921         int offset;
922
923         if (!domain_pfn_supported(domain, pfn))
924                 /* Address beyond IOMMU's addressing capabilities. */
925                 return NULL;
926
927         parent = domain->pgd;
928
929         while (1) {
930                 void *tmp_page;
931
932                 offset = pfn_level_offset(pfn, level);
933                 pte = &parent[offset];
934                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
935                         break;
936                 if (level == *target_level)
937                         break;
938
939                 if (!dma_pte_present(pte)) {
940                         uint64_t pteval;
941
942                         tmp_page = alloc_pgtable_page(domain->nid, gfp);
943
944                         if (!tmp_page)
945                                 return NULL;
946
947                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
948                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
949                         if (domain->use_first_level)
950                                 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
951
952                         if (cmpxchg64(&pte->val, 0ULL, pteval))
953                                 /* Someone else set it while we were thinking; use theirs. */
954                                 free_pgtable_page(tmp_page);
955                         else
956                                 domain_flush_cache(domain, pte, sizeof(*pte));
957                 }
958                 if (level == 1)
959                         break;
960
961                 parent = phys_to_virt(dma_pte_addr(pte));
962                 level--;
963         }
964
965         if (!*target_level)
966                 *target_level = level;
967
968         return pte;
969 }
970
971 /* return address's pte at specific level */
972 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
973                                          unsigned long pfn,
974                                          int level, int *large_page)
975 {
976         struct dma_pte *parent, *pte;
977         int total = agaw_to_level(domain->agaw);
978         int offset;
979
980         parent = domain->pgd;
981         while (level <= total) {
982                 offset = pfn_level_offset(pfn, total);
983                 pte = &parent[offset];
984                 if (level == total)
985                         return pte;
986
987                 if (!dma_pte_present(pte)) {
988                         *large_page = total;
989                         break;
990                 }
991
992                 if (dma_pte_superpage(pte)) {
993                         *large_page = total;
994                         return pte;
995                 }
996
997                 parent = phys_to_virt(dma_pte_addr(pte));
998                 total--;
999         }
1000         return NULL;
1001 }
1002
1003 /* clear last level pte, a tlb flush should be followed */
1004 static void dma_pte_clear_range(struct dmar_domain *domain,
1005                                 unsigned long start_pfn,
1006                                 unsigned long last_pfn)
1007 {
1008         unsigned int large_page;
1009         struct dma_pte *first_pte, *pte;
1010
1011         if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1012             WARN_ON(start_pfn > last_pfn))
1013                 return;
1014
1015         /* we don't need lock here; nobody else touches the iova range */
1016         do {
1017                 large_page = 1;
1018                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1019                 if (!pte) {
1020                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1021                         continue;
1022                 }
1023                 do {
1024                         dma_clear_pte(pte);
1025                         start_pfn += lvl_to_nr_pages(large_page);
1026                         pte++;
1027                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1028
1029                 domain_flush_cache(domain, first_pte,
1030                                    (void *)pte - (void *)first_pte);
1031
1032         } while (start_pfn && start_pfn <= last_pfn);
1033 }
1034
1035 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1036                                int retain_level, struct dma_pte *pte,
1037                                unsigned long pfn, unsigned long start_pfn,
1038                                unsigned long last_pfn)
1039 {
1040         pfn = max(start_pfn, pfn);
1041         pte = &pte[pfn_level_offset(pfn, level)];
1042
1043         do {
1044                 unsigned long level_pfn;
1045                 struct dma_pte *level_pte;
1046
1047                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1048                         goto next;
1049
1050                 level_pfn = pfn & level_mask(level);
1051                 level_pte = phys_to_virt(dma_pte_addr(pte));
1052
1053                 if (level > 2) {
1054                         dma_pte_free_level(domain, level - 1, retain_level,
1055                                            level_pte, level_pfn, start_pfn,
1056                                            last_pfn);
1057                 }
1058
1059                 /*
1060                  * Free the page table if we're below the level we want to
1061                  * retain and the range covers the entire table.
1062                  */
1063                 if (level < retain_level && !(start_pfn > level_pfn ||
1064                       last_pfn < level_pfn + level_size(level) - 1)) {
1065                         dma_clear_pte(pte);
1066                         domain_flush_cache(domain, pte, sizeof(*pte));
1067                         free_pgtable_page(level_pte);
1068                 }
1069 next:
1070                 pfn += level_size(level);
1071         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1072 }
1073
1074 /*
1075  * clear last level (leaf) ptes and free page table pages below the
1076  * level we wish to keep intact.
1077  */
1078 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1079                                    unsigned long start_pfn,
1080                                    unsigned long last_pfn,
1081                                    int retain_level)
1082 {
1083         dma_pte_clear_range(domain, start_pfn, last_pfn);
1084
1085         /* We don't need lock here; nobody else touches the iova range */
1086         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1087                            domain->pgd, 0, start_pfn, last_pfn);
1088
1089         /* free pgd */
1090         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1091                 free_pgtable_page(domain->pgd);
1092                 domain->pgd = NULL;
1093         }
1094 }
1095
1096 /* When a page at a given level is being unlinked from its parent, we don't
1097    need to *modify* it at all. All we need to do is make a list of all the
1098    pages which can be freed just as soon as we've flushed the IOTLB and we
1099    know the hardware page-walk will no longer touch them.
1100    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1101    be freed. */
1102 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1103                                     int level, struct dma_pte *pte,
1104                                     struct list_head *freelist)
1105 {
1106         struct page *pg;
1107
1108         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1109         list_add_tail(&pg->lru, freelist);
1110
1111         if (level == 1)
1112                 return;
1113
1114         pte = page_address(pg);
1115         do {
1116                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1117                         dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1118                 pte++;
1119         } while (!first_pte_in_page(pte));
1120 }
1121
1122 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1123                                 struct dma_pte *pte, unsigned long pfn,
1124                                 unsigned long start_pfn, unsigned long last_pfn,
1125                                 struct list_head *freelist)
1126 {
1127         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1128
1129         pfn = max(start_pfn, pfn);
1130         pte = &pte[pfn_level_offset(pfn, level)];
1131
1132         do {
1133                 unsigned long level_pfn = pfn & level_mask(level);
1134
1135                 if (!dma_pte_present(pte))
1136                         goto next;
1137
1138                 /* If range covers entire pagetable, free it */
1139                 if (start_pfn <= level_pfn &&
1140                     last_pfn >= level_pfn + level_size(level) - 1) {
1141                         /* These suborbinate page tables are going away entirely. Don't
1142                            bother to clear them; we're just going to *free* them. */
1143                         if (level > 1 && !dma_pte_superpage(pte))
1144                                 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1145
1146                         dma_clear_pte(pte);
1147                         if (!first_pte)
1148                                 first_pte = pte;
1149                         last_pte = pte;
1150                 } else if (level > 1) {
1151                         /* Recurse down into a level that isn't *entirely* obsolete */
1152                         dma_pte_clear_level(domain, level - 1,
1153                                             phys_to_virt(dma_pte_addr(pte)),
1154                                             level_pfn, start_pfn, last_pfn,
1155                                             freelist);
1156                 }
1157 next:
1158                 pfn = level_pfn + level_size(level);
1159         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1160
1161         if (first_pte)
1162                 domain_flush_cache(domain, first_pte,
1163                                    (void *)++last_pte - (void *)first_pte);
1164 }
1165
1166 /* We can't just free the pages because the IOMMU may still be walking
1167    the page tables, and may have cached the intermediate levels. The
1168    pages can only be freed after the IOTLB flush has been done. */
1169 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1170                          unsigned long last_pfn, struct list_head *freelist)
1171 {
1172         if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1173             WARN_ON(start_pfn > last_pfn))
1174                 return;
1175
1176         /* we don't need lock here; nobody else touches the iova range */
1177         dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1178                             domain->pgd, 0, start_pfn, last_pfn, freelist);
1179
1180         /* free pgd */
1181         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1182                 struct page *pgd_page = virt_to_page(domain->pgd);
1183                 list_add_tail(&pgd_page->lru, freelist);
1184                 domain->pgd = NULL;
1185         }
1186 }
1187
1188 /* iommu handling */
1189 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1190 {
1191         struct root_entry *root;
1192
1193         root = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
1194         if (!root) {
1195                 pr_err("Allocating root entry for %s failed\n",
1196                         iommu->name);
1197                 return -ENOMEM;
1198         }
1199
1200         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1201         iommu->root_entry = root;
1202
1203         return 0;
1204 }
1205
1206 static void iommu_set_root_entry(struct intel_iommu *iommu)
1207 {
1208         u64 addr;
1209         u32 sts;
1210         unsigned long flag;
1211
1212         addr = virt_to_phys(iommu->root_entry);
1213         if (sm_supported(iommu))
1214                 addr |= DMA_RTADDR_SMT;
1215
1216         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1217         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1218
1219         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1220
1221         /* Make sure hardware complete it */
1222         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1223                       readl, (sts & DMA_GSTS_RTPS), sts);
1224
1225         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1226
1227         /*
1228          * Hardware invalidates all DMA remapping hardware translation
1229          * caches as part of SRTP flow.
1230          */
1231         if (cap_esrtps(iommu->cap))
1232                 return;
1233
1234         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1235         if (sm_supported(iommu))
1236                 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1237         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1238 }
1239
1240 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1241 {
1242         u32 val;
1243         unsigned long flag;
1244
1245         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1246                 return;
1247
1248         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1249         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1250
1251         /* Make sure hardware complete it */
1252         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1253                       readl, (!(val & DMA_GSTS_WBFS)), val);
1254
1255         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1256 }
1257
1258 /* return value determine if we need a write buffer flush */
1259 static void __iommu_flush_context(struct intel_iommu *iommu,
1260                                   u16 did, u16 source_id, u8 function_mask,
1261                                   u64 type)
1262 {
1263         u64 val = 0;
1264         unsigned long flag;
1265
1266         switch (type) {
1267         case DMA_CCMD_GLOBAL_INVL:
1268                 val = DMA_CCMD_GLOBAL_INVL;
1269                 break;
1270         case DMA_CCMD_DOMAIN_INVL:
1271                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1272                 break;
1273         case DMA_CCMD_DEVICE_INVL:
1274                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1275                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1276                 break;
1277         default:
1278                 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1279                         iommu->name, type);
1280                 return;
1281         }
1282         val |= DMA_CCMD_ICC;
1283
1284         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1285         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1286
1287         /* Make sure hardware complete it */
1288         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1289                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1290
1291         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1292 }
1293
1294 /* return value determine if we need a write buffer flush */
1295 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1296                                 u64 addr, unsigned int size_order, u64 type)
1297 {
1298         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1299         u64 val = 0, val_iva = 0;
1300         unsigned long flag;
1301
1302         switch (type) {
1303         case DMA_TLB_GLOBAL_FLUSH:
1304                 /* global flush doesn't need set IVA_REG */
1305                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1306                 break;
1307         case DMA_TLB_DSI_FLUSH:
1308                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1309                 break;
1310         case DMA_TLB_PSI_FLUSH:
1311                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1312                 /* IH bit is passed in as part of address */
1313                 val_iva = size_order | addr;
1314                 break;
1315         default:
1316                 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1317                         iommu->name, type);
1318                 return;
1319         }
1320
1321         if (cap_write_drain(iommu->cap))
1322                 val |= DMA_TLB_WRITE_DRAIN;
1323
1324         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1325         /* Note: Only uses first TLB reg currently */
1326         if (val_iva)
1327                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1328         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1329
1330         /* Make sure hardware complete it */
1331         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1332                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1333
1334         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1335
1336         /* check IOTLB invalidation granularity */
1337         if (DMA_TLB_IAIG(val) == 0)
1338                 pr_err("Flush IOTLB failed\n");
1339         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1340                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1341                         (unsigned long long)DMA_TLB_IIRG(type),
1342                         (unsigned long long)DMA_TLB_IAIG(val));
1343 }
1344
1345 static struct device_domain_info *
1346 domain_lookup_dev_info(struct dmar_domain *domain,
1347                        struct intel_iommu *iommu, u8 bus, u8 devfn)
1348 {
1349         struct device_domain_info *info;
1350         unsigned long flags;
1351
1352         spin_lock_irqsave(&domain->lock, flags);
1353         list_for_each_entry(info, &domain->devices, link) {
1354                 if (info->iommu == iommu && info->bus == bus &&
1355                     info->devfn == devfn) {
1356                         spin_unlock_irqrestore(&domain->lock, flags);
1357                         return info;
1358                 }
1359         }
1360         spin_unlock_irqrestore(&domain->lock, flags);
1361
1362         return NULL;
1363 }
1364
1365 static void domain_update_iotlb(struct dmar_domain *domain)
1366 {
1367         struct dev_pasid_info *dev_pasid;
1368         struct device_domain_info *info;
1369         bool has_iotlb_device = false;
1370         unsigned long flags;
1371
1372         spin_lock_irqsave(&domain->lock, flags);
1373         list_for_each_entry(info, &domain->devices, link) {
1374                 if (info->ats_enabled) {
1375                         has_iotlb_device = true;
1376                         break;
1377                 }
1378         }
1379
1380         list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1381                 info = dev_iommu_priv_get(dev_pasid->dev);
1382                 if (info->ats_enabled) {
1383                         has_iotlb_device = true;
1384                         break;
1385                 }
1386         }
1387         domain->has_iotlb_device = has_iotlb_device;
1388         spin_unlock_irqrestore(&domain->lock, flags);
1389 }
1390
1391 /*
1392  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1393  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1394  * check because it applies only to the built-in QAT devices and it doesn't
1395  * grant additional privileges.
1396  */
1397 #define BUGGY_QAT_DEVID_MASK 0x4940
1398 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1399 {
1400         if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1401                 return false;
1402
1403         if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1404                 return false;
1405
1406         return true;
1407 }
1408
1409 static void iommu_enable_pci_caps(struct device_domain_info *info)
1410 {
1411         struct pci_dev *pdev;
1412
1413         if (!dev_is_pci(info->dev))
1414                 return;
1415
1416         pdev = to_pci_dev(info->dev);
1417
1418         /* The PCIe spec, in its wisdom, declares that the behaviour of
1419            the device if you enable PASID support after ATS support is
1420            undefined. So always enable PASID support on devices which
1421            have it, even if we can't yet know if we're ever going to
1422            use it. */
1423         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1424                 info->pasid_enabled = 1;
1425
1426         if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1427             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1428                 info->ats_enabled = 1;
1429                 domain_update_iotlb(info->domain);
1430         }
1431 }
1432
1433 static void iommu_disable_pci_caps(struct device_domain_info *info)
1434 {
1435         struct pci_dev *pdev;
1436
1437         if (!dev_is_pci(info->dev))
1438                 return;
1439
1440         pdev = to_pci_dev(info->dev);
1441
1442         if (info->ats_enabled) {
1443                 pci_disable_ats(pdev);
1444                 info->ats_enabled = 0;
1445                 domain_update_iotlb(info->domain);
1446         }
1447
1448         if (info->pasid_enabled) {
1449                 pci_disable_pasid(pdev);
1450                 info->pasid_enabled = 0;
1451         }
1452 }
1453
1454 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1455                                     u64 addr, unsigned int mask)
1456 {
1457         u16 sid, qdep;
1458
1459         if (!info || !info->ats_enabled)
1460                 return;
1461
1462         sid = info->bus << 8 | info->devfn;
1463         qdep = info->ats_qdep;
1464         qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1465                            qdep, addr, mask);
1466         quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep);
1467 }
1468
1469 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1470                                   u64 addr, unsigned mask)
1471 {
1472         struct dev_pasid_info *dev_pasid;
1473         struct device_domain_info *info;
1474         unsigned long flags;
1475
1476         if (!domain->has_iotlb_device)
1477                 return;
1478
1479         spin_lock_irqsave(&domain->lock, flags);
1480         list_for_each_entry(info, &domain->devices, link)
1481                 __iommu_flush_dev_iotlb(info, addr, mask);
1482
1483         list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1484                 info = dev_iommu_priv_get(dev_pasid->dev);
1485
1486                 if (!info->ats_enabled)
1487                         continue;
1488
1489                 qi_flush_dev_iotlb_pasid(info->iommu,
1490                                          PCI_DEVID(info->bus, info->devfn),
1491                                          info->pfsid, dev_pasid->pasid,
1492                                          info->ats_qdep, addr,
1493                                          mask);
1494         }
1495         spin_unlock_irqrestore(&domain->lock, flags);
1496 }
1497
1498 static void domain_flush_pasid_iotlb(struct intel_iommu *iommu,
1499                                      struct dmar_domain *domain, u64 addr,
1500                                      unsigned long npages, bool ih)
1501 {
1502         u16 did = domain_id_iommu(domain, iommu);
1503         struct dev_pasid_info *dev_pasid;
1504         unsigned long flags;
1505
1506         spin_lock_irqsave(&domain->lock, flags);
1507         list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain)
1508                 qi_flush_piotlb(iommu, did, dev_pasid->pasid, addr, npages, ih);
1509
1510         if (!list_empty(&domain->devices))
1511                 qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih);
1512         spin_unlock_irqrestore(&domain->lock, flags);
1513 }
1514
1515 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1516                                   struct dmar_domain *domain,
1517                                   unsigned long pfn, unsigned int pages,
1518                                   int ih, int map)
1519 {
1520         unsigned int aligned_pages = __roundup_pow_of_two(pages);
1521         unsigned int mask = ilog2(aligned_pages);
1522         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1523         u16 did = domain_id_iommu(domain, iommu);
1524
1525         if (WARN_ON(!pages))
1526                 return;
1527
1528         if (ih)
1529                 ih = 1 << 6;
1530
1531         if (domain->use_first_level) {
1532                 domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih);
1533         } else {
1534                 unsigned long bitmask = aligned_pages - 1;
1535
1536                 /*
1537                  * PSI masks the low order bits of the base address. If the
1538                  * address isn't aligned to the mask, then compute a mask value
1539                  * needed to ensure the target range is flushed.
1540                  */
1541                 if (unlikely(bitmask & pfn)) {
1542                         unsigned long end_pfn = pfn + pages - 1, shared_bits;
1543
1544                         /*
1545                          * Since end_pfn <= pfn + bitmask, the only way bits
1546                          * higher than bitmask can differ in pfn and end_pfn is
1547                          * by carrying. This means after masking out bitmask,
1548                          * high bits starting with the first set bit in
1549                          * shared_bits are all equal in both pfn and end_pfn.
1550                          */
1551                         shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1552                         mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1553                 }
1554
1555                 /*
1556                  * Fallback to domain selective flush if no PSI support or
1557                  * the size is too big.
1558                  */
1559                 if (!cap_pgsel_inv(iommu->cap) ||
1560                     mask > cap_max_amask_val(iommu->cap))
1561                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1562                                                         DMA_TLB_DSI_FLUSH);
1563                 else
1564                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1565                                                         DMA_TLB_PSI_FLUSH);
1566         }
1567
1568         /*
1569          * In caching mode, changes of pages from non-present to present require
1570          * flush. However, device IOTLB doesn't need to be flushed in this case.
1571          */
1572         if (!cap_caching_mode(iommu->cap) || !map)
1573                 iommu_flush_dev_iotlb(domain, addr, mask);
1574 }
1575
1576 /* Notification for newly created mappings */
1577 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1578                                         struct dmar_domain *domain,
1579                                         unsigned long pfn, unsigned int pages)
1580 {
1581         /*
1582          * It's a non-present to present mapping. Only flush if caching mode
1583          * and second level.
1584          */
1585         if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1586                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1587         else
1588                 iommu_flush_write_buffer(iommu);
1589 }
1590
1591 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1592 {
1593         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1594         struct iommu_domain_info *info;
1595         unsigned long idx;
1596
1597         xa_for_each(&dmar_domain->iommu_array, idx, info) {
1598                 struct intel_iommu *iommu = info->iommu;
1599                 u16 did = domain_id_iommu(dmar_domain, iommu);
1600
1601                 if (dmar_domain->use_first_level)
1602                         domain_flush_pasid_iotlb(iommu, dmar_domain, 0, -1, 0);
1603                 else
1604                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1605                                                  DMA_TLB_DSI_FLUSH);
1606
1607                 if (!cap_caching_mode(iommu->cap))
1608                         iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1609         }
1610 }
1611
1612 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1613 {
1614         u32 pmen;
1615         unsigned long flags;
1616
1617         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1618                 return;
1619
1620         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1621         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1622         pmen &= ~DMA_PMEN_EPM;
1623         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1624
1625         /* wait for the protected region status bit to clear */
1626         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1627                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1628
1629         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1630 }
1631
1632 static void iommu_enable_translation(struct intel_iommu *iommu)
1633 {
1634         u32 sts;
1635         unsigned long flags;
1636
1637         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1638         iommu->gcmd |= DMA_GCMD_TE;
1639         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1640
1641         /* Make sure hardware complete it */
1642         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1643                       readl, (sts & DMA_GSTS_TES), sts);
1644
1645         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1646 }
1647
1648 static void iommu_disable_translation(struct intel_iommu *iommu)
1649 {
1650         u32 sts;
1651         unsigned long flag;
1652
1653         if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1654             (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1655                 return;
1656
1657         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1658         iommu->gcmd &= ~DMA_GCMD_TE;
1659         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1660
1661         /* Make sure hardware complete it */
1662         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1663                       readl, (!(sts & DMA_GSTS_TES)), sts);
1664
1665         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1666 }
1667
1668 static int iommu_init_domains(struct intel_iommu *iommu)
1669 {
1670         u32 ndomains;
1671
1672         ndomains = cap_ndoms(iommu->cap);
1673         pr_debug("%s: Number of Domains supported <%d>\n",
1674                  iommu->name, ndomains);
1675
1676         spin_lock_init(&iommu->lock);
1677
1678         iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1679         if (!iommu->domain_ids)
1680                 return -ENOMEM;
1681
1682         /*
1683          * If Caching mode is set, then invalid translations are tagged
1684          * with domain-id 0, hence we need to pre-allocate it. We also
1685          * use domain-id 0 as a marker for non-allocated domain-id, so
1686          * make sure it is not used for a real domain.
1687          */
1688         set_bit(0, iommu->domain_ids);
1689
1690         /*
1691          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1692          * entry for first-level or pass-through translation modes should
1693          * be programmed with a domain id different from those used for
1694          * second-level or nested translation. We reserve a domain id for
1695          * this purpose.
1696          */
1697         if (sm_supported(iommu))
1698                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1699
1700         return 0;
1701 }
1702
1703 static void disable_dmar_iommu(struct intel_iommu *iommu)
1704 {
1705         if (!iommu->domain_ids)
1706                 return;
1707
1708         /*
1709          * All iommu domains must have been detached from the devices,
1710          * hence there should be no domain IDs in use.
1711          */
1712         if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1713                     > NUM_RESERVED_DID))
1714                 return;
1715
1716         if (iommu->gcmd & DMA_GCMD_TE)
1717                 iommu_disable_translation(iommu);
1718 }
1719
1720 static void free_dmar_iommu(struct intel_iommu *iommu)
1721 {
1722         if (iommu->domain_ids) {
1723                 bitmap_free(iommu->domain_ids);
1724                 iommu->domain_ids = NULL;
1725         }
1726
1727         if (iommu->copied_tables) {
1728                 bitmap_free(iommu->copied_tables);
1729                 iommu->copied_tables = NULL;
1730         }
1731
1732         /* free context mapping */
1733         free_context_table(iommu);
1734
1735 #ifdef CONFIG_INTEL_IOMMU_SVM
1736         if (pasid_supported(iommu)) {
1737                 if (ecap_prs(iommu->ecap))
1738                         intel_svm_finish_prq(iommu);
1739         }
1740 #endif
1741 }
1742
1743 /*
1744  * Check and return whether first level is used by default for
1745  * DMA translation.
1746  */
1747 static bool first_level_by_default(unsigned int type)
1748 {
1749         /* Only SL is available in legacy mode */
1750         if (!scalable_mode_support())
1751                 return false;
1752
1753         /* Only level (either FL or SL) is available, just use it */
1754         if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1755                 return intel_cap_flts_sanity();
1756
1757         /* Both levels are available, decide it based on domain type */
1758         return type != IOMMU_DOMAIN_UNMANAGED;
1759 }
1760
1761 static struct dmar_domain *alloc_domain(unsigned int type)
1762 {
1763         struct dmar_domain *domain;
1764
1765         domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1766         if (!domain)
1767                 return NULL;
1768
1769         domain->nid = NUMA_NO_NODE;
1770         if (first_level_by_default(type))
1771                 domain->use_first_level = true;
1772         domain->has_iotlb_device = false;
1773         INIT_LIST_HEAD(&domain->devices);
1774         INIT_LIST_HEAD(&domain->dev_pasids);
1775         spin_lock_init(&domain->lock);
1776         xa_init(&domain->iommu_array);
1777
1778         return domain;
1779 }
1780
1781 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1782 {
1783         struct iommu_domain_info *info, *curr;
1784         unsigned long ndomains;
1785         int num, ret = -ENOSPC;
1786
1787         info = kzalloc(sizeof(*info), GFP_KERNEL);
1788         if (!info)
1789                 return -ENOMEM;
1790
1791         spin_lock(&iommu->lock);
1792         curr = xa_load(&domain->iommu_array, iommu->seq_id);
1793         if (curr) {
1794                 curr->refcnt++;
1795                 spin_unlock(&iommu->lock);
1796                 kfree(info);
1797                 return 0;
1798         }
1799
1800         ndomains = cap_ndoms(iommu->cap);
1801         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1802         if (num >= ndomains) {
1803                 pr_err("%s: No free domain ids\n", iommu->name);
1804                 goto err_unlock;
1805         }
1806
1807         set_bit(num, iommu->domain_ids);
1808         info->refcnt    = 1;
1809         info->did       = num;
1810         info->iommu     = iommu;
1811         curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1812                           NULL, info, GFP_ATOMIC);
1813         if (curr) {
1814                 ret = xa_err(curr) ? : -EBUSY;
1815                 goto err_clear;
1816         }
1817         domain_update_iommu_cap(domain);
1818
1819         spin_unlock(&iommu->lock);
1820         return 0;
1821
1822 err_clear:
1823         clear_bit(info->did, iommu->domain_ids);
1824 err_unlock:
1825         spin_unlock(&iommu->lock);
1826         kfree(info);
1827         return ret;
1828 }
1829
1830 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1831 {
1832         struct iommu_domain_info *info;
1833
1834         spin_lock(&iommu->lock);
1835         info = xa_load(&domain->iommu_array, iommu->seq_id);
1836         if (--info->refcnt == 0) {
1837                 clear_bit(info->did, iommu->domain_ids);
1838                 xa_erase(&domain->iommu_array, iommu->seq_id);
1839                 domain->nid = NUMA_NO_NODE;
1840                 domain_update_iommu_cap(domain);
1841                 kfree(info);
1842         }
1843         spin_unlock(&iommu->lock);
1844 }
1845
1846 static inline int guestwidth_to_adjustwidth(int gaw)
1847 {
1848         int agaw;
1849         int r = (gaw - 12) % 9;
1850
1851         if (r == 0)
1852                 agaw = gaw;
1853         else
1854                 agaw = gaw + 9 - r;
1855         if (agaw > 64)
1856                 agaw = 64;
1857         return agaw;
1858 }
1859
1860 static void domain_exit(struct dmar_domain *domain)
1861 {
1862         if (domain->pgd) {
1863                 LIST_HEAD(freelist);
1864
1865                 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1866                 put_pages_list(&freelist);
1867         }
1868
1869         if (WARN_ON(!list_empty(&domain->devices)))
1870                 return;
1871
1872         kfree(domain);
1873 }
1874
1875 /*
1876  * Get the PASID directory size for scalable mode context entry.
1877  * Value of X in the PDTS field of a scalable mode context entry
1878  * indicates PASID directory with 2^(X + 7) entries.
1879  */
1880 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1881 {
1882         unsigned long pds, max_pde;
1883
1884         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1885         pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1886         if (pds < 7)
1887                 return 0;
1888
1889         return pds - 7;
1890 }
1891
1892 /*
1893  * Set the RID_PASID field of a scalable mode context entry. The
1894  * IOMMU hardware will use the PASID value set in this field for
1895  * DMA translations of DMA requests without PASID.
1896  */
1897 static inline void
1898 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1899 {
1900         context->hi |= pasid & ((1 << 20) - 1);
1901 }
1902
1903 /*
1904  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1905  * entry.
1906  */
1907 static inline void context_set_sm_dte(struct context_entry *context)
1908 {
1909         context->lo |= BIT_ULL(2);
1910 }
1911
1912 /*
1913  * Set the PRE(Page Request Enable) field of a scalable mode context
1914  * entry.
1915  */
1916 static inline void context_set_sm_pre(struct context_entry *context)
1917 {
1918         context->lo |= BIT_ULL(4);
1919 }
1920
1921 /* Convert value to context PASID directory size field coding. */
1922 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1923
1924 static int domain_context_mapping_one(struct dmar_domain *domain,
1925                                       struct intel_iommu *iommu,
1926                                       struct pasid_table *table,
1927                                       u8 bus, u8 devfn)
1928 {
1929         struct device_domain_info *info =
1930                         domain_lookup_dev_info(domain, iommu, bus, devfn);
1931         u16 did = domain_id_iommu(domain, iommu);
1932         int translation = CONTEXT_TT_MULTI_LEVEL;
1933         struct context_entry *context;
1934         int ret;
1935
1936         if (hw_pass_through && domain_type_is_si(domain))
1937                 translation = CONTEXT_TT_PASS_THROUGH;
1938
1939         pr_debug("Set context mapping for %02x:%02x.%d\n",
1940                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1941
1942         spin_lock(&iommu->lock);
1943         ret = -ENOMEM;
1944         context = iommu_context_addr(iommu, bus, devfn, 1);
1945         if (!context)
1946                 goto out_unlock;
1947
1948         ret = 0;
1949         if (context_present(context) && !context_copied(iommu, bus, devfn))
1950                 goto out_unlock;
1951
1952         /*
1953          * For kdump cases, old valid entries may be cached due to the
1954          * in-flight DMA and copied pgtable, but there is no unmapping
1955          * behaviour for them, thus we need an explicit cache flush for
1956          * the newly-mapped device. For kdump, at this point, the device
1957          * is supposed to finish reset at its driver probe stage, so no
1958          * in-flight DMA will exist, and we don't need to worry anymore
1959          * hereafter.
1960          */
1961         if (context_copied(iommu, bus, devfn)) {
1962                 u16 did_old = context_domain_id(context);
1963
1964                 if (did_old < cap_ndoms(iommu->cap)) {
1965                         iommu->flush.flush_context(iommu, did_old,
1966                                                    (((u16)bus) << 8) | devfn,
1967                                                    DMA_CCMD_MASK_NOBIT,
1968                                                    DMA_CCMD_DEVICE_INVL);
1969                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1970                                                  DMA_TLB_DSI_FLUSH);
1971                 }
1972
1973                 clear_context_copied(iommu, bus, devfn);
1974         }
1975
1976         context_clear_entry(context);
1977
1978         if (sm_supported(iommu)) {
1979                 unsigned long pds;
1980
1981                 /* Setup the PASID DIR pointer: */
1982                 pds = context_get_sm_pds(table);
1983                 context->lo = (u64)virt_to_phys(table->table) |
1984                                 context_pdts(pds);
1985
1986                 /* Setup the RID_PASID field: */
1987                 context_set_sm_rid2pasid(context, IOMMU_NO_PASID);
1988
1989                 /*
1990                  * Setup the Device-TLB enable bit and Page request
1991                  * Enable bit:
1992                  */
1993                 if (info && info->ats_supported)
1994                         context_set_sm_dte(context);
1995                 if (info && info->pri_supported)
1996                         context_set_sm_pre(context);
1997                 if (info && info->pasid_supported)
1998                         context_set_pasid(context);
1999         } else {
2000                 struct dma_pte *pgd = domain->pgd;
2001                 int agaw;
2002
2003                 context_set_domain_id(context, did);
2004
2005                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2006                         /*
2007                          * Skip top levels of page tables for iommu which has
2008                          * less agaw than default. Unnecessary for PT mode.
2009                          */
2010                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2011                                 ret = -ENOMEM;
2012                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2013                                 if (!dma_pte_present(pgd))
2014                                         goto out_unlock;
2015                         }
2016
2017                         if (info && info->ats_supported)
2018                                 translation = CONTEXT_TT_DEV_IOTLB;
2019                         else
2020                                 translation = CONTEXT_TT_MULTI_LEVEL;
2021
2022                         context_set_address_root(context, virt_to_phys(pgd));
2023                         context_set_address_width(context, agaw);
2024                 } else {
2025                         /*
2026                          * In pass through mode, AW must be programmed to
2027                          * indicate the largest AGAW value supported by
2028                          * hardware. And ASR is ignored by hardware.
2029                          */
2030                         context_set_address_width(context, iommu->msagaw);
2031                 }
2032
2033                 context_set_translation_type(context, translation);
2034         }
2035
2036         context_set_fault_enable(context);
2037         context_set_present(context);
2038         if (!ecap_coherent(iommu->ecap))
2039                 clflush_cache_range(context, sizeof(*context));
2040
2041         /*
2042          * It's a non-present to present mapping. If hardware doesn't cache
2043          * non-present entry we only need to flush the write-buffer. If the
2044          * _does_ cache non-present entries, then it does so in the special
2045          * domain #0, which we have to flush:
2046          */
2047         if (cap_caching_mode(iommu->cap)) {
2048                 iommu->flush.flush_context(iommu, 0,
2049                                            (((u16)bus) << 8) | devfn,
2050                                            DMA_CCMD_MASK_NOBIT,
2051                                            DMA_CCMD_DEVICE_INVL);
2052                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2053         } else {
2054                 iommu_flush_write_buffer(iommu);
2055         }
2056
2057         ret = 0;
2058
2059 out_unlock:
2060         spin_unlock(&iommu->lock);
2061
2062         return ret;
2063 }
2064
2065 struct domain_context_mapping_data {
2066         struct dmar_domain *domain;
2067         struct intel_iommu *iommu;
2068         struct pasid_table *table;
2069 };
2070
2071 static int domain_context_mapping_cb(struct pci_dev *pdev,
2072                                      u16 alias, void *opaque)
2073 {
2074         struct domain_context_mapping_data *data = opaque;
2075
2076         return domain_context_mapping_one(data->domain, data->iommu,
2077                                           data->table, PCI_BUS_NUM(alias),
2078                                           alias & 0xff);
2079 }
2080
2081 static int
2082 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2083 {
2084         struct domain_context_mapping_data data;
2085         struct pasid_table *table;
2086         struct intel_iommu *iommu;
2087         u8 bus, devfn;
2088
2089         iommu = device_to_iommu(dev, &bus, &devfn);
2090         if (!iommu)
2091                 return -ENODEV;
2092
2093         table = intel_pasid_get_table(dev);
2094
2095         if (!dev_is_pci(dev))
2096                 return domain_context_mapping_one(domain, iommu, table,
2097                                                   bus, devfn);
2098
2099         data.domain = domain;
2100         data.iommu = iommu;
2101         data.table = table;
2102
2103         return pci_for_each_dma_alias(to_pci_dev(dev),
2104                                       &domain_context_mapping_cb, &data);
2105 }
2106
2107 /* Returns a number of VTD pages, but aligned to MM page size */
2108 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2109                                             size_t size)
2110 {
2111         host_addr &= ~PAGE_MASK;
2112         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2113 }
2114
2115 /* Return largest possible superpage level for a given mapping */
2116 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2117                                           unsigned long iov_pfn,
2118                                           unsigned long phy_pfn,
2119                                           unsigned long pages)
2120 {
2121         int support, level = 1;
2122         unsigned long pfnmerge;
2123
2124         support = domain->iommu_superpage;
2125
2126         /* To use a large page, the virtual *and* physical addresses
2127            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2128            of them will mean we have to use smaller pages. So just
2129            merge them and check both at once. */
2130         pfnmerge = iov_pfn | phy_pfn;
2131
2132         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2133                 pages >>= VTD_STRIDE_SHIFT;
2134                 if (!pages)
2135                         break;
2136                 pfnmerge >>= VTD_STRIDE_SHIFT;
2137                 level++;
2138                 support--;
2139         }
2140         return level;
2141 }
2142
2143 /*
2144  * Ensure that old small page tables are removed to make room for superpage(s).
2145  * We're going to add new large pages, so make sure we don't remove their parent
2146  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2147  */
2148 static void switch_to_super_page(struct dmar_domain *domain,
2149                                  unsigned long start_pfn,
2150                                  unsigned long end_pfn, int level)
2151 {
2152         unsigned long lvl_pages = lvl_to_nr_pages(level);
2153         struct iommu_domain_info *info;
2154         struct dma_pte *pte = NULL;
2155         unsigned long i;
2156
2157         while (start_pfn <= end_pfn) {
2158                 if (!pte)
2159                         pte = pfn_to_dma_pte(domain, start_pfn, &level,
2160                                              GFP_ATOMIC);
2161
2162                 if (dma_pte_present(pte)) {
2163                         dma_pte_free_pagetable(domain, start_pfn,
2164                                                start_pfn + lvl_pages - 1,
2165                                                level + 1);
2166
2167                         xa_for_each(&domain->iommu_array, i, info)
2168                                 iommu_flush_iotlb_psi(info->iommu, domain,
2169                                                       start_pfn, lvl_pages,
2170                                                       0, 0);
2171                 }
2172
2173                 pte++;
2174                 start_pfn += lvl_pages;
2175                 if (first_pte_in_page(pte))
2176                         pte = NULL;
2177         }
2178 }
2179
2180 static int
2181 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2182                  unsigned long phys_pfn, unsigned long nr_pages, int prot,
2183                  gfp_t gfp)
2184 {
2185         struct dma_pte *first_pte = NULL, *pte = NULL;
2186         unsigned int largepage_lvl = 0;
2187         unsigned long lvl_pages = 0;
2188         phys_addr_t pteval;
2189         u64 attr;
2190
2191         if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
2192                 return -EINVAL;
2193
2194         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2195                 return -EINVAL;
2196
2197         if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
2198                 pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
2199                 return -EINVAL;
2200         }
2201
2202         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2203         attr |= DMA_FL_PTE_PRESENT;
2204         if (domain->use_first_level) {
2205                 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2206                 if (prot & DMA_PTE_WRITE)
2207                         attr |= DMA_FL_PTE_DIRTY;
2208         }
2209
2210         pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2211
2212         while (nr_pages > 0) {
2213                 uint64_t tmp;
2214
2215                 if (!pte) {
2216                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2217                                         phys_pfn, nr_pages);
2218
2219                         pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
2220                                              gfp);
2221                         if (!pte)
2222                                 return -ENOMEM;
2223                         first_pte = pte;
2224
2225                         lvl_pages = lvl_to_nr_pages(largepage_lvl);
2226
2227                         /* It is large page*/
2228                         if (largepage_lvl > 1) {
2229                                 unsigned long end_pfn;
2230                                 unsigned long pages_to_remove;
2231
2232                                 pteval |= DMA_PTE_LARGE_PAGE;
2233                                 pages_to_remove = min_t(unsigned long, nr_pages,
2234                                                         nr_pte_to_next_page(pte) * lvl_pages);
2235                                 end_pfn = iov_pfn + pages_to_remove - 1;
2236                                 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2237                         } else {
2238                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2239                         }
2240
2241                 }
2242                 /* We don't need lock here, nobody else
2243                  * touches the iova range
2244                  */
2245                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2246                 if (tmp) {
2247                         static int dumps = 5;
2248                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2249                                 iov_pfn, tmp, (unsigned long long)pteval);
2250                         if (dumps) {
2251                                 dumps--;
2252                                 debug_dma_dump_mappings(NULL);
2253                         }
2254                         WARN_ON(1);
2255                 }
2256
2257                 nr_pages -= lvl_pages;
2258                 iov_pfn += lvl_pages;
2259                 phys_pfn += lvl_pages;
2260                 pteval += lvl_pages * VTD_PAGE_SIZE;
2261
2262                 /* If the next PTE would be the first in a new page, then we
2263                  * need to flush the cache on the entries we've just written.
2264                  * And then we'll need to recalculate 'pte', so clear it and
2265                  * let it get set again in the if (!pte) block above.
2266                  *
2267                  * If we're done (!nr_pages) we need to flush the cache too.
2268                  *
2269                  * Also if we've been setting superpages, we may need to
2270                  * recalculate 'pte' and switch back to smaller pages for the
2271                  * end of the mapping, if the trailing size is not enough to
2272                  * use another superpage (i.e. nr_pages < lvl_pages).
2273                  */
2274                 pte++;
2275                 if (!nr_pages || first_pte_in_page(pte) ||
2276                     (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2277                         domain_flush_cache(domain, first_pte,
2278                                            (void *)pte - (void *)first_pte);
2279                         pte = NULL;
2280                 }
2281         }
2282
2283         return 0;
2284 }
2285
2286 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2287 {
2288         struct intel_iommu *iommu = info->iommu;
2289         struct context_entry *context;
2290         u16 did_old;
2291
2292         if (!iommu)
2293                 return;
2294
2295         spin_lock(&iommu->lock);
2296         context = iommu_context_addr(iommu, bus, devfn, 0);
2297         if (!context) {
2298                 spin_unlock(&iommu->lock);
2299                 return;
2300         }
2301
2302         if (sm_supported(iommu)) {
2303                 if (hw_pass_through && domain_type_is_si(info->domain))
2304                         did_old = FLPT_DEFAULT_DID;
2305                 else
2306                         did_old = domain_id_iommu(info->domain, iommu);
2307         } else {
2308                 did_old = context_domain_id(context);
2309         }
2310
2311         context_clear_entry(context);
2312         __iommu_flush_cache(iommu, context, sizeof(*context));
2313         spin_unlock(&iommu->lock);
2314         iommu->flush.flush_context(iommu,
2315                                    did_old,
2316                                    (((u16)bus) << 8) | devfn,
2317                                    DMA_CCMD_MASK_NOBIT,
2318                                    DMA_CCMD_DEVICE_INVL);
2319
2320         if (sm_supported(iommu))
2321                 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2322
2323         iommu->flush.flush_iotlb(iommu,
2324                                  did_old,
2325                                  0,
2326                                  0,
2327                                  DMA_TLB_DSI_FLUSH);
2328
2329         __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2330 }
2331
2332 static int domain_setup_first_level(struct intel_iommu *iommu,
2333                                     struct dmar_domain *domain,
2334                                     struct device *dev,
2335                                     u32 pasid)
2336 {
2337         struct dma_pte *pgd = domain->pgd;
2338         int agaw, level;
2339         int flags = 0;
2340
2341         /*
2342          * Skip top levels of page tables for iommu which has
2343          * less agaw than default. Unnecessary for PT mode.
2344          */
2345         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2346                 pgd = phys_to_virt(dma_pte_addr(pgd));
2347                 if (!dma_pte_present(pgd))
2348                         return -ENOMEM;
2349         }
2350
2351         level = agaw_to_level(agaw);
2352         if (level != 4 && level != 5)
2353                 return -EINVAL;
2354
2355         if (level == 5)
2356                 flags |= PASID_FLAG_FL5LP;
2357
2358         if (domain->force_snooping)
2359                 flags |= PASID_FLAG_PAGE_SNOOP;
2360
2361         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2362                                              domain_id_iommu(domain, iommu),
2363                                              flags);
2364 }
2365
2366 static bool dev_is_real_dma_subdevice(struct device *dev)
2367 {
2368         return dev && dev_is_pci(dev) &&
2369                pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2370 }
2371
2372 static int iommu_domain_identity_map(struct dmar_domain *domain,
2373                                      unsigned long first_vpfn,
2374                                      unsigned long last_vpfn)
2375 {
2376         /*
2377          * RMRR range might have overlap with physical memory range,
2378          * clear it first
2379          */
2380         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2381
2382         return __domain_mapping(domain, first_vpfn,
2383                                 first_vpfn, last_vpfn - first_vpfn + 1,
2384                                 DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2385 }
2386
2387 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2388
2389 static int __init si_domain_init(int hw)
2390 {
2391         struct dmar_rmrr_unit *rmrr;
2392         struct device *dev;
2393         int i, nid, ret;
2394
2395         si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2396         if (!si_domain)
2397                 return -EFAULT;
2398
2399         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2400                 domain_exit(si_domain);
2401                 si_domain = NULL;
2402                 return -EFAULT;
2403         }
2404
2405         if (hw)
2406                 return 0;
2407
2408         for_each_online_node(nid) {
2409                 unsigned long start_pfn, end_pfn;
2410                 int i;
2411
2412                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2413                         ret = iommu_domain_identity_map(si_domain,
2414                                         mm_to_dma_pfn_start(start_pfn),
2415                                         mm_to_dma_pfn_end(end_pfn));
2416                         if (ret)
2417                                 return ret;
2418                 }
2419         }
2420
2421         /*
2422          * Identity map the RMRRs so that devices with RMRRs could also use
2423          * the si_domain.
2424          */
2425         for_each_rmrr_units(rmrr) {
2426                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2427                                           i, dev) {
2428                         unsigned long long start = rmrr->base_address;
2429                         unsigned long long end = rmrr->end_address;
2430
2431                         if (WARN_ON(end < start ||
2432                                     end >> agaw_to_width(si_domain->agaw)))
2433                                 continue;
2434
2435                         ret = iommu_domain_identity_map(si_domain,
2436                                         mm_to_dma_pfn_start(start >> PAGE_SHIFT),
2437                                         mm_to_dma_pfn_end(end >> PAGE_SHIFT));
2438                         if (ret)
2439                                 return ret;
2440                 }
2441         }
2442
2443         return 0;
2444 }
2445
2446 static int dmar_domain_attach_device(struct dmar_domain *domain,
2447                                      struct device *dev)
2448 {
2449         struct device_domain_info *info = dev_iommu_priv_get(dev);
2450         struct intel_iommu *iommu;
2451         unsigned long flags;
2452         u8 bus, devfn;
2453         int ret;
2454
2455         iommu = device_to_iommu(dev, &bus, &devfn);
2456         if (!iommu)
2457                 return -ENODEV;
2458
2459         ret = domain_attach_iommu(domain, iommu);
2460         if (ret)
2461                 return ret;
2462         info->domain = domain;
2463         spin_lock_irqsave(&domain->lock, flags);
2464         list_add(&info->link, &domain->devices);
2465         spin_unlock_irqrestore(&domain->lock, flags);
2466
2467         /* PASID table is mandatory for a PCI device in scalable mode. */
2468         if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2469                 /* Setup the PASID entry for requests without PASID: */
2470                 if (hw_pass_through && domain_type_is_si(domain))
2471                         ret = intel_pasid_setup_pass_through(iommu, domain,
2472                                         dev, IOMMU_NO_PASID);
2473                 else if (domain->use_first_level)
2474                         ret = domain_setup_first_level(iommu, domain, dev,
2475                                         IOMMU_NO_PASID);
2476                 else
2477                         ret = intel_pasid_setup_second_level(iommu, domain,
2478                                         dev, IOMMU_NO_PASID);
2479                 if (ret) {
2480                         dev_err(dev, "Setup RID2PASID failed\n");
2481                         device_block_translation(dev);
2482                         return ret;
2483                 }
2484         }
2485
2486         ret = domain_context_mapping(domain, dev);
2487         if (ret) {
2488                 dev_err(dev, "Domain context map failed\n");
2489                 device_block_translation(dev);
2490                 return ret;
2491         }
2492
2493         iommu_enable_pci_caps(info);
2494
2495         return 0;
2496 }
2497
2498 /**
2499  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2500  * is relaxable (ie. is allowed to be not enforced under some conditions)
2501  * @dev: device handle
2502  *
2503  * We assume that PCI USB devices with RMRRs have them largely
2504  * for historical reasons and that the RMRR space is not actively used post
2505  * boot.  This exclusion may change if vendors begin to abuse it.
2506  *
2507  * The same exception is made for graphics devices, with the requirement that
2508  * any use of the RMRR regions will be torn down before assigning the device
2509  * to a guest.
2510  *
2511  * Return: true if the RMRR is relaxable, false otherwise
2512  */
2513 static bool device_rmrr_is_relaxable(struct device *dev)
2514 {
2515         struct pci_dev *pdev;
2516
2517         if (!dev_is_pci(dev))
2518                 return false;
2519
2520         pdev = to_pci_dev(dev);
2521         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2522                 return true;
2523         else
2524                 return false;
2525 }
2526
2527 /*
2528  * Return the required default domain type for a specific device.
2529  *
2530  * @dev: the device in query
2531  * @startup: true if this is during early boot
2532  *
2533  * Returns:
2534  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2535  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2536  *  - 0: both identity and dynamic domains work for this device
2537  */
2538 static int device_def_domain_type(struct device *dev)
2539 {
2540         if (dev_is_pci(dev)) {
2541                 struct pci_dev *pdev = to_pci_dev(dev);
2542
2543                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2544                         return IOMMU_DOMAIN_IDENTITY;
2545
2546                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2547                         return IOMMU_DOMAIN_IDENTITY;
2548         }
2549
2550         return 0;
2551 }
2552
2553 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2554 {
2555         /*
2556          * Start from the sane iommu hardware state.
2557          * If the queued invalidation is already initialized by us
2558          * (for example, while enabling interrupt-remapping) then
2559          * we got the things already rolling from a sane state.
2560          */
2561         if (!iommu->qi) {
2562                 /*
2563                  * Clear any previous faults.
2564                  */
2565                 dmar_fault(-1, iommu);
2566                 /*
2567                  * Disable queued invalidation if supported and already enabled
2568                  * before OS handover.
2569                  */
2570                 dmar_disable_qi(iommu);
2571         }
2572
2573         if (dmar_enable_qi(iommu)) {
2574                 /*
2575                  * Queued Invalidate not enabled, use Register Based Invalidate
2576                  */
2577                 iommu->flush.flush_context = __iommu_flush_context;
2578                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2579                 pr_info("%s: Using Register based invalidation\n",
2580                         iommu->name);
2581         } else {
2582                 iommu->flush.flush_context = qi_flush_context;
2583                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2584                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2585         }
2586 }
2587
2588 static int copy_context_table(struct intel_iommu *iommu,
2589                               struct root_entry *old_re,
2590                               struct context_entry **tbl,
2591                               int bus, bool ext)
2592 {
2593         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2594         struct context_entry *new_ce = NULL, ce;
2595         struct context_entry *old_ce = NULL;
2596         struct root_entry re;
2597         phys_addr_t old_ce_phys;
2598
2599         tbl_idx = ext ? bus * 2 : bus;
2600         memcpy(&re, old_re, sizeof(re));
2601
2602         for (devfn = 0; devfn < 256; devfn++) {
2603                 /* First calculate the correct index */
2604                 idx = (ext ? devfn * 2 : devfn) % 256;
2605
2606                 if (idx == 0) {
2607                         /* First save what we may have and clean up */
2608                         if (new_ce) {
2609                                 tbl[tbl_idx] = new_ce;
2610                                 __iommu_flush_cache(iommu, new_ce,
2611                                                     VTD_PAGE_SIZE);
2612                                 pos = 1;
2613                         }
2614
2615                         if (old_ce)
2616                                 memunmap(old_ce);
2617
2618                         ret = 0;
2619                         if (devfn < 0x80)
2620                                 old_ce_phys = root_entry_lctp(&re);
2621                         else
2622                                 old_ce_phys = root_entry_uctp(&re);
2623
2624                         if (!old_ce_phys) {
2625                                 if (ext && devfn == 0) {
2626                                         /* No LCTP, try UCTP */
2627                                         devfn = 0x7f;
2628                                         continue;
2629                                 } else {
2630                                         goto out;
2631                                 }
2632                         }
2633
2634                         ret = -ENOMEM;
2635                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
2636                                         MEMREMAP_WB);
2637                         if (!old_ce)
2638                                 goto out;
2639
2640                         new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
2641                         if (!new_ce)
2642                                 goto out_unmap;
2643
2644                         ret = 0;
2645                 }
2646
2647                 /* Now copy the context entry */
2648                 memcpy(&ce, old_ce + idx, sizeof(ce));
2649
2650                 if (!context_present(&ce))
2651                         continue;
2652
2653                 did = context_domain_id(&ce);
2654                 if (did >= 0 && did < cap_ndoms(iommu->cap))
2655                         set_bit(did, iommu->domain_ids);
2656
2657                 set_context_copied(iommu, bus, devfn);
2658                 new_ce[idx] = ce;
2659         }
2660
2661         tbl[tbl_idx + pos] = new_ce;
2662
2663         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2664
2665 out_unmap:
2666         memunmap(old_ce);
2667
2668 out:
2669         return ret;
2670 }
2671
2672 static int copy_translation_tables(struct intel_iommu *iommu)
2673 {
2674         struct context_entry **ctxt_tbls;
2675         struct root_entry *old_rt;
2676         phys_addr_t old_rt_phys;
2677         int ctxt_table_entries;
2678         u64 rtaddr_reg;
2679         int bus, ret;
2680         bool new_ext, ext;
2681
2682         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2683         ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2684         new_ext    = !!sm_supported(iommu);
2685
2686         /*
2687          * The RTT bit can only be changed when translation is disabled,
2688          * but disabling translation means to open a window for data
2689          * corruption. So bail out and don't copy anything if we would
2690          * have to change the bit.
2691          */
2692         if (new_ext != ext)
2693                 return -EINVAL;
2694
2695         iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2696         if (!iommu->copied_tables)
2697                 return -ENOMEM;
2698
2699         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2700         if (!old_rt_phys)
2701                 return -EINVAL;
2702
2703         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2704         if (!old_rt)
2705                 return -ENOMEM;
2706
2707         /* This is too big for the stack - allocate it from slab */
2708         ctxt_table_entries = ext ? 512 : 256;
2709         ret = -ENOMEM;
2710         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2711         if (!ctxt_tbls)
2712                 goto out_unmap;
2713
2714         for (bus = 0; bus < 256; bus++) {
2715                 ret = copy_context_table(iommu, &old_rt[bus],
2716                                          ctxt_tbls, bus, ext);
2717                 if (ret) {
2718                         pr_err("%s: Failed to copy context table for bus %d\n",
2719                                 iommu->name, bus);
2720                         continue;
2721                 }
2722         }
2723
2724         spin_lock(&iommu->lock);
2725
2726         /* Context tables are copied, now write them to the root_entry table */
2727         for (bus = 0; bus < 256; bus++) {
2728                 int idx = ext ? bus * 2 : bus;
2729                 u64 val;
2730
2731                 if (ctxt_tbls[idx]) {
2732                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
2733                         iommu->root_entry[bus].lo = val;
2734                 }
2735
2736                 if (!ext || !ctxt_tbls[idx + 1])
2737                         continue;
2738
2739                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2740                 iommu->root_entry[bus].hi = val;
2741         }
2742
2743         spin_unlock(&iommu->lock);
2744
2745         kfree(ctxt_tbls);
2746
2747         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2748
2749         ret = 0;
2750
2751 out_unmap:
2752         memunmap(old_rt);
2753
2754         return ret;
2755 }
2756
2757 static int __init init_dmars(void)
2758 {
2759         struct dmar_drhd_unit *drhd;
2760         struct intel_iommu *iommu;
2761         int ret;
2762
2763         ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2764         if (ret)
2765                 goto free_iommu;
2766
2767         for_each_iommu(iommu, drhd) {
2768                 if (drhd->ignored) {
2769                         iommu_disable_translation(iommu);
2770                         continue;
2771                 }
2772
2773                 /*
2774                  * Find the max pasid size of all IOMMU's in the system.
2775                  * We need to ensure the system pasid table is no bigger
2776                  * than the smallest supported.
2777                  */
2778                 if (pasid_supported(iommu)) {
2779                         u32 temp = 2 << ecap_pss(iommu->ecap);
2780
2781                         intel_pasid_max_id = min_t(u32, temp,
2782                                                    intel_pasid_max_id);
2783                 }
2784
2785                 intel_iommu_init_qi(iommu);
2786
2787                 ret = iommu_init_domains(iommu);
2788                 if (ret)
2789                         goto free_iommu;
2790
2791                 init_translation_status(iommu);
2792
2793                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2794                         iommu_disable_translation(iommu);
2795                         clear_translation_pre_enabled(iommu);
2796                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2797                                 iommu->name);
2798                 }
2799
2800                 /*
2801                  * TBD:
2802                  * we could share the same root & context tables
2803                  * among all IOMMU's. Need to Split it later.
2804                  */
2805                 ret = iommu_alloc_root_entry(iommu);
2806                 if (ret)
2807                         goto free_iommu;
2808
2809                 if (translation_pre_enabled(iommu)) {
2810                         pr_info("Translation already enabled - trying to copy translation structures\n");
2811
2812                         ret = copy_translation_tables(iommu);
2813                         if (ret) {
2814                                 /*
2815                                  * We found the IOMMU with translation
2816                                  * enabled - but failed to copy over the
2817                                  * old root-entry table. Try to proceed
2818                                  * by disabling translation now and
2819                                  * allocating a clean root-entry table.
2820                                  * This might cause DMAR faults, but
2821                                  * probably the dump will still succeed.
2822                                  */
2823                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2824                                        iommu->name);
2825                                 iommu_disable_translation(iommu);
2826                                 clear_translation_pre_enabled(iommu);
2827                         } else {
2828                                 pr_info("Copied translation tables from previous kernel for %s\n",
2829                                         iommu->name);
2830                         }
2831                 }
2832
2833                 if (!ecap_pass_through(iommu->ecap))
2834                         hw_pass_through = 0;
2835                 intel_svm_check(iommu);
2836         }
2837
2838         /*
2839          * Now that qi is enabled on all iommus, set the root entry and flush
2840          * caches. This is required on some Intel X58 chipsets, otherwise the
2841          * flush_context function will loop forever and the boot hangs.
2842          */
2843         for_each_active_iommu(iommu, drhd) {
2844                 iommu_flush_write_buffer(iommu);
2845                 iommu_set_root_entry(iommu);
2846         }
2847
2848 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2849         dmar_map_gfx = 0;
2850 #endif
2851
2852         if (!dmar_map_gfx)
2853                 iommu_identity_mapping |= IDENTMAP_GFX;
2854
2855         check_tylersburg_isoch();
2856
2857         ret = si_domain_init(hw_pass_through);
2858         if (ret)
2859                 goto free_iommu;
2860
2861         /*
2862          * for each drhd
2863          *   enable fault log
2864          *   global invalidate context cache
2865          *   global invalidate iotlb
2866          *   enable translation
2867          */
2868         for_each_iommu(iommu, drhd) {
2869                 if (drhd->ignored) {
2870                         /*
2871                          * we always have to disable PMRs or DMA may fail on
2872                          * this device
2873                          */
2874                         if (force_on)
2875                                 iommu_disable_protect_mem_regions(iommu);
2876                         continue;
2877                 }
2878
2879                 iommu_flush_write_buffer(iommu);
2880
2881 #ifdef CONFIG_INTEL_IOMMU_SVM
2882                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2883                         /*
2884                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
2885                          * could cause possible lock race condition.
2886                          */
2887                         up_write(&dmar_global_lock);
2888                         ret = intel_svm_enable_prq(iommu);
2889                         down_write(&dmar_global_lock);
2890                         if (ret)
2891                                 goto free_iommu;
2892                 }
2893 #endif
2894                 ret = dmar_set_interrupt(iommu);
2895                 if (ret)
2896                         goto free_iommu;
2897         }
2898
2899         return 0;
2900
2901 free_iommu:
2902         for_each_active_iommu(iommu, drhd) {
2903                 disable_dmar_iommu(iommu);
2904                 free_dmar_iommu(iommu);
2905         }
2906         if (si_domain) {
2907                 domain_exit(si_domain);
2908                 si_domain = NULL;
2909         }
2910
2911         return ret;
2912 }
2913
2914 static void __init init_no_remapping_devices(void)
2915 {
2916         struct dmar_drhd_unit *drhd;
2917         struct device *dev;
2918         int i;
2919
2920         for_each_drhd_unit(drhd) {
2921                 if (!drhd->include_all) {
2922                         for_each_active_dev_scope(drhd->devices,
2923                                                   drhd->devices_cnt, i, dev)
2924                                 break;
2925                         /* ignore DMAR unit if no devices exist */
2926                         if (i == drhd->devices_cnt)
2927                                 drhd->ignored = 1;
2928                 }
2929         }
2930
2931         for_each_active_drhd_unit(drhd) {
2932                 if (drhd->include_all)
2933                         continue;
2934
2935                 for_each_active_dev_scope(drhd->devices,
2936                                           drhd->devices_cnt, i, dev)
2937                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2938                                 break;
2939                 if (i < drhd->devices_cnt)
2940                         continue;
2941
2942                 /* This IOMMU has *only* gfx devices. Either bypass it or
2943                    set the gfx_mapped flag, as appropriate */
2944                 drhd->gfx_dedicated = 1;
2945                 if (!dmar_map_gfx)
2946                         drhd->ignored = 1;
2947         }
2948 }
2949
2950 #ifdef CONFIG_SUSPEND
2951 static int init_iommu_hw(void)
2952 {
2953         struct dmar_drhd_unit *drhd;
2954         struct intel_iommu *iommu = NULL;
2955         int ret;
2956
2957         for_each_active_iommu(iommu, drhd) {
2958                 if (iommu->qi) {
2959                         ret = dmar_reenable_qi(iommu);
2960                         if (ret)
2961                                 return ret;
2962                 }
2963         }
2964
2965         for_each_iommu(iommu, drhd) {
2966                 if (drhd->ignored) {
2967                         /*
2968                          * we always have to disable PMRs or DMA may fail on
2969                          * this device
2970                          */
2971                         if (force_on)
2972                                 iommu_disable_protect_mem_regions(iommu);
2973                         continue;
2974                 }
2975
2976                 iommu_flush_write_buffer(iommu);
2977                 iommu_set_root_entry(iommu);
2978                 iommu_enable_translation(iommu);
2979                 iommu_disable_protect_mem_regions(iommu);
2980         }
2981
2982         return 0;
2983 }
2984
2985 static void iommu_flush_all(void)
2986 {
2987         struct dmar_drhd_unit *drhd;
2988         struct intel_iommu *iommu;
2989
2990         for_each_active_iommu(iommu, drhd) {
2991                 iommu->flush.flush_context(iommu, 0, 0, 0,
2992                                            DMA_CCMD_GLOBAL_INVL);
2993                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2994                                          DMA_TLB_GLOBAL_FLUSH);
2995         }
2996 }
2997
2998 static int iommu_suspend(void)
2999 {
3000         struct dmar_drhd_unit *drhd;
3001         struct intel_iommu *iommu = NULL;
3002         unsigned long flag;
3003
3004         iommu_flush_all();
3005
3006         for_each_active_iommu(iommu, drhd) {
3007                 iommu_disable_translation(iommu);
3008
3009                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3010
3011                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3012                         readl(iommu->reg + DMAR_FECTL_REG);
3013                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3014                         readl(iommu->reg + DMAR_FEDATA_REG);
3015                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3016                         readl(iommu->reg + DMAR_FEADDR_REG);
3017                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3018                         readl(iommu->reg + DMAR_FEUADDR_REG);
3019
3020                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3021         }
3022         return 0;
3023 }
3024
3025 static void iommu_resume(void)
3026 {
3027         struct dmar_drhd_unit *drhd;
3028         struct intel_iommu *iommu = NULL;
3029         unsigned long flag;
3030
3031         if (init_iommu_hw()) {
3032                 if (force_on)
3033                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3034                 else
3035                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3036                 return;
3037         }
3038
3039         for_each_active_iommu(iommu, drhd) {
3040
3041                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3042
3043                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3044                         iommu->reg + DMAR_FECTL_REG);
3045                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3046                         iommu->reg + DMAR_FEDATA_REG);
3047                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3048                         iommu->reg + DMAR_FEADDR_REG);
3049                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3050                         iommu->reg + DMAR_FEUADDR_REG);
3051
3052                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3053         }
3054 }
3055
3056 static struct syscore_ops iommu_syscore_ops = {
3057         .resume         = iommu_resume,
3058         .suspend        = iommu_suspend,
3059 };
3060
3061 static void __init init_iommu_pm_ops(void)
3062 {
3063         register_syscore_ops(&iommu_syscore_ops);
3064 }
3065
3066 #else
3067 static inline void init_iommu_pm_ops(void) {}
3068 #endif  /* CONFIG_PM */
3069
3070 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3071 {
3072         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3073             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3074             rmrr->end_address <= rmrr->base_address ||
3075             arch_rmrr_sanity_check(rmrr))
3076                 return -EINVAL;
3077
3078         return 0;
3079 }
3080
3081 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3082 {
3083         struct acpi_dmar_reserved_memory *rmrr;
3084         struct dmar_rmrr_unit *rmrru;
3085
3086         rmrr = (struct acpi_dmar_reserved_memory *)header;
3087         if (rmrr_sanity_check(rmrr)) {
3088                 pr_warn(FW_BUG
3089                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3090                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3091                            rmrr->base_address, rmrr->end_address,
3092                            dmi_get_system_info(DMI_BIOS_VENDOR),
3093                            dmi_get_system_info(DMI_BIOS_VERSION),
3094                            dmi_get_system_info(DMI_PRODUCT_VERSION));
3095                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3096         }
3097
3098         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3099         if (!rmrru)
3100                 goto out;
3101
3102         rmrru->hdr = header;
3103
3104         rmrru->base_address = rmrr->base_address;
3105         rmrru->end_address = rmrr->end_address;
3106
3107         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3108                                 ((void *)rmrr) + rmrr->header.length,
3109                                 &rmrru->devices_cnt);
3110         if (rmrru->devices_cnt && rmrru->devices == NULL)
3111                 goto free_rmrru;
3112
3113         list_add(&rmrru->list, &dmar_rmrr_units);
3114
3115         return 0;
3116 free_rmrru:
3117         kfree(rmrru);
3118 out:
3119         return -ENOMEM;
3120 }
3121
3122 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3123 {
3124         struct dmar_atsr_unit *atsru;
3125         struct acpi_dmar_atsr *tmp;
3126
3127         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3128                                 dmar_rcu_check()) {
3129                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3130                 if (atsr->segment != tmp->segment)
3131                         continue;
3132                 if (atsr->header.length != tmp->header.length)
3133                         continue;
3134                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3135                         return atsru;
3136         }
3137
3138         return NULL;
3139 }
3140
3141 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3142 {
3143         struct acpi_dmar_atsr *atsr;
3144         struct dmar_atsr_unit *atsru;
3145
3146         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3147                 return 0;
3148
3149         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3150         atsru = dmar_find_atsr(atsr);
3151         if (atsru)
3152                 return 0;
3153
3154         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3155         if (!atsru)
3156                 return -ENOMEM;
3157
3158         /*
3159          * If memory is allocated from slab by ACPI _DSM method, we need to
3160          * copy the memory content because the memory buffer will be freed
3161          * on return.
3162          */
3163         atsru->hdr = (void *)(atsru + 1);
3164         memcpy(atsru->hdr, hdr, hdr->length);
3165         atsru->include_all = atsr->flags & 0x1;
3166         if (!atsru->include_all) {
3167                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3168                                 (void *)atsr + atsr->header.length,
3169                                 &atsru->devices_cnt);
3170                 if (atsru->devices_cnt && atsru->devices == NULL) {
3171                         kfree(atsru);
3172                         return -ENOMEM;
3173                 }
3174         }
3175
3176         list_add_rcu(&atsru->list, &dmar_atsr_units);
3177
3178         return 0;
3179 }
3180
3181 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3182 {
3183         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3184         kfree(atsru);
3185 }
3186
3187 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3188 {
3189         struct acpi_dmar_atsr *atsr;
3190         struct dmar_atsr_unit *atsru;
3191
3192         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3193         atsru = dmar_find_atsr(atsr);
3194         if (atsru) {
3195                 list_del_rcu(&atsru->list);
3196                 synchronize_rcu();
3197                 intel_iommu_free_atsr(atsru);
3198         }
3199
3200         return 0;
3201 }
3202
3203 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3204 {
3205         int i;
3206         struct device *dev;
3207         struct acpi_dmar_atsr *atsr;
3208         struct dmar_atsr_unit *atsru;
3209
3210         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3211         atsru = dmar_find_atsr(atsr);
3212         if (!atsru)
3213                 return 0;
3214
3215         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3216                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3217                                           i, dev)
3218                         return -EBUSY;
3219         }
3220
3221         return 0;
3222 }
3223
3224 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3225 {
3226         struct dmar_satc_unit *satcu;
3227         struct acpi_dmar_satc *tmp;
3228
3229         list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3230                                 dmar_rcu_check()) {
3231                 tmp = (struct acpi_dmar_satc *)satcu->hdr;
3232                 if (satc->segment != tmp->segment)
3233                         continue;
3234                 if (satc->header.length != tmp->header.length)
3235                         continue;
3236                 if (memcmp(satc, tmp, satc->header.length) == 0)
3237                         return satcu;
3238         }
3239
3240         return NULL;
3241 }
3242
3243 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3244 {
3245         struct acpi_dmar_satc *satc;
3246         struct dmar_satc_unit *satcu;
3247
3248         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3249                 return 0;
3250
3251         satc = container_of(hdr, struct acpi_dmar_satc, header);
3252         satcu = dmar_find_satc(satc);
3253         if (satcu)
3254                 return 0;
3255
3256         satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3257         if (!satcu)
3258                 return -ENOMEM;
3259
3260         satcu->hdr = (void *)(satcu + 1);
3261         memcpy(satcu->hdr, hdr, hdr->length);
3262         satcu->atc_required = satc->flags & 0x1;
3263         satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3264                                               (void *)satc + satc->header.length,
3265                                               &satcu->devices_cnt);
3266         if (satcu->devices_cnt && !satcu->devices) {
3267                 kfree(satcu);
3268                 return -ENOMEM;
3269         }
3270         list_add_rcu(&satcu->list, &dmar_satc_units);
3271
3272         return 0;
3273 }
3274
3275 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3276 {
3277         int sp, ret;
3278         struct intel_iommu *iommu = dmaru->iommu;
3279
3280         ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3281         if (ret)
3282                 goto out;
3283
3284         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3285                 pr_warn("%s: Doesn't support hardware pass through.\n",
3286                         iommu->name);
3287                 return -ENXIO;
3288         }
3289
3290         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3291         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3292                 pr_warn("%s: Doesn't support large page.\n",
3293                         iommu->name);
3294                 return -ENXIO;
3295         }
3296
3297         /*
3298          * Disable translation if already enabled prior to OS handover.
3299          */
3300         if (iommu->gcmd & DMA_GCMD_TE)
3301                 iommu_disable_translation(iommu);
3302
3303         ret = iommu_init_domains(iommu);
3304         if (ret == 0)
3305                 ret = iommu_alloc_root_entry(iommu);
3306         if (ret)
3307                 goto out;
3308
3309         intel_svm_check(iommu);
3310
3311         if (dmaru->ignored) {
3312                 /*
3313                  * we always have to disable PMRs or DMA may fail on this device
3314                  */
3315                 if (force_on)
3316                         iommu_disable_protect_mem_regions(iommu);
3317                 return 0;
3318         }
3319
3320         intel_iommu_init_qi(iommu);
3321         iommu_flush_write_buffer(iommu);
3322
3323 #ifdef CONFIG_INTEL_IOMMU_SVM
3324         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3325                 ret = intel_svm_enable_prq(iommu);
3326                 if (ret)
3327                         goto disable_iommu;
3328         }
3329 #endif
3330         ret = dmar_set_interrupt(iommu);
3331         if (ret)
3332                 goto disable_iommu;
3333
3334         iommu_set_root_entry(iommu);
3335         iommu_enable_translation(iommu);
3336
3337         iommu_disable_protect_mem_regions(iommu);
3338         return 0;
3339
3340 disable_iommu:
3341         disable_dmar_iommu(iommu);
3342 out:
3343         free_dmar_iommu(iommu);
3344         return ret;
3345 }
3346
3347 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3348 {
3349         int ret = 0;
3350         struct intel_iommu *iommu = dmaru->iommu;
3351
3352         if (!intel_iommu_enabled)
3353                 return 0;
3354         if (iommu == NULL)
3355                 return -EINVAL;
3356
3357         if (insert) {
3358                 ret = intel_iommu_add(dmaru);
3359         } else {
3360                 disable_dmar_iommu(iommu);
3361                 free_dmar_iommu(iommu);
3362         }
3363
3364         return ret;
3365 }
3366
3367 static void intel_iommu_free_dmars(void)
3368 {
3369         struct dmar_rmrr_unit *rmrru, *rmrr_n;
3370         struct dmar_atsr_unit *atsru, *atsr_n;
3371         struct dmar_satc_unit *satcu, *satc_n;
3372
3373         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3374                 list_del(&rmrru->list);
3375                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3376                 kfree(rmrru);
3377         }
3378
3379         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3380                 list_del(&atsru->list);
3381                 intel_iommu_free_atsr(atsru);
3382         }
3383         list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3384                 list_del(&satcu->list);
3385                 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3386                 kfree(satcu);
3387         }
3388 }
3389
3390 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3391 {
3392         struct dmar_satc_unit *satcu;
3393         struct acpi_dmar_satc *satc;
3394         struct device *tmp;
3395         int i;
3396
3397         dev = pci_physfn(dev);
3398         rcu_read_lock();
3399
3400         list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3401                 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3402                 if (satc->segment != pci_domain_nr(dev->bus))
3403                         continue;
3404                 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3405                         if (to_pci_dev(tmp) == dev)
3406                                 goto out;
3407         }
3408         satcu = NULL;
3409 out:
3410         rcu_read_unlock();
3411         return satcu;
3412 }
3413
3414 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3415 {
3416         int i, ret = 1;
3417         struct pci_bus *bus;
3418         struct pci_dev *bridge = NULL;
3419         struct device *tmp;
3420         struct acpi_dmar_atsr *atsr;
3421         struct dmar_atsr_unit *atsru;
3422         struct dmar_satc_unit *satcu;
3423
3424         dev = pci_physfn(dev);
3425         satcu = dmar_find_matched_satc_unit(dev);
3426         if (satcu)
3427                 /*
3428                  * This device supports ATS as it is in SATC table.
3429                  * When IOMMU is in legacy mode, enabling ATS is done
3430                  * automatically by HW for the device that requires
3431                  * ATS, hence OS should not enable this device ATS
3432                  * to avoid duplicated TLB invalidation.
3433                  */
3434                 return !(satcu->atc_required && !sm_supported(iommu));
3435
3436         for (bus = dev->bus; bus; bus = bus->parent) {
3437                 bridge = bus->self;
3438                 /* If it's an integrated device, allow ATS */
3439                 if (!bridge)
3440                         return 1;
3441                 /* Connected via non-PCIe: no ATS */
3442                 if (!pci_is_pcie(bridge) ||
3443                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3444                         return 0;
3445                 /* If we found the root port, look it up in the ATSR */
3446                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3447                         break;
3448         }
3449
3450         rcu_read_lock();
3451         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3452                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3453                 if (atsr->segment != pci_domain_nr(dev->bus))
3454                         continue;
3455
3456                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3457                         if (tmp == &bridge->dev)
3458                                 goto out;
3459
3460                 if (atsru->include_all)
3461                         goto out;
3462         }
3463         ret = 0;
3464 out:
3465         rcu_read_unlock();
3466
3467         return ret;
3468 }
3469
3470 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3471 {
3472         int ret;
3473         struct dmar_rmrr_unit *rmrru;
3474         struct dmar_atsr_unit *atsru;
3475         struct dmar_satc_unit *satcu;
3476         struct acpi_dmar_atsr *atsr;
3477         struct acpi_dmar_reserved_memory *rmrr;
3478         struct acpi_dmar_satc *satc;
3479
3480         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3481                 return 0;
3482
3483         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3484                 rmrr = container_of(rmrru->hdr,
3485                                     struct acpi_dmar_reserved_memory, header);
3486                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3487                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3488                                 ((void *)rmrr) + rmrr->header.length,
3489                                 rmrr->segment, rmrru->devices,
3490                                 rmrru->devices_cnt);
3491                         if (ret < 0)
3492                                 return ret;
3493                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3494                         dmar_remove_dev_scope(info, rmrr->segment,
3495                                 rmrru->devices, rmrru->devices_cnt);
3496                 }
3497         }
3498
3499         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3500                 if (atsru->include_all)
3501                         continue;
3502
3503                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3504                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3505                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3506                                         (void *)atsr + atsr->header.length,
3507                                         atsr->segment, atsru->devices,
3508                                         atsru->devices_cnt);
3509                         if (ret > 0)
3510                                 break;
3511                         else if (ret < 0)
3512                                 return ret;
3513                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3514                         if (dmar_remove_dev_scope(info, atsr->segment,
3515                                         atsru->devices, atsru->devices_cnt))
3516                                 break;
3517                 }
3518         }
3519         list_for_each_entry(satcu, &dmar_satc_units, list) {
3520                 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3521                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3522                         ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3523                                         (void *)satc + satc->header.length,
3524                                         satc->segment, satcu->devices,
3525                                         satcu->devices_cnt);
3526                         if (ret > 0)
3527                                 break;
3528                         else if (ret < 0)
3529                                 return ret;
3530                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3531                         if (dmar_remove_dev_scope(info, satc->segment,
3532                                         satcu->devices, satcu->devices_cnt))
3533                                 break;
3534                 }
3535         }
3536
3537         return 0;
3538 }
3539
3540 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3541                                        unsigned long val, void *v)
3542 {
3543         struct memory_notify *mhp = v;
3544         unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn);
3545         unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn +
3546                         mhp->nr_pages - 1);
3547
3548         switch (val) {
3549         case MEM_GOING_ONLINE:
3550                 if (iommu_domain_identity_map(si_domain,
3551                                               start_vpfn, last_vpfn)) {
3552                         pr_warn("Failed to build identity map for [%lx-%lx]\n",
3553                                 start_vpfn, last_vpfn);
3554                         return NOTIFY_BAD;
3555                 }
3556                 break;
3557
3558         case MEM_OFFLINE:
3559         case MEM_CANCEL_ONLINE:
3560                 {
3561                         struct dmar_drhd_unit *drhd;
3562                         struct intel_iommu *iommu;
3563                         LIST_HEAD(freelist);
3564
3565                         domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3566
3567                         rcu_read_lock();
3568                         for_each_active_iommu(iommu, drhd)
3569                                 iommu_flush_iotlb_psi(iommu, si_domain,
3570                                         start_vpfn, mhp->nr_pages,
3571                                         list_empty(&freelist), 0);
3572                         rcu_read_unlock();
3573                         put_pages_list(&freelist);
3574                 }
3575                 break;
3576         }
3577
3578         return NOTIFY_OK;
3579 }
3580
3581 static struct notifier_block intel_iommu_memory_nb = {
3582         .notifier_call = intel_iommu_memory_notifier,
3583         .priority = 0
3584 };
3585
3586 static void intel_disable_iommus(void)
3587 {
3588         struct intel_iommu *iommu = NULL;
3589         struct dmar_drhd_unit *drhd;
3590
3591         for_each_iommu(iommu, drhd)
3592                 iommu_disable_translation(iommu);
3593 }
3594
3595 void intel_iommu_shutdown(void)
3596 {
3597         struct dmar_drhd_unit *drhd;
3598         struct intel_iommu *iommu = NULL;
3599
3600         if (no_iommu || dmar_disabled)
3601                 return;
3602
3603         down_write(&dmar_global_lock);
3604
3605         /* Disable PMRs explicitly here. */
3606         for_each_iommu(iommu, drhd)
3607                 iommu_disable_protect_mem_regions(iommu);
3608
3609         /* Make sure the IOMMUs are switched off */
3610         intel_disable_iommus();
3611
3612         up_write(&dmar_global_lock);
3613 }
3614
3615 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3616 {
3617         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3618
3619         return container_of(iommu_dev, struct intel_iommu, iommu);
3620 }
3621
3622 static ssize_t version_show(struct device *dev,
3623                             struct device_attribute *attr, char *buf)
3624 {
3625         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3626         u32 ver = readl(iommu->reg + DMAR_VER_REG);
3627         return sysfs_emit(buf, "%d:%d\n",
3628                           DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3629 }
3630 static DEVICE_ATTR_RO(version);
3631
3632 static ssize_t address_show(struct device *dev,
3633                             struct device_attribute *attr, char *buf)
3634 {
3635         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3636         return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3637 }
3638 static DEVICE_ATTR_RO(address);
3639
3640 static ssize_t cap_show(struct device *dev,
3641                         struct device_attribute *attr, char *buf)
3642 {
3643         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3644         return sysfs_emit(buf, "%llx\n", iommu->cap);
3645 }
3646 static DEVICE_ATTR_RO(cap);
3647
3648 static ssize_t ecap_show(struct device *dev,
3649                          struct device_attribute *attr, char *buf)
3650 {
3651         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3652         return sysfs_emit(buf, "%llx\n", iommu->ecap);
3653 }
3654 static DEVICE_ATTR_RO(ecap);
3655
3656 static ssize_t domains_supported_show(struct device *dev,
3657                                       struct device_attribute *attr, char *buf)
3658 {
3659         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3660         return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3661 }
3662 static DEVICE_ATTR_RO(domains_supported);
3663
3664 static ssize_t domains_used_show(struct device *dev,
3665                                  struct device_attribute *attr, char *buf)
3666 {
3667         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3668         return sysfs_emit(buf, "%d\n",
3669                           bitmap_weight(iommu->domain_ids,
3670                                         cap_ndoms(iommu->cap)));
3671 }
3672 static DEVICE_ATTR_RO(domains_used);
3673
3674 static struct attribute *intel_iommu_attrs[] = {
3675         &dev_attr_version.attr,
3676         &dev_attr_address.attr,
3677         &dev_attr_cap.attr,
3678         &dev_attr_ecap.attr,
3679         &dev_attr_domains_supported.attr,
3680         &dev_attr_domains_used.attr,
3681         NULL,
3682 };
3683
3684 static struct attribute_group intel_iommu_group = {
3685         .name = "intel-iommu",
3686         .attrs = intel_iommu_attrs,
3687 };
3688
3689 const struct attribute_group *intel_iommu_groups[] = {
3690         &intel_iommu_group,
3691         NULL,
3692 };
3693
3694 static inline bool has_external_pci(void)
3695 {
3696         struct pci_dev *pdev = NULL;
3697
3698         for_each_pci_dev(pdev)
3699                 if (pdev->external_facing) {
3700                         pci_dev_put(pdev);
3701                         return true;
3702                 }
3703
3704         return false;
3705 }
3706
3707 static int __init platform_optin_force_iommu(void)
3708 {
3709         if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3710                 return 0;
3711
3712         if (no_iommu || dmar_disabled)
3713                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3714
3715         /*
3716          * If Intel-IOMMU is disabled by default, we will apply identity
3717          * map for all devices except those marked as being untrusted.
3718          */
3719         if (dmar_disabled)
3720                 iommu_set_default_passthrough(false);
3721
3722         dmar_disabled = 0;
3723         no_iommu = 0;
3724
3725         return 1;
3726 }
3727
3728 static int __init probe_acpi_namespace_devices(void)
3729 {
3730         struct dmar_drhd_unit *drhd;
3731         /* To avoid a -Wunused-but-set-variable warning. */
3732         struct intel_iommu *iommu __maybe_unused;
3733         struct device *dev;
3734         int i, ret = 0;
3735
3736         for_each_active_iommu(iommu, drhd) {
3737                 for_each_active_dev_scope(drhd->devices,
3738                                           drhd->devices_cnt, i, dev) {
3739                         struct acpi_device_physical_node *pn;
3740                         struct acpi_device *adev;
3741
3742                         if (dev->bus != &acpi_bus_type)
3743                                 continue;
3744
3745                         adev = to_acpi_device(dev);
3746                         mutex_lock(&adev->physical_node_lock);
3747                         list_for_each_entry(pn,
3748                                             &adev->physical_node_list, node) {
3749                                 ret = iommu_probe_device(pn->dev);
3750                                 if (ret)
3751                                         break;
3752                         }
3753                         mutex_unlock(&adev->physical_node_lock);
3754
3755                         if (ret)
3756                                 return ret;
3757                 }
3758         }
3759
3760         return 0;
3761 }
3762
3763 static __init int tboot_force_iommu(void)
3764 {
3765         if (!tboot_enabled())
3766                 return 0;
3767
3768         if (no_iommu || dmar_disabled)
3769                 pr_warn("Forcing Intel-IOMMU to enabled\n");
3770
3771         dmar_disabled = 0;
3772         no_iommu = 0;
3773
3774         return 1;
3775 }
3776
3777 int __init intel_iommu_init(void)
3778 {
3779         int ret = -ENODEV;
3780         struct dmar_drhd_unit *drhd;
3781         struct intel_iommu *iommu;
3782
3783         /*
3784          * Intel IOMMU is required for a TXT/tboot launch or platform
3785          * opt in, so enforce that.
3786          */
3787         force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3788                     platform_optin_force_iommu();
3789
3790         down_write(&dmar_global_lock);
3791         if (dmar_table_init()) {
3792                 if (force_on)
3793                         panic("tboot: Failed to initialize DMAR table\n");
3794                 goto out_free_dmar;
3795         }
3796
3797         if (dmar_dev_scope_init() < 0) {
3798                 if (force_on)
3799                         panic("tboot: Failed to initialize DMAR device scope\n");
3800                 goto out_free_dmar;
3801         }
3802
3803         up_write(&dmar_global_lock);
3804
3805         /*
3806          * The bus notifier takes the dmar_global_lock, so lockdep will
3807          * complain later when we register it under the lock.
3808          */
3809         dmar_register_bus_notifier();
3810
3811         down_write(&dmar_global_lock);
3812
3813         if (!no_iommu)
3814                 intel_iommu_debugfs_init();
3815
3816         if (no_iommu || dmar_disabled) {
3817                 /*
3818                  * We exit the function here to ensure IOMMU's remapping and
3819                  * mempool aren't setup, which means that the IOMMU's PMRs
3820                  * won't be disabled via the call to init_dmars(). So disable
3821                  * it explicitly here. The PMRs were setup by tboot prior to
3822                  * calling SENTER, but the kernel is expected to reset/tear
3823                  * down the PMRs.
3824                  */
3825                 if (intel_iommu_tboot_noforce) {
3826                         for_each_iommu(iommu, drhd)
3827                                 iommu_disable_protect_mem_regions(iommu);
3828                 }
3829
3830                 /*
3831                  * Make sure the IOMMUs are switched off, even when we
3832                  * boot into a kexec kernel and the previous kernel left
3833                  * them enabled
3834                  */
3835                 intel_disable_iommus();
3836                 goto out_free_dmar;
3837         }
3838
3839         if (list_empty(&dmar_rmrr_units))
3840                 pr_info("No RMRR found\n");
3841
3842         if (list_empty(&dmar_atsr_units))
3843                 pr_info("No ATSR found\n");
3844
3845         if (list_empty(&dmar_satc_units))
3846                 pr_info("No SATC found\n");
3847
3848         init_no_remapping_devices();
3849
3850         ret = init_dmars();
3851         if (ret) {
3852                 if (force_on)
3853                         panic("tboot: Failed to initialize DMARs\n");
3854                 pr_err("Initialization failed\n");
3855                 goto out_free_dmar;
3856         }
3857         up_write(&dmar_global_lock);
3858
3859         init_iommu_pm_ops();
3860
3861         down_read(&dmar_global_lock);
3862         for_each_active_iommu(iommu, drhd) {
3863                 /*
3864                  * The flush queue implementation does not perform
3865                  * page-selective invalidations that are required for efficient
3866                  * TLB flushes in virtual environments.  The benefit of batching
3867                  * is likely to be much lower than the overhead of synchronizing
3868                  * the virtual and physical IOMMU page-tables.
3869                  */
3870                 if (cap_caching_mode(iommu->cap) &&
3871                     !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3872                         pr_info_once("IOMMU batching disallowed due to virtualization\n");
3873                         iommu_set_dma_strict();
3874                 }
3875                 iommu_device_sysfs_add(&iommu->iommu, NULL,
3876                                        intel_iommu_groups,
3877                                        "%s", iommu->name);
3878                 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3879
3880                 iommu_pmu_register(iommu);
3881         }
3882         up_read(&dmar_global_lock);
3883
3884         if (si_domain && !hw_pass_through)
3885                 register_memory_notifier(&intel_iommu_memory_nb);
3886
3887         down_read(&dmar_global_lock);
3888         if (probe_acpi_namespace_devices())
3889                 pr_warn("ACPI name space devices didn't probe correctly\n");
3890
3891         /* Finally, we enable the DMA remapping hardware. */
3892         for_each_iommu(iommu, drhd) {
3893                 if (!drhd->ignored && !translation_pre_enabled(iommu))
3894                         iommu_enable_translation(iommu);
3895
3896                 iommu_disable_protect_mem_regions(iommu);
3897         }
3898         up_read(&dmar_global_lock);
3899
3900         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3901
3902         intel_iommu_enabled = 1;
3903
3904         return 0;
3905
3906 out_free_dmar:
3907         intel_iommu_free_dmars();
3908         up_write(&dmar_global_lock);
3909         return ret;
3910 }
3911
3912 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3913 {
3914         struct device_domain_info *info = opaque;
3915
3916         domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3917         return 0;
3918 }
3919
3920 /*
3921  * NB - intel-iommu lacks any sort of reference counting for the users of
3922  * dependent devices.  If multiple endpoints have intersecting dependent
3923  * devices, unbinding the driver from any one of them will possibly leave
3924  * the others unable to operate.
3925  */
3926 static void domain_context_clear(struct device_domain_info *info)
3927 {
3928         if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
3929                 return;
3930
3931         pci_for_each_dma_alias(to_pci_dev(info->dev),
3932                                &domain_context_clear_one_cb, info);
3933 }
3934
3935 static void dmar_remove_one_dev_info(struct device *dev)
3936 {
3937         struct device_domain_info *info = dev_iommu_priv_get(dev);
3938         struct dmar_domain *domain = info->domain;
3939         struct intel_iommu *iommu = info->iommu;
3940         unsigned long flags;
3941
3942         if (!dev_is_real_dma_subdevice(info->dev)) {
3943                 if (dev_is_pci(info->dev) && sm_supported(iommu))
3944                         intel_pasid_tear_down_entry(iommu, info->dev,
3945                                         IOMMU_NO_PASID, false);
3946
3947                 iommu_disable_pci_caps(info);
3948                 domain_context_clear(info);
3949         }
3950
3951         spin_lock_irqsave(&domain->lock, flags);
3952         list_del(&info->link);
3953         spin_unlock_irqrestore(&domain->lock, flags);
3954
3955         domain_detach_iommu(domain, iommu);
3956         info->domain = NULL;
3957 }
3958
3959 /*
3960  * Clear the page table pointer in context or pasid table entries so that
3961  * all DMA requests without PASID from the device are blocked. If the page
3962  * table has been set, clean up the data structures.
3963  */
3964 void device_block_translation(struct device *dev)
3965 {
3966         struct device_domain_info *info = dev_iommu_priv_get(dev);
3967         struct intel_iommu *iommu = info->iommu;
3968         unsigned long flags;
3969
3970         iommu_disable_pci_caps(info);
3971         if (!dev_is_real_dma_subdevice(dev)) {
3972                 if (sm_supported(iommu))
3973                         intel_pasid_tear_down_entry(iommu, dev,
3974                                                     IOMMU_NO_PASID, false);
3975                 else
3976                         domain_context_clear(info);
3977         }
3978
3979         if (!info->domain)
3980                 return;
3981
3982         spin_lock_irqsave(&info->domain->lock, flags);
3983         list_del(&info->link);
3984         spin_unlock_irqrestore(&info->domain->lock, flags);
3985
3986         domain_detach_iommu(info->domain, iommu);
3987         info->domain = NULL;
3988 }
3989
3990 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3991 {
3992         int adjust_width;
3993
3994         /* calculate AGAW */
3995         domain->gaw = guest_width;
3996         adjust_width = guestwidth_to_adjustwidth(guest_width);
3997         domain->agaw = width_to_agaw(adjust_width);
3998
3999         domain->iommu_coherency = false;
4000         domain->iommu_superpage = 0;
4001         domain->max_addr = 0;
4002
4003         /* always allocate the top pgd */
4004         domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
4005         if (!domain->pgd)
4006                 return -ENOMEM;
4007         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4008         return 0;
4009 }
4010
4011 static int blocking_domain_attach_dev(struct iommu_domain *domain,
4012                                       struct device *dev)
4013 {
4014         device_block_translation(dev);
4015         return 0;
4016 }
4017
4018 static struct iommu_domain blocking_domain = {
4019         .type = IOMMU_DOMAIN_BLOCKED,
4020         .ops = &(const struct iommu_domain_ops) {
4021                 .attach_dev     = blocking_domain_attach_dev,
4022         }
4023 };
4024
4025 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4026 {
4027         struct dmar_domain *dmar_domain;
4028         struct iommu_domain *domain;
4029
4030         switch (type) {
4031         case IOMMU_DOMAIN_DMA:
4032         case IOMMU_DOMAIN_UNMANAGED:
4033                 dmar_domain = alloc_domain(type);
4034                 if (!dmar_domain) {
4035                         pr_err("Can't allocate dmar_domain\n");
4036                         return NULL;
4037                 }
4038                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4039                         pr_err("Domain initialization failed\n");
4040                         domain_exit(dmar_domain);
4041                         return NULL;
4042                 }
4043
4044                 domain = &dmar_domain->domain;
4045                 domain->geometry.aperture_start = 0;
4046                 domain->geometry.aperture_end   =
4047                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4048                 domain->geometry.force_aperture = true;
4049
4050                 return domain;
4051         case IOMMU_DOMAIN_IDENTITY:
4052                 return &si_domain->domain;
4053         case IOMMU_DOMAIN_SVA:
4054                 return intel_svm_domain_alloc();
4055         default:
4056                 return NULL;
4057         }
4058
4059         return NULL;
4060 }
4061
4062 static struct iommu_domain *
4063 intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
4064                               struct iommu_domain *parent,
4065                               const struct iommu_user_data *user_data)
4066 {
4067         struct device_domain_info *info = dev_iommu_priv_get(dev);
4068         bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
4069         bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
4070         struct intel_iommu *iommu = info->iommu;
4071         struct iommu_domain *domain;
4072
4073         /* Must be NESTING domain */
4074         if (parent) {
4075                 if (!nested_supported(iommu) || flags)
4076                         return ERR_PTR(-EOPNOTSUPP);
4077                 return intel_nested_domain_alloc(parent, user_data);
4078         }
4079
4080         if (flags &
4081             (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
4082                 return ERR_PTR(-EOPNOTSUPP);
4083         if (nested_parent && !nested_supported(iommu))
4084                 return ERR_PTR(-EOPNOTSUPP);
4085         if (user_data || (dirty_tracking && !ssads_supported(iommu)))
4086                 return ERR_PTR(-EOPNOTSUPP);
4087
4088         /*
4089          * domain_alloc_user op needs to fully initialize a domain before
4090          * return, so uses iommu_domain_alloc() here for simple.
4091          */
4092         domain = iommu_domain_alloc(dev->bus);
4093         if (!domain)
4094                 return ERR_PTR(-ENOMEM);
4095
4096         if (nested_parent)
4097                 to_dmar_domain(domain)->nested_parent = true;
4098
4099         if (dirty_tracking) {
4100                 if (to_dmar_domain(domain)->use_first_level) {
4101                         iommu_domain_free(domain);
4102                         return ERR_PTR(-EOPNOTSUPP);
4103                 }
4104                 domain->dirty_ops = &intel_dirty_ops;
4105         }
4106
4107         return domain;
4108 }
4109
4110 static void intel_iommu_domain_free(struct iommu_domain *domain)
4111 {
4112         if (domain != &si_domain->domain)
4113                 domain_exit(to_dmar_domain(domain));
4114 }
4115
4116 int prepare_domain_attach_device(struct iommu_domain *domain,
4117                                  struct device *dev)
4118 {
4119         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4120         struct intel_iommu *iommu;
4121         int addr_width;
4122
4123         iommu = device_to_iommu(dev, NULL, NULL);
4124         if (!iommu)
4125                 return -ENODEV;
4126
4127         if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4128                 return -EINVAL;
4129
4130         if (domain->dirty_ops && !ssads_supported(iommu))
4131                 return -EINVAL;
4132
4133         /* check if this iommu agaw is sufficient for max mapped address */
4134         addr_width = agaw_to_width(iommu->agaw);
4135         if (addr_width > cap_mgaw(iommu->cap))
4136                 addr_width = cap_mgaw(iommu->cap);
4137
4138         if (dmar_domain->max_addr > (1LL << addr_width))
4139                 return -EINVAL;
4140         dmar_domain->gaw = addr_width;
4141
4142         /*
4143          * Knock out extra levels of page tables if necessary
4144          */
4145         while (iommu->agaw < dmar_domain->agaw) {
4146                 struct dma_pte *pte;
4147
4148                 pte = dmar_domain->pgd;
4149                 if (dma_pte_present(pte)) {
4150                         dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4151                         free_pgtable_page(pte);
4152                 }
4153                 dmar_domain->agaw--;
4154         }
4155
4156         return 0;
4157 }
4158
4159 static int intel_iommu_attach_device(struct iommu_domain *domain,
4160                                      struct device *dev)
4161 {
4162         struct device_domain_info *info = dev_iommu_priv_get(dev);
4163         int ret;
4164
4165         if (info->domain)
4166                 device_block_translation(dev);
4167
4168         ret = prepare_domain_attach_device(domain, dev);
4169         if (ret)
4170                 return ret;
4171
4172         return dmar_domain_attach_device(to_dmar_domain(domain), dev);
4173 }
4174
4175 static int intel_iommu_map(struct iommu_domain *domain,
4176                            unsigned long iova, phys_addr_t hpa,
4177                            size_t size, int iommu_prot, gfp_t gfp)
4178 {
4179         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4180         u64 max_addr;
4181         int prot = 0;
4182
4183         if (iommu_prot & IOMMU_READ)
4184                 prot |= DMA_PTE_READ;
4185         if (iommu_prot & IOMMU_WRITE)
4186                 prot |= DMA_PTE_WRITE;
4187         if (dmar_domain->set_pte_snp)
4188                 prot |= DMA_PTE_SNP;
4189
4190         max_addr = iova + size;
4191         if (dmar_domain->max_addr < max_addr) {
4192                 u64 end;
4193
4194                 /* check if minimum agaw is sufficient for mapped address */
4195                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4196                 if (end < max_addr) {
4197                         pr_err("%s: iommu width (%d) is not "
4198                                "sufficient for the mapped address (%llx)\n",
4199                                __func__, dmar_domain->gaw, max_addr);
4200                         return -EFAULT;
4201                 }
4202                 dmar_domain->max_addr = max_addr;
4203         }
4204         /* Round up size to next multiple of PAGE_SIZE, if it and
4205            the low bits of hpa would take us onto the next page */
4206         size = aligned_nrpages(hpa, size);
4207         return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4208                                 hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
4209 }
4210
4211 static int intel_iommu_map_pages(struct iommu_domain *domain,
4212                                  unsigned long iova, phys_addr_t paddr,
4213                                  size_t pgsize, size_t pgcount,
4214                                  int prot, gfp_t gfp, size_t *mapped)
4215 {
4216         unsigned long pgshift = __ffs(pgsize);
4217         size_t size = pgcount << pgshift;
4218         int ret;
4219
4220         if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4221                 return -EINVAL;
4222
4223         if (!IS_ALIGNED(iova | paddr, pgsize))
4224                 return -EINVAL;
4225
4226         ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4227         if (!ret && mapped)
4228                 *mapped = size;
4229
4230         return ret;
4231 }
4232
4233 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4234                                 unsigned long iova, size_t size,
4235                                 struct iommu_iotlb_gather *gather)
4236 {
4237         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4238         unsigned long start_pfn, last_pfn;
4239         int level = 0;
4240
4241         /* Cope with horrid API which requires us to unmap more than the
4242            size argument if it happens to be a large-page mapping. */
4243         if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
4244                                      &level, GFP_ATOMIC)))
4245                 return 0;
4246
4247         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4248                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4249
4250         start_pfn = iova >> VTD_PAGE_SHIFT;
4251         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4252
4253         domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4254
4255         if (dmar_domain->max_addr == iova + size)
4256                 dmar_domain->max_addr = iova;
4257
4258         /*
4259          * We do not use page-selective IOTLB invalidation in flush queue,
4260          * so there is no need to track page and sync iotlb.
4261          */
4262         if (!iommu_iotlb_gather_queued(gather))
4263                 iommu_iotlb_gather_add_page(domain, gather, iova, size);
4264
4265         return size;
4266 }
4267
4268 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4269                                       unsigned long iova,
4270                                       size_t pgsize, size_t pgcount,
4271                                       struct iommu_iotlb_gather *gather)
4272 {
4273         unsigned long pgshift = __ffs(pgsize);
4274         size_t size = pgcount << pgshift;
4275
4276         return intel_iommu_unmap(domain, iova, size, gather);
4277 }
4278
4279 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4280                                  struct iommu_iotlb_gather *gather)
4281 {
4282         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4283         unsigned long iova_pfn = IOVA_PFN(gather->start);
4284         size_t size = gather->end - gather->start;
4285         struct iommu_domain_info *info;
4286         unsigned long start_pfn;
4287         unsigned long nrpages;
4288         unsigned long i;
4289
4290         nrpages = aligned_nrpages(gather->start, size);
4291         start_pfn = mm_to_dma_pfn_start(iova_pfn);
4292
4293         xa_for_each(&dmar_domain->iommu_array, i, info)
4294                 iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4295                                       start_pfn, nrpages,
4296                                       list_empty(&gather->freelist), 0);
4297
4298         put_pages_list(&gather->freelist);
4299 }
4300
4301 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4302                                             dma_addr_t iova)
4303 {
4304         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4305         struct dma_pte *pte;
4306         int level = 0;
4307         u64 phys = 0;
4308
4309         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4310                              GFP_ATOMIC);
4311         if (pte && dma_pte_present(pte))
4312                 phys = dma_pte_addr(pte) +
4313                         (iova & (BIT_MASK(level_to_offset_bits(level) +
4314                                                 VTD_PAGE_SHIFT) - 1));
4315
4316         return phys;
4317 }
4318
4319 static bool domain_support_force_snooping(struct dmar_domain *domain)
4320 {
4321         struct device_domain_info *info;
4322         bool support = true;
4323
4324         assert_spin_locked(&domain->lock);
4325         list_for_each_entry(info, &domain->devices, link) {
4326                 if (!ecap_sc_support(info->iommu->ecap)) {
4327                         support = false;
4328                         break;
4329                 }
4330         }
4331
4332         return support;
4333 }
4334
4335 static void domain_set_force_snooping(struct dmar_domain *domain)
4336 {
4337         struct device_domain_info *info;
4338
4339         assert_spin_locked(&domain->lock);
4340         /*
4341          * Second level page table supports per-PTE snoop control. The
4342          * iommu_map() interface will handle this by setting SNP bit.
4343          */
4344         if (!domain->use_first_level) {
4345                 domain->set_pte_snp = true;
4346                 return;
4347         }
4348
4349         list_for_each_entry(info, &domain->devices, link)
4350                 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4351                                                      IOMMU_NO_PASID);
4352 }
4353
4354 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4355 {
4356         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4357         unsigned long flags;
4358
4359         if (dmar_domain->force_snooping)
4360                 return true;
4361
4362         spin_lock_irqsave(&dmar_domain->lock, flags);
4363         if (!domain_support_force_snooping(dmar_domain)) {
4364                 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4365                 return false;
4366         }
4367
4368         domain_set_force_snooping(dmar_domain);
4369         dmar_domain->force_snooping = true;
4370         spin_unlock_irqrestore(&dmar_domain->lock, flags);
4371
4372         return true;
4373 }
4374
4375 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4376 {
4377         struct device_domain_info *info = dev_iommu_priv_get(dev);
4378
4379         switch (cap) {
4380         case IOMMU_CAP_CACHE_COHERENCY:
4381         case IOMMU_CAP_DEFERRED_FLUSH:
4382                 return true;
4383         case IOMMU_CAP_PRE_BOOT_PROTECTION:
4384                 return dmar_platform_optin();
4385         case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4386                 return ecap_sc_support(info->iommu->ecap);
4387         case IOMMU_CAP_DIRTY_TRACKING:
4388                 return ssads_supported(info->iommu);
4389         default:
4390                 return false;
4391         }
4392 }
4393
4394 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4395 {
4396         struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4397         struct device_domain_info *info;
4398         struct intel_iommu *iommu;
4399         u8 bus, devfn;
4400         int ret;
4401
4402         iommu = device_to_iommu(dev, &bus, &devfn);
4403         if (!iommu || !iommu->iommu.ops)
4404                 return ERR_PTR(-ENODEV);
4405
4406         info = kzalloc(sizeof(*info), GFP_KERNEL);
4407         if (!info)
4408                 return ERR_PTR(-ENOMEM);
4409
4410         if (dev_is_real_dma_subdevice(dev)) {
4411                 info->bus = pdev->bus->number;
4412                 info->devfn = pdev->devfn;
4413                 info->segment = pci_domain_nr(pdev->bus);
4414         } else {
4415                 info->bus = bus;
4416                 info->devfn = devfn;
4417                 info->segment = iommu->segment;
4418         }
4419
4420         info->dev = dev;
4421         info->iommu = iommu;
4422         if (dev_is_pci(dev)) {
4423                 if (ecap_dev_iotlb_support(iommu->ecap) &&
4424                     pci_ats_supported(pdev) &&
4425                     dmar_ats_supported(pdev, iommu)) {
4426                         info->ats_supported = 1;
4427                         info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4428
4429                         /*
4430                          * For IOMMU that supports device IOTLB throttling
4431                          * (DIT), we assign PFSID to the invalidation desc
4432                          * of a VF such that IOMMU HW can gauge queue depth
4433                          * at PF level. If DIT is not set, PFSID will be
4434                          * treated as reserved, which should be set to 0.
4435                          */
4436                         if (ecap_dit(iommu->ecap))
4437                                 info->pfsid = pci_dev_id(pci_physfn(pdev));
4438                         info->ats_qdep = pci_ats_queue_depth(pdev);
4439                 }
4440                 if (sm_supported(iommu)) {
4441                         if (pasid_supported(iommu)) {
4442                                 int features = pci_pasid_features(pdev);
4443
4444                                 if (features >= 0)
4445                                         info->pasid_supported = features | 1;
4446                         }
4447
4448                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
4449                             pci_pri_supported(pdev))
4450                                 info->pri_supported = 1;
4451                 }
4452         }
4453
4454         dev_iommu_priv_set(dev, info);
4455
4456         if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4457                 ret = intel_pasid_alloc_table(dev);
4458                 if (ret) {
4459                         dev_err(dev, "PASID table allocation failed\n");
4460                         dev_iommu_priv_set(dev, NULL);
4461                         kfree(info);
4462                         return ERR_PTR(ret);
4463                 }
4464         }
4465
4466         intel_iommu_debugfs_create_dev(info);
4467
4468         return &iommu->iommu;
4469 }
4470
4471 static void intel_iommu_release_device(struct device *dev)
4472 {
4473         struct device_domain_info *info = dev_iommu_priv_get(dev);
4474
4475         dmar_remove_one_dev_info(dev);
4476         intel_pasid_free_table(dev);
4477         intel_iommu_debugfs_remove_dev(info);
4478         dev_iommu_priv_set(dev, NULL);
4479         kfree(info);
4480         set_dma_ops(dev, NULL);
4481 }
4482
4483 static void intel_iommu_probe_finalize(struct device *dev)
4484 {
4485         set_dma_ops(dev, NULL);
4486         iommu_setup_dma_ops(dev, 0, U64_MAX);
4487 }
4488
4489 static void intel_iommu_get_resv_regions(struct device *device,
4490                                          struct list_head *head)
4491 {
4492         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4493         struct iommu_resv_region *reg;
4494         struct dmar_rmrr_unit *rmrr;
4495         struct device *i_dev;
4496         int i;
4497
4498         rcu_read_lock();
4499         for_each_rmrr_units(rmrr) {
4500                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4501                                           i, i_dev) {
4502                         struct iommu_resv_region *resv;
4503                         enum iommu_resv_type type;
4504                         size_t length;
4505
4506                         if (i_dev != device &&
4507                             !is_downstream_to_pci_bridge(device, i_dev))
4508                                 continue;
4509
4510                         length = rmrr->end_address - rmrr->base_address + 1;
4511
4512                         type = device_rmrr_is_relaxable(device) ?
4513                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4514
4515                         resv = iommu_alloc_resv_region(rmrr->base_address,
4516                                                        length, prot, type,
4517                                                        GFP_ATOMIC);
4518                         if (!resv)
4519                                 break;
4520
4521                         list_add_tail(&resv->list, head);
4522                 }
4523         }
4524         rcu_read_unlock();
4525
4526 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4527         if (dev_is_pci(device)) {
4528                 struct pci_dev *pdev = to_pci_dev(device);
4529
4530                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4531                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4532                                         IOMMU_RESV_DIRECT_RELAXABLE,
4533                                         GFP_KERNEL);
4534                         if (reg)
4535                                 list_add_tail(&reg->list, head);
4536                 }
4537         }
4538 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4539
4540         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4541                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4542                                       0, IOMMU_RESV_MSI, GFP_KERNEL);
4543         if (!reg)
4544                 return;
4545         list_add_tail(&reg->list, head);
4546 }
4547
4548 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4549 {
4550         if (dev_is_pci(dev))
4551                 return pci_device_group(dev);
4552         return generic_device_group(dev);
4553 }
4554
4555 static int intel_iommu_enable_sva(struct device *dev)
4556 {
4557         struct device_domain_info *info = dev_iommu_priv_get(dev);
4558         struct intel_iommu *iommu;
4559
4560         if (!info || dmar_disabled)
4561                 return -EINVAL;
4562
4563         iommu = info->iommu;
4564         if (!iommu)
4565                 return -EINVAL;
4566
4567         if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4568                 return -ENODEV;
4569
4570         if (!info->pasid_enabled || !info->ats_enabled)
4571                 return -EINVAL;
4572
4573         /*
4574          * Devices having device-specific I/O fault handling should not
4575          * support PCI/PRI. The IOMMU side has no means to check the
4576          * capability of device-specific IOPF.  Therefore, IOMMU can only
4577          * default that if the device driver enables SVA on a non-PRI
4578          * device, it will handle IOPF in its own way.
4579          */
4580         if (!info->pri_supported)
4581                 return 0;
4582
4583         /* Devices supporting PRI should have it enabled. */
4584         if (!info->pri_enabled)
4585                 return -EINVAL;
4586
4587         return 0;
4588 }
4589
4590 static int intel_iommu_enable_iopf(struct device *dev)
4591 {
4592         struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4593         struct device_domain_info *info = dev_iommu_priv_get(dev);
4594         struct intel_iommu *iommu;
4595         int ret;
4596
4597         if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4598                 return -ENODEV;
4599
4600         if (info->pri_enabled)
4601                 return -EBUSY;
4602
4603         iommu = info->iommu;
4604         if (!iommu)
4605                 return -EINVAL;
4606
4607         /* PASID is required in PRG Response Message. */
4608         if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4609                 return -EINVAL;
4610
4611         ret = pci_reset_pri(pdev);
4612         if (ret)
4613                 return ret;
4614
4615         ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4616         if (ret)
4617                 return ret;
4618
4619         ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4620         if (ret)
4621                 goto iopf_remove_device;
4622
4623         ret = pci_enable_pri(pdev, PRQ_DEPTH);
4624         if (ret)
4625                 goto iopf_unregister_handler;
4626         info->pri_enabled = 1;
4627
4628         return 0;
4629
4630 iopf_unregister_handler:
4631         iommu_unregister_device_fault_handler(dev);
4632 iopf_remove_device:
4633         iopf_queue_remove_device(iommu->iopf_queue, dev);
4634
4635         return ret;
4636 }
4637
4638 static int intel_iommu_disable_iopf(struct device *dev)
4639 {
4640         struct device_domain_info *info = dev_iommu_priv_get(dev);
4641         struct intel_iommu *iommu = info->iommu;
4642
4643         if (!info->pri_enabled)
4644                 return -EINVAL;
4645
4646         /*
4647          * PCIe spec states that by clearing PRI enable bit, the Page
4648          * Request Interface will not issue new page requests, but has
4649          * outstanding page requests that have been transmitted or are
4650          * queued for transmission. This is supposed to be called after
4651          * the device driver has stopped DMA, all PASIDs have been
4652          * unbound and the outstanding PRQs have been drained.
4653          */
4654         pci_disable_pri(to_pci_dev(dev));
4655         info->pri_enabled = 0;
4656
4657         /*
4658          * With PRI disabled and outstanding PRQs drained, unregistering
4659          * fault handler and removing device from iopf queue should never
4660          * fail.
4661          */
4662         WARN_ON(iommu_unregister_device_fault_handler(dev));
4663         WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev));
4664
4665         return 0;
4666 }
4667
4668 static int
4669 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4670 {
4671         switch (feat) {
4672         case IOMMU_DEV_FEAT_IOPF:
4673                 return intel_iommu_enable_iopf(dev);
4674
4675         case IOMMU_DEV_FEAT_SVA:
4676                 return intel_iommu_enable_sva(dev);
4677
4678         default:
4679                 return -ENODEV;
4680         }
4681 }
4682
4683 static int
4684 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4685 {
4686         switch (feat) {
4687         case IOMMU_DEV_FEAT_IOPF:
4688                 return intel_iommu_disable_iopf(dev);
4689
4690         case IOMMU_DEV_FEAT_SVA:
4691                 return 0;
4692
4693         default:
4694                 return -ENODEV;
4695         }
4696 }
4697
4698 static bool intel_iommu_is_attach_deferred(struct device *dev)
4699 {
4700         struct device_domain_info *info = dev_iommu_priv_get(dev);
4701
4702         return translation_pre_enabled(info->iommu) && !info->domain;
4703 }
4704
4705 /*
4706  * Check that the device does not live on an external facing PCI port that is
4707  * marked as untrusted. Such devices should not be able to apply quirks and
4708  * thus not be able to bypass the IOMMU restrictions.
4709  */
4710 static bool risky_device(struct pci_dev *pdev)
4711 {
4712         if (pdev->untrusted) {
4713                 pci_info(pdev,
4714                          "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4715                          pdev->vendor, pdev->device);
4716                 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4717                 return true;
4718         }
4719         return false;
4720 }
4721
4722 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4723                                       unsigned long iova, size_t size)
4724 {
4725         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4726         unsigned long pages = aligned_nrpages(iova, size);
4727         unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4728         struct iommu_domain_info *info;
4729         unsigned long i;
4730
4731         xa_for_each(&dmar_domain->iommu_array, i, info)
4732                 __mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4733         return 0;
4734 }
4735
4736 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4737 {
4738         struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
4739         struct dev_pasid_info *curr, *dev_pasid = NULL;
4740         struct dmar_domain *dmar_domain;
4741         struct iommu_domain *domain;
4742         unsigned long flags;
4743
4744         domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4745         if (WARN_ON_ONCE(!domain))
4746                 goto out_tear_down;
4747
4748         /*
4749          * The SVA implementation needs to handle its own stuffs like the mm
4750          * notification. Before consolidating that code into iommu core, let
4751          * the intel sva code handle it.
4752          */
4753         if (domain->type == IOMMU_DOMAIN_SVA) {
4754                 intel_svm_remove_dev_pasid(dev, pasid);
4755                 goto out_tear_down;
4756         }
4757
4758         dmar_domain = to_dmar_domain(domain);
4759         spin_lock_irqsave(&dmar_domain->lock, flags);
4760         list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4761                 if (curr->dev == dev && curr->pasid == pasid) {
4762                         list_del(&curr->link_domain);
4763                         dev_pasid = curr;
4764                         break;
4765                 }
4766         }
4767         WARN_ON_ONCE(!dev_pasid);
4768         spin_unlock_irqrestore(&dmar_domain->lock, flags);
4769
4770         domain_detach_iommu(dmar_domain, iommu);
4771         intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4772         kfree(dev_pasid);
4773 out_tear_down:
4774         intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4775         intel_drain_pasid_prq(dev, pasid);
4776 }
4777
4778 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4779                                      struct device *dev, ioasid_t pasid)
4780 {
4781         struct device_domain_info *info = dev_iommu_priv_get(dev);
4782         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4783         struct intel_iommu *iommu = info->iommu;
4784         struct dev_pasid_info *dev_pasid;
4785         unsigned long flags;
4786         int ret;
4787
4788         if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4789                 return -EOPNOTSUPP;
4790
4791         if (domain->dirty_ops)
4792                 return -EINVAL;
4793
4794         if (context_copied(iommu, info->bus, info->devfn))
4795                 return -EBUSY;
4796
4797         ret = prepare_domain_attach_device(domain, dev);
4798         if (ret)
4799                 return ret;
4800
4801         dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4802         if (!dev_pasid)
4803                 return -ENOMEM;
4804
4805         ret = domain_attach_iommu(dmar_domain, iommu);
4806         if (ret)
4807                 goto out_free;
4808
4809         if (domain_type_is_si(dmar_domain))
4810                 ret = intel_pasid_setup_pass_through(iommu, dmar_domain,
4811                                                      dev, pasid);
4812         else if (dmar_domain->use_first_level)
4813                 ret = domain_setup_first_level(iommu, dmar_domain,
4814                                                dev, pasid);
4815         else
4816                 ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4817                                                      dev, pasid);
4818         if (ret)
4819                 goto out_detach_iommu;
4820
4821         dev_pasid->dev = dev;
4822         dev_pasid->pasid = pasid;
4823         spin_lock_irqsave(&dmar_domain->lock, flags);
4824         list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4825         spin_unlock_irqrestore(&dmar_domain->lock, flags);
4826
4827         if (domain->type & __IOMMU_DOMAIN_PAGING)
4828                 intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4829
4830         return 0;
4831 out_detach_iommu:
4832         domain_detach_iommu(dmar_domain, iommu);
4833 out_free:
4834         kfree(dev_pasid);
4835         return ret;
4836 }
4837
4838 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4839 {
4840         struct device_domain_info *info = dev_iommu_priv_get(dev);
4841         struct intel_iommu *iommu = info->iommu;
4842         struct iommu_hw_info_vtd *vtd;
4843
4844         vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4845         if (!vtd)
4846                 return ERR_PTR(-ENOMEM);
4847
4848         vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4849         vtd->cap_reg = iommu->cap;
4850         vtd->ecap_reg = iommu->ecap;
4851         *length = sizeof(*vtd);
4852         *type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4853         return vtd;
4854 }
4855
4856 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4857                                           bool enable)
4858 {
4859         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4860         struct device_domain_info *info;
4861         int ret;
4862
4863         spin_lock(&dmar_domain->lock);
4864         if (dmar_domain->dirty_tracking == enable)
4865                 goto out_unlock;
4866
4867         list_for_each_entry(info, &dmar_domain->devices, link) {
4868                 ret = intel_pasid_setup_dirty_tracking(info->iommu,
4869                                                        info->domain, info->dev,
4870                                                        IOMMU_NO_PASID, enable);
4871                 if (ret)
4872                         goto err_unwind;
4873         }
4874
4875         dmar_domain->dirty_tracking = enable;
4876 out_unlock:
4877         spin_unlock(&dmar_domain->lock);
4878
4879         return 0;
4880
4881 err_unwind:
4882         list_for_each_entry(info, &dmar_domain->devices, link)
4883                 intel_pasid_setup_dirty_tracking(info->iommu, dmar_domain,
4884                                                  info->dev, IOMMU_NO_PASID,
4885                                                  dmar_domain->dirty_tracking);
4886         spin_unlock(&dmar_domain->lock);
4887         return ret;
4888 }
4889
4890 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4891                                             unsigned long iova, size_t size,
4892                                             unsigned long flags,
4893                                             struct iommu_dirty_bitmap *dirty)
4894 {
4895         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4896         unsigned long end = iova + size - 1;
4897         unsigned long pgsize;
4898
4899         /*
4900          * IOMMUFD core calls into a dirty tracking disabled domain without an
4901          * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4902          * have occurred when we stopped dirty tracking. This ensures that we
4903          * never inherit dirtied bits from a previous cycle.
4904          */
4905         if (!dmar_domain->dirty_tracking && dirty->bitmap)
4906                 return -EINVAL;
4907
4908         do {
4909                 struct dma_pte *pte;
4910                 int lvl = 0;
4911
4912                 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4913                                      GFP_ATOMIC);
4914                 pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4915                 if (!pte || !dma_pte_present(pte)) {
4916                         iova += pgsize;
4917                         continue;
4918                 }
4919
4920                 if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4921                         iommu_dirty_bitmap_record(dirty, iova, pgsize);
4922                 iova += pgsize;
4923         } while (iova < end);
4924
4925         return 0;
4926 }
4927
4928 const struct iommu_dirty_ops intel_dirty_ops = {
4929         .set_dirty_tracking = intel_iommu_set_dirty_tracking,
4930         .read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4931 };
4932
4933 const struct iommu_ops intel_iommu_ops = {
4934         .blocked_domain         = &blocking_domain,
4935         .capable                = intel_iommu_capable,
4936         .hw_info                = intel_iommu_hw_info,
4937         .domain_alloc           = intel_iommu_domain_alloc,
4938         .domain_alloc_user      = intel_iommu_domain_alloc_user,
4939         .probe_device           = intel_iommu_probe_device,
4940         .probe_finalize         = intel_iommu_probe_finalize,
4941         .release_device         = intel_iommu_release_device,
4942         .get_resv_regions       = intel_iommu_get_resv_regions,
4943         .device_group           = intel_iommu_device_group,
4944         .dev_enable_feat        = intel_iommu_dev_enable_feat,
4945         .dev_disable_feat       = intel_iommu_dev_disable_feat,
4946         .is_attach_deferred     = intel_iommu_is_attach_deferred,
4947         .def_domain_type        = device_def_domain_type,
4948         .remove_dev_pasid       = intel_iommu_remove_dev_pasid,
4949         .pgsize_bitmap          = SZ_4K,
4950 #ifdef CONFIG_INTEL_IOMMU_SVM
4951         .page_response          = intel_svm_page_response,
4952 #endif
4953         .default_domain_ops = &(const struct iommu_domain_ops) {
4954                 .attach_dev             = intel_iommu_attach_device,
4955                 .set_dev_pasid          = intel_iommu_set_dev_pasid,
4956                 .map_pages              = intel_iommu_map_pages,
4957                 .unmap_pages            = intel_iommu_unmap_pages,
4958                 .iotlb_sync_map         = intel_iommu_iotlb_sync_map,
4959                 .flush_iotlb_all        = intel_flush_iotlb_all,
4960                 .iotlb_sync             = intel_iommu_tlb_sync,
4961                 .iova_to_phys           = intel_iommu_iova_to_phys,
4962                 .free                   = intel_iommu_domain_free,
4963                 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4964         }
4965 };
4966
4967 static void quirk_iommu_igfx(struct pci_dev *dev)
4968 {
4969         if (risky_device(dev))
4970                 return;
4971
4972         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4973         dmar_map_gfx = 0;
4974 }
4975
4976 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4977 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4978 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4979 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4980 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4981 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4982 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4983 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4984
4985 /* Broadwell igfx malfunctions with dmar */
4986 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4987 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4988 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4989 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4990 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4991 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4992 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4993 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4994 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4995 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4996 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4997 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4998 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4999 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
5000 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
5001 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
5002 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
5003 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
5004 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
5005 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
5006 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
5007 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
5008 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
5009 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
5010
5011 static void quirk_iommu_rwbf(struct pci_dev *dev)
5012 {
5013         if (risky_device(dev))
5014                 return;
5015
5016         /*
5017          * Mobile 4 Series Chipset neglects to set RWBF capability,
5018          * but needs it. Same seems to hold for the desktop versions.
5019          */
5020         pci_info(dev, "Forcing write-buffer flush capability\n");
5021         rwbf_quirk = 1;
5022 }
5023
5024 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5025 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5026 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5027 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5028 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5029 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5030 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5031
5032 #define GGC 0x52
5033 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5034 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5035 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5036 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5037 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5038 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5039 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5040 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5041
5042 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5043 {
5044         unsigned short ggc;
5045
5046         if (risky_device(dev))
5047                 return;
5048
5049         if (pci_read_config_word(dev, GGC, &ggc))
5050                 return;
5051
5052         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5053                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5054                 dmar_map_gfx = 0;
5055         } else if (dmar_map_gfx) {
5056                 /* we have to ensure the gfx device is idle before we flush */
5057                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5058                 iommu_set_dma_strict();
5059         }
5060 }
5061 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5062 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5063 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5064 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5065
5066 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
5067 {
5068         unsigned short ver;
5069
5070         if (!IS_GFX_DEVICE(dev))
5071                 return;
5072
5073         ver = (dev->device >> 8) & 0xff;
5074         if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
5075             ver != 0x4e && ver != 0x8a && ver != 0x98 &&
5076             ver != 0x9a && ver != 0xa7)
5077                 return;
5078
5079         if (risky_device(dev))
5080                 return;
5081
5082         pci_info(dev, "Skip IOMMU disabling for graphics\n");
5083         iommu_skip_te_disable = 1;
5084 }
5085 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5086
5087 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5088    ISOCH DMAR unit for the Azalia sound device, but not give it any
5089    TLB entries, which causes it to deadlock. Check for that.  We do
5090    this in a function called from init_dmars(), instead of in a PCI
5091    quirk, because we don't want to print the obnoxious "BIOS broken"
5092    message if VT-d is actually disabled.
5093 */
5094 static void __init check_tylersburg_isoch(void)
5095 {
5096         struct pci_dev *pdev;
5097         uint32_t vtisochctrl;
5098
5099         /* If there's no Azalia in the system anyway, forget it. */
5100         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5101         if (!pdev)
5102                 return;
5103
5104         if (risky_device(pdev)) {
5105                 pci_dev_put(pdev);
5106                 return;
5107         }
5108
5109         pci_dev_put(pdev);
5110
5111         /* System Management Registers. Might be hidden, in which case
5112            we can't do the sanity check. But that's OK, because the
5113            known-broken BIOSes _don't_ actually hide it, so far. */
5114         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5115         if (!pdev)
5116                 return;
5117
5118         if (risky_device(pdev)) {
5119                 pci_dev_put(pdev);
5120                 return;
5121         }
5122
5123         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5124                 pci_dev_put(pdev);
5125                 return;
5126         }
5127
5128         pci_dev_put(pdev);
5129
5130         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5131         if (vtisochctrl & 1)
5132                 return;
5133
5134         /* Drop all bits other than the number of TLB entries */
5135         vtisochctrl &= 0x1c;
5136
5137         /* If we have the recommended number of TLB entries (16), fine. */
5138         if (vtisochctrl == 0x10)
5139                 return;
5140
5141         /* Zero TLB entries? You get to ride the short bus to school. */
5142         if (!vtisochctrl) {
5143                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5144                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5145                      dmi_get_system_info(DMI_BIOS_VENDOR),
5146                      dmi_get_system_info(DMI_BIOS_VERSION),
5147                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5148                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5149                 return;
5150         }
5151
5152         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5153                vtisochctrl);
5154 }
5155
5156 /*
5157  * Here we deal with a device TLB defect where device may inadvertently issue ATS
5158  * invalidation completion before posted writes initiated with translated address
5159  * that utilized translations matching the invalidation address range, violating
5160  * the invalidation completion ordering.
5161  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
5162  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
5163  * under the control of the trusted/privileged host device driver must use this
5164  * quirk.
5165  * Device TLBs are invalidated under the following six conditions:
5166  * 1. Device driver does DMA API unmap IOVA
5167  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
5168  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
5169  *    exit_mmap() due to crash
5170  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
5171  *    VM has to free pages that were unmapped
5172  * 5. Userspace driver unmaps a DMA buffer
5173  * 6. Cache invalidation in vSVA usage (upcoming)
5174  *
5175  * For #1 and #2, device drivers are responsible for stopping DMA traffic
5176  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5177  * invalidate TLB the same way as normal user unmap which will use this quirk.
5178  * The dTLB invalidation after PASID cache flush does not need this quirk.
5179  *
5180  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5181  */
5182 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5183                                unsigned long address, unsigned long mask,
5184                                u32 pasid, u16 qdep)
5185 {
5186         u16 sid;
5187
5188         if (likely(!info->dtlb_extra_inval))
5189                 return;
5190
5191         sid = PCI_DEVID(info->bus, info->devfn);
5192         if (pasid == IOMMU_NO_PASID) {
5193                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5194                                    qdep, address, mask);
5195         } else {
5196                 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5197                                          pasid, qdep, address, mask);
5198         }
5199 }
5200
5201 #define ecmd_get_status_code(res)       (((res) & 0xff) >> 1)
5202
5203 /*
5204  * Function to submit a command to the enhanced command interface. The
5205  * valid enhanced command descriptions are defined in Table 47 of the
5206  * VT-d spec. The VT-d hardware implementation may support some but not
5207  * all commands, which can be determined by checking the Enhanced
5208  * Command Capability Register.
5209  *
5210  * Return values:
5211  *  - 0: Command successful without any error;
5212  *  - Negative: software error value;
5213  *  - Nonzero positive: failure status code defined in Table 48.
5214  */
5215 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5216 {
5217         unsigned long flags;
5218         u64 res;
5219         int ret;
5220
5221         if (!cap_ecmds(iommu->cap))
5222                 return -ENODEV;
5223
5224         raw_spin_lock_irqsave(&iommu->register_lock, flags);
5225
5226         res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5227         if (res & DMA_ECMD_ECRSP_IP) {
5228                 ret = -EBUSY;
5229                 goto err;
5230         }
5231
5232         /*
5233          * Unconditionally write the operand B, because
5234          * - There is no side effect if an ecmd doesn't require an
5235          *   operand B, but we set the register to some value.
5236          * - It's not invoked in any critical path. The extra MMIO
5237          *   write doesn't bring any performance concerns.
5238          */
5239         dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5240         dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5241
5242         IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5243                       !(res & DMA_ECMD_ECRSP_IP), res);
5244
5245         if (res & DMA_ECMD_ECRSP_IP) {
5246                 ret = -ETIMEDOUT;
5247                 goto err;
5248         }
5249
5250         ret = ecmd_get_status_code(res);
5251 err:
5252         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
5253
5254         return ret;
5255 }