2 * Copyright © 2006-2014 Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * Authors: David Woodhouse <dwmw2@infradead.org>,
14 * Ashok Raj <ashok.raj@intel.com>,
15 * Shaohua Li <shaohua.li@intel.com>,
16 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17 * Fenghua Yu <fenghua.yu@intel.com>
18 * Joerg Roedel <jroedel@suse.de>
21 #define pr_fmt(fmt) "DMAR: " fmt
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/export.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/memory.h>
36 #include <linux/cpu.h>
37 #include <linux/timer.h>
39 #include <linux/iova.h>
40 #include <linux/iommu.h>
41 #include <linux/intel-iommu.h>
42 #include <linux/syscore_ops.h>
43 #include <linux/tboot.h>
44 #include <linux/dmi.h>
45 #include <linux/pci-ats.h>
46 #include <linux/memblock.h>
47 #include <linux/dma-contiguous.h>
48 #include <linux/dma-direct.h>
49 #include <linux/crash_dump.h>
50 #include <asm/irq_remapping.h>
51 #include <asm/cacheflush.h>
52 #include <asm/iommu.h>
54 #include "irq_remapping.h"
56 #define ROOT_SIZE VTD_PAGE_SIZE
57 #define CONTEXT_SIZE VTD_PAGE_SIZE
59 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
60 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
61 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
62 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
64 #define IOAPIC_RANGE_START (0xfee00000)
65 #define IOAPIC_RANGE_END (0xfeefffff)
66 #define IOVA_START_ADDR (0x1000)
68 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
70 #define MAX_AGAW_WIDTH 64
71 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
73 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
74 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
76 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
77 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
78 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
79 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
80 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
82 /* IO virtual address start page frame number */
83 #define IOVA_START_PFN (1)
85 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
87 /* page table handling */
88 #define LEVEL_STRIDE (9)
89 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
92 * This bitmap is used to advertise the page sizes our hardware support
93 * to the IOMMU core, which will then use this information to split
94 * physically contiguous memory regions it is mapping into page sizes
97 * Traditionally the IOMMU core just handed us the mappings directly,
98 * after making sure the size is an order of a 4KiB page and that the
99 * mapping has natural alignment.
101 * To retain this behavior, we currently advertise that we support
102 * all page sizes that are an order of 4KiB.
104 * If at some point we'd like to utilize the IOMMU core's new behavior,
105 * we could change this to advertise the real page sizes we support.
107 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
109 static inline int agaw_to_level(int agaw)
114 static inline int agaw_to_width(int agaw)
116 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
119 static inline int width_to_agaw(int width)
121 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
124 static inline unsigned int level_to_offset_bits(int level)
126 return (level - 1) * LEVEL_STRIDE;
129 static inline int pfn_level_offset(unsigned long pfn, int level)
131 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
134 static inline unsigned long level_mask(int level)
136 return -1UL << level_to_offset_bits(level);
139 static inline unsigned long level_size(int level)
141 return 1UL << level_to_offset_bits(level);
144 static inline unsigned long align_to_level(unsigned long pfn, int level)
146 return (pfn + level_size(level) - 1) & level_mask(level);
149 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
151 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
154 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
155 are never going to work. */
156 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
158 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
161 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
163 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
165 static inline unsigned long page_to_dma_pfn(struct page *pg)
167 return mm_to_dma_pfn(page_to_pfn(pg));
169 static inline unsigned long virt_to_dma_pfn(void *p)
171 return page_to_dma_pfn(virt_to_page(p));
174 /* global iommu list, set NULL for ignored DMAR units */
175 static struct intel_iommu **g_iommus;
177 static void __init check_tylersburg_isoch(void);
178 static int rwbf_quirk;
181 * set to 1 to panic kernel if can't successfully enable VT-d
182 * (used when kernel is launched w/ TXT)
184 static int force_on = 0;
185 int intel_iommu_tboot_noforce;
190 * 12-63: Context Ptr (12 - (haw-1))
197 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
200 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
203 static phys_addr_t root_entry_lctp(struct root_entry *re)
208 return re->lo & VTD_PAGE_MASK;
212 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
215 static phys_addr_t root_entry_uctp(struct root_entry *re)
220 return re->hi & VTD_PAGE_MASK;
225 * 1: fault processing disable
226 * 2-3: translation type
227 * 12-63: address space root
233 struct context_entry {
238 static inline void context_clear_pasid_enable(struct context_entry *context)
240 context->lo &= ~(1ULL << 11);
243 static inline bool context_pasid_enabled(struct context_entry *context)
245 return !!(context->lo & (1ULL << 11));
248 static inline void context_set_copied(struct context_entry *context)
250 context->hi |= (1ull << 3);
253 static inline bool context_copied(struct context_entry *context)
255 return !!(context->hi & (1ULL << 3));
258 static inline bool __context_present(struct context_entry *context)
260 return (context->lo & 1);
263 static inline bool context_present(struct context_entry *context)
265 return context_pasid_enabled(context) ?
266 __context_present(context) :
267 __context_present(context) && !context_copied(context);
270 static inline void context_set_present(struct context_entry *context)
275 static inline void context_set_fault_enable(struct context_entry *context)
277 context->lo &= (((u64)-1) << 2) | 1;
280 static inline void context_set_translation_type(struct context_entry *context,
283 context->lo &= (((u64)-1) << 4) | 3;
284 context->lo |= (value & 3) << 2;
287 static inline void context_set_address_root(struct context_entry *context,
290 context->lo &= ~VTD_PAGE_MASK;
291 context->lo |= value & VTD_PAGE_MASK;
294 static inline void context_set_address_width(struct context_entry *context,
297 context->hi |= value & 7;
300 static inline void context_set_domain_id(struct context_entry *context,
303 context->hi |= (value & ((1 << 16) - 1)) << 8;
306 static inline int context_domain_id(struct context_entry *c)
308 return((c->hi >> 8) & 0xffff);
311 static inline void context_clear_entry(struct context_entry *context)
324 * 12-63: Host physcial address
330 static inline void dma_clear_pte(struct dma_pte *pte)
335 static inline u64 dma_pte_addr(struct dma_pte *pte)
338 return pte->val & VTD_PAGE_MASK;
340 /* Must have a full atomic 64-bit read */
341 return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
345 static inline bool dma_pte_present(struct dma_pte *pte)
347 return (pte->val & 3) != 0;
350 static inline bool dma_pte_superpage(struct dma_pte *pte)
352 return (pte->val & DMA_PTE_LARGE_PAGE);
355 static inline int first_pte_in_page(struct dma_pte *pte)
357 return !((unsigned long)pte & ~VTD_PAGE_MASK);
361 * This domain is a statically identity mapping domain.
362 * 1. This domain creats a static 1:1 mapping to all usable memory.
363 * 2. It maps to each iommu if successful.
364 * 3. Each iommu mapps to this domain if successful.
366 static struct dmar_domain *si_domain;
367 static int hw_pass_through = 1;
370 * Domain represents a virtual machine, more than one devices
371 * across iommus may be owned in one domain, e.g. kvm guest.
373 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 0)
375 /* si_domain contains mulitple devices */
376 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 1)
378 #define for_each_domain_iommu(idx, domain) \
379 for (idx = 0; idx < g_num_of_iommus; idx++) \
380 if (domain->iommu_refcnt[idx])
383 int nid; /* node id */
385 unsigned iommu_refcnt[DMAR_UNITS_SUPPORTED];
386 /* Refcount of devices per iommu */
389 u16 iommu_did[DMAR_UNITS_SUPPORTED];
390 /* Domain ids per IOMMU. Use u16 since
391 * domain ids are 16 bit wide according
392 * to VT-d spec, section 9.3 */
394 bool has_iotlb_device;
395 struct list_head devices; /* all devices' list */
396 struct iova_domain iovad; /* iova's that belong to this domain */
398 struct dma_pte *pgd; /* virtual address */
399 int gaw; /* max guest address width */
401 /* adjusted guest address width, 0 is level 2 30-bit */
404 int flags; /* flags to find out type of domain */
406 int iommu_coherency;/* indicate coherency of iommu access */
407 int iommu_snooping; /* indicate snooping control feature*/
408 int iommu_count; /* reference count of iommu */
409 int iommu_superpage;/* Level of superpages supported:
410 0 == 4KiB (no superpages), 1 == 2MiB,
411 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
412 u64 max_addr; /* maximum mapped address */
414 struct iommu_domain domain; /* generic domain data structure for
418 /* PCI domain-device relationship */
419 struct device_domain_info {
420 struct list_head link; /* link to domain siblings */
421 struct list_head global; /* link to global list */
422 u8 bus; /* PCI bus number */
423 u8 devfn; /* PCI devfn number */
424 u8 pasid_supported:3;
431 struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
432 struct intel_iommu *iommu; /* IOMMU used by this device */
433 struct dmar_domain *domain; /* pointer to domain */
436 struct dmar_rmrr_unit {
437 struct list_head list; /* list of rmrr units */
438 struct acpi_dmar_header *hdr; /* ACPI header */
439 u64 base_address; /* reserved base address*/
440 u64 end_address; /* reserved end address */
441 struct dmar_dev_scope *devices; /* target devices */
442 int devices_cnt; /* target device count */
443 struct iommu_resv_region *resv; /* reserved region handle */
446 struct dmar_atsr_unit {
447 struct list_head list; /* list of ATSR units */
448 struct acpi_dmar_header *hdr; /* ACPI header */
449 struct dmar_dev_scope *devices; /* target devices */
450 int devices_cnt; /* target device count */
451 u8 include_all:1; /* include all ports */
454 static LIST_HEAD(dmar_atsr_units);
455 static LIST_HEAD(dmar_rmrr_units);
457 #define for_each_rmrr_units(rmrr) \
458 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
460 /* bitmap for indexing intel_iommus */
461 static int g_num_of_iommus;
463 static void domain_exit(struct dmar_domain *domain);
464 static void domain_remove_dev_info(struct dmar_domain *domain);
465 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
467 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
468 static void domain_context_clear(struct intel_iommu *iommu,
470 static int domain_detach_iommu(struct dmar_domain *domain,
471 struct intel_iommu *iommu);
473 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
474 int dmar_disabled = 0;
476 int dmar_disabled = 1;
477 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
479 int intel_iommu_enabled = 0;
480 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
482 static int dmar_map_gfx = 1;
483 static int dmar_forcedac;
484 static int intel_iommu_strict;
485 static int intel_iommu_superpage = 1;
486 static int intel_iommu_ecs = 1;
487 static int intel_iommu_pasid28;
488 static int iommu_identity_mapping;
490 #define IDENTMAP_ALL 1
491 #define IDENTMAP_GFX 2
492 #define IDENTMAP_AZALIA 4
494 /* Broadwell and Skylake have broken ECS support — normal so-called "second
495 * level" translation of DMA requests-without-PASID doesn't actually happen
496 * unless you also set the NESTE bit in an extended context-entry. Which of
497 * course means that SVM doesn't work because it's trying to do nested
498 * translation of the physical addresses it finds in the process page tables,
499 * through the IOVA->phys mapping found in the "second level" page tables.
501 * The VT-d specification was retroactively changed to change the definition
502 * of the capability bits and pretend that Broadwell/Skylake never happened...
503 * but unfortunately the wrong bit was changed. It's ECS which is broken, but
504 * for some reason it was the PASID capability bit which was redefined (from
505 * bit 28 on BDW/SKL to bit 40 in future).
507 * So our test for ECS needs to eschew those implementations which set the old
508 * PASID capabiity bit 28, since those are the ones on which ECS is broken.
509 * Unless we are working around the 'pasid28' limitations, that is, by putting
510 * the device into passthrough mode for normal DMA and thus masking the bug.
512 #define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
513 (intel_iommu_pasid28 || !ecap_broken_pasid(iommu->ecap)))
514 /* PASID support is thus enabled if ECS is enabled and *either* of the old
515 * or new capability bits are set. */
516 #define pasid_enabled(iommu) (ecs_enabled(iommu) && \
517 (ecap_pasid(iommu->ecap) || ecap_broken_pasid(iommu->ecap)))
519 int intel_iommu_gfx_mapped;
520 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
522 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
523 static DEFINE_SPINLOCK(device_domain_lock);
524 static LIST_HEAD(device_domain_list);
526 const struct iommu_ops intel_iommu_ops;
528 static bool translation_pre_enabled(struct intel_iommu *iommu)
530 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
533 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
535 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
538 static void init_translation_status(struct intel_iommu *iommu)
542 gsts = readl(iommu->reg + DMAR_GSTS_REG);
543 if (gsts & DMA_GSTS_TES)
544 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
547 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
548 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
550 return container_of(dom, struct dmar_domain, domain);
553 static int __init intel_iommu_setup(char *str)
558 if (!strncmp(str, "on", 2)) {
560 pr_info("IOMMU enabled\n");
561 } else if (!strncmp(str, "off", 3)) {
563 pr_info("IOMMU disabled\n");
564 } else if (!strncmp(str, "igfx_off", 8)) {
566 pr_info("Disable GFX device mapping\n");
567 } else if (!strncmp(str, "forcedac", 8)) {
568 pr_info("Forcing DAC for PCI devices\n");
570 } else if (!strncmp(str, "strict", 6)) {
571 pr_info("Disable batched IOTLB flush\n");
572 intel_iommu_strict = 1;
573 } else if (!strncmp(str, "sp_off", 6)) {
574 pr_info("Disable supported super page\n");
575 intel_iommu_superpage = 0;
576 } else if (!strncmp(str, "ecs_off", 7)) {
578 "Intel-IOMMU: disable extended context table support\n");
580 } else if (!strncmp(str, "pasid28", 7)) {
582 "Intel-IOMMU: enable pre-production PASID support\n");
583 intel_iommu_pasid28 = 1;
584 iommu_identity_mapping |= IDENTMAP_GFX;
585 } else if (!strncmp(str, "tboot_noforce", 13)) {
587 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
588 intel_iommu_tboot_noforce = 1;
591 str += strcspn(str, ",");
597 __setup("intel_iommu=", intel_iommu_setup);
599 static struct kmem_cache *iommu_domain_cache;
600 static struct kmem_cache *iommu_devinfo_cache;
602 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
604 struct dmar_domain **domains;
607 domains = iommu->domains[idx];
611 return domains[did & 0xff];
614 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
615 struct dmar_domain *domain)
617 struct dmar_domain **domains;
620 if (!iommu->domains[idx]) {
621 size_t size = 256 * sizeof(struct dmar_domain *);
622 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
625 domains = iommu->domains[idx];
626 if (WARN_ON(!domains))
629 domains[did & 0xff] = domain;
632 static inline void *alloc_pgtable_page(int node)
637 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
639 vaddr = page_address(page);
643 static inline void free_pgtable_page(void *vaddr)
645 free_page((unsigned long)vaddr);
648 static inline void *alloc_domain_mem(void)
650 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
653 static void free_domain_mem(void *vaddr)
655 kmem_cache_free(iommu_domain_cache, vaddr);
658 static inline void * alloc_devinfo_mem(void)
660 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
663 static inline void free_devinfo_mem(void *vaddr)
665 kmem_cache_free(iommu_devinfo_cache, vaddr);
668 static inline int domain_type_is_vm(struct dmar_domain *domain)
670 return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
673 static inline int domain_type_is_si(struct dmar_domain *domain)
675 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
678 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
680 return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
681 DOMAIN_FLAG_STATIC_IDENTITY);
684 static inline int domain_pfn_supported(struct dmar_domain *domain,
687 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
689 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
692 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
697 sagaw = cap_sagaw(iommu->cap);
698 for (agaw = width_to_agaw(max_gaw);
700 if (test_bit(agaw, &sagaw))
708 * Calculate max SAGAW for each iommu.
710 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
712 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
716 * calculate agaw for each iommu.
717 * "SAGAW" may be different across iommus, use a default agaw, and
718 * get a supported less agaw for iommus that don't support the default agaw.
720 int iommu_calculate_agaw(struct intel_iommu *iommu)
722 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
725 /* This functionin only returns single iommu in a domain */
726 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
730 /* si_domain and vm domain should not get here. */
731 BUG_ON(domain_type_is_vm_or_si(domain));
732 for_each_domain_iommu(iommu_id, domain)
735 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
738 return g_iommus[iommu_id];
741 static void domain_update_iommu_coherency(struct dmar_domain *domain)
743 struct dmar_drhd_unit *drhd;
744 struct intel_iommu *iommu;
748 domain->iommu_coherency = 1;
750 for_each_domain_iommu(i, domain) {
752 if (!ecap_coherent(g_iommus[i]->ecap)) {
753 domain->iommu_coherency = 0;
760 /* No hardware attached; use lowest common denominator */
762 for_each_active_iommu(iommu, drhd) {
763 if (!ecap_coherent(iommu->ecap)) {
764 domain->iommu_coherency = 0;
771 static int domain_update_iommu_snooping(struct intel_iommu *skip)
773 struct dmar_drhd_unit *drhd;
774 struct intel_iommu *iommu;
778 for_each_active_iommu(iommu, drhd) {
780 if (!ecap_sc_support(iommu->ecap)) {
791 static int domain_update_iommu_superpage(struct intel_iommu *skip)
793 struct dmar_drhd_unit *drhd;
794 struct intel_iommu *iommu;
797 if (!intel_iommu_superpage) {
801 /* set iommu_superpage to the smallest common denominator */
803 for_each_active_iommu(iommu, drhd) {
805 mask &= cap_super_page_val(iommu->cap);
815 /* Some capabilities may be different across iommus */
816 static void domain_update_iommu_cap(struct dmar_domain *domain)
818 domain_update_iommu_coherency(domain);
819 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
820 domain->iommu_superpage = domain_update_iommu_superpage(NULL);
823 static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu,
824 u8 bus, u8 devfn, int alloc)
826 struct root_entry *root = &iommu->root_entry[bus];
827 struct context_entry *context;
831 if (ecs_enabled(iommu)) {
839 context = phys_to_virt(*entry & VTD_PAGE_MASK);
841 unsigned long phy_addr;
845 context = alloc_pgtable_page(iommu->node);
849 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
850 phy_addr = virt_to_phys((void *)context);
851 *entry = phy_addr | 1;
852 __iommu_flush_cache(iommu, entry, sizeof(*entry));
854 return &context[devfn];
857 static int iommu_dummy(struct device *dev)
859 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
862 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
864 struct dmar_drhd_unit *drhd = NULL;
865 struct intel_iommu *iommu;
867 struct pci_dev *ptmp, *pdev = NULL;
871 if (iommu_dummy(dev))
874 if (dev_is_pci(dev)) {
875 struct pci_dev *pf_pdev;
877 pdev = to_pci_dev(dev);
880 /* VMD child devices currently cannot be handled individually */
881 if (is_vmd(pdev->bus))
885 /* VFs aren't listed in scope tables; we need to look up
886 * the PF instead to find the IOMMU. */
887 pf_pdev = pci_physfn(pdev);
889 segment = pci_domain_nr(pdev->bus);
890 } else if (has_acpi_companion(dev))
891 dev = &ACPI_COMPANION(dev)->dev;
894 for_each_active_iommu(iommu, drhd) {
895 if (pdev && segment != drhd->segment)
898 for_each_active_dev_scope(drhd->devices,
899 drhd->devices_cnt, i, tmp) {
901 /* For a VF use its original BDF# not that of the PF
902 * which we used for the IOMMU lookup. Strictly speaking
903 * we could do this for all PCI devices; we only need to
904 * get the BDF# from the scope table for ACPI matches. */
905 if (pdev && pdev->is_virtfn)
908 *bus = drhd->devices[i].bus;
909 *devfn = drhd->devices[i].devfn;
913 if (!pdev || !dev_is_pci(tmp))
916 ptmp = to_pci_dev(tmp);
917 if (ptmp->subordinate &&
918 ptmp->subordinate->number <= pdev->bus->number &&
919 ptmp->subordinate->busn_res.end >= pdev->bus->number)
923 if (pdev && drhd->include_all) {
925 *bus = pdev->bus->number;
926 *devfn = pdev->devfn;
937 static void domain_flush_cache(struct dmar_domain *domain,
938 void *addr, int size)
940 if (!domain->iommu_coherency)
941 clflush_cache_range(addr, size);
944 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
946 struct context_entry *context;
950 spin_lock_irqsave(&iommu->lock, flags);
951 context = iommu_context_addr(iommu, bus, devfn, 0);
953 ret = context_present(context);
954 spin_unlock_irqrestore(&iommu->lock, flags);
958 static void free_context_table(struct intel_iommu *iommu)
962 struct context_entry *context;
964 spin_lock_irqsave(&iommu->lock, flags);
965 if (!iommu->root_entry) {
968 for (i = 0; i < ROOT_ENTRY_NR; i++) {
969 context = iommu_context_addr(iommu, i, 0, 0);
971 free_pgtable_page(context);
973 if (!ecs_enabled(iommu))
976 context = iommu_context_addr(iommu, i, 0x80, 0);
978 free_pgtable_page(context);
981 free_pgtable_page(iommu->root_entry);
982 iommu->root_entry = NULL;
984 spin_unlock_irqrestore(&iommu->lock, flags);
987 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
988 unsigned long pfn, int *target_level)
990 struct dma_pte *parent, *pte = NULL;
991 int level = agaw_to_level(domain->agaw);
994 BUG_ON(!domain->pgd);
996 if (!domain_pfn_supported(domain, pfn))
997 /* Address beyond IOMMU's addressing capabilities. */
1000 parent = domain->pgd;
1005 offset = pfn_level_offset(pfn, level);
1006 pte = &parent[offset];
1007 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1009 if (level == *target_level)
1012 if (!dma_pte_present(pte)) {
1015 tmp_page = alloc_pgtable_page(domain->nid);
1020 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1021 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1022 if (cmpxchg64(&pte->val, 0ULL, pteval))
1023 /* Someone else set it while we were thinking; use theirs. */
1024 free_pgtable_page(tmp_page);
1026 domain_flush_cache(domain, pte, sizeof(*pte));
1031 parent = phys_to_virt(dma_pte_addr(pte));
1036 *target_level = level;
1042 /* return address's pte at specific level */
1043 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1045 int level, int *large_page)
1047 struct dma_pte *parent, *pte = NULL;
1048 int total = agaw_to_level(domain->agaw);
1051 parent = domain->pgd;
1052 while (level <= total) {
1053 offset = pfn_level_offset(pfn, total);
1054 pte = &parent[offset];
1058 if (!dma_pte_present(pte)) {
1059 *large_page = total;
1063 if (dma_pte_superpage(pte)) {
1064 *large_page = total;
1068 parent = phys_to_virt(dma_pte_addr(pte));
1074 /* clear last level pte, a tlb flush should be followed */
1075 static void dma_pte_clear_range(struct dmar_domain *domain,
1076 unsigned long start_pfn,
1077 unsigned long last_pfn)
1079 unsigned int large_page = 1;
1080 struct dma_pte *first_pte, *pte;
1082 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1083 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1084 BUG_ON(start_pfn > last_pfn);
1086 /* we don't need lock here; nobody else touches the iova range */
1089 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1091 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1096 start_pfn += lvl_to_nr_pages(large_page);
1098 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1100 domain_flush_cache(domain, first_pte,
1101 (void *)pte - (void *)first_pte);
1103 } while (start_pfn && start_pfn <= last_pfn);
1106 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1107 int retain_level, struct dma_pte *pte,
1108 unsigned long pfn, unsigned long start_pfn,
1109 unsigned long last_pfn)
1111 pfn = max(start_pfn, pfn);
1112 pte = &pte[pfn_level_offset(pfn, level)];
1115 unsigned long level_pfn;
1116 struct dma_pte *level_pte;
1118 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1121 level_pfn = pfn & level_mask(level);
1122 level_pte = phys_to_virt(dma_pte_addr(pte));
1125 dma_pte_free_level(domain, level - 1, retain_level,
1126 level_pte, level_pfn, start_pfn,
1131 * Free the page table if we're below the level we want to
1132 * retain and the range covers the entire table.
1134 if (level < retain_level && !(start_pfn > level_pfn ||
1135 last_pfn < level_pfn + level_size(level) - 1)) {
1137 domain_flush_cache(domain, pte, sizeof(*pte));
1138 free_pgtable_page(level_pte);
1141 pfn += level_size(level);
1142 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1146 * clear last level (leaf) ptes and free page table pages below the
1147 * level we wish to keep intact.
1149 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1150 unsigned long start_pfn,
1151 unsigned long last_pfn,
1154 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1155 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1156 BUG_ON(start_pfn > last_pfn);
1158 dma_pte_clear_range(domain, start_pfn, last_pfn);
1160 /* We don't need lock here; nobody else touches the iova range */
1161 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1162 domain->pgd, 0, start_pfn, last_pfn);
1165 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1166 free_pgtable_page(domain->pgd);
1171 /* When a page at a given level is being unlinked from its parent, we don't
1172 need to *modify* it at all. All we need to do is make a list of all the
1173 pages which can be freed just as soon as we've flushed the IOTLB and we
1174 know the hardware page-walk will no longer touch them.
1175 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1177 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1178 int level, struct dma_pte *pte,
1179 struct page *freelist)
1183 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1184 pg->freelist = freelist;
1190 pte = page_address(pg);
1192 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1193 freelist = dma_pte_list_pagetables(domain, level - 1,
1196 } while (!first_pte_in_page(pte));
1201 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1202 struct dma_pte *pte, unsigned long pfn,
1203 unsigned long start_pfn,
1204 unsigned long last_pfn,
1205 struct page *freelist)
1207 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1209 pfn = max(start_pfn, pfn);
1210 pte = &pte[pfn_level_offset(pfn, level)];
1213 unsigned long level_pfn;
1215 if (!dma_pte_present(pte))
1218 level_pfn = pfn & level_mask(level);
1220 /* If range covers entire pagetable, free it */
1221 if (start_pfn <= level_pfn &&
1222 last_pfn >= level_pfn + level_size(level) - 1) {
1223 /* These suborbinate page tables are going away entirely. Don't
1224 bother to clear them; we're just going to *free* them. */
1225 if (level > 1 && !dma_pte_superpage(pte))
1226 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1232 } else if (level > 1) {
1233 /* Recurse down into a level that isn't *entirely* obsolete */
1234 freelist = dma_pte_clear_level(domain, level - 1,
1235 phys_to_virt(dma_pte_addr(pte)),
1236 level_pfn, start_pfn, last_pfn,
1240 pfn += level_size(level);
1241 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1244 domain_flush_cache(domain, first_pte,
1245 (void *)++last_pte - (void *)first_pte);
1250 /* We can't just free the pages because the IOMMU may still be walking
1251 the page tables, and may have cached the intermediate levels. The
1252 pages can only be freed after the IOTLB flush has been done. */
1253 static struct page *domain_unmap(struct dmar_domain *domain,
1254 unsigned long start_pfn,
1255 unsigned long last_pfn)
1257 struct page *freelist = NULL;
1259 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1260 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1261 BUG_ON(start_pfn > last_pfn);
1263 /* we don't need lock here; nobody else touches the iova range */
1264 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1265 domain->pgd, 0, start_pfn, last_pfn, NULL);
1268 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1269 struct page *pgd_page = virt_to_page(domain->pgd);
1270 pgd_page->freelist = freelist;
1271 freelist = pgd_page;
1279 static void dma_free_pagelist(struct page *freelist)
1283 while ((pg = freelist)) {
1284 freelist = pg->freelist;
1285 free_pgtable_page(page_address(pg));
1289 static void iova_entry_free(unsigned long data)
1291 struct page *freelist = (struct page *)data;
1293 dma_free_pagelist(freelist);
1296 /* iommu handling */
1297 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1299 struct root_entry *root;
1300 unsigned long flags;
1302 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1304 pr_err("Allocating root entry for %s failed\n",
1309 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1311 spin_lock_irqsave(&iommu->lock, flags);
1312 iommu->root_entry = root;
1313 spin_unlock_irqrestore(&iommu->lock, flags);
1318 static void iommu_set_root_entry(struct intel_iommu *iommu)
1324 addr = virt_to_phys(iommu->root_entry);
1325 if (ecs_enabled(iommu))
1326 addr |= DMA_RTADDR_RTT;
1328 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1329 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1331 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1333 /* Make sure hardware complete it */
1334 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1335 readl, (sts & DMA_GSTS_RTPS), sts);
1337 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1340 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1345 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1348 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1349 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1351 /* Make sure hardware complete it */
1352 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1353 readl, (!(val & DMA_GSTS_WBFS)), val);
1355 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1358 /* return value determine if we need a write buffer flush */
1359 static void __iommu_flush_context(struct intel_iommu *iommu,
1360 u16 did, u16 source_id, u8 function_mask,
1367 case DMA_CCMD_GLOBAL_INVL:
1368 val = DMA_CCMD_GLOBAL_INVL;
1370 case DMA_CCMD_DOMAIN_INVL:
1371 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1373 case DMA_CCMD_DEVICE_INVL:
1374 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1375 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1380 val |= DMA_CCMD_ICC;
1382 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1383 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1385 /* Make sure hardware complete it */
1386 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1387 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1389 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1392 /* return value determine if we need a write buffer flush */
1393 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1394 u64 addr, unsigned int size_order, u64 type)
1396 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1397 u64 val = 0, val_iva = 0;
1401 case DMA_TLB_GLOBAL_FLUSH:
1402 /* global flush doesn't need set IVA_REG */
1403 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1405 case DMA_TLB_DSI_FLUSH:
1406 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1408 case DMA_TLB_PSI_FLUSH:
1409 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1410 /* IH bit is passed in as part of address */
1411 val_iva = size_order | addr;
1416 /* Note: set drain read/write */
1419 * This is probably to be super secure.. Looks like we can
1420 * ignore it without any impact.
1422 if (cap_read_drain(iommu->cap))
1423 val |= DMA_TLB_READ_DRAIN;
1425 if (cap_write_drain(iommu->cap))
1426 val |= DMA_TLB_WRITE_DRAIN;
1428 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1429 /* Note: Only uses first TLB reg currently */
1431 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1432 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1434 /* Make sure hardware complete it */
1435 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1436 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1438 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1440 /* check IOTLB invalidation granularity */
1441 if (DMA_TLB_IAIG(val) == 0)
1442 pr_err("Flush IOTLB failed\n");
1443 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1444 pr_debug("TLB flush request %Lx, actual %Lx\n",
1445 (unsigned long long)DMA_TLB_IIRG(type),
1446 (unsigned long long)DMA_TLB_IAIG(val));
1449 static struct device_domain_info *
1450 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1453 struct device_domain_info *info;
1455 assert_spin_locked(&device_domain_lock);
1460 list_for_each_entry(info, &domain->devices, link)
1461 if (info->iommu == iommu && info->bus == bus &&
1462 info->devfn == devfn) {
1463 if (info->ats_supported && info->dev)
1471 static void domain_update_iotlb(struct dmar_domain *domain)
1473 struct device_domain_info *info;
1474 bool has_iotlb_device = false;
1476 assert_spin_locked(&device_domain_lock);
1478 list_for_each_entry(info, &domain->devices, link) {
1479 struct pci_dev *pdev;
1481 if (!info->dev || !dev_is_pci(info->dev))
1484 pdev = to_pci_dev(info->dev);
1485 if (pdev->ats_enabled) {
1486 has_iotlb_device = true;
1491 domain->has_iotlb_device = has_iotlb_device;
1494 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1496 struct pci_dev *pdev;
1498 assert_spin_locked(&device_domain_lock);
1500 if (!info || !dev_is_pci(info->dev))
1503 pdev = to_pci_dev(info->dev);
1505 #ifdef CONFIG_INTEL_IOMMU_SVM
1506 /* The PCIe spec, in its wisdom, declares that the behaviour of
1507 the device if you enable PASID support after ATS support is
1508 undefined. So always enable PASID support on devices which
1509 have it, even if we can't yet know if we're ever going to
1511 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1512 info->pasid_enabled = 1;
1514 if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1515 info->pri_enabled = 1;
1517 if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1518 info->ats_enabled = 1;
1519 domain_update_iotlb(info->domain);
1520 info->ats_qdep = pci_ats_queue_depth(pdev);
1524 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1526 struct pci_dev *pdev;
1528 assert_spin_locked(&device_domain_lock);
1530 if (!dev_is_pci(info->dev))
1533 pdev = to_pci_dev(info->dev);
1535 if (info->ats_enabled) {
1536 pci_disable_ats(pdev);
1537 info->ats_enabled = 0;
1538 domain_update_iotlb(info->domain);
1540 #ifdef CONFIG_INTEL_IOMMU_SVM
1541 if (info->pri_enabled) {
1542 pci_disable_pri(pdev);
1543 info->pri_enabled = 0;
1545 if (info->pasid_enabled) {
1546 pci_disable_pasid(pdev);
1547 info->pasid_enabled = 0;
1552 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1553 u64 addr, unsigned mask)
1556 unsigned long flags;
1557 struct device_domain_info *info;
1559 if (!domain->has_iotlb_device)
1562 spin_lock_irqsave(&device_domain_lock, flags);
1563 list_for_each_entry(info, &domain->devices, link) {
1564 if (!info->ats_enabled)
1567 sid = info->bus << 8 | info->devfn;
1568 qdep = info->ats_qdep;
1569 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1571 spin_unlock_irqrestore(&device_domain_lock, flags);
1574 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1575 struct dmar_domain *domain,
1576 unsigned long pfn, unsigned int pages,
1579 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1580 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1581 u16 did = domain->iommu_did[iommu->seq_id];
1588 * Fallback to domain selective flush if no PSI support or the size is
1590 * PSI requires page size to be 2 ^ x, and the base address is naturally
1591 * aligned to the size
1593 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1594 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1597 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1601 * In caching mode, changes of pages from non-present to present require
1602 * flush. However, device IOTLB doesn't need to be flushed in this case.
1604 if (!cap_caching_mode(iommu->cap) || !map)
1605 iommu_flush_dev_iotlb(domain, addr, mask);
1608 /* Notification for newly created mappings */
1609 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1610 struct dmar_domain *domain,
1611 unsigned long pfn, unsigned int pages)
1613 /* It's a non-present to present mapping. Only flush if caching mode */
1614 if (cap_caching_mode(iommu->cap))
1615 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1617 iommu_flush_write_buffer(iommu);
1620 static void iommu_flush_iova(struct iova_domain *iovad)
1622 struct dmar_domain *domain;
1625 domain = container_of(iovad, struct dmar_domain, iovad);
1627 for_each_domain_iommu(idx, domain) {
1628 struct intel_iommu *iommu = g_iommus[idx];
1629 u16 did = domain->iommu_did[iommu->seq_id];
1631 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1633 if (!cap_caching_mode(iommu->cap))
1634 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1635 0, MAX_AGAW_PFN_WIDTH);
1639 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1642 unsigned long flags;
1644 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1645 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1646 pmen &= ~DMA_PMEN_EPM;
1647 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1649 /* wait for the protected region status bit to clear */
1650 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1651 readl, !(pmen & DMA_PMEN_PRS), pmen);
1653 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1656 static void iommu_enable_translation(struct intel_iommu *iommu)
1659 unsigned long flags;
1661 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1662 iommu->gcmd |= DMA_GCMD_TE;
1663 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1665 /* Make sure hardware complete it */
1666 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1667 readl, (sts & DMA_GSTS_TES), sts);
1669 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1672 static void iommu_disable_translation(struct intel_iommu *iommu)
1677 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1678 iommu->gcmd &= ~DMA_GCMD_TE;
1679 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1681 /* Make sure hardware complete it */
1682 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1683 readl, (!(sts & DMA_GSTS_TES)), sts);
1685 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1689 static int iommu_init_domains(struct intel_iommu *iommu)
1691 u32 ndomains, nlongs;
1694 ndomains = cap_ndoms(iommu->cap);
1695 pr_debug("%s: Number of Domains supported <%d>\n",
1696 iommu->name, ndomains);
1697 nlongs = BITS_TO_LONGS(ndomains);
1699 spin_lock_init(&iommu->lock);
1701 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1702 if (!iommu->domain_ids) {
1703 pr_err("%s: Allocating domain id array failed\n",
1708 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1709 iommu->domains = kzalloc(size, GFP_KERNEL);
1711 if (iommu->domains) {
1712 size = 256 * sizeof(struct dmar_domain *);
1713 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1716 if (!iommu->domains || !iommu->domains[0]) {
1717 pr_err("%s: Allocating domain array failed\n",
1719 kfree(iommu->domain_ids);
1720 kfree(iommu->domains);
1721 iommu->domain_ids = NULL;
1722 iommu->domains = NULL;
1729 * If Caching mode is set, then invalid translations are tagged
1730 * with domain-id 0, hence we need to pre-allocate it. We also
1731 * use domain-id 0 as a marker for non-allocated domain-id, so
1732 * make sure it is not used for a real domain.
1734 set_bit(0, iommu->domain_ids);
1739 static void disable_dmar_iommu(struct intel_iommu *iommu)
1741 struct device_domain_info *info, *tmp;
1742 unsigned long flags;
1744 if (!iommu->domains || !iommu->domain_ids)
1748 spin_lock_irqsave(&device_domain_lock, flags);
1749 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1750 struct dmar_domain *domain;
1752 if (info->iommu != iommu)
1755 if (!info->dev || !info->domain)
1758 domain = info->domain;
1760 __dmar_remove_one_dev_info(info);
1762 if (!domain_type_is_vm_or_si(domain)) {
1764 * The domain_exit() function can't be called under
1765 * device_domain_lock, as it takes this lock itself.
1766 * So release the lock here and re-run the loop
1769 spin_unlock_irqrestore(&device_domain_lock, flags);
1770 domain_exit(domain);
1774 spin_unlock_irqrestore(&device_domain_lock, flags);
1776 if (iommu->gcmd & DMA_GCMD_TE)
1777 iommu_disable_translation(iommu);
1780 static void free_dmar_iommu(struct intel_iommu *iommu)
1782 if ((iommu->domains) && (iommu->domain_ids)) {
1783 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1786 for (i = 0; i < elems; i++)
1787 kfree(iommu->domains[i]);
1788 kfree(iommu->domains);
1789 kfree(iommu->domain_ids);
1790 iommu->domains = NULL;
1791 iommu->domain_ids = NULL;
1794 g_iommus[iommu->seq_id] = NULL;
1796 /* free context mapping */
1797 free_context_table(iommu);
1799 #ifdef CONFIG_INTEL_IOMMU_SVM
1800 if (pasid_enabled(iommu)) {
1801 if (ecap_prs(iommu->ecap))
1802 intel_svm_finish_prq(iommu);
1803 intel_svm_free_pasid_tables(iommu);
1808 static struct dmar_domain *alloc_domain(int flags)
1810 struct dmar_domain *domain;
1812 domain = alloc_domain_mem();
1816 memset(domain, 0, sizeof(*domain));
1818 domain->flags = flags;
1819 domain->has_iotlb_device = false;
1820 INIT_LIST_HEAD(&domain->devices);
1825 /* Must be called with iommu->lock */
1826 static int domain_attach_iommu(struct dmar_domain *domain,
1827 struct intel_iommu *iommu)
1829 unsigned long ndomains;
1832 assert_spin_locked(&device_domain_lock);
1833 assert_spin_locked(&iommu->lock);
1835 domain->iommu_refcnt[iommu->seq_id] += 1;
1836 domain->iommu_count += 1;
1837 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1838 ndomains = cap_ndoms(iommu->cap);
1839 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1841 if (num >= ndomains) {
1842 pr_err("%s: No free domain ids\n", iommu->name);
1843 domain->iommu_refcnt[iommu->seq_id] -= 1;
1844 domain->iommu_count -= 1;
1848 set_bit(num, iommu->domain_ids);
1849 set_iommu_domain(iommu, num, domain);
1851 domain->iommu_did[iommu->seq_id] = num;
1852 domain->nid = iommu->node;
1854 domain_update_iommu_cap(domain);
1860 static int domain_detach_iommu(struct dmar_domain *domain,
1861 struct intel_iommu *iommu)
1863 int num, count = INT_MAX;
1865 assert_spin_locked(&device_domain_lock);
1866 assert_spin_locked(&iommu->lock);
1868 domain->iommu_refcnt[iommu->seq_id] -= 1;
1869 count = --domain->iommu_count;
1870 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1871 num = domain->iommu_did[iommu->seq_id];
1872 clear_bit(num, iommu->domain_ids);
1873 set_iommu_domain(iommu, num, NULL);
1875 domain_update_iommu_cap(domain);
1876 domain->iommu_did[iommu->seq_id] = 0;
1882 static struct iova_domain reserved_iova_list;
1883 static struct lock_class_key reserved_rbtree_key;
1885 static int dmar_init_reserved_ranges(void)
1887 struct pci_dev *pdev = NULL;
1891 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1893 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1894 &reserved_rbtree_key);
1896 /* IOAPIC ranges shouldn't be accessed by DMA */
1897 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1898 IOVA_PFN(IOAPIC_RANGE_END));
1900 pr_err("Reserve IOAPIC range failed\n");
1904 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1905 for_each_pci_dev(pdev) {
1908 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1909 r = &pdev->resource[i];
1910 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1912 iova = reserve_iova(&reserved_iova_list,
1916 pr_err("Reserve iova failed\n");
1924 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1926 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1929 static inline int guestwidth_to_adjustwidth(int gaw)
1932 int r = (gaw - 12) % 9;
1943 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1946 int adjust_width, agaw;
1947 unsigned long sagaw;
1950 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1952 err = init_iova_flush_queue(&domain->iovad,
1953 iommu_flush_iova, iova_entry_free);
1957 domain_reserve_special_ranges(domain);
1959 /* calculate AGAW */
1960 if (guest_width > cap_mgaw(iommu->cap))
1961 guest_width = cap_mgaw(iommu->cap);
1962 domain->gaw = guest_width;
1963 adjust_width = guestwidth_to_adjustwidth(guest_width);
1964 agaw = width_to_agaw(adjust_width);
1965 sagaw = cap_sagaw(iommu->cap);
1966 if (!test_bit(agaw, &sagaw)) {
1967 /* hardware doesn't support it, choose a bigger one */
1968 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1969 agaw = find_next_bit(&sagaw, 5, agaw);
1973 domain->agaw = agaw;
1975 if (ecap_coherent(iommu->ecap))
1976 domain->iommu_coherency = 1;
1978 domain->iommu_coherency = 0;
1980 if (ecap_sc_support(iommu->ecap))
1981 domain->iommu_snooping = 1;
1983 domain->iommu_snooping = 0;
1985 if (intel_iommu_superpage)
1986 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1988 domain->iommu_superpage = 0;
1990 domain->nid = iommu->node;
1992 /* always allocate the top pgd */
1993 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1996 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
2000 static void domain_exit(struct dmar_domain *domain)
2002 struct page *freelist = NULL;
2004 /* Domain 0 is reserved, so dont process it */
2008 /* Remove associated devices and clear attached or cached domains */
2010 domain_remove_dev_info(domain);
2014 put_iova_domain(&domain->iovad);
2016 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
2018 dma_free_pagelist(freelist);
2020 free_domain_mem(domain);
2023 static int domain_context_mapping_one(struct dmar_domain *domain,
2024 struct intel_iommu *iommu,
2027 u16 did = domain->iommu_did[iommu->seq_id];
2028 int translation = CONTEXT_TT_MULTI_LEVEL;
2029 struct device_domain_info *info = NULL;
2030 struct context_entry *context;
2031 unsigned long flags;
2032 struct dma_pte *pgd;
2037 if (hw_pass_through && domain_type_is_si(domain))
2038 translation = CONTEXT_TT_PASS_THROUGH;
2040 pr_debug("Set context mapping for %02x:%02x.%d\n",
2041 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2043 BUG_ON(!domain->pgd);
2045 spin_lock_irqsave(&device_domain_lock, flags);
2046 spin_lock(&iommu->lock);
2049 context = iommu_context_addr(iommu, bus, devfn, 1);
2054 if (context_present(context))
2058 * For kdump cases, old valid entries may be cached due to the
2059 * in-flight DMA and copied pgtable, but there is no unmapping
2060 * behaviour for them, thus we need an explicit cache flush for
2061 * the newly-mapped device. For kdump, at this point, the device
2062 * is supposed to finish reset at its driver probe stage, so no
2063 * in-flight DMA will exist, and we don't need to worry anymore
2066 if (context_copied(context)) {
2067 u16 did_old = context_domain_id(context);
2069 if (did_old < cap_ndoms(iommu->cap)) {
2070 iommu->flush.flush_context(iommu, did_old,
2071 (((u16)bus) << 8) | devfn,
2072 DMA_CCMD_MASK_NOBIT,
2073 DMA_CCMD_DEVICE_INVL);
2074 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2081 context_clear_entry(context);
2082 context_set_domain_id(context, did);
2085 * Skip top levels of page tables for iommu which has less agaw
2086 * than default. Unnecessary for PT mode.
2088 if (translation != CONTEXT_TT_PASS_THROUGH) {
2089 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
2091 pgd = phys_to_virt(dma_pte_addr(pgd));
2092 if (!dma_pte_present(pgd))
2096 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2097 if (info && info->ats_supported)
2098 translation = CONTEXT_TT_DEV_IOTLB;
2100 translation = CONTEXT_TT_MULTI_LEVEL;
2102 context_set_address_root(context, virt_to_phys(pgd));
2103 context_set_address_width(context, iommu->agaw);
2106 * In pass through mode, AW must be programmed to
2107 * indicate the largest AGAW value supported by
2108 * hardware. And ASR is ignored by hardware.
2110 context_set_address_width(context, iommu->msagaw);
2113 context_set_translation_type(context, translation);
2114 context_set_fault_enable(context);
2115 context_set_present(context);
2116 domain_flush_cache(domain, context, sizeof(*context));
2119 * It's a non-present to present mapping. If hardware doesn't cache
2120 * non-present entry we only need to flush the write-buffer. If the
2121 * _does_ cache non-present entries, then it does so in the special
2122 * domain #0, which we have to flush:
2124 if (cap_caching_mode(iommu->cap)) {
2125 iommu->flush.flush_context(iommu, 0,
2126 (((u16)bus) << 8) | devfn,
2127 DMA_CCMD_MASK_NOBIT,
2128 DMA_CCMD_DEVICE_INVL);
2129 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2131 iommu_flush_write_buffer(iommu);
2133 iommu_enable_dev_iotlb(info);
2138 spin_unlock(&iommu->lock);
2139 spin_unlock_irqrestore(&device_domain_lock, flags);
2144 struct domain_context_mapping_data {
2145 struct dmar_domain *domain;
2146 struct intel_iommu *iommu;
2149 static int domain_context_mapping_cb(struct pci_dev *pdev,
2150 u16 alias, void *opaque)
2152 struct domain_context_mapping_data *data = opaque;
2154 return domain_context_mapping_one(data->domain, data->iommu,
2155 PCI_BUS_NUM(alias), alias & 0xff);
2159 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2161 struct intel_iommu *iommu;
2163 struct domain_context_mapping_data data;
2165 iommu = device_to_iommu(dev, &bus, &devfn);
2169 if (!dev_is_pci(dev))
2170 return domain_context_mapping_one(domain, iommu, bus, devfn);
2172 data.domain = domain;
2175 return pci_for_each_dma_alias(to_pci_dev(dev),
2176 &domain_context_mapping_cb, &data);
2179 static int domain_context_mapped_cb(struct pci_dev *pdev,
2180 u16 alias, void *opaque)
2182 struct intel_iommu *iommu = opaque;
2184 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2187 static int domain_context_mapped(struct device *dev)
2189 struct intel_iommu *iommu;
2192 iommu = device_to_iommu(dev, &bus, &devfn);
2196 if (!dev_is_pci(dev))
2197 return device_context_mapped(iommu, bus, devfn);
2199 return !pci_for_each_dma_alias(to_pci_dev(dev),
2200 domain_context_mapped_cb, iommu);
2203 /* Returns a number of VTD pages, but aligned to MM page size */
2204 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2207 host_addr &= ~PAGE_MASK;
2208 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2211 /* Return largest possible superpage level for a given mapping */
2212 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2213 unsigned long iov_pfn,
2214 unsigned long phy_pfn,
2215 unsigned long pages)
2217 int support, level = 1;
2218 unsigned long pfnmerge;
2220 support = domain->iommu_superpage;
2222 /* To use a large page, the virtual *and* physical addresses
2223 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2224 of them will mean we have to use smaller pages. So just
2225 merge them and check both at once. */
2226 pfnmerge = iov_pfn | phy_pfn;
2228 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2229 pages >>= VTD_STRIDE_SHIFT;
2232 pfnmerge >>= VTD_STRIDE_SHIFT;
2239 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2240 struct scatterlist *sg, unsigned long phys_pfn,
2241 unsigned long nr_pages, int prot)
2243 struct dma_pte *first_pte = NULL, *pte = NULL;
2244 phys_addr_t uninitialized_var(pteval);
2245 unsigned long sg_res = 0;
2246 unsigned int largepage_lvl = 0;
2247 unsigned long lvl_pages = 0;
2249 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2251 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2254 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2258 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2261 while (nr_pages > 0) {
2265 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2267 sg_res = aligned_nrpages(sg->offset, sg->length);
2268 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2269 sg->dma_length = sg->length;
2270 pteval = (sg_phys(sg) - pgoff) | prot;
2271 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2275 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2277 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2280 /* It is large page*/
2281 if (largepage_lvl > 1) {
2282 unsigned long nr_superpages, end_pfn;
2284 pteval |= DMA_PTE_LARGE_PAGE;
2285 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2287 nr_superpages = sg_res / lvl_pages;
2288 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2291 * Ensure that old small page tables are
2292 * removed to make room for superpage(s).
2293 * We're adding new large pages, so make sure
2294 * we don't remove their parent tables.
2296 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2299 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2303 /* We don't need lock here, nobody else
2304 * touches the iova range
2306 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2308 static int dumps = 5;
2309 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2310 iov_pfn, tmp, (unsigned long long)pteval);
2313 debug_dma_dump_mappings(NULL);
2318 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2320 BUG_ON(nr_pages < lvl_pages);
2321 BUG_ON(sg_res < lvl_pages);
2323 nr_pages -= lvl_pages;
2324 iov_pfn += lvl_pages;
2325 phys_pfn += lvl_pages;
2326 pteval += lvl_pages * VTD_PAGE_SIZE;
2327 sg_res -= lvl_pages;
2329 /* If the next PTE would be the first in a new page, then we
2330 need to flush the cache on the entries we've just written.
2331 And then we'll need to recalculate 'pte', so clear it and
2332 let it get set again in the if (!pte) block above.
2334 If we're done (!nr_pages) we need to flush the cache too.
2336 Also if we've been setting superpages, we may need to
2337 recalculate 'pte' and switch back to smaller pages for the
2338 end of the mapping, if the trailing size is not enough to
2339 use another superpage (i.e. sg_res < lvl_pages). */
2341 if (!nr_pages || first_pte_in_page(pte) ||
2342 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2343 domain_flush_cache(domain, first_pte,
2344 (void *)pte - (void *)first_pte);
2348 if (!sg_res && nr_pages)
2354 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2355 struct scatterlist *sg, unsigned long phys_pfn,
2356 unsigned long nr_pages, int prot)
2359 struct intel_iommu *iommu;
2361 /* Do the real mapping first */
2362 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2366 /* Notify about the new mapping */
2367 if (domain_type_is_vm(domain)) {
2368 /* VM typed domains can have more than one IOMMUs */
2370 for_each_domain_iommu(iommu_id, domain) {
2371 iommu = g_iommus[iommu_id];
2372 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2375 /* General domains only have one IOMMU */
2376 iommu = domain_get_iommu(domain);
2377 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2383 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2384 struct scatterlist *sg, unsigned long nr_pages,
2387 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2390 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2391 unsigned long phys_pfn, unsigned long nr_pages,
2394 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2397 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2399 unsigned long flags;
2400 struct context_entry *context;
2406 spin_lock_irqsave(&iommu->lock, flags);
2407 context = iommu_context_addr(iommu, bus, devfn, 0);
2409 spin_unlock_irqrestore(&iommu->lock, flags);
2412 did_old = context_domain_id(context);
2413 context_clear_entry(context);
2414 __iommu_flush_cache(iommu, context, sizeof(*context));
2415 spin_unlock_irqrestore(&iommu->lock, flags);
2416 iommu->flush.flush_context(iommu,
2418 (((u16)bus) << 8) | devfn,
2419 DMA_CCMD_MASK_NOBIT,
2420 DMA_CCMD_DEVICE_INVL);
2421 iommu->flush.flush_iotlb(iommu,
2428 static inline void unlink_domain_info(struct device_domain_info *info)
2430 assert_spin_locked(&device_domain_lock);
2431 list_del(&info->link);
2432 list_del(&info->global);
2434 info->dev->archdata.iommu = NULL;
2437 static void domain_remove_dev_info(struct dmar_domain *domain)
2439 struct device_domain_info *info, *tmp;
2440 unsigned long flags;
2442 spin_lock_irqsave(&device_domain_lock, flags);
2443 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2444 __dmar_remove_one_dev_info(info);
2445 spin_unlock_irqrestore(&device_domain_lock, flags);
2450 * Note: we use struct device->archdata.iommu stores the info
2452 static struct dmar_domain *find_domain(struct device *dev)
2454 struct device_domain_info *info;
2456 /* No lock here, assumes no domain exit in normal case */
2457 info = dev->archdata.iommu;
2459 return info->domain;
2463 static inline struct device_domain_info *
2464 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2466 struct device_domain_info *info;
2468 list_for_each_entry(info, &device_domain_list, global)
2469 if (info->iommu->segment == segment && info->bus == bus &&
2470 info->devfn == devfn)
2476 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2479 struct dmar_domain *domain)
2481 struct dmar_domain *found = NULL;
2482 struct device_domain_info *info;
2483 unsigned long flags;
2486 info = alloc_devinfo_mem();
2491 info->devfn = devfn;
2492 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2493 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2496 info->domain = domain;
2497 info->iommu = iommu;
2499 if (dev && dev_is_pci(dev)) {
2500 struct pci_dev *pdev = to_pci_dev(info->dev);
2502 if (!pci_ats_disabled() &&
2503 ecap_dev_iotlb_support(iommu->ecap) &&
2504 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2505 dmar_find_matched_atsr_unit(pdev))
2506 info->ats_supported = 1;
2508 if (ecs_enabled(iommu)) {
2509 if (pasid_enabled(iommu)) {
2510 int features = pci_pasid_features(pdev);
2512 info->pasid_supported = features | 1;
2515 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2516 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2517 info->pri_supported = 1;
2521 spin_lock_irqsave(&device_domain_lock, flags);
2523 found = find_domain(dev);
2526 struct device_domain_info *info2;
2527 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2529 found = info2->domain;
2535 spin_unlock_irqrestore(&device_domain_lock, flags);
2536 free_devinfo_mem(info);
2537 /* Caller must free the original domain */
2541 spin_lock(&iommu->lock);
2542 ret = domain_attach_iommu(domain, iommu);
2543 spin_unlock(&iommu->lock);
2546 spin_unlock_irqrestore(&device_domain_lock, flags);
2547 free_devinfo_mem(info);
2551 list_add(&info->link, &domain->devices);
2552 list_add(&info->global, &device_domain_list);
2554 dev->archdata.iommu = info;
2555 spin_unlock_irqrestore(&device_domain_lock, flags);
2557 if (dev && domain_context_mapping(domain, dev)) {
2558 pr_err("Domain context map for %s failed\n", dev_name(dev));
2559 dmar_remove_one_dev_info(domain, dev);
2566 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2568 *(u16 *)opaque = alias;
2572 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2574 struct device_domain_info *info = NULL;
2575 struct dmar_domain *domain = NULL;
2576 struct intel_iommu *iommu;
2578 unsigned long flags;
2581 iommu = device_to_iommu(dev, &bus, &devfn);
2585 if (dev_is_pci(dev)) {
2586 struct pci_dev *pdev = to_pci_dev(dev);
2588 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2590 spin_lock_irqsave(&device_domain_lock, flags);
2591 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2592 PCI_BUS_NUM(dma_alias),
2595 iommu = info->iommu;
2596 domain = info->domain;
2598 spin_unlock_irqrestore(&device_domain_lock, flags);
2600 /* DMA alias already has a domain, use it */
2605 /* Allocate and initialize new domain for the device */
2606 domain = alloc_domain(0);
2609 if (domain_init(domain, iommu, gaw)) {
2610 domain_exit(domain);
2619 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2620 struct dmar_domain *domain)
2622 struct intel_iommu *iommu;
2623 struct dmar_domain *tmp;
2624 u16 req_id, dma_alias;
2627 iommu = device_to_iommu(dev, &bus, &devfn);
2631 req_id = ((u16)bus << 8) | devfn;
2633 if (dev_is_pci(dev)) {
2634 struct pci_dev *pdev = to_pci_dev(dev);
2636 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2638 /* register PCI DMA alias device */
2639 if (req_id != dma_alias) {
2640 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2641 dma_alias & 0xff, NULL, domain);
2643 if (!tmp || tmp != domain)
2648 tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2649 if (!tmp || tmp != domain)
2655 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2657 struct dmar_domain *domain, *tmp;
2659 domain = find_domain(dev);
2663 domain = find_or_alloc_domain(dev, gaw);
2667 tmp = set_domain_for_dev(dev, domain);
2668 if (!tmp || domain != tmp) {
2669 domain_exit(domain);
2678 static int iommu_domain_identity_map(struct dmar_domain *domain,
2679 unsigned long long start,
2680 unsigned long long end)
2682 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2683 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2685 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2686 dma_to_mm_pfn(last_vpfn))) {
2687 pr_err("Reserving iova failed\n");
2691 pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2693 * RMRR range might have overlap with physical memory range,
2696 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2698 return __domain_mapping(domain, first_vpfn, NULL,
2699 first_vpfn, last_vpfn - first_vpfn + 1,
2700 DMA_PTE_READ|DMA_PTE_WRITE);
2703 static int domain_prepare_identity_map(struct device *dev,
2704 struct dmar_domain *domain,
2705 unsigned long long start,
2706 unsigned long long end)
2708 /* For _hardware_ passthrough, don't bother. But for software
2709 passthrough, we do it anyway -- it may indicate a memory
2710 range which is reserved in E820, so which didn't get set
2711 up to start with in si_domain */
2712 if (domain == si_domain && hw_pass_through) {
2713 pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2714 dev_name(dev), start, end);
2718 pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2719 dev_name(dev), start, end);
2722 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2723 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2724 dmi_get_system_info(DMI_BIOS_VENDOR),
2725 dmi_get_system_info(DMI_BIOS_VERSION),
2726 dmi_get_system_info(DMI_PRODUCT_VERSION));
2730 if (end >> agaw_to_width(domain->agaw)) {
2731 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2732 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2733 agaw_to_width(domain->agaw),
2734 dmi_get_system_info(DMI_BIOS_VENDOR),
2735 dmi_get_system_info(DMI_BIOS_VERSION),
2736 dmi_get_system_info(DMI_PRODUCT_VERSION));
2740 return iommu_domain_identity_map(domain, start, end);
2743 static int iommu_prepare_identity_map(struct device *dev,
2744 unsigned long long start,
2745 unsigned long long end)
2747 struct dmar_domain *domain;
2750 domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2754 ret = domain_prepare_identity_map(dev, domain, start, end);
2756 domain_exit(domain);
2761 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2764 if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2766 return iommu_prepare_identity_map(dev, rmrr->base_address,
2770 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2771 static inline void iommu_prepare_isa(void)
2773 struct pci_dev *pdev;
2776 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2780 pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2781 ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2784 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2789 static inline void iommu_prepare_isa(void)
2793 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2795 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2797 static int __init si_domain_init(int hw)
2801 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2805 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2806 domain_exit(si_domain);
2810 pr_debug("Identity mapping domain allocated\n");
2815 for_each_online_node(nid) {
2816 unsigned long start_pfn, end_pfn;
2819 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2820 ret = iommu_domain_identity_map(si_domain,
2821 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2830 static int identity_mapping(struct device *dev)
2832 struct device_domain_info *info;
2834 if (likely(!iommu_identity_mapping))
2837 info = dev->archdata.iommu;
2838 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2839 return (info->domain == si_domain);
2844 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2846 struct dmar_domain *ndomain;
2847 struct intel_iommu *iommu;
2850 iommu = device_to_iommu(dev, &bus, &devfn);
2854 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2855 if (ndomain != domain)
2861 static bool device_has_rmrr(struct device *dev)
2863 struct dmar_rmrr_unit *rmrr;
2868 for_each_rmrr_units(rmrr) {
2870 * Return TRUE if this RMRR contains the device that
2873 for_each_active_dev_scope(rmrr->devices,
2874 rmrr->devices_cnt, i, tmp)
2885 * There are a couple cases where we need to restrict the functionality of
2886 * devices associated with RMRRs. The first is when evaluating a device for
2887 * identity mapping because problems exist when devices are moved in and out
2888 * of domains and their respective RMRR information is lost. This means that
2889 * a device with associated RMRRs will never be in a "passthrough" domain.
2890 * The second is use of the device through the IOMMU API. This interface
2891 * expects to have full control of the IOVA space for the device. We cannot
2892 * satisfy both the requirement that RMRR access is maintained and have an
2893 * unencumbered IOVA space. We also have no ability to quiesce the device's
2894 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2895 * We therefore prevent devices associated with an RMRR from participating in
2896 * the IOMMU API, which eliminates them from device assignment.
2898 * In both cases we assume that PCI USB devices with RMRRs have them largely
2899 * for historical reasons and that the RMRR space is not actively used post
2900 * boot. This exclusion may change if vendors begin to abuse it.
2902 * The same exception is made for graphics devices, with the requirement that
2903 * any use of the RMRR regions will be torn down before assigning the device
2906 static bool device_is_rmrr_locked(struct device *dev)
2908 if (!device_has_rmrr(dev))
2911 if (dev_is_pci(dev)) {
2912 struct pci_dev *pdev = to_pci_dev(dev);
2914 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2921 static int iommu_should_identity_map(struct device *dev, int startup)
2924 if (dev_is_pci(dev)) {
2925 struct pci_dev *pdev = to_pci_dev(dev);
2927 if (device_is_rmrr_locked(dev))
2930 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2933 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2936 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2940 * We want to start off with all devices in the 1:1 domain, and
2941 * take them out later if we find they can't access all of memory.
2943 * However, we can't do this for PCI devices behind bridges,
2944 * because all PCI devices behind the same bridge will end up
2945 * with the same source-id on their transactions.
2947 * Practically speaking, we can't change things around for these
2948 * devices at run-time, because we can't be sure there'll be no
2949 * DMA transactions in flight for any of their siblings.
2951 * So PCI devices (unless they're on the root bus) as well as
2952 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2953 * the 1:1 domain, just in _case_ one of their siblings turns out
2954 * not to be able to map all of memory.
2956 if (!pci_is_pcie(pdev)) {
2957 if (!pci_is_root_bus(pdev->bus))
2959 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2961 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2964 if (device_has_rmrr(dev))
2969 * At boot time, we don't yet know if devices will be 64-bit capable.
2970 * Assume that they will — if they turn out not to be, then we can
2971 * take them out of the 1:1 domain later.
2975 * If the device's dma_mask is less than the system's memory
2976 * size then this is not a candidate for identity mapping.
2978 u64 dma_mask = *dev->dma_mask;
2980 if (dev->coherent_dma_mask &&
2981 dev->coherent_dma_mask < dma_mask)
2982 dma_mask = dev->coherent_dma_mask;
2984 return dma_mask >= dma_get_required_mask(dev);
2990 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2994 if (!iommu_should_identity_map(dev, 1))
2997 ret = domain_add_dev_info(si_domain, dev);
2999 pr_info("%s identity mapping for device %s\n",
3000 hw ? "Hardware" : "Software", dev_name(dev));
3001 else if (ret == -ENODEV)
3002 /* device not associated with an iommu */
3009 static int __init iommu_prepare_static_identity_mapping(int hw)
3011 struct pci_dev *pdev = NULL;
3012 struct dmar_drhd_unit *drhd;
3013 struct intel_iommu *iommu;
3018 for_each_pci_dev(pdev) {
3019 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
3024 for_each_active_iommu(iommu, drhd)
3025 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
3026 struct acpi_device_physical_node *pn;
3027 struct acpi_device *adev;
3029 if (dev->bus != &acpi_bus_type)
3032 adev= to_acpi_device(dev);
3033 mutex_lock(&adev->physical_node_lock);
3034 list_for_each_entry(pn, &adev->physical_node_list, node) {
3035 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
3039 mutex_unlock(&adev->physical_node_lock);
3047 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3050 * Start from the sane iommu hardware state.
3051 * If the queued invalidation is already initialized by us
3052 * (for example, while enabling interrupt-remapping) then
3053 * we got the things already rolling from a sane state.
3057 * Clear any previous faults.
3059 dmar_fault(-1, iommu);
3061 * Disable queued invalidation if supported and already enabled
3062 * before OS handover.
3064 dmar_disable_qi(iommu);
3067 if (dmar_enable_qi(iommu)) {
3069 * Queued Invalidate not enabled, use Register Based Invalidate
3071 iommu->flush.flush_context = __iommu_flush_context;
3072 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3073 pr_info("%s: Using Register based invalidation\n",
3076 iommu->flush.flush_context = qi_flush_context;
3077 iommu->flush.flush_iotlb = qi_flush_iotlb;
3078 pr_info("%s: Using Queued invalidation\n", iommu->name);
3082 static int copy_context_table(struct intel_iommu *iommu,
3083 struct root_entry *old_re,
3084 struct context_entry **tbl,
3087 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3088 struct context_entry *new_ce = NULL, ce;
3089 struct context_entry *old_ce = NULL;
3090 struct root_entry re;
3091 phys_addr_t old_ce_phys;
3093 tbl_idx = ext ? bus * 2 : bus;
3094 memcpy(&re, old_re, sizeof(re));
3096 for (devfn = 0; devfn < 256; devfn++) {
3097 /* First calculate the correct index */
3098 idx = (ext ? devfn * 2 : devfn) % 256;
3101 /* First save what we may have and clean up */
3103 tbl[tbl_idx] = new_ce;
3104 __iommu_flush_cache(iommu, new_ce,
3114 old_ce_phys = root_entry_lctp(&re);
3116 old_ce_phys = root_entry_uctp(&re);
3119 if (ext && devfn == 0) {
3120 /* No LCTP, try UCTP */
3129 old_ce = memremap(old_ce_phys, PAGE_SIZE,
3134 new_ce = alloc_pgtable_page(iommu->node);
3141 /* Now copy the context entry */
3142 memcpy(&ce, old_ce + idx, sizeof(ce));
3144 if (!__context_present(&ce))
3147 did = context_domain_id(&ce);
3148 if (did >= 0 && did < cap_ndoms(iommu->cap))
3149 set_bit(did, iommu->domain_ids);
3152 * We need a marker for copied context entries. This
3153 * marker needs to work for the old format as well as
3154 * for extended context entries.
3156 * Bit 67 of the context entry is used. In the old
3157 * format this bit is available to software, in the
3158 * extended format it is the PGE bit, but PGE is ignored
3159 * by HW if PASIDs are disabled (and thus still
3162 * So disable PASIDs first and then mark the entry
3163 * copied. This means that we don't copy PASID
3164 * translations from the old kernel, but this is fine as
3165 * faults there are not fatal.
3167 context_clear_pasid_enable(&ce);
3168 context_set_copied(&ce);
3173 tbl[tbl_idx + pos] = new_ce;
3175 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3184 static int copy_translation_tables(struct intel_iommu *iommu)
3186 struct context_entry **ctxt_tbls;
3187 struct root_entry *old_rt;
3188 phys_addr_t old_rt_phys;
3189 int ctxt_table_entries;
3190 unsigned long flags;
3195 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3196 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3197 new_ext = !!ecap_ecs(iommu->ecap);
3200 * The RTT bit can only be changed when translation is disabled,
3201 * but disabling translation means to open a window for data
3202 * corruption. So bail out and don't copy anything if we would
3203 * have to change the bit.
3208 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3212 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3216 /* This is too big for the stack - allocate it from slab */
3217 ctxt_table_entries = ext ? 512 : 256;
3219 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3223 for (bus = 0; bus < 256; bus++) {
3224 ret = copy_context_table(iommu, &old_rt[bus],
3225 ctxt_tbls, bus, ext);
3227 pr_err("%s: Failed to copy context table for bus %d\n",
3233 spin_lock_irqsave(&iommu->lock, flags);
3235 /* Context tables are copied, now write them to the root_entry table */
3236 for (bus = 0; bus < 256; bus++) {
3237 int idx = ext ? bus * 2 : bus;
3240 if (ctxt_tbls[idx]) {
3241 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3242 iommu->root_entry[bus].lo = val;
3245 if (!ext || !ctxt_tbls[idx + 1])
3248 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3249 iommu->root_entry[bus].hi = val;
3252 spin_unlock_irqrestore(&iommu->lock, flags);
3256 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3266 static int __init init_dmars(void)
3268 struct dmar_drhd_unit *drhd;
3269 struct dmar_rmrr_unit *rmrr;
3270 bool copied_tables = false;
3272 struct intel_iommu *iommu;
3278 * initialize and program root entry to not present
3281 for_each_drhd_unit(drhd) {
3283 * lock not needed as this is only incremented in the single
3284 * threaded kernel __init code path all other access are read
3287 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3291 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3294 /* Preallocate enough resources for IOMMU hot-addition */
3295 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3296 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3298 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3301 pr_err("Allocating global iommu array failed\n");
3306 for_each_active_iommu(iommu, drhd) {
3307 g_iommus[iommu->seq_id] = iommu;
3309 intel_iommu_init_qi(iommu);
3311 ret = iommu_init_domains(iommu);
3315 init_translation_status(iommu);
3317 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3318 iommu_disable_translation(iommu);
3319 clear_translation_pre_enabled(iommu);
3320 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3326 * we could share the same root & context tables
3327 * among all IOMMU's. Need to Split it later.
3329 ret = iommu_alloc_root_entry(iommu);
3333 if (translation_pre_enabled(iommu)) {
3334 pr_info("Translation already enabled - trying to copy translation structures\n");
3336 ret = copy_translation_tables(iommu);
3339 * We found the IOMMU with translation
3340 * enabled - but failed to copy over the
3341 * old root-entry table. Try to proceed
3342 * by disabling translation now and
3343 * allocating a clean root-entry table.
3344 * This might cause DMAR faults, but
3345 * probably the dump will still succeed.
3347 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3349 iommu_disable_translation(iommu);
3350 clear_translation_pre_enabled(iommu);
3352 pr_info("Copied translation tables from previous kernel for %s\n",
3354 copied_tables = true;
3358 if (!ecap_pass_through(iommu->ecap))
3359 hw_pass_through = 0;
3360 #ifdef CONFIG_INTEL_IOMMU_SVM
3361 if (pasid_enabled(iommu))
3362 intel_svm_alloc_pasid_tables(iommu);
3367 * Now that qi is enabled on all iommus, set the root entry and flush
3368 * caches. This is required on some Intel X58 chipsets, otherwise the
3369 * flush_context function will loop forever and the boot hangs.
3371 for_each_active_iommu(iommu, drhd) {
3372 iommu_flush_write_buffer(iommu);
3373 iommu_set_root_entry(iommu);
3374 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3375 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3378 if (iommu_pass_through)
3379 iommu_identity_mapping |= IDENTMAP_ALL;
3381 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3382 iommu_identity_mapping |= IDENTMAP_GFX;
3385 check_tylersburg_isoch();
3387 if (iommu_identity_mapping) {
3388 ret = si_domain_init(hw_pass_through);
3395 * If we copied translations from a previous kernel in the kdump
3396 * case, we can not assign the devices to domains now, as that
3397 * would eliminate the old mappings. So skip this part and defer
3398 * the assignment to device driver initialization time.
3404 * If pass through is not set or not enabled, setup context entries for
3405 * identity mappings for rmrr, gfx, and isa and may fall back to static
3406 * identity mapping if iommu_identity_mapping is set.
3408 if (iommu_identity_mapping) {
3409 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3411 pr_crit("Failed to setup IOMMU pass-through\n");
3417 * for each dev attached to rmrr
3419 * locate drhd for dev, alloc domain for dev
3420 * allocate free domain
3421 * allocate page table entries for rmrr
3422 * if context not allocated for bus
3423 * allocate and init context
3424 * set present in root table for this bus
3425 * init context with domain, translation etc
3429 pr_info("Setting RMRR:\n");
3430 for_each_rmrr_units(rmrr) {
3431 /* some BIOS lists non-exist devices in DMAR table. */
3432 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3434 ret = iommu_prepare_rmrr_dev(rmrr, dev);
3436 pr_err("Mapping reserved region failed\n");
3440 iommu_prepare_isa();
3447 * global invalidate context cache
3448 * global invalidate iotlb
3449 * enable translation
3451 for_each_iommu(iommu, drhd) {
3452 if (drhd->ignored) {
3454 * we always have to disable PMRs or DMA may fail on
3458 iommu_disable_protect_mem_regions(iommu);
3462 iommu_flush_write_buffer(iommu);
3464 #ifdef CONFIG_INTEL_IOMMU_SVM
3465 if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
3466 ret = intel_svm_enable_prq(iommu);
3471 ret = dmar_set_interrupt(iommu);
3475 if (!translation_pre_enabled(iommu))
3476 iommu_enable_translation(iommu);
3478 iommu_disable_protect_mem_regions(iommu);
3484 for_each_active_iommu(iommu, drhd) {
3485 disable_dmar_iommu(iommu);
3486 free_dmar_iommu(iommu);
3495 /* This takes a number of _MM_ pages, not VTD pages */
3496 static unsigned long intel_alloc_iova(struct device *dev,
3497 struct dmar_domain *domain,
3498 unsigned long nrpages, uint64_t dma_mask)
3500 unsigned long iova_pfn = 0;
3502 /* Restrict dma_mask to the width that the iommu can handle */
3503 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3504 /* Ensure we reserve the whole size-aligned region */
3505 nrpages = __roundup_pow_of_two(nrpages);
3507 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3509 * First try to allocate an io virtual address in
3510 * DMA_BIT_MASK(32) and if that fails then try allocating
3513 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3514 IOVA_PFN(DMA_BIT_MASK(32)), false);
3518 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3519 IOVA_PFN(dma_mask), true);
3520 if (unlikely(!iova_pfn)) {
3521 pr_err("Allocating %ld-page iova for %s failed",
3522 nrpages, dev_name(dev));
3529 static struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3531 struct dmar_domain *domain, *tmp;
3532 struct dmar_rmrr_unit *rmrr;
3533 struct device *i_dev;
3536 domain = find_domain(dev);
3540 domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3544 /* We have a new domain - setup possible RMRRs for the device */
3546 for_each_rmrr_units(rmrr) {
3547 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3552 ret = domain_prepare_identity_map(dev, domain,
3556 dev_err(dev, "Mapping reserved region failed\n");
3561 tmp = set_domain_for_dev(dev, domain);
3562 if (!tmp || domain != tmp) {
3563 domain_exit(domain);
3570 pr_err("Allocating domain for %s failed\n", dev_name(dev));
3576 /* Check if the dev needs to go through non-identity map and unmap process.*/
3577 static int iommu_no_mapping(struct device *dev)
3581 if (iommu_dummy(dev))
3584 if (!iommu_identity_mapping)
3587 found = identity_mapping(dev);
3589 if (iommu_should_identity_map(dev, 0))
3593 * 32 bit DMA is removed from si_domain and fall back
3594 * to non-identity mapping.
3596 dmar_remove_one_dev_info(si_domain, dev);
3597 pr_info("32bit %s uses non-identity mapping\n",
3603 * In case of a detached 64 bit DMA device from vm, the device
3604 * is put into si_domain for identity mapping.
3606 if (iommu_should_identity_map(dev, 0)) {
3608 ret = domain_add_dev_info(si_domain, dev);
3610 pr_info("64bit %s uses identity mapping\n",
3620 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3621 size_t size, int dir, u64 dma_mask)
3623 struct dmar_domain *domain;
3624 phys_addr_t start_paddr;
3625 unsigned long iova_pfn;
3628 struct intel_iommu *iommu;
3629 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3631 BUG_ON(dir == DMA_NONE);
3633 if (iommu_no_mapping(dev))
3636 domain = get_valid_domain_for_dev(dev);
3640 iommu = domain_get_iommu(domain);
3641 size = aligned_nrpages(paddr, size);
3643 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3648 * Check if DMAR supports zero-length reads on write only
3651 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3652 !cap_zlr(iommu->cap))
3653 prot |= DMA_PTE_READ;
3654 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3655 prot |= DMA_PTE_WRITE;
3657 * paddr - (paddr + size) might be partial page, we should map the whole
3658 * page. Note: if two part of one page are separately mapped, we
3659 * might have two guest_addr mapping to the same host paddr, but this
3660 * is not a big problem
3662 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3663 mm_to_dma_pfn(paddr_pfn), size, prot);
3667 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3668 start_paddr += paddr & ~PAGE_MASK;
3673 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3674 pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3675 dev_name(dev), size, (unsigned long long)paddr, dir);
3679 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3680 unsigned long offset, size_t size,
3681 enum dma_data_direction dir,
3682 unsigned long attrs)
3684 return __intel_map_single(dev, page_to_phys(page) + offset, size,
3685 dir, *dev->dma_mask);
3688 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3690 struct dmar_domain *domain;
3691 unsigned long start_pfn, last_pfn;
3692 unsigned long nrpages;
3693 unsigned long iova_pfn;
3694 struct intel_iommu *iommu;
3695 struct page *freelist;
3697 if (iommu_no_mapping(dev))
3700 domain = find_domain(dev);
3703 iommu = domain_get_iommu(domain);
3705 iova_pfn = IOVA_PFN(dev_addr);
3707 nrpages = aligned_nrpages(dev_addr, size);
3708 start_pfn = mm_to_dma_pfn(iova_pfn);
3709 last_pfn = start_pfn + nrpages - 1;
3711 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3712 dev_name(dev), start_pfn, last_pfn);
3714 freelist = domain_unmap(domain, start_pfn, last_pfn);
3716 if (intel_iommu_strict) {
3717 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3718 nrpages, !freelist, 0);
3720 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3721 dma_free_pagelist(freelist);
3723 queue_iova(&domain->iovad, iova_pfn, nrpages,
3724 (unsigned long)freelist);
3726 * queue up the release of the unmap to save the 1/6th of the
3727 * cpu used up by the iotlb flush operation...
3732 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3733 size_t size, enum dma_data_direction dir,
3734 unsigned long attrs)
3736 intel_unmap(dev, dev_addr, size);
3739 static void *intel_alloc_coherent(struct device *dev, size_t size,
3740 dma_addr_t *dma_handle, gfp_t flags,
3741 unsigned long attrs)
3743 struct page *page = NULL;
3746 size = PAGE_ALIGN(size);
3747 order = get_order(size);
3749 if (!iommu_no_mapping(dev))
3750 flags &= ~(GFP_DMA | GFP_DMA32);
3751 else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3752 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3758 if (gfpflags_allow_blocking(flags)) {
3759 unsigned int count = size >> PAGE_SHIFT;
3761 page = dma_alloc_from_contiguous(dev, count, order, flags);
3762 if (page && iommu_no_mapping(dev) &&
3763 page_to_phys(page) + size > dev->coherent_dma_mask) {
3764 dma_release_from_contiguous(dev, page, count);
3770 page = alloc_pages(flags, order);
3773 memset(page_address(page), 0, size);
3775 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3777 dev->coherent_dma_mask);
3779 return page_address(page);
3780 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3781 __free_pages(page, order);
3786 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3787 dma_addr_t dma_handle, unsigned long attrs)
3790 struct page *page = virt_to_page(vaddr);
3792 size = PAGE_ALIGN(size);
3793 order = get_order(size);
3795 intel_unmap(dev, dma_handle, size);
3796 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3797 __free_pages(page, order);
3800 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3801 int nelems, enum dma_data_direction dir,
3802 unsigned long attrs)
3804 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3805 unsigned long nrpages = 0;
3806 struct scatterlist *sg;
3809 for_each_sg(sglist, sg, nelems, i) {
3810 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3813 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3816 static int intel_nontranslate_map_sg(struct device *hddev,
3817 struct scatterlist *sglist, int nelems, int dir)
3820 struct scatterlist *sg;
3822 for_each_sg(sglist, sg, nelems, i) {
3823 BUG_ON(!sg_page(sg));
3824 sg->dma_address = sg_phys(sg);
3825 sg->dma_length = sg->length;
3830 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3831 enum dma_data_direction dir, unsigned long attrs)
3834 struct dmar_domain *domain;
3837 unsigned long iova_pfn;
3839 struct scatterlist *sg;
3840 unsigned long start_vpfn;
3841 struct intel_iommu *iommu;
3843 BUG_ON(dir == DMA_NONE);
3844 if (iommu_no_mapping(dev))
3845 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3847 domain = get_valid_domain_for_dev(dev);
3851 iommu = domain_get_iommu(domain);
3853 for_each_sg(sglist, sg, nelems, i)
3854 size += aligned_nrpages(sg->offset, sg->length);
3856 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3859 sglist->dma_length = 0;
3864 * Check if DMAR supports zero-length reads on write only
3867 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3868 !cap_zlr(iommu->cap))
3869 prot |= DMA_PTE_READ;
3870 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3871 prot |= DMA_PTE_WRITE;
3873 start_vpfn = mm_to_dma_pfn(iova_pfn);
3875 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3876 if (unlikely(ret)) {
3877 dma_pte_free_pagetable(domain, start_vpfn,
3878 start_vpfn + size - 1,
3879 agaw_to_level(domain->agaw) + 1);
3880 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3887 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3892 const struct dma_map_ops intel_dma_ops = {
3893 .alloc = intel_alloc_coherent,
3894 .free = intel_free_coherent,
3895 .map_sg = intel_map_sg,
3896 .unmap_sg = intel_unmap_sg,
3897 .map_page = intel_map_page,
3898 .unmap_page = intel_unmap_page,
3899 .mapping_error = intel_mapping_error,
3901 .dma_supported = dma_direct_supported,
3905 static inline int iommu_domain_cache_init(void)
3909 iommu_domain_cache = kmem_cache_create("iommu_domain",
3910 sizeof(struct dmar_domain),
3915 if (!iommu_domain_cache) {
3916 pr_err("Couldn't create iommu_domain cache\n");
3923 static inline int iommu_devinfo_cache_init(void)
3927 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3928 sizeof(struct device_domain_info),
3932 if (!iommu_devinfo_cache) {
3933 pr_err("Couldn't create devinfo cache\n");
3940 static int __init iommu_init_mempool(void)
3943 ret = iova_cache_get();
3947 ret = iommu_domain_cache_init();
3951 ret = iommu_devinfo_cache_init();
3955 kmem_cache_destroy(iommu_domain_cache);
3962 static void __init iommu_exit_mempool(void)
3964 kmem_cache_destroy(iommu_devinfo_cache);
3965 kmem_cache_destroy(iommu_domain_cache);
3969 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3971 struct dmar_drhd_unit *drhd;
3975 /* We know that this device on this chipset has its own IOMMU.
3976 * If we find it under a different IOMMU, then the BIOS is lying
3977 * to us. Hope that the IOMMU for this device is actually
3978 * disabled, and it needs no translation...
3980 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3982 /* "can't" happen */
3983 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3986 vtbar &= 0xffff0000;
3988 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3989 drhd = dmar_find_matched_drhd_unit(pdev);
3990 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3991 TAINT_FIRMWARE_WORKAROUND,
3992 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3993 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3995 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3997 static void __init init_no_remapping_devices(void)
3999 struct dmar_drhd_unit *drhd;
4003 for_each_drhd_unit(drhd) {
4004 if (!drhd->include_all) {
4005 for_each_active_dev_scope(drhd->devices,
4006 drhd->devices_cnt, i, dev)
4008 /* ignore DMAR unit if no devices exist */
4009 if (i == drhd->devices_cnt)
4014 for_each_active_drhd_unit(drhd) {
4015 if (drhd->include_all)
4018 for_each_active_dev_scope(drhd->devices,
4019 drhd->devices_cnt, i, dev)
4020 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4022 if (i < drhd->devices_cnt)
4025 /* This IOMMU has *only* gfx devices. Either bypass it or
4026 set the gfx_mapped flag, as appropriate */
4028 intel_iommu_gfx_mapped = 1;
4031 for_each_active_dev_scope(drhd->devices,
4032 drhd->devices_cnt, i, dev)
4033 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4038 #ifdef CONFIG_SUSPEND
4039 static int init_iommu_hw(void)
4041 struct dmar_drhd_unit *drhd;
4042 struct intel_iommu *iommu = NULL;
4044 for_each_active_iommu(iommu, drhd)
4046 dmar_reenable_qi(iommu);
4048 for_each_iommu(iommu, drhd) {
4049 if (drhd->ignored) {
4051 * we always have to disable PMRs or DMA may fail on
4055 iommu_disable_protect_mem_regions(iommu);
4059 iommu_flush_write_buffer(iommu);
4061 iommu_set_root_entry(iommu);
4063 iommu->flush.flush_context(iommu, 0, 0, 0,
4064 DMA_CCMD_GLOBAL_INVL);
4065 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4066 iommu_enable_translation(iommu);
4067 iommu_disable_protect_mem_regions(iommu);
4073 static void iommu_flush_all(void)
4075 struct dmar_drhd_unit *drhd;
4076 struct intel_iommu *iommu;
4078 for_each_active_iommu(iommu, drhd) {
4079 iommu->flush.flush_context(iommu, 0, 0, 0,
4080 DMA_CCMD_GLOBAL_INVL);
4081 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4082 DMA_TLB_GLOBAL_FLUSH);
4086 static int iommu_suspend(void)
4088 struct dmar_drhd_unit *drhd;
4089 struct intel_iommu *iommu = NULL;
4092 for_each_active_iommu(iommu, drhd) {
4093 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4095 if (!iommu->iommu_state)
4101 for_each_active_iommu(iommu, drhd) {
4102 iommu_disable_translation(iommu);
4104 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4106 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4107 readl(iommu->reg + DMAR_FECTL_REG);
4108 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4109 readl(iommu->reg + DMAR_FEDATA_REG);
4110 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4111 readl(iommu->reg + DMAR_FEADDR_REG);
4112 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4113 readl(iommu->reg + DMAR_FEUADDR_REG);
4115 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4120 for_each_active_iommu(iommu, drhd)
4121 kfree(iommu->iommu_state);
4126 static void iommu_resume(void)
4128 struct dmar_drhd_unit *drhd;
4129 struct intel_iommu *iommu = NULL;
4132 if (init_iommu_hw()) {
4134 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4136 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4140 for_each_active_iommu(iommu, drhd) {
4142 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4144 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4145 iommu->reg + DMAR_FECTL_REG);
4146 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4147 iommu->reg + DMAR_FEDATA_REG);
4148 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4149 iommu->reg + DMAR_FEADDR_REG);
4150 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4151 iommu->reg + DMAR_FEUADDR_REG);
4153 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4156 for_each_active_iommu(iommu, drhd)
4157 kfree(iommu->iommu_state);
4160 static struct syscore_ops iommu_syscore_ops = {
4161 .resume = iommu_resume,
4162 .suspend = iommu_suspend,
4165 static void __init init_iommu_pm_ops(void)
4167 register_syscore_ops(&iommu_syscore_ops);
4171 static inline void init_iommu_pm_ops(void) {}
4172 #endif /* CONFIG_PM */
4175 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4177 struct acpi_dmar_reserved_memory *rmrr;
4178 int prot = DMA_PTE_READ|DMA_PTE_WRITE;
4179 struct dmar_rmrr_unit *rmrru;
4182 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4186 rmrru->hdr = header;
4187 rmrr = (struct acpi_dmar_reserved_memory *)header;
4188 rmrru->base_address = rmrr->base_address;
4189 rmrru->end_address = rmrr->end_address;
4191 length = rmrr->end_address - rmrr->base_address + 1;
4192 rmrru->resv = iommu_alloc_resv_region(rmrr->base_address, length, prot,
4197 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4198 ((void *)rmrr) + rmrr->header.length,
4199 &rmrru->devices_cnt);
4200 if (rmrru->devices_cnt && rmrru->devices == NULL)
4203 list_add(&rmrru->list, &dmar_rmrr_units);
4214 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4216 struct dmar_atsr_unit *atsru;
4217 struct acpi_dmar_atsr *tmp;
4219 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4220 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4221 if (atsr->segment != tmp->segment)
4223 if (atsr->header.length != tmp->header.length)
4225 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4232 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4234 struct acpi_dmar_atsr *atsr;
4235 struct dmar_atsr_unit *atsru;
4237 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4240 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4241 atsru = dmar_find_atsr(atsr);
4245 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4250 * If memory is allocated from slab by ACPI _DSM method, we need to
4251 * copy the memory content because the memory buffer will be freed
4254 atsru->hdr = (void *)(atsru + 1);
4255 memcpy(atsru->hdr, hdr, hdr->length);
4256 atsru->include_all = atsr->flags & 0x1;
4257 if (!atsru->include_all) {
4258 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4259 (void *)atsr + atsr->header.length,
4260 &atsru->devices_cnt);
4261 if (atsru->devices_cnt && atsru->devices == NULL) {
4267 list_add_rcu(&atsru->list, &dmar_atsr_units);
4272 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4274 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4278 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4280 struct acpi_dmar_atsr *atsr;
4281 struct dmar_atsr_unit *atsru;
4283 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4284 atsru = dmar_find_atsr(atsr);
4286 list_del_rcu(&atsru->list);
4288 intel_iommu_free_atsr(atsru);
4294 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4298 struct acpi_dmar_atsr *atsr;
4299 struct dmar_atsr_unit *atsru;
4301 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4302 atsru = dmar_find_atsr(atsr);
4306 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4307 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4315 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4318 struct intel_iommu *iommu = dmaru->iommu;
4320 if (g_iommus[iommu->seq_id])
4323 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4324 pr_warn("%s: Doesn't support hardware pass through.\n",
4328 if (!ecap_sc_support(iommu->ecap) &&
4329 domain_update_iommu_snooping(iommu)) {
4330 pr_warn("%s: Doesn't support snooping.\n",
4334 sp = domain_update_iommu_superpage(iommu) - 1;
4335 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4336 pr_warn("%s: Doesn't support large page.\n",
4342 * Disable translation if already enabled prior to OS handover.
4344 if (iommu->gcmd & DMA_GCMD_TE)
4345 iommu_disable_translation(iommu);
4347 g_iommus[iommu->seq_id] = iommu;
4348 ret = iommu_init_domains(iommu);
4350 ret = iommu_alloc_root_entry(iommu);
4354 #ifdef CONFIG_INTEL_IOMMU_SVM
4355 if (pasid_enabled(iommu))
4356 intel_svm_alloc_pasid_tables(iommu);
4359 if (dmaru->ignored) {
4361 * we always have to disable PMRs or DMA may fail on this device
4364 iommu_disable_protect_mem_regions(iommu);
4368 intel_iommu_init_qi(iommu);
4369 iommu_flush_write_buffer(iommu);
4371 #ifdef CONFIG_INTEL_IOMMU_SVM
4372 if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
4373 ret = intel_svm_enable_prq(iommu);
4378 ret = dmar_set_interrupt(iommu);
4382 iommu_set_root_entry(iommu);
4383 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4384 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4385 iommu_enable_translation(iommu);
4387 iommu_disable_protect_mem_regions(iommu);
4391 disable_dmar_iommu(iommu);
4393 free_dmar_iommu(iommu);
4397 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4400 struct intel_iommu *iommu = dmaru->iommu;
4402 if (!intel_iommu_enabled)
4408 ret = intel_iommu_add(dmaru);
4410 disable_dmar_iommu(iommu);
4411 free_dmar_iommu(iommu);
4417 static void intel_iommu_free_dmars(void)
4419 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4420 struct dmar_atsr_unit *atsru, *atsr_n;
4422 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4423 list_del(&rmrru->list);
4424 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4429 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4430 list_del(&atsru->list);
4431 intel_iommu_free_atsr(atsru);
4435 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4438 struct pci_bus *bus;
4439 struct pci_dev *bridge = NULL;
4441 struct acpi_dmar_atsr *atsr;
4442 struct dmar_atsr_unit *atsru;
4444 dev = pci_physfn(dev);
4445 for (bus = dev->bus; bus; bus = bus->parent) {
4447 /* If it's an integrated device, allow ATS */
4450 /* Connected via non-PCIe: no ATS */
4451 if (!pci_is_pcie(bridge) ||
4452 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4454 /* If we found the root port, look it up in the ATSR */
4455 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4460 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4461 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4462 if (atsr->segment != pci_domain_nr(dev->bus))
4465 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4466 if (tmp == &bridge->dev)
4469 if (atsru->include_all)
4479 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4482 struct dmar_rmrr_unit *rmrru;
4483 struct dmar_atsr_unit *atsru;
4484 struct acpi_dmar_atsr *atsr;
4485 struct acpi_dmar_reserved_memory *rmrr;
4487 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4490 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4491 rmrr = container_of(rmrru->hdr,
4492 struct acpi_dmar_reserved_memory, header);
4493 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4494 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4495 ((void *)rmrr) + rmrr->header.length,
4496 rmrr->segment, rmrru->devices,
4497 rmrru->devices_cnt);
4500 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4501 dmar_remove_dev_scope(info, rmrr->segment,
4502 rmrru->devices, rmrru->devices_cnt);
4506 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4507 if (atsru->include_all)
4510 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4511 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4512 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4513 (void *)atsr + atsr->header.length,
4514 atsr->segment, atsru->devices,
4515 atsru->devices_cnt);
4520 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4521 if (dmar_remove_dev_scope(info, atsr->segment,
4522 atsru->devices, atsru->devices_cnt))
4531 * Here we only respond to action of unbound device from driver.
4533 * Added device is not attached to its DMAR domain here yet. That will happen
4534 * when mapping the device to iova.
4536 static int device_notifier(struct notifier_block *nb,
4537 unsigned long action, void *data)
4539 struct device *dev = data;
4540 struct dmar_domain *domain;
4542 if (iommu_dummy(dev))
4545 if (action != BUS_NOTIFY_REMOVED_DEVICE)
4548 domain = find_domain(dev);
4552 dmar_remove_one_dev_info(domain, dev);
4553 if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4554 domain_exit(domain);
4559 static struct notifier_block device_nb = {
4560 .notifier_call = device_notifier,
4563 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4564 unsigned long val, void *v)
4566 struct memory_notify *mhp = v;
4567 unsigned long long start, end;
4568 unsigned long start_vpfn, last_vpfn;
4571 case MEM_GOING_ONLINE:
4572 start = mhp->start_pfn << PAGE_SHIFT;
4573 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4574 if (iommu_domain_identity_map(si_domain, start, end)) {
4575 pr_warn("Failed to build identity map for [%llx-%llx]\n",
4582 case MEM_CANCEL_ONLINE:
4583 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4584 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4585 while (start_vpfn <= last_vpfn) {
4587 struct dmar_drhd_unit *drhd;
4588 struct intel_iommu *iommu;
4589 struct page *freelist;
4591 iova = find_iova(&si_domain->iovad, start_vpfn);
4593 pr_debug("Failed get IOVA for PFN %lx\n",
4598 iova = split_and_remove_iova(&si_domain->iovad, iova,
4599 start_vpfn, last_vpfn);
4601 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4602 start_vpfn, last_vpfn);
4606 freelist = domain_unmap(si_domain, iova->pfn_lo,
4610 for_each_active_iommu(iommu, drhd)
4611 iommu_flush_iotlb_psi(iommu, si_domain,
4612 iova->pfn_lo, iova_size(iova),
4615 dma_free_pagelist(freelist);
4617 start_vpfn = iova->pfn_hi + 1;
4618 free_iova_mem(iova);
4626 static struct notifier_block intel_iommu_memory_nb = {
4627 .notifier_call = intel_iommu_memory_notifier,
4631 static void free_all_cpu_cached_iovas(unsigned int cpu)
4635 for (i = 0; i < g_num_of_iommus; i++) {
4636 struct intel_iommu *iommu = g_iommus[i];
4637 struct dmar_domain *domain;
4643 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4644 domain = get_iommu_domain(iommu, (u16)did);
4648 free_cpu_cached_iovas(cpu, &domain->iovad);
4653 static int intel_iommu_cpu_dead(unsigned int cpu)
4655 free_all_cpu_cached_iovas(cpu);
4659 static void intel_disable_iommus(void)
4661 struct intel_iommu *iommu = NULL;
4662 struct dmar_drhd_unit *drhd;
4664 for_each_iommu(iommu, drhd)
4665 iommu_disable_translation(iommu);
4668 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4670 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4672 return container_of(iommu_dev, struct intel_iommu, iommu);
4675 static ssize_t intel_iommu_show_version(struct device *dev,
4676 struct device_attribute *attr,
4679 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4680 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4681 return sprintf(buf, "%d:%d\n",
4682 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4684 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4686 static ssize_t intel_iommu_show_address(struct device *dev,
4687 struct device_attribute *attr,
4690 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4691 return sprintf(buf, "%llx\n", iommu->reg_phys);
4693 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4695 static ssize_t intel_iommu_show_cap(struct device *dev,
4696 struct device_attribute *attr,
4699 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4700 return sprintf(buf, "%llx\n", iommu->cap);
4702 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4704 static ssize_t intel_iommu_show_ecap(struct device *dev,
4705 struct device_attribute *attr,
4708 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4709 return sprintf(buf, "%llx\n", iommu->ecap);
4711 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4713 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4714 struct device_attribute *attr,
4717 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4718 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4720 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4722 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4723 struct device_attribute *attr,
4726 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4727 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4728 cap_ndoms(iommu->cap)));
4730 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4732 static struct attribute *intel_iommu_attrs[] = {
4733 &dev_attr_version.attr,
4734 &dev_attr_address.attr,
4736 &dev_attr_ecap.attr,
4737 &dev_attr_domains_supported.attr,
4738 &dev_attr_domains_used.attr,
4742 static struct attribute_group intel_iommu_group = {
4743 .name = "intel-iommu",
4744 .attrs = intel_iommu_attrs,
4747 const struct attribute_group *intel_iommu_groups[] = {
4752 int __init intel_iommu_init(void)
4755 struct dmar_drhd_unit *drhd;
4756 struct intel_iommu *iommu;
4758 /* VT-d is required for a TXT/tboot launch, so enforce that */
4759 force_on = tboot_force_iommu();
4761 if (iommu_init_mempool()) {
4763 panic("tboot: Failed to initialize iommu memory\n");
4767 down_write(&dmar_global_lock);
4768 if (dmar_table_init()) {
4770 panic("tboot: Failed to initialize DMAR table\n");
4774 if (dmar_dev_scope_init() < 0) {
4776 panic("tboot: Failed to initialize DMAR device scope\n");
4780 up_write(&dmar_global_lock);
4783 * The bus notifier takes the dmar_global_lock, so lockdep will
4784 * complain later when we register it under the lock.
4786 dmar_register_bus_notifier();
4788 down_write(&dmar_global_lock);
4790 if (no_iommu || dmar_disabled) {
4792 * We exit the function here to ensure IOMMU's remapping and
4793 * mempool aren't setup, which means that the IOMMU's PMRs
4794 * won't be disabled via the call to init_dmars(). So disable
4795 * it explicitly here. The PMRs were setup by tboot prior to
4796 * calling SENTER, but the kernel is expected to reset/tear
4799 if (intel_iommu_tboot_noforce) {
4800 for_each_iommu(iommu, drhd)
4801 iommu_disable_protect_mem_regions(iommu);
4805 * Make sure the IOMMUs are switched off, even when we
4806 * boot into a kexec kernel and the previous kernel left
4809 intel_disable_iommus();
4813 if (list_empty(&dmar_rmrr_units))
4814 pr_info("No RMRR found\n");
4816 if (list_empty(&dmar_atsr_units))
4817 pr_info("No ATSR found\n");
4819 if (dmar_init_reserved_ranges()) {
4821 panic("tboot: Failed to reserve iommu ranges\n");
4822 goto out_free_reserved_range;
4825 init_no_remapping_devices();
4830 panic("tboot: Failed to initialize DMARs\n");
4831 pr_err("Initialization failed\n");
4832 goto out_free_reserved_range;
4834 up_write(&dmar_global_lock);
4835 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4837 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4840 dma_ops = &intel_dma_ops;
4842 init_iommu_pm_ops();
4844 for_each_active_iommu(iommu, drhd) {
4845 iommu_device_sysfs_add(&iommu->iommu, NULL,
4848 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4849 iommu_device_register(&iommu->iommu);
4852 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4853 bus_register_notifier(&pci_bus_type, &device_nb);
4854 if (si_domain && !hw_pass_through)
4855 register_memory_notifier(&intel_iommu_memory_nb);
4856 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4857 intel_iommu_cpu_dead);
4858 intel_iommu_enabled = 1;
4862 out_free_reserved_range:
4863 put_iova_domain(&reserved_iova_list);
4865 intel_iommu_free_dmars();
4866 up_write(&dmar_global_lock);
4867 iommu_exit_mempool();
4871 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4873 struct intel_iommu *iommu = opaque;
4875 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4880 * NB - intel-iommu lacks any sort of reference counting for the users of
4881 * dependent devices. If multiple endpoints have intersecting dependent
4882 * devices, unbinding the driver from any one of them will possibly leave
4883 * the others unable to operate.
4885 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4887 if (!iommu || !dev || !dev_is_pci(dev))
4890 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4893 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4895 struct intel_iommu *iommu;
4896 unsigned long flags;
4898 assert_spin_locked(&device_domain_lock);
4903 iommu = info->iommu;
4906 iommu_disable_dev_iotlb(info);
4907 domain_context_clear(iommu, info->dev);
4910 unlink_domain_info(info);
4912 spin_lock_irqsave(&iommu->lock, flags);
4913 domain_detach_iommu(info->domain, iommu);
4914 spin_unlock_irqrestore(&iommu->lock, flags);
4916 free_devinfo_mem(info);
4919 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
4922 struct device_domain_info *info;
4923 unsigned long flags;
4925 spin_lock_irqsave(&device_domain_lock, flags);
4926 info = dev->archdata.iommu;
4927 __dmar_remove_one_dev_info(info);
4928 spin_unlock_irqrestore(&device_domain_lock, flags);
4931 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4935 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
4936 domain_reserve_special_ranges(domain);
4938 /* calculate AGAW */
4939 domain->gaw = guest_width;
4940 adjust_width = guestwidth_to_adjustwidth(guest_width);
4941 domain->agaw = width_to_agaw(adjust_width);
4943 domain->iommu_coherency = 0;
4944 domain->iommu_snooping = 0;
4945 domain->iommu_superpage = 0;
4946 domain->max_addr = 0;
4948 /* always allocate the top pgd */
4949 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4952 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4956 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4958 struct dmar_domain *dmar_domain;
4959 struct iommu_domain *domain;
4961 if (type != IOMMU_DOMAIN_UNMANAGED)
4964 dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4966 pr_err("Can't allocate dmar_domain\n");
4969 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4970 pr_err("Domain initialization failed\n");
4971 domain_exit(dmar_domain);
4974 domain_update_iommu_cap(dmar_domain);
4976 domain = &dmar_domain->domain;
4977 domain->geometry.aperture_start = 0;
4978 domain->geometry.aperture_end = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4979 domain->geometry.force_aperture = true;
4984 static void intel_iommu_domain_free(struct iommu_domain *domain)
4986 domain_exit(to_dmar_domain(domain));
4989 static int intel_iommu_attach_device(struct iommu_domain *domain,
4992 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4993 struct intel_iommu *iommu;
4997 if (device_is_rmrr_locked(dev)) {
4998 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
5002 /* normally dev is not mapped */
5003 if (unlikely(domain_context_mapped(dev))) {
5004 struct dmar_domain *old_domain;
5006 old_domain = find_domain(dev);
5009 dmar_remove_one_dev_info(old_domain, dev);
5012 if (!domain_type_is_vm_or_si(old_domain) &&
5013 list_empty(&old_domain->devices))
5014 domain_exit(old_domain);
5018 iommu = device_to_iommu(dev, &bus, &devfn);
5022 /* check if this iommu agaw is sufficient for max mapped address */
5023 addr_width = agaw_to_width(iommu->agaw);
5024 if (addr_width > cap_mgaw(iommu->cap))
5025 addr_width = cap_mgaw(iommu->cap);
5027 if (dmar_domain->max_addr > (1LL << addr_width)) {
5028 pr_err("%s: iommu width (%d) is not "
5029 "sufficient for the mapped address (%llx)\n",
5030 __func__, addr_width, dmar_domain->max_addr);
5033 dmar_domain->gaw = addr_width;
5036 * Knock out extra levels of page tables if necessary
5038 while (iommu->agaw < dmar_domain->agaw) {
5039 struct dma_pte *pte;
5041 pte = dmar_domain->pgd;
5042 if (dma_pte_present(pte)) {
5043 dmar_domain->pgd = (struct dma_pte *)
5044 phys_to_virt(dma_pte_addr(pte));
5045 free_pgtable_page(pte);
5047 dmar_domain->agaw--;
5050 return domain_add_dev_info(dmar_domain, dev);
5053 static void intel_iommu_detach_device(struct iommu_domain *domain,
5056 dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
5059 static int intel_iommu_map(struct iommu_domain *domain,
5060 unsigned long iova, phys_addr_t hpa,
5061 size_t size, int iommu_prot)
5063 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5068 if (iommu_prot & IOMMU_READ)
5069 prot |= DMA_PTE_READ;
5070 if (iommu_prot & IOMMU_WRITE)
5071 prot |= DMA_PTE_WRITE;
5072 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5073 prot |= DMA_PTE_SNP;
5075 max_addr = iova + size;
5076 if (dmar_domain->max_addr < max_addr) {
5079 /* check if minimum agaw is sufficient for mapped address */
5080 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5081 if (end < max_addr) {
5082 pr_err("%s: iommu width (%d) is not "
5083 "sufficient for the mapped address (%llx)\n",
5084 __func__, dmar_domain->gaw, max_addr);
5087 dmar_domain->max_addr = max_addr;
5089 /* Round up size to next multiple of PAGE_SIZE, if it and
5090 the low bits of hpa would take us onto the next page */
5091 size = aligned_nrpages(hpa, size);
5092 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5093 hpa >> VTD_PAGE_SHIFT, size, prot);
5097 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5098 unsigned long iova, size_t size)
5100 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5101 struct page *freelist = NULL;
5102 unsigned long start_pfn, last_pfn;
5103 unsigned int npages;
5104 int iommu_id, level = 0;
5106 /* Cope with horrid API which requires us to unmap more than the
5107 size argument if it happens to be a large-page mapping. */
5108 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5110 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5111 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5113 start_pfn = iova >> VTD_PAGE_SHIFT;
5114 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5116 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5118 npages = last_pfn - start_pfn + 1;
5120 for_each_domain_iommu(iommu_id, dmar_domain)
5121 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5122 start_pfn, npages, !freelist, 0);
5124 dma_free_pagelist(freelist);
5126 if (dmar_domain->max_addr == iova + size)
5127 dmar_domain->max_addr = iova;
5132 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5135 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5136 struct dma_pte *pte;
5140 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5142 phys = dma_pte_addr(pte);
5147 static bool intel_iommu_capable(enum iommu_cap cap)
5149 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5150 return domain_update_iommu_snooping(NULL) == 1;
5151 if (cap == IOMMU_CAP_INTR_REMAP)
5152 return irq_remapping_enabled == 1;
5157 static int intel_iommu_add_device(struct device *dev)
5159 struct intel_iommu *iommu;
5160 struct iommu_group *group;
5163 iommu = device_to_iommu(dev, &bus, &devfn);
5167 iommu_device_link(&iommu->iommu, dev);
5169 group = iommu_group_get_for_dev(dev);
5172 return PTR_ERR(group);
5174 iommu_group_put(group);
5178 static void intel_iommu_remove_device(struct device *dev)
5180 struct intel_iommu *iommu;
5183 iommu = device_to_iommu(dev, &bus, &devfn);
5187 iommu_group_remove_device(dev);
5189 iommu_device_unlink(&iommu->iommu, dev);
5192 static void intel_iommu_get_resv_regions(struct device *device,
5193 struct list_head *head)
5195 struct iommu_resv_region *reg;
5196 struct dmar_rmrr_unit *rmrr;
5197 struct device *i_dev;
5201 for_each_rmrr_units(rmrr) {
5202 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5204 if (i_dev != device)
5207 list_add_tail(&rmrr->resv->list, head);
5212 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5213 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5217 list_add_tail(®->list, head);
5220 static void intel_iommu_put_resv_regions(struct device *dev,
5221 struct list_head *head)
5223 struct iommu_resv_region *entry, *next;
5225 list_for_each_entry_safe(entry, next, head, list) {
5226 if (entry->type == IOMMU_RESV_RESERVED)
5231 #ifdef CONFIG_INTEL_IOMMU_SVM
5232 #define MAX_NR_PASID_BITS (20)
5233 static inline unsigned long intel_iommu_get_pts(struct intel_iommu *iommu)
5236 * Convert ecap_pss to extend context entry pts encoding, also
5237 * respect the soft pasid_max value set by the iommu.
5238 * - number of PASID bits = ecap_pss + 1
5239 * - number of PASID table entries = 2^(pts + 5)
5240 * Therefore, pts = ecap_pss - 4
5241 * e.g. KBL ecap_pss = 0x13, PASID has 20 bits, pts = 15
5243 if (ecap_pss(iommu->ecap) < 5)
5246 /* pasid_max is encoded as actual number of entries not the bits */
5247 return find_first_bit((unsigned long *)&iommu->pasid_max,
5248 MAX_NR_PASID_BITS) - 5;
5251 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev)
5253 struct device_domain_info *info;
5254 struct context_entry *context;
5255 struct dmar_domain *domain;
5256 unsigned long flags;
5260 domain = get_valid_domain_for_dev(sdev->dev);
5264 spin_lock_irqsave(&device_domain_lock, flags);
5265 spin_lock(&iommu->lock);
5268 info = sdev->dev->archdata.iommu;
5269 if (!info || !info->pasid_supported)
5272 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5273 if (WARN_ON(!context))
5276 ctx_lo = context[0].lo;
5278 sdev->did = domain->iommu_did[iommu->seq_id];
5279 sdev->sid = PCI_DEVID(info->bus, info->devfn);
5281 if (!(ctx_lo & CONTEXT_PASIDE)) {
5282 if (iommu->pasid_state_table)
5283 context[1].hi = (u64)virt_to_phys(iommu->pasid_state_table);
5284 context[1].lo = (u64)virt_to_phys(iommu->pasid_table) |
5285 intel_iommu_get_pts(iommu);
5288 /* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both
5289 * extended to permit requests-with-PASID if the PASIDE bit
5290 * is set. which makes sense. For CONTEXT_TT_PASS_THROUGH,
5291 * however, the PASIDE bit is ignored and requests-with-PASID
5292 * are unconditionally blocked. Which makes less sense.
5293 * So convert from CONTEXT_TT_PASS_THROUGH to one of the new
5294 * "guest mode" translation types depending on whether ATS
5295 * is available or not. Annoyingly, we can't use the new
5296 * modes *unless* PASIDE is set. */
5297 if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 2)) {
5298 ctx_lo &= ~CONTEXT_TT_MASK;
5299 if (info->ats_supported)
5300 ctx_lo |= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2;
5302 ctx_lo |= CONTEXT_TT_PT_PASID << 2;
5304 ctx_lo |= CONTEXT_PASIDE;
5305 if (iommu->pasid_state_table)
5306 ctx_lo |= CONTEXT_DINVE;
5307 if (info->pri_supported)
5308 ctx_lo |= CONTEXT_PRS;
5309 context[0].lo = ctx_lo;
5311 iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
5312 DMA_CCMD_MASK_NOBIT,
5313 DMA_CCMD_DEVICE_INVL);
5316 /* Enable PASID support in the device, if it wasn't already */
5317 if (!info->pasid_enabled)
5318 iommu_enable_dev_iotlb(info);
5320 if (info->ats_enabled) {
5321 sdev->dev_iotlb = 1;
5322 sdev->qdep = info->ats_qdep;
5323 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
5329 spin_unlock(&iommu->lock);
5330 spin_unlock_irqrestore(&device_domain_lock, flags);
5335 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5337 struct intel_iommu *iommu;
5340 if (iommu_dummy(dev)) {
5342 "No IOMMU translation for device; cannot enable SVM\n");
5346 iommu = device_to_iommu(dev, &bus, &devfn);
5348 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5352 if (!iommu->pasid_table) {
5353 dev_err(dev, "PASID not enabled on IOMMU; cannot enable SVM\n");
5359 #endif /* CONFIG_INTEL_IOMMU_SVM */
5361 const struct iommu_ops intel_iommu_ops = {
5362 .capable = intel_iommu_capable,
5363 .domain_alloc = intel_iommu_domain_alloc,
5364 .domain_free = intel_iommu_domain_free,
5365 .attach_dev = intel_iommu_attach_device,
5366 .detach_dev = intel_iommu_detach_device,
5367 .map = intel_iommu_map,
5368 .unmap = intel_iommu_unmap,
5369 .map_sg = default_iommu_map_sg,
5370 .iova_to_phys = intel_iommu_iova_to_phys,
5371 .add_device = intel_iommu_add_device,
5372 .remove_device = intel_iommu_remove_device,
5373 .get_resv_regions = intel_iommu_get_resv_regions,
5374 .put_resv_regions = intel_iommu_put_resv_regions,
5375 .device_group = pci_device_group,
5376 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
5379 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5381 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5382 pr_info("Disabling IOMMU for graphics on this chipset\n");
5386 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5387 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5388 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5389 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5390 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5391 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5392 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5394 static void quirk_iommu_rwbf(struct pci_dev *dev)
5397 * Mobile 4 Series Chipset neglects to set RWBF capability,
5398 * but needs it. Same seems to hold for the desktop versions.
5400 pr_info("Forcing write-buffer flush capability\n");
5404 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5405 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5406 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5407 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5408 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5409 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5410 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5413 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
5414 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
5415 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
5416 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
5417 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
5418 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
5419 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
5420 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
5422 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5426 if (pci_read_config_word(dev, GGC, &ggc))
5429 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5430 pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5432 } else if (dmar_map_gfx) {
5433 /* we have to ensure the gfx device is idle before we flush */
5434 pr_info("Disabling batched IOTLB flush on Ironlake\n");
5435 intel_iommu_strict = 1;
5438 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5439 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5440 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5441 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5443 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5444 ISOCH DMAR unit for the Azalia sound device, but not give it any
5445 TLB entries, which causes it to deadlock. Check for that. We do
5446 this in a function called from init_dmars(), instead of in a PCI
5447 quirk, because we don't want to print the obnoxious "BIOS broken"
5448 message if VT-d is actually disabled.
5450 static void __init check_tylersburg_isoch(void)
5452 struct pci_dev *pdev;
5453 uint32_t vtisochctrl;
5455 /* If there's no Azalia in the system anyway, forget it. */
5456 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5461 /* System Management Registers. Might be hidden, in which case
5462 we can't do the sanity check. But that's OK, because the
5463 known-broken BIOSes _don't_ actually hide it, so far. */
5464 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5468 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5475 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5476 if (vtisochctrl & 1)
5479 /* Drop all bits other than the number of TLB entries */
5480 vtisochctrl &= 0x1c;
5482 /* If we have the recommended number of TLB entries (16), fine. */
5483 if (vtisochctrl == 0x10)
5486 /* Zero TLB entries? You get to ride the short bus to school. */
5488 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5489 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5490 dmi_get_system_info(DMI_BIOS_VENDOR),
5491 dmi_get_system_info(DMI_BIOS_VERSION),
5492 dmi_get_system_info(DMI_PRODUCT_VERSION));
5493 iommu_identity_mapping |= IDENTMAP_AZALIA;
5497 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",