2 * Copyright © 2006-2014 Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * Authors: David Woodhouse <dwmw2@infradead.org>,
14 * Ashok Raj <ashok.raj@intel.com>,
15 * Shaohua Li <shaohua.li@intel.com>,
16 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17 * Fenghua Yu <fenghua.yu@intel.com>
18 * Joerg Roedel <jroedel@suse.de>
21 #define pr_fmt(fmt) "DMAR: " fmt
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/export.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/memory.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <linux/memblock.h>
45 #include <linux/dma-contiguous.h>
46 #include <linux/crash_dump.h>
47 #include <asm/irq_remapping.h>
48 #include <asm/cacheflush.h>
49 #include <asm/iommu.h>
51 #include "irq_remapping.h"
53 #define ROOT_SIZE VTD_PAGE_SIZE
54 #define CONTEXT_SIZE VTD_PAGE_SIZE
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
61 #define IOAPIC_RANGE_START (0xfee00000)
62 #define IOAPIC_RANGE_END (0xfeefffff)
63 #define IOVA_START_ADDR (0x1000)
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
70 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
76 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN (1)
82 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
83 #define DMA_32BIT_PFN IOVA_PFN(DMA_BIT_MASK(32))
84 #define DMA_64BIT_PFN IOVA_PFN(DMA_BIT_MASK(64))
86 /* page table handling */
87 #define LEVEL_STRIDE (9)
88 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
91 * This bitmap is used to advertise the page sizes our hardware support
92 * to the IOMMU core, which will then use this information to split
93 * physically contiguous memory regions it is mapping into page sizes
96 * Traditionally the IOMMU core just handed us the mappings directly,
97 * after making sure the size is an order of a 4KiB page and that the
98 * mapping has natural alignment.
100 * To retain this behavior, we currently advertise that we support
101 * all page sizes that are an order of 4KiB.
103 * If at some point we'd like to utilize the IOMMU core's new behavior,
104 * we could change this to advertise the real page sizes we support.
106 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
108 static inline int agaw_to_level(int agaw)
113 static inline int agaw_to_width(int agaw)
115 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
118 static inline int width_to_agaw(int width)
120 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
123 static inline unsigned int level_to_offset_bits(int level)
125 return (level - 1) * LEVEL_STRIDE;
128 static inline int pfn_level_offset(unsigned long pfn, int level)
130 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
133 static inline unsigned long level_mask(int level)
135 return -1UL << level_to_offset_bits(level);
138 static inline unsigned long level_size(int level)
140 return 1UL << level_to_offset_bits(level);
143 static inline unsigned long align_to_level(unsigned long pfn, int level)
145 return (pfn + level_size(level) - 1) & level_mask(level);
148 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
150 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
153 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
154 are never going to work. */
155 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
157 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
160 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
162 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
164 static inline unsigned long page_to_dma_pfn(struct page *pg)
166 return mm_to_dma_pfn(page_to_pfn(pg));
168 static inline unsigned long virt_to_dma_pfn(void *p)
170 return page_to_dma_pfn(virt_to_page(p));
173 /* global iommu list, set NULL for ignored DMAR units */
174 static struct intel_iommu **g_iommus;
176 static void __init check_tylersburg_isoch(void);
177 static int rwbf_quirk;
180 * set to 1 to panic kernel if can't successfully enable VT-d
181 * (used when kernel is launched w/ TXT)
183 static int force_on = 0;
188 * 12-63: Context Ptr (12 - (haw-1))
195 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
198 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
201 static phys_addr_t root_entry_lctp(struct root_entry *re)
206 return re->lo & VTD_PAGE_MASK;
210 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
213 static phys_addr_t root_entry_uctp(struct root_entry *re)
218 return re->hi & VTD_PAGE_MASK;
223 * 1: fault processing disable
224 * 2-3: translation type
225 * 12-63: address space root
231 struct context_entry {
236 static inline bool context_present(struct context_entry *context)
238 return (context->lo & 1);
240 static inline void context_set_present(struct context_entry *context)
245 static inline void context_set_fault_enable(struct context_entry *context)
247 context->lo &= (((u64)-1) << 2) | 1;
250 static inline void context_set_translation_type(struct context_entry *context,
253 context->lo &= (((u64)-1) << 4) | 3;
254 context->lo |= (value & 3) << 2;
257 static inline void context_set_address_root(struct context_entry *context,
260 context->lo &= ~VTD_PAGE_MASK;
261 context->lo |= value & VTD_PAGE_MASK;
264 static inline void context_set_address_width(struct context_entry *context,
267 context->hi |= value & 7;
270 static inline void context_set_domain_id(struct context_entry *context,
273 context->hi |= (value & ((1 << 16) - 1)) << 8;
276 static inline int context_domain_id(struct context_entry *c)
278 return((c->hi >> 8) & 0xffff);
281 static inline void context_clear_entry(struct context_entry *context)
294 * 12-63: Host physcial address
300 static inline void dma_clear_pte(struct dma_pte *pte)
305 static inline u64 dma_pte_addr(struct dma_pte *pte)
308 return pte->val & VTD_PAGE_MASK;
310 /* Must have a full atomic 64-bit read */
311 return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
315 static inline bool dma_pte_present(struct dma_pte *pte)
317 return (pte->val & 3) != 0;
320 static inline bool dma_pte_superpage(struct dma_pte *pte)
322 return (pte->val & DMA_PTE_LARGE_PAGE);
325 static inline int first_pte_in_page(struct dma_pte *pte)
327 return !((unsigned long)pte & ~VTD_PAGE_MASK);
331 * This domain is a statically identity mapping domain.
332 * 1. This domain creats a static 1:1 mapping to all usable memory.
333 * 2. It maps to each iommu if successful.
334 * 3. Each iommu mapps to this domain if successful.
336 static struct dmar_domain *si_domain;
337 static int hw_pass_through = 1;
339 /* domain represents a virtual machine, more than one devices
340 * across iommus may be owned in one domain, e.g. kvm guest.
342 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 0)
344 /* si_domain contains mulitple devices */
345 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 1)
348 int id; /* domain id */
349 int nid; /* node id */
350 DECLARE_BITMAP(iommu_bmp, DMAR_UNITS_SUPPORTED);
351 /* bitmap of iommus this domain uses*/
353 struct list_head devices; /* all devices' list */
354 struct iova_domain iovad; /* iova's that belong to this domain */
356 struct dma_pte *pgd; /* virtual address */
357 int gaw; /* max guest address width */
359 /* adjusted guest address width, 0 is level 2 30-bit */
362 int flags; /* flags to find out type of domain */
364 int iommu_coherency;/* indicate coherency of iommu access */
365 int iommu_snooping; /* indicate snooping control feature*/
366 int iommu_count; /* reference count of iommu */
367 int iommu_superpage;/* Level of superpages supported:
368 0 == 4KiB (no superpages), 1 == 2MiB,
369 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
370 spinlock_t iommu_lock; /* protect iommu set in domain */
371 u64 max_addr; /* maximum mapped address */
373 struct iommu_domain domain; /* generic domain data structure for
377 /* PCI domain-device relationship */
378 struct device_domain_info {
379 struct list_head link; /* link to domain siblings */
380 struct list_head global; /* link to global list */
381 u8 bus; /* PCI bus number */
382 u8 devfn; /* PCI devfn number */
383 struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
384 struct intel_iommu *iommu; /* IOMMU used by this device */
385 struct dmar_domain *domain; /* pointer to domain */
388 struct dmar_rmrr_unit {
389 struct list_head list; /* list of rmrr units */
390 struct acpi_dmar_header *hdr; /* ACPI header */
391 u64 base_address; /* reserved base address*/
392 u64 end_address; /* reserved end address */
393 struct dmar_dev_scope *devices; /* target devices */
394 int devices_cnt; /* target device count */
397 struct dmar_atsr_unit {
398 struct list_head list; /* list of ATSR units */
399 struct acpi_dmar_header *hdr; /* ACPI header */
400 struct dmar_dev_scope *devices; /* target devices */
401 int devices_cnt; /* target device count */
402 u8 include_all:1; /* include all ports */
405 static LIST_HEAD(dmar_atsr_units);
406 static LIST_HEAD(dmar_rmrr_units);
408 #define for_each_rmrr_units(rmrr) \
409 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
411 static void flush_unmaps_timeout(unsigned long data);
413 static DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
415 #define HIGH_WATER_MARK 250
416 struct deferred_flush_tables {
418 struct iova *iova[HIGH_WATER_MARK];
419 struct dmar_domain *domain[HIGH_WATER_MARK];
420 struct page *freelist[HIGH_WATER_MARK];
423 static struct deferred_flush_tables *deferred_flush;
425 /* bitmap for indexing intel_iommus */
426 static int g_num_of_iommus;
428 static DEFINE_SPINLOCK(async_umap_flush_lock);
429 static LIST_HEAD(unmaps_to_do);
432 static long list_size;
434 static void domain_exit(struct dmar_domain *domain);
435 static void domain_remove_dev_info(struct dmar_domain *domain);
436 static void domain_remove_one_dev_info(struct dmar_domain *domain,
438 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
440 static int domain_detach_iommu(struct dmar_domain *domain,
441 struct intel_iommu *iommu);
443 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
444 int dmar_disabled = 0;
446 int dmar_disabled = 1;
447 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
449 int intel_iommu_enabled = 0;
450 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
452 static int dmar_map_gfx = 1;
453 static int dmar_forcedac;
454 static int intel_iommu_strict;
455 static int intel_iommu_superpage = 1;
456 static int intel_iommu_ecs = 1;
458 /* We only actually use ECS when PASID support (on the new bit 40)
459 * is also advertised. Some early implementations — the ones with
460 * PASID support on bit 28 — have issues even when we *only* use
461 * extended root/context tables. */
462 #define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
463 ecap_pasid(iommu->ecap))
465 int intel_iommu_gfx_mapped;
466 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
468 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
469 static DEFINE_SPINLOCK(device_domain_lock);
470 static LIST_HEAD(device_domain_list);
472 static const struct iommu_ops intel_iommu_ops;
474 static bool translation_pre_enabled(struct intel_iommu *iommu)
476 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
479 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
481 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
484 static void init_translation_status(struct intel_iommu *iommu)
488 gsts = readl(iommu->reg + DMAR_GSTS_REG);
489 if (gsts & DMA_GSTS_TES)
490 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
493 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
494 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
496 return container_of(dom, struct dmar_domain, domain);
499 static int __init intel_iommu_setup(char *str)
504 if (!strncmp(str, "on", 2)) {
506 pr_info("IOMMU enabled\n");
507 } else if (!strncmp(str, "off", 3)) {
509 pr_info("IOMMU disabled\n");
510 } else if (!strncmp(str, "igfx_off", 8)) {
512 pr_info("Disable GFX device mapping\n");
513 } else if (!strncmp(str, "forcedac", 8)) {
514 pr_info("Forcing DAC for PCI devices\n");
516 } else if (!strncmp(str, "strict", 6)) {
517 pr_info("Disable batched IOTLB flush\n");
518 intel_iommu_strict = 1;
519 } else if (!strncmp(str, "sp_off", 6)) {
520 pr_info("Disable supported super page\n");
521 intel_iommu_superpage = 0;
522 } else if (!strncmp(str, "ecs_off", 7)) {
524 "Intel-IOMMU: disable extended context table support\n");
528 str += strcspn(str, ",");
534 __setup("intel_iommu=", intel_iommu_setup);
536 static struct kmem_cache *iommu_domain_cache;
537 static struct kmem_cache *iommu_devinfo_cache;
539 static inline void *alloc_pgtable_page(int node)
544 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
546 vaddr = page_address(page);
550 static inline void free_pgtable_page(void *vaddr)
552 free_page((unsigned long)vaddr);
555 static inline void *alloc_domain_mem(void)
557 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
560 static void free_domain_mem(void *vaddr)
562 kmem_cache_free(iommu_domain_cache, vaddr);
565 static inline void * alloc_devinfo_mem(void)
567 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
570 static inline void free_devinfo_mem(void *vaddr)
572 kmem_cache_free(iommu_devinfo_cache, vaddr);
575 static inline int domain_type_is_vm(struct dmar_domain *domain)
577 return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
580 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
582 return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
583 DOMAIN_FLAG_STATIC_IDENTITY);
586 static inline int domain_pfn_supported(struct dmar_domain *domain,
589 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
591 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
594 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
599 sagaw = cap_sagaw(iommu->cap);
600 for (agaw = width_to_agaw(max_gaw);
602 if (test_bit(agaw, &sagaw))
610 * Calculate max SAGAW for each iommu.
612 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
614 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
618 * calculate agaw for each iommu.
619 * "SAGAW" may be different across iommus, use a default agaw, and
620 * get a supported less agaw for iommus that don't support the default agaw.
622 int iommu_calculate_agaw(struct intel_iommu *iommu)
624 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
627 /* This functionin only returns single iommu in a domain */
628 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
632 /* si_domain and vm domain should not get here. */
633 BUG_ON(domain_type_is_vm_or_si(domain));
634 iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
635 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
638 return g_iommus[iommu_id];
641 static void domain_update_iommu_coherency(struct dmar_domain *domain)
643 struct dmar_drhd_unit *drhd;
644 struct intel_iommu *iommu;
648 domain->iommu_coherency = 1;
650 for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
652 if (!ecap_coherent(g_iommus[i]->ecap)) {
653 domain->iommu_coherency = 0;
660 /* No hardware attached; use lowest common denominator */
662 for_each_active_iommu(iommu, drhd) {
663 if (!ecap_coherent(iommu->ecap)) {
664 domain->iommu_coherency = 0;
671 static int domain_update_iommu_snooping(struct intel_iommu *skip)
673 struct dmar_drhd_unit *drhd;
674 struct intel_iommu *iommu;
678 for_each_active_iommu(iommu, drhd) {
680 if (!ecap_sc_support(iommu->ecap)) {
691 static int domain_update_iommu_superpage(struct intel_iommu *skip)
693 struct dmar_drhd_unit *drhd;
694 struct intel_iommu *iommu;
697 if (!intel_iommu_superpage) {
701 /* set iommu_superpage to the smallest common denominator */
703 for_each_active_iommu(iommu, drhd) {
705 mask &= cap_super_page_val(iommu->cap);
715 /* Some capabilities may be different across iommus */
716 static void domain_update_iommu_cap(struct dmar_domain *domain)
718 domain_update_iommu_coherency(domain);
719 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
720 domain->iommu_superpage = domain_update_iommu_superpage(NULL);
723 static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu,
724 u8 bus, u8 devfn, int alloc)
726 struct root_entry *root = &iommu->root_entry[bus];
727 struct context_entry *context;
730 if (ecs_enabled(iommu)) {
739 context = phys_to_virt(*entry & VTD_PAGE_MASK);
741 unsigned long phy_addr;
745 context = alloc_pgtable_page(iommu->node);
749 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
750 phy_addr = virt_to_phys((void *)context);
751 *entry = phy_addr | 1;
752 __iommu_flush_cache(iommu, entry, sizeof(*entry));
754 return &context[devfn];
757 static int iommu_dummy(struct device *dev)
759 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
762 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
764 struct dmar_drhd_unit *drhd = NULL;
765 struct intel_iommu *iommu;
767 struct pci_dev *ptmp, *pdev = NULL;
771 if (iommu_dummy(dev))
774 if (dev_is_pci(dev)) {
775 pdev = to_pci_dev(dev);
776 segment = pci_domain_nr(pdev->bus);
777 } else if (has_acpi_companion(dev))
778 dev = &ACPI_COMPANION(dev)->dev;
781 for_each_active_iommu(iommu, drhd) {
782 if (pdev && segment != drhd->segment)
785 for_each_active_dev_scope(drhd->devices,
786 drhd->devices_cnt, i, tmp) {
788 *bus = drhd->devices[i].bus;
789 *devfn = drhd->devices[i].devfn;
793 if (!pdev || !dev_is_pci(tmp))
796 ptmp = to_pci_dev(tmp);
797 if (ptmp->subordinate &&
798 ptmp->subordinate->number <= pdev->bus->number &&
799 ptmp->subordinate->busn_res.end >= pdev->bus->number)
803 if (pdev && drhd->include_all) {
805 *bus = pdev->bus->number;
806 *devfn = pdev->devfn;
817 static void domain_flush_cache(struct dmar_domain *domain,
818 void *addr, int size)
820 if (!domain->iommu_coherency)
821 clflush_cache_range(addr, size);
824 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
826 struct context_entry *context;
830 spin_lock_irqsave(&iommu->lock, flags);
831 context = iommu_context_addr(iommu, bus, devfn, 0);
833 ret = context_present(context);
834 spin_unlock_irqrestore(&iommu->lock, flags);
838 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
840 struct context_entry *context;
843 spin_lock_irqsave(&iommu->lock, flags);
844 context = iommu_context_addr(iommu, bus, devfn, 0);
846 context_clear_entry(context);
847 __iommu_flush_cache(iommu, context, sizeof(*context));
849 spin_unlock_irqrestore(&iommu->lock, flags);
852 static void free_context_table(struct intel_iommu *iommu)
856 struct context_entry *context;
858 spin_lock_irqsave(&iommu->lock, flags);
859 if (!iommu->root_entry) {
862 for (i = 0; i < ROOT_ENTRY_NR; i++) {
863 context = iommu_context_addr(iommu, i, 0, 0);
865 free_pgtable_page(context);
867 if (!ecs_enabled(iommu))
870 context = iommu_context_addr(iommu, i, 0x80, 0);
872 free_pgtable_page(context);
875 free_pgtable_page(iommu->root_entry);
876 iommu->root_entry = NULL;
878 spin_unlock_irqrestore(&iommu->lock, flags);
881 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
882 unsigned long pfn, int *target_level)
884 struct dma_pte *parent, *pte = NULL;
885 int level = agaw_to_level(domain->agaw);
888 BUG_ON(!domain->pgd);
890 if (!domain_pfn_supported(domain, pfn))
891 /* Address beyond IOMMU's addressing capabilities. */
894 parent = domain->pgd;
899 offset = pfn_level_offset(pfn, level);
900 pte = &parent[offset];
901 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
903 if (level == *target_level)
906 if (!dma_pte_present(pte)) {
909 tmp_page = alloc_pgtable_page(domain->nid);
914 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
915 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
916 if (cmpxchg64(&pte->val, 0ULL, pteval))
917 /* Someone else set it while we were thinking; use theirs. */
918 free_pgtable_page(tmp_page);
920 domain_flush_cache(domain, pte, sizeof(*pte));
925 parent = phys_to_virt(dma_pte_addr(pte));
930 *target_level = level;
936 /* return address's pte at specific level */
937 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
939 int level, int *large_page)
941 struct dma_pte *parent, *pte = NULL;
942 int total = agaw_to_level(domain->agaw);
945 parent = domain->pgd;
946 while (level <= total) {
947 offset = pfn_level_offset(pfn, total);
948 pte = &parent[offset];
952 if (!dma_pte_present(pte)) {
957 if (dma_pte_superpage(pte)) {
962 parent = phys_to_virt(dma_pte_addr(pte));
968 /* clear last level pte, a tlb flush should be followed */
969 static void dma_pte_clear_range(struct dmar_domain *domain,
970 unsigned long start_pfn,
971 unsigned long last_pfn)
973 unsigned int large_page = 1;
974 struct dma_pte *first_pte, *pte;
976 BUG_ON(!domain_pfn_supported(domain, start_pfn));
977 BUG_ON(!domain_pfn_supported(domain, last_pfn));
978 BUG_ON(start_pfn > last_pfn);
980 /* we don't need lock here; nobody else touches the iova range */
983 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
985 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
990 start_pfn += lvl_to_nr_pages(large_page);
992 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
994 domain_flush_cache(domain, first_pte,
995 (void *)pte - (void *)first_pte);
997 } while (start_pfn && start_pfn <= last_pfn);
1000 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1001 struct dma_pte *pte, unsigned long pfn,
1002 unsigned long start_pfn, unsigned long last_pfn)
1004 pfn = max(start_pfn, pfn);
1005 pte = &pte[pfn_level_offset(pfn, level)];
1008 unsigned long level_pfn;
1009 struct dma_pte *level_pte;
1011 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1014 level_pfn = pfn & level_mask(level - 1);
1015 level_pte = phys_to_virt(dma_pte_addr(pte));
1018 dma_pte_free_level(domain, level - 1, level_pte,
1019 level_pfn, start_pfn, last_pfn);
1021 /* If range covers entire pagetable, free it */
1022 if (!(start_pfn > level_pfn ||
1023 last_pfn < level_pfn + level_size(level) - 1)) {
1025 domain_flush_cache(domain, pte, sizeof(*pte));
1026 free_pgtable_page(level_pte);
1029 pfn += level_size(level);
1030 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1033 /* free page table pages. last level pte should already be cleared */
1034 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1035 unsigned long start_pfn,
1036 unsigned long last_pfn)
1038 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1039 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1040 BUG_ON(start_pfn > last_pfn);
1042 dma_pte_clear_range(domain, start_pfn, last_pfn);
1044 /* We don't need lock here; nobody else touches the iova range */
1045 dma_pte_free_level(domain, agaw_to_level(domain->agaw),
1046 domain->pgd, 0, start_pfn, last_pfn);
1049 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1050 free_pgtable_page(domain->pgd);
1055 /* When a page at a given level is being unlinked from its parent, we don't
1056 need to *modify* it at all. All we need to do is make a list of all the
1057 pages which can be freed just as soon as we've flushed the IOTLB and we
1058 know the hardware page-walk will no longer touch them.
1059 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1061 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1062 int level, struct dma_pte *pte,
1063 struct page *freelist)
1067 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1068 pg->freelist = freelist;
1074 pte = page_address(pg);
1076 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1077 freelist = dma_pte_list_pagetables(domain, level - 1,
1080 } while (!first_pte_in_page(pte));
1085 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1086 struct dma_pte *pte, unsigned long pfn,
1087 unsigned long start_pfn,
1088 unsigned long last_pfn,
1089 struct page *freelist)
1091 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1093 pfn = max(start_pfn, pfn);
1094 pte = &pte[pfn_level_offset(pfn, level)];
1097 unsigned long level_pfn;
1099 if (!dma_pte_present(pte))
1102 level_pfn = pfn & level_mask(level);
1104 /* If range covers entire pagetable, free it */
1105 if (start_pfn <= level_pfn &&
1106 last_pfn >= level_pfn + level_size(level) - 1) {
1107 /* These suborbinate page tables are going away entirely. Don't
1108 bother to clear them; we're just going to *free* them. */
1109 if (level > 1 && !dma_pte_superpage(pte))
1110 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1116 } else if (level > 1) {
1117 /* Recurse down into a level that isn't *entirely* obsolete */
1118 freelist = dma_pte_clear_level(domain, level - 1,
1119 phys_to_virt(dma_pte_addr(pte)),
1120 level_pfn, start_pfn, last_pfn,
1124 pfn += level_size(level);
1125 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1128 domain_flush_cache(domain, first_pte,
1129 (void *)++last_pte - (void *)first_pte);
1134 /* We can't just free the pages because the IOMMU may still be walking
1135 the page tables, and may have cached the intermediate levels. The
1136 pages can only be freed after the IOTLB flush has been done. */
1137 struct page *domain_unmap(struct dmar_domain *domain,
1138 unsigned long start_pfn,
1139 unsigned long last_pfn)
1141 struct page *freelist = NULL;
1143 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1144 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1145 BUG_ON(start_pfn > last_pfn);
1147 /* we don't need lock here; nobody else touches the iova range */
1148 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1149 domain->pgd, 0, start_pfn, last_pfn, NULL);
1152 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1153 struct page *pgd_page = virt_to_page(domain->pgd);
1154 pgd_page->freelist = freelist;
1155 freelist = pgd_page;
1163 void dma_free_pagelist(struct page *freelist)
1167 while ((pg = freelist)) {
1168 freelist = pg->freelist;
1169 free_pgtable_page(page_address(pg));
1173 /* iommu handling */
1174 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1176 struct root_entry *root;
1177 unsigned long flags;
1179 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1181 pr_err("Allocating root entry for %s failed\n",
1186 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1188 spin_lock_irqsave(&iommu->lock, flags);
1189 iommu->root_entry = root;
1190 spin_unlock_irqrestore(&iommu->lock, flags);
1195 static void iommu_set_root_entry(struct intel_iommu *iommu)
1201 addr = virt_to_phys(iommu->root_entry);
1202 if (ecs_enabled(iommu))
1203 addr |= DMA_RTADDR_RTT;
1205 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1206 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1208 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1210 /* Make sure hardware complete it */
1211 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1212 readl, (sts & DMA_GSTS_RTPS), sts);
1214 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1217 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1222 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1225 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1226 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1228 /* Make sure hardware complete it */
1229 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1230 readl, (!(val & DMA_GSTS_WBFS)), val);
1232 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1235 /* return value determine if we need a write buffer flush */
1236 static void __iommu_flush_context(struct intel_iommu *iommu,
1237 u16 did, u16 source_id, u8 function_mask,
1244 case DMA_CCMD_GLOBAL_INVL:
1245 val = DMA_CCMD_GLOBAL_INVL;
1247 case DMA_CCMD_DOMAIN_INVL:
1248 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1250 case DMA_CCMD_DEVICE_INVL:
1251 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1252 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1257 val |= DMA_CCMD_ICC;
1259 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1260 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1262 /* Make sure hardware complete it */
1263 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1264 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1266 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1269 /* return value determine if we need a write buffer flush */
1270 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1271 u64 addr, unsigned int size_order, u64 type)
1273 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1274 u64 val = 0, val_iva = 0;
1278 case DMA_TLB_GLOBAL_FLUSH:
1279 /* global flush doesn't need set IVA_REG */
1280 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1282 case DMA_TLB_DSI_FLUSH:
1283 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1285 case DMA_TLB_PSI_FLUSH:
1286 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1287 /* IH bit is passed in as part of address */
1288 val_iva = size_order | addr;
1293 /* Note: set drain read/write */
1296 * This is probably to be super secure.. Looks like we can
1297 * ignore it without any impact.
1299 if (cap_read_drain(iommu->cap))
1300 val |= DMA_TLB_READ_DRAIN;
1302 if (cap_write_drain(iommu->cap))
1303 val |= DMA_TLB_WRITE_DRAIN;
1305 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1306 /* Note: Only uses first TLB reg currently */
1308 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1309 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1311 /* Make sure hardware complete it */
1312 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1313 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1315 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1317 /* check IOTLB invalidation granularity */
1318 if (DMA_TLB_IAIG(val) == 0)
1319 pr_err("Flush IOTLB failed\n");
1320 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1321 pr_debug("TLB flush request %Lx, actual %Lx\n",
1322 (unsigned long long)DMA_TLB_IIRG(type),
1323 (unsigned long long)DMA_TLB_IAIG(val));
1326 static struct device_domain_info *
1327 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1331 unsigned long flags;
1332 struct device_domain_info *info;
1333 struct pci_dev *pdev;
1335 if (!ecap_dev_iotlb_support(iommu->ecap))
1341 spin_lock_irqsave(&device_domain_lock, flags);
1342 list_for_each_entry(info, &domain->devices, link)
1343 if (info->iommu == iommu && info->bus == bus &&
1344 info->devfn == devfn) {
1348 spin_unlock_irqrestore(&device_domain_lock, flags);
1350 if (!found || !info->dev || !dev_is_pci(info->dev))
1353 pdev = to_pci_dev(info->dev);
1355 if (!pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS))
1358 if (!dmar_find_matched_atsr_unit(pdev))
1364 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1366 if (!info || !dev_is_pci(info->dev))
1369 pci_enable_ats(to_pci_dev(info->dev), VTD_PAGE_SHIFT);
1372 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1374 if (!info->dev || !dev_is_pci(info->dev) ||
1375 !pci_ats_enabled(to_pci_dev(info->dev)))
1378 pci_disable_ats(to_pci_dev(info->dev));
1381 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1382 u64 addr, unsigned mask)
1385 unsigned long flags;
1386 struct device_domain_info *info;
1388 spin_lock_irqsave(&device_domain_lock, flags);
1389 list_for_each_entry(info, &domain->devices, link) {
1390 struct pci_dev *pdev;
1391 if (!info->dev || !dev_is_pci(info->dev))
1394 pdev = to_pci_dev(info->dev);
1395 if (!pci_ats_enabled(pdev))
1398 sid = info->bus << 8 | info->devfn;
1399 qdep = pci_ats_queue_depth(pdev);
1400 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1402 spin_unlock_irqrestore(&device_domain_lock, flags);
1405 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1406 unsigned long pfn, unsigned int pages, int ih, int map)
1408 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1409 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1416 * Fallback to domain selective flush if no PSI support or the size is
1418 * PSI requires page size to be 2 ^ x, and the base address is naturally
1419 * aligned to the size
1421 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1422 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1425 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1429 * In caching mode, changes of pages from non-present to present require
1430 * flush. However, device IOTLB doesn't need to be flushed in this case.
1432 if (!cap_caching_mode(iommu->cap) || !map)
1433 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1436 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1439 unsigned long flags;
1441 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1442 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1443 pmen &= ~DMA_PMEN_EPM;
1444 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1446 /* wait for the protected region status bit to clear */
1447 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1448 readl, !(pmen & DMA_PMEN_PRS), pmen);
1450 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1453 static void iommu_enable_translation(struct intel_iommu *iommu)
1456 unsigned long flags;
1458 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1459 iommu->gcmd |= DMA_GCMD_TE;
1460 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1462 /* Make sure hardware complete it */
1463 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1464 readl, (sts & DMA_GSTS_TES), sts);
1466 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1469 static void iommu_disable_translation(struct intel_iommu *iommu)
1474 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1475 iommu->gcmd &= ~DMA_GCMD_TE;
1476 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1478 /* Make sure hardware complete it */
1479 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1480 readl, (!(sts & DMA_GSTS_TES)), sts);
1482 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1486 static int iommu_init_domains(struct intel_iommu *iommu)
1488 unsigned long ndomains;
1489 unsigned long nlongs;
1491 ndomains = cap_ndoms(iommu->cap);
1492 pr_debug("%s: Number of Domains supported <%ld>\n",
1493 iommu->name, ndomains);
1494 nlongs = BITS_TO_LONGS(ndomains);
1496 spin_lock_init(&iommu->lock);
1498 /* TBD: there might be 64K domains,
1499 * consider other allocation for future chip
1501 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1502 if (!iommu->domain_ids) {
1503 pr_err("%s: Allocating domain id array failed\n",
1507 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1509 if (!iommu->domains) {
1510 pr_err("%s: Allocating domain array failed\n",
1512 kfree(iommu->domain_ids);
1513 iommu->domain_ids = NULL;
1518 * if Caching mode is set, then invalid translations are tagged
1519 * with domainid 0. Hence we need to pre-allocate it.
1521 if (cap_caching_mode(iommu->cap))
1522 set_bit(0, iommu->domain_ids);
1526 static void disable_dmar_iommu(struct intel_iommu *iommu)
1528 struct dmar_domain *domain;
1531 if ((iommu->domains) && (iommu->domain_ids)) {
1532 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1534 * Domain id 0 is reserved for invalid translation
1535 * if hardware supports caching mode.
1537 if (cap_caching_mode(iommu->cap) && i == 0)
1540 domain = iommu->domains[i];
1541 clear_bit(i, iommu->domain_ids);
1542 if (domain_detach_iommu(domain, iommu) == 0 &&
1543 !domain_type_is_vm(domain))
1544 domain_exit(domain);
1548 if (iommu->gcmd & DMA_GCMD_TE)
1549 iommu_disable_translation(iommu);
1552 static void free_dmar_iommu(struct intel_iommu *iommu)
1554 if ((iommu->domains) && (iommu->domain_ids)) {
1555 kfree(iommu->domains);
1556 kfree(iommu->domain_ids);
1557 iommu->domains = NULL;
1558 iommu->domain_ids = NULL;
1561 g_iommus[iommu->seq_id] = NULL;
1563 /* free context mapping */
1564 free_context_table(iommu);
1567 static struct dmar_domain *alloc_domain(int flags)
1569 /* domain id for virtual machine, it won't be set in context */
1570 static atomic_t vm_domid = ATOMIC_INIT(0);
1571 struct dmar_domain *domain;
1573 domain = alloc_domain_mem();
1577 memset(domain, 0, sizeof(*domain));
1579 domain->flags = flags;
1580 spin_lock_init(&domain->iommu_lock);
1581 INIT_LIST_HEAD(&domain->devices);
1582 if (flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1583 domain->id = atomic_inc_return(&vm_domid);
1588 static int __iommu_attach_domain(struct dmar_domain *domain,
1589 struct intel_iommu *iommu)
1592 unsigned long ndomains;
1594 ndomains = cap_ndoms(iommu->cap);
1595 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1596 if (num < ndomains) {
1597 set_bit(num, iommu->domain_ids);
1598 iommu->domains[num] = domain;
1606 static int iommu_attach_domain(struct dmar_domain *domain,
1607 struct intel_iommu *iommu)
1610 unsigned long flags;
1612 spin_lock_irqsave(&iommu->lock, flags);
1613 num = __iommu_attach_domain(domain, iommu);
1614 spin_unlock_irqrestore(&iommu->lock, flags);
1616 pr_err("%s: No free domain ids\n", iommu->name);
1621 static int iommu_attach_vm_domain(struct dmar_domain *domain,
1622 struct intel_iommu *iommu)
1625 unsigned long ndomains;
1627 ndomains = cap_ndoms(iommu->cap);
1628 for_each_set_bit(num, iommu->domain_ids, ndomains)
1629 if (iommu->domains[num] == domain)
1632 return __iommu_attach_domain(domain, iommu);
1635 static void iommu_detach_domain(struct dmar_domain *domain,
1636 struct intel_iommu *iommu)
1638 unsigned long flags;
1641 spin_lock_irqsave(&iommu->lock, flags);
1642 if (domain_type_is_vm_or_si(domain)) {
1643 ndomains = cap_ndoms(iommu->cap);
1644 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1645 if (iommu->domains[num] == domain) {
1646 clear_bit(num, iommu->domain_ids);
1647 iommu->domains[num] = NULL;
1652 clear_bit(domain->id, iommu->domain_ids);
1653 iommu->domains[domain->id] = NULL;
1655 spin_unlock_irqrestore(&iommu->lock, flags);
1658 static void domain_attach_iommu(struct dmar_domain *domain,
1659 struct intel_iommu *iommu)
1661 unsigned long flags;
1663 spin_lock_irqsave(&domain->iommu_lock, flags);
1664 if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1665 domain->iommu_count++;
1666 if (domain->iommu_count == 1)
1667 domain->nid = iommu->node;
1668 domain_update_iommu_cap(domain);
1670 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1673 static int domain_detach_iommu(struct dmar_domain *domain,
1674 struct intel_iommu *iommu)
1676 unsigned long flags;
1677 int count = INT_MAX;
1679 spin_lock_irqsave(&domain->iommu_lock, flags);
1680 if (test_and_clear_bit(iommu->seq_id, domain->iommu_bmp)) {
1681 count = --domain->iommu_count;
1682 domain_update_iommu_cap(domain);
1684 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1689 static struct iova_domain reserved_iova_list;
1690 static struct lock_class_key reserved_rbtree_key;
1692 static int dmar_init_reserved_ranges(void)
1694 struct pci_dev *pdev = NULL;
1698 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN,
1701 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1702 &reserved_rbtree_key);
1704 /* IOAPIC ranges shouldn't be accessed by DMA */
1705 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1706 IOVA_PFN(IOAPIC_RANGE_END));
1708 pr_err("Reserve IOAPIC range failed\n");
1712 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1713 for_each_pci_dev(pdev) {
1716 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1717 r = &pdev->resource[i];
1718 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1720 iova = reserve_iova(&reserved_iova_list,
1724 pr_err("Reserve iova failed\n");
1732 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1734 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1737 static inline int guestwidth_to_adjustwidth(int gaw)
1740 int r = (gaw - 12) % 9;
1751 static int domain_init(struct dmar_domain *domain, int guest_width)
1753 struct intel_iommu *iommu;
1754 int adjust_width, agaw;
1755 unsigned long sagaw;
1757 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
1759 domain_reserve_special_ranges(domain);
1761 /* calculate AGAW */
1762 iommu = domain_get_iommu(domain);
1763 if (guest_width > cap_mgaw(iommu->cap))
1764 guest_width = cap_mgaw(iommu->cap);
1765 domain->gaw = guest_width;
1766 adjust_width = guestwidth_to_adjustwidth(guest_width);
1767 agaw = width_to_agaw(adjust_width);
1768 sagaw = cap_sagaw(iommu->cap);
1769 if (!test_bit(agaw, &sagaw)) {
1770 /* hardware doesn't support it, choose a bigger one */
1771 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1772 agaw = find_next_bit(&sagaw, 5, agaw);
1776 domain->agaw = agaw;
1778 if (ecap_coherent(iommu->ecap))
1779 domain->iommu_coherency = 1;
1781 domain->iommu_coherency = 0;
1783 if (ecap_sc_support(iommu->ecap))
1784 domain->iommu_snooping = 1;
1786 domain->iommu_snooping = 0;
1788 if (intel_iommu_superpage)
1789 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1791 domain->iommu_superpage = 0;
1793 domain->nid = iommu->node;
1795 /* always allocate the top pgd */
1796 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1799 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1803 static void domain_exit(struct dmar_domain *domain)
1805 struct page *freelist = NULL;
1808 /* Domain 0 is reserved, so dont process it */
1812 /* Flush any lazy unmaps that may reference this domain */
1813 if (!intel_iommu_strict)
1814 flush_unmaps_timeout(0);
1816 /* remove associated devices */
1817 domain_remove_dev_info(domain);
1820 put_iova_domain(&domain->iovad);
1822 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1824 /* clear attached or cached domains */
1826 for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus)
1827 iommu_detach_domain(domain, g_iommus[i]);
1830 dma_free_pagelist(freelist);
1832 free_domain_mem(domain);
1835 static int domain_context_mapping_one(struct dmar_domain *domain,
1836 struct intel_iommu *iommu,
1837 u8 bus, u8 devfn, int translation)
1839 struct context_entry *context;
1840 unsigned long flags;
1841 struct dma_pte *pgd;
1844 struct device_domain_info *info = NULL;
1846 pr_debug("Set context mapping for %02x:%02x.%d\n",
1847 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1849 BUG_ON(!domain->pgd);
1850 BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1851 translation != CONTEXT_TT_MULTI_LEVEL);
1853 spin_lock_irqsave(&iommu->lock, flags);
1854 context = iommu_context_addr(iommu, bus, devfn, 1);
1855 spin_unlock_irqrestore(&iommu->lock, flags);
1858 spin_lock_irqsave(&iommu->lock, flags);
1859 if (context_present(context)) {
1860 spin_unlock_irqrestore(&iommu->lock, flags);
1867 if (domain_type_is_vm_or_si(domain)) {
1868 if (domain_type_is_vm(domain)) {
1869 id = iommu_attach_vm_domain(domain, iommu);
1871 spin_unlock_irqrestore(&iommu->lock, flags);
1872 pr_err("%s: No free domain ids\n", iommu->name);
1877 /* Skip top levels of page tables for
1878 * iommu which has less agaw than default.
1879 * Unnecessary for PT mode.
1881 if (translation != CONTEXT_TT_PASS_THROUGH) {
1882 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1883 pgd = phys_to_virt(dma_pte_addr(pgd));
1884 if (!dma_pte_present(pgd)) {
1885 spin_unlock_irqrestore(&iommu->lock, flags);
1892 context_set_domain_id(context, id);
1894 if (translation != CONTEXT_TT_PASS_THROUGH) {
1895 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
1896 translation = info ? CONTEXT_TT_DEV_IOTLB :
1897 CONTEXT_TT_MULTI_LEVEL;
1900 * In pass through mode, AW must be programmed to indicate the largest
1901 * AGAW value supported by hardware. And ASR is ignored by hardware.
1903 if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1904 context_set_address_width(context, iommu->msagaw);
1906 context_set_address_root(context, virt_to_phys(pgd));
1907 context_set_address_width(context, iommu->agaw);
1910 context_set_translation_type(context, translation);
1911 context_set_fault_enable(context);
1912 context_set_present(context);
1913 domain_flush_cache(domain, context, sizeof(*context));
1916 * It's a non-present to present mapping. If hardware doesn't cache
1917 * non-present entry we only need to flush the write-buffer. If the
1918 * _does_ cache non-present entries, then it does so in the special
1919 * domain #0, which we have to flush:
1921 if (cap_caching_mode(iommu->cap)) {
1922 iommu->flush.flush_context(iommu, 0,
1923 (((u16)bus) << 8) | devfn,
1924 DMA_CCMD_MASK_NOBIT,
1925 DMA_CCMD_DEVICE_INVL);
1926 iommu->flush.flush_iotlb(iommu, id, 0, 0, DMA_TLB_DSI_FLUSH);
1928 iommu_flush_write_buffer(iommu);
1930 iommu_enable_dev_iotlb(info);
1931 spin_unlock_irqrestore(&iommu->lock, flags);
1933 domain_attach_iommu(domain, iommu);
1938 struct domain_context_mapping_data {
1939 struct dmar_domain *domain;
1940 struct intel_iommu *iommu;
1944 static int domain_context_mapping_cb(struct pci_dev *pdev,
1945 u16 alias, void *opaque)
1947 struct domain_context_mapping_data *data = opaque;
1949 return domain_context_mapping_one(data->domain, data->iommu,
1950 PCI_BUS_NUM(alias), alias & 0xff,
1955 domain_context_mapping(struct dmar_domain *domain, struct device *dev,
1958 struct intel_iommu *iommu;
1960 struct domain_context_mapping_data data;
1962 iommu = device_to_iommu(dev, &bus, &devfn);
1966 if (!dev_is_pci(dev))
1967 return domain_context_mapping_one(domain, iommu, bus, devfn,
1970 data.domain = domain;
1972 data.translation = translation;
1974 return pci_for_each_dma_alias(to_pci_dev(dev),
1975 &domain_context_mapping_cb, &data);
1978 static int domain_context_mapped_cb(struct pci_dev *pdev,
1979 u16 alias, void *opaque)
1981 struct intel_iommu *iommu = opaque;
1983 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
1986 static int domain_context_mapped(struct device *dev)
1988 struct intel_iommu *iommu;
1991 iommu = device_to_iommu(dev, &bus, &devfn);
1995 if (!dev_is_pci(dev))
1996 return device_context_mapped(iommu, bus, devfn);
1998 return !pci_for_each_dma_alias(to_pci_dev(dev),
1999 domain_context_mapped_cb, iommu);
2002 /* Returns a number of VTD pages, but aligned to MM page size */
2003 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2006 host_addr &= ~PAGE_MASK;
2007 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2010 /* Return largest possible superpage level for a given mapping */
2011 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2012 unsigned long iov_pfn,
2013 unsigned long phy_pfn,
2014 unsigned long pages)
2016 int support, level = 1;
2017 unsigned long pfnmerge;
2019 support = domain->iommu_superpage;
2021 /* To use a large page, the virtual *and* physical addresses
2022 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2023 of them will mean we have to use smaller pages. So just
2024 merge them and check both at once. */
2025 pfnmerge = iov_pfn | phy_pfn;
2027 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2028 pages >>= VTD_STRIDE_SHIFT;
2031 pfnmerge >>= VTD_STRIDE_SHIFT;
2038 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2039 struct scatterlist *sg, unsigned long phys_pfn,
2040 unsigned long nr_pages, int prot)
2042 struct dma_pte *first_pte = NULL, *pte = NULL;
2043 phys_addr_t uninitialized_var(pteval);
2044 unsigned long sg_res = 0;
2045 unsigned int largepage_lvl = 0;
2046 unsigned long lvl_pages = 0;
2048 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2050 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2053 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2057 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2060 while (nr_pages > 0) {
2064 sg_res = aligned_nrpages(sg->offset, sg->length);
2065 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
2066 sg->dma_length = sg->length;
2067 pteval = page_to_phys(sg_page(sg)) | prot;
2068 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2072 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2074 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2077 /* It is large page*/
2078 if (largepage_lvl > 1) {
2079 pteval |= DMA_PTE_LARGE_PAGE;
2080 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2082 * Ensure that old small page tables are
2083 * removed to make room for superpage,
2086 dma_pte_free_pagetable(domain, iov_pfn,
2087 iov_pfn + lvl_pages - 1);
2089 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2093 /* We don't need lock here, nobody else
2094 * touches the iova range
2096 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2098 static int dumps = 5;
2099 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2100 iov_pfn, tmp, (unsigned long long)pteval);
2103 debug_dma_dump_mappings(NULL);
2108 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2110 BUG_ON(nr_pages < lvl_pages);
2111 BUG_ON(sg_res < lvl_pages);
2113 nr_pages -= lvl_pages;
2114 iov_pfn += lvl_pages;
2115 phys_pfn += lvl_pages;
2116 pteval += lvl_pages * VTD_PAGE_SIZE;
2117 sg_res -= lvl_pages;
2119 /* If the next PTE would be the first in a new page, then we
2120 need to flush the cache on the entries we've just written.
2121 And then we'll need to recalculate 'pte', so clear it and
2122 let it get set again in the if (!pte) block above.
2124 If we're done (!nr_pages) we need to flush the cache too.
2126 Also if we've been setting superpages, we may need to
2127 recalculate 'pte' and switch back to smaller pages for the
2128 end of the mapping, if the trailing size is not enough to
2129 use another superpage (i.e. sg_res < lvl_pages). */
2131 if (!nr_pages || first_pte_in_page(pte) ||
2132 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2133 domain_flush_cache(domain, first_pte,
2134 (void *)pte - (void *)first_pte);
2138 if (!sg_res && nr_pages)
2144 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2145 struct scatterlist *sg, unsigned long nr_pages,
2148 return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2151 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2152 unsigned long phys_pfn, unsigned long nr_pages,
2155 return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2158 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
2163 clear_context_table(iommu, bus, devfn);
2164 iommu->flush.flush_context(iommu, 0, 0, 0,
2165 DMA_CCMD_GLOBAL_INVL);
2166 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2169 static inline void unlink_domain_info(struct device_domain_info *info)
2171 assert_spin_locked(&device_domain_lock);
2172 list_del(&info->link);
2173 list_del(&info->global);
2175 info->dev->archdata.iommu = NULL;
2178 static void domain_remove_dev_info(struct dmar_domain *domain)
2180 struct device_domain_info *info, *tmp;
2181 unsigned long flags;
2183 spin_lock_irqsave(&device_domain_lock, flags);
2184 list_for_each_entry_safe(info, tmp, &domain->devices, link) {
2185 unlink_domain_info(info);
2186 spin_unlock_irqrestore(&device_domain_lock, flags);
2188 iommu_disable_dev_iotlb(info);
2189 iommu_detach_dev(info->iommu, info->bus, info->devfn);
2191 if (domain_type_is_vm(domain)) {
2192 iommu_detach_dependent_devices(info->iommu, info->dev);
2193 domain_detach_iommu(domain, info->iommu);
2196 free_devinfo_mem(info);
2197 spin_lock_irqsave(&device_domain_lock, flags);
2199 spin_unlock_irqrestore(&device_domain_lock, flags);
2204 * Note: we use struct device->archdata.iommu stores the info
2206 static struct dmar_domain *find_domain(struct device *dev)
2208 struct device_domain_info *info;
2210 /* No lock here, assumes no domain exit in normal case */
2211 info = dev->archdata.iommu;
2213 return info->domain;
2217 static inline struct device_domain_info *
2218 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2220 struct device_domain_info *info;
2222 list_for_each_entry(info, &device_domain_list, global)
2223 if (info->iommu->segment == segment && info->bus == bus &&
2224 info->devfn == devfn)
2230 static struct dmar_domain *dmar_insert_dev_info(struct intel_iommu *iommu,
2233 struct dmar_domain *domain)
2235 struct dmar_domain *found = NULL;
2236 struct device_domain_info *info;
2237 unsigned long flags;
2239 info = alloc_devinfo_mem();
2244 info->devfn = devfn;
2246 info->domain = domain;
2247 info->iommu = iommu;
2249 spin_lock_irqsave(&device_domain_lock, flags);
2251 found = find_domain(dev);
2253 struct device_domain_info *info2;
2254 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2256 found = info2->domain;
2259 spin_unlock_irqrestore(&device_domain_lock, flags);
2260 free_devinfo_mem(info);
2261 /* Caller must free the original domain */
2265 list_add(&info->link, &domain->devices);
2266 list_add(&info->global, &device_domain_list);
2268 dev->archdata.iommu = info;
2269 spin_unlock_irqrestore(&device_domain_lock, flags);
2274 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2276 *(u16 *)opaque = alias;
2280 /* domain is initialized */
2281 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2283 struct dmar_domain *domain, *tmp;
2284 struct intel_iommu *iommu;
2285 struct device_domain_info *info;
2287 unsigned long flags;
2290 domain = find_domain(dev);
2294 iommu = device_to_iommu(dev, &bus, &devfn);
2298 if (dev_is_pci(dev)) {
2299 struct pci_dev *pdev = to_pci_dev(dev);
2301 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2303 spin_lock_irqsave(&device_domain_lock, flags);
2304 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2305 PCI_BUS_NUM(dma_alias),
2308 iommu = info->iommu;
2309 domain = info->domain;
2311 spin_unlock_irqrestore(&device_domain_lock, flags);
2313 /* DMA alias already has a domain, uses it */
2318 /* Allocate and initialize new domain for the device */
2319 domain = alloc_domain(0);
2322 domain->id = iommu_attach_domain(domain, iommu);
2323 if (domain->id < 0) {
2324 free_domain_mem(domain);
2327 domain_attach_iommu(domain, iommu);
2328 if (domain_init(domain, gaw)) {
2329 domain_exit(domain);
2333 /* register PCI DMA alias device */
2334 if (dev_is_pci(dev)) {
2335 tmp = dmar_insert_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2336 dma_alias & 0xff, NULL, domain);
2338 if (!tmp || tmp != domain) {
2339 domain_exit(domain);
2348 tmp = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
2350 if (!tmp || tmp != domain) {
2351 domain_exit(domain);
2358 static int iommu_identity_mapping;
2359 #define IDENTMAP_ALL 1
2360 #define IDENTMAP_GFX 2
2361 #define IDENTMAP_AZALIA 4
2363 static int iommu_domain_identity_map(struct dmar_domain *domain,
2364 unsigned long long start,
2365 unsigned long long end)
2367 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2368 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2370 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2371 dma_to_mm_pfn(last_vpfn))) {
2372 pr_err("Reserving iova failed\n");
2376 pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2377 start, end, domain->id);
2379 * RMRR range might have overlap with physical memory range,
2382 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2384 return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2385 last_vpfn - first_vpfn + 1,
2386 DMA_PTE_READ|DMA_PTE_WRITE);
2389 static int iommu_prepare_identity_map(struct device *dev,
2390 unsigned long long start,
2391 unsigned long long end)
2393 struct dmar_domain *domain;
2396 domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2400 /* For _hardware_ passthrough, don't bother. But for software
2401 passthrough, we do it anyway -- it may indicate a memory
2402 range which is reserved in E820, so which didn't get set
2403 up to start with in si_domain */
2404 if (domain == si_domain && hw_pass_through) {
2405 pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2406 dev_name(dev), start, end);
2410 pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2411 dev_name(dev), start, end);
2414 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2415 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2416 dmi_get_system_info(DMI_BIOS_VENDOR),
2417 dmi_get_system_info(DMI_BIOS_VERSION),
2418 dmi_get_system_info(DMI_PRODUCT_VERSION));
2423 if (end >> agaw_to_width(domain->agaw)) {
2424 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2425 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2426 agaw_to_width(domain->agaw),
2427 dmi_get_system_info(DMI_BIOS_VENDOR),
2428 dmi_get_system_info(DMI_BIOS_VERSION),
2429 dmi_get_system_info(DMI_PRODUCT_VERSION));
2434 ret = iommu_domain_identity_map(domain, start, end);
2438 /* context entry init */
2439 ret = domain_context_mapping(domain, dev, CONTEXT_TT_MULTI_LEVEL);
2446 domain_exit(domain);
2450 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2453 if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2455 return iommu_prepare_identity_map(dev, rmrr->base_address,
2459 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2460 static inline void iommu_prepare_isa(void)
2462 struct pci_dev *pdev;
2465 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2469 pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2470 ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2473 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2478 static inline void iommu_prepare_isa(void)
2482 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2484 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2486 static int __init si_domain_init(int hw)
2488 struct dmar_drhd_unit *drhd;
2489 struct intel_iommu *iommu;
2493 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2497 for_each_active_iommu(iommu, drhd) {
2498 ret = iommu_attach_domain(si_domain, iommu);
2500 domain_exit(si_domain);
2503 si_domain->id = ret;
2505 } else if (si_domain->id != ret) {
2506 domain_exit(si_domain);
2509 domain_attach_iommu(si_domain, iommu);
2512 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2513 domain_exit(si_domain);
2517 pr_debug("Identity mapping domain is domain %d\n",
2523 for_each_online_node(nid) {
2524 unsigned long start_pfn, end_pfn;
2527 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2528 ret = iommu_domain_identity_map(si_domain,
2529 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2538 static int identity_mapping(struct device *dev)
2540 struct device_domain_info *info;
2542 if (likely(!iommu_identity_mapping))
2545 info = dev->archdata.iommu;
2546 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2547 return (info->domain == si_domain);
2552 static int domain_add_dev_info(struct dmar_domain *domain,
2553 struct device *dev, int translation)
2555 struct dmar_domain *ndomain;
2556 struct intel_iommu *iommu;
2560 iommu = device_to_iommu(dev, &bus, &devfn);
2564 ndomain = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
2565 if (ndomain != domain)
2568 ret = domain_context_mapping(domain, dev, translation);
2570 domain_remove_one_dev_info(domain, dev);
2577 static bool device_has_rmrr(struct device *dev)
2579 struct dmar_rmrr_unit *rmrr;
2584 for_each_rmrr_units(rmrr) {
2586 * Return TRUE if this RMRR contains the device that
2589 for_each_active_dev_scope(rmrr->devices,
2590 rmrr->devices_cnt, i, tmp)
2601 * There are a couple cases where we need to restrict the functionality of
2602 * devices associated with RMRRs. The first is when evaluating a device for
2603 * identity mapping because problems exist when devices are moved in and out
2604 * of domains and their respective RMRR information is lost. This means that
2605 * a device with associated RMRRs will never be in a "passthrough" domain.
2606 * The second is use of the device through the IOMMU API. This interface
2607 * expects to have full control of the IOVA space for the device. We cannot
2608 * satisfy both the requirement that RMRR access is maintained and have an
2609 * unencumbered IOVA space. We also have no ability to quiesce the device's
2610 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2611 * We therefore prevent devices associated with an RMRR from participating in
2612 * the IOMMU API, which eliminates them from device assignment.
2614 * In both cases we assume that PCI USB devices with RMRRs have them largely
2615 * for historical reasons and that the RMRR space is not actively used post
2616 * boot. This exclusion may change if vendors begin to abuse it.
2618 * The same exception is made for graphics devices, with the requirement that
2619 * any use of the RMRR regions will be torn down before assigning the device
2622 static bool device_is_rmrr_locked(struct device *dev)
2624 if (!device_has_rmrr(dev))
2627 if (dev_is_pci(dev)) {
2628 struct pci_dev *pdev = to_pci_dev(dev);
2630 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2637 static int iommu_should_identity_map(struct device *dev, int startup)
2640 if (dev_is_pci(dev)) {
2641 struct pci_dev *pdev = to_pci_dev(dev);
2643 if (device_is_rmrr_locked(dev))
2646 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2649 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2652 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2656 * We want to start off with all devices in the 1:1 domain, and
2657 * take them out later if we find they can't access all of memory.
2659 * However, we can't do this for PCI devices behind bridges,
2660 * because all PCI devices behind the same bridge will end up
2661 * with the same source-id on their transactions.
2663 * Practically speaking, we can't change things around for these
2664 * devices at run-time, because we can't be sure there'll be no
2665 * DMA transactions in flight for any of their siblings.
2667 * So PCI devices (unless they're on the root bus) as well as
2668 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2669 * the 1:1 domain, just in _case_ one of their siblings turns out
2670 * not to be able to map all of memory.
2672 if (!pci_is_pcie(pdev)) {
2673 if (!pci_is_root_bus(pdev->bus))
2675 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2677 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2680 if (device_has_rmrr(dev))
2685 * At boot time, we don't yet know if devices will be 64-bit capable.
2686 * Assume that they will — if they turn out not to be, then we can
2687 * take them out of the 1:1 domain later.
2691 * If the device's dma_mask is less than the system's memory
2692 * size then this is not a candidate for identity mapping.
2694 u64 dma_mask = *dev->dma_mask;
2696 if (dev->coherent_dma_mask &&
2697 dev->coherent_dma_mask < dma_mask)
2698 dma_mask = dev->coherent_dma_mask;
2700 return dma_mask >= dma_get_required_mask(dev);
2706 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2710 if (!iommu_should_identity_map(dev, 1))
2713 ret = domain_add_dev_info(si_domain, dev,
2714 hw ? CONTEXT_TT_PASS_THROUGH :
2715 CONTEXT_TT_MULTI_LEVEL);
2717 pr_info("%s identity mapping for device %s\n",
2718 hw ? "Hardware" : "Software", dev_name(dev));
2719 else if (ret == -ENODEV)
2720 /* device not associated with an iommu */
2727 static int __init iommu_prepare_static_identity_mapping(int hw)
2729 struct pci_dev *pdev = NULL;
2730 struct dmar_drhd_unit *drhd;
2731 struct intel_iommu *iommu;
2736 ret = si_domain_init(hw);
2740 for_each_pci_dev(pdev) {
2741 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2746 for_each_active_iommu(iommu, drhd)
2747 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2748 struct acpi_device_physical_node *pn;
2749 struct acpi_device *adev;
2751 if (dev->bus != &acpi_bus_type)
2754 adev= to_acpi_device(dev);
2755 mutex_lock(&adev->physical_node_lock);
2756 list_for_each_entry(pn, &adev->physical_node_list, node) {
2757 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
2761 mutex_unlock(&adev->physical_node_lock);
2769 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2772 * Start from the sane iommu hardware state.
2773 * If the queued invalidation is already initialized by us
2774 * (for example, while enabling interrupt-remapping) then
2775 * we got the things already rolling from a sane state.
2779 * Clear any previous faults.
2781 dmar_fault(-1, iommu);
2783 * Disable queued invalidation if supported and already enabled
2784 * before OS handover.
2786 dmar_disable_qi(iommu);
2789 if (dmar_enable_qi(iommu)) {
2791 * Queued Invalidate not enabled, use Register Based Invalidate
2793 iommu->flush.flush_context = __iommu_flush_context;
2794 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2795 pr_info("%s: Using Register based invalidation\n",
2798 iommu->flush.flush_context = qi_flush_context;
2799 iommu->flush.flush_iotlb = qi_flush_iotlb;
2800 pr_info("%s: Using Queued invalidation\n", iommu->name);
2804 static int copy_context_table(struct intel_iommu *iommu,
2805 struct root_entry *old_re,
2806 struct context_entry **tbl,
2809 struct context_entry *old_ce = NULL, *new_ce = NULL, ce;
2810 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2811 phys_addr_t old_ce_phys;
2813 tbl_idx = ext ? bus * 2 : bus;
2815 for (devfn = 0; devfn < 256; devfn++) {
2816 /* First calculate the correct index */
2817 idx = (ext ? devfn * 2 : devfn) % 256;
2820 /* First save what we may have and clean up */
2822 tbl[tbl_idx] = new_ce;
2823 __iommu_flush_cache(iommu, new_ce,
2833 old_ce_phys = root_entry_lctp(old_re);
2835 old_ce_phys = root_entry_uctp(old_re);
2838 if (ext && devfn == 0) {
2839 /* No LCTP, try UCTP */
2848 old_ce = ioremap_cache(old_ce_phys, PAGE_SIZE);
2852 new_ce = alloc_pgtable_page(iommu->node);
2859 /* Now copy the context entry */
2862 if (!context_present(&ce))
2865 did = context_domain_id(&ce);
2866 if (did >= 0 && did < cap_ndoms(iommu->cap))
2867 set_bit(did, iommu->domain_ids);
2872 tbl[tbl_idx + pos] = new_ce;
2874 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2883 static int copy_translation_tables(struct intel_iommu *iommu)
2885 struct context_entry **ctxt_tbls;
2886 struct root_entry *old_rt;
2887 phys_addr_t old_rt_phys;
2888 int ctxt_table_entries;
2889 unsigned long flags;
2894 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2895 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
2897 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2901 old_rt = ioremap_cache(old_rt_phys, PAGE_SIZE);
2905 /* This is too big for the stack - allocate it from slab */
2906 ctxt_table_entries = ext ? 512 : 256;
2908 ctxt_tbls = kzalloc(ctxt_table_entries * sizeof(void *), GFP_KERNEL);
2912 for (bus = 0; bus < 256; bus++) {
2913 ret = copy_context_table(iommu, &old_rt[bus],
2914 ctxt_tbls, bus, ext);
2916 pr_err("%s: Failed to copy context table for bus %d\n",
2922 spin_lock_irqsave(&iommu->lock, flags);
2924 /* Context tables are copied, now write them to the root_entry table */
2925 for (bus = 0; bus < 256; bus++) {
2926 int idx = ext ? bus * 2 : bus;
2929 if (ctxt_tbls[idx]) {
2930 val = virt_to_phys(ctxt_tbls[idx]) | 1;
2931 iommu->root_entry[bus].lo = val;
2934 if (!ext || !ctxt_tbls[idx + 1])
2937 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2938 iommu->root_entry[bus].hi = val;
2941 spin_unlock_irqrestore(&iommu->lock, flags);
2945 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2955 static int __init init_dmars(void)
2957 struct dmar_drhd_unit *drhd;
2958 struct dmar_rmrr_unit *rmrr;
2960 struct intel_iommu *iommu;
2966 * initialize and program root entry to not present
2969 for_each_drhd_unit(drhd) {
2971 * lock not needed as this is only incremented in the single
2972 * threaded kernel __init code path all other access are read
2975 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
2979 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
2982 /* Preallocate enough resources for IOMMU hot-addition */
2983 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
2984 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
2986 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2989 pr_err("Allocating global iommu array failed\n");
2994 deferred_flush = kzalloc(g_num_of_iommus *
2995 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2996 if (!deferred_flush) {
3001 for_each_active_iommu(iommu, drhd) {
3002 g_iommus[iommu->seq_id] = iommu;
3004 intel_iommu_init_qi(iommu);
3006 ret = iommu_init_domains(iommu);
3010 init_translation_status(iommu);
3012 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3013 iommu_disable_translation(iommu);
3014 clear_translation_pre_enabled(iommu);
3015 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3021 * we could share the same root & context tables
3022 * among all IOMMU's. Need to Split it later.
3024 ret = iommu_alloc_root_entry(iommu);
3028 if (translation_pre_enabled(iommu)) {
3029 pr_info("Translation already enabled - trying to copy translation structures\n");
3031 ret = copy_translation_tables(iommu);
3034 * We found the IOMMU with translation
3035 * enabled - but failed to copy over the
3036 * old root-entry table. Try to proceed
3037 * by disabling translation now and
3038 * allocating a clean root-entry table.
3039 * This might cause DMAR faults, but
3040 * probably the dump will still succeed.
3042 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3044 iommu_disable_translation(iommu);
3045 clear_translation_pre_enabled(iommu);
3047 pr_info("Copied translation tables from previous kernel for %s\n",
3052 iommu_flush_write_buffer(iommu);
3053 iommu_set_root_entry(iommu);
3054 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3055 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3057 if (!ecap_pass_through(iommu->ecap))
3058 hw_pass_through = 0;
3061 if (iommu_pass_through)
3062 iommu_identity_mapping |= IDENTMAP_ALL;
3064 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3065 iommu_identity_mapping |= IDENTMAP_GFX;
3068 check_tylersburg_isoch();
3071 * If pass through is not set or not enabled, setup context entries for
3072 * identity mappings for rmrr, gfx, and isa and may fall back to static
3073 * identity mapping if iommu_identity_mapping is set.
3075 if (iommu_identity_mapping) {
3076 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3078 pr_crit("Failed to setup IOMMU pass-through\n");
3084 * for each dev attached to rmrr
3086 * locate drhd for dev, alloc domain for dev
3087 * allocate free domain
3088 * allocate page table entries for rmrr
3089 * if context not allocated for bus
3090 * allocate and init context
3091 * set present in root table for this bus
3092 * init context with domain, translation etc
3096 pr_info("Setting RMRR:\n");
3097 for_each_rmrr_units(rmrr) {
3098 /* some BIOS lists non-exist devices in DMAR table. */
3099 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3101 ret = iommu_prepare_rmrr_dev(rmrr, dev);
3103 pr_err("Mapping reserved region failed\n");
3107 iommu_prepare_isa();
3112 * global invalidate context cache
3113 * global invalidate iotlb
3114 * enable translation
3116 for_each_iommu(iommu, drhd) {
3117 if (drhd->ignored) {
3119 * we always have to disable PMRs or DMA may fail on
3123 iommu_disable_protect_mem_regions(iommu);
3127 iommu_flush_write_buffer(iommu);
3129 ret = dmar_set_interrupt(iommu);
3133 iommu_enable_translation(iommu);
3134 iommu_disable_protect_mem_regions(iommu);
3140 for_each_active_iommu(iommu, drhd) {
3141 disable_dmar_iommu(iommu);
3142 free_dmar_iommu(iommu);
3144 kfree(deferred_flush);
3151 /* This takes a number of _MM_ pages, not VTD pages */
3152 static struct iova *intel_alloc_iova(struct device *dev,
3153 struct dmar_domain *domain,
3154 unsigned long nrpages, uint64_t dma_mask)
3156 struct iova *iova = NULL;
3158 /* Restrict dma_mask to the width that the iommu can handle */
3159 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3161 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3163 * First try to allocate an io virtual address in
3164 * DMA_BIT_MASK(32) and if that fails then try allocating
3167 iova = alloc_iova(&domain->iovad, nrpages,
3168 IOVA_PFN(DMA_BIT_MASK(32)), 1);
3172 iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
3173 if (unlikely(!iova)) {
3174 pr_err("Allocating %ld-page iova for %s failed",
3175 nrpages, dev_name(dev));
3182 static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
3184 struct dmar_domain *domain;
3187 domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3189 pr_err("Allocating domain for %s failed\n",
3194 /* make sure context mapping is ok */
3195 if (unlikely(!domain_context_mapped(dev))) {
3196 ret = domain_context_mapping(domain, dev, CONTEXT_TT_MULTI_LEVEL);
3198 pr_err("Domain context map for %s failed\n",
3207 static inline struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3209 struct device_domain_info *info;
3211 /* No lock here, assumes no domain exit in normal case */
3212 info = dev->archdata.iommu;
3214 return info->domain;
3216 return __get_valid_domain_for_dev(dev);
3219 /* Check if the dev needs to go through non-identity map and unmap process.*/
3220 static int iommu_no_mapping(struct device *dev)
3224 if (iommu_dummy(dev))
3227 if (!iommu_identity_mapping)
3230 found = identity_mapping(dev);
3232 if (iommu_should_identity_map(dev, 0))
3236 * 32 bit DMA is removed from si_domain and fall back
3237 * to non-identity mapping.
3239 domain_remove_one_dev_info(si_domain, dev);
3240 pr_info("32bit %s uses non-identity mapping\n",
3246 * In case of a detached 64 bit DMA device from vm, the device
3247 * is put into si_domain for identity mapping.
3249 if (iommu_should_identity_map(dev, 0)) {
3251 ret = domain_add_dev_info(si_domain, dev,
3253 CONTEXT_TT_PASS_THROUGH :
3254 CONTEXT_TT_MULTI_LEVEL);
3256 pr_info("64bit %s uses identity mapping\n",
3266 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3267 size_t size, int dir, u64 dma_mask)
3269 struct dmar_domain *domain;
3270 phys_addr_t start_paddr;
3274 struct intel_iommu *iommu;
3275 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3277 BUG_ON(dir == DMA_NONE);
3279 if (iommu_no_mapping(dev))
3282 domain = get_valid_domain_for_dev(dev);
3286 iommu = domain_get_iommu(domain);
3287 size = aligned_nrpages(paddr, size);
3289 iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3294 * Check if DMAR supports zero-length reads on write only
3297 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3298 !cap_zlr(iommu->cap))
3299 prot |= DMA_PTE_READ;
3300 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3301 prot |= DMA_PTE_WRITE;
3303 * paddr - (paddr + size) might be partial page, we should map the whole
3304 * page. Note: if two part of one page are separately mapped, we
3305 * might have two guest_addr mapping to the same host paddr, but this
3306 * is not a big problem
3308 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
3309 mm_to_dma_pfn(paddr_pfn), size, prot);
3313 /* it's a non-present to present mapping. Only flush if caching mode */
3314 if (cap_caching_mode(iommu->cap))
3315 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 0, 1);
3317 iommu_flush_write_buffer(iommu);
3319 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
3320 start_paddr += paddr & ~PAGE_MASK;
3325 __free_iova(&domain->iovad, iova);
3326 pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3327 dev_name(dev), size, (unsigned long long)paddr, dir);
3331 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3332 unsigned long offset, size_t size,
3333 enum dma_data_direction dir,
3334 struct dma_attrs *attrs)
3336 return __intel_map_single(dev, page_to_phys(page) + offset, size,
3337 dir, *dev->dma_mask);
3340 static void flush_unmaps(void)
3346 /* just flush them all */
3347 for (i = 0; i < g_num_of_iommus; i++) {
3348 struct intel_iommu *iommu = g_iommus[i];
3352 if (!deferred_flush[i].next)
3355 /* In caching mode, global flushes turn emulation expensive */
3356 if (!cap_caching_mode(iommu->cap))
3357 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3358 DMA_TLB_GLOBAL_FLUSH);
3359 for (j = 0; j < deferred_flush[i].next; j++) {
3361 struct iova *iova = deferred_flush[i].iova[j];
3362 struct dmar_domain *domain = deferred_flush[i].domain[j];
3364 /* On real hardware multiple invalidations are expensive */
3365 if (cap_caching_mode(iommu->cap))
3366 iommu_flush_iotlb_psi(iommu, domain->id,
3367 iova->pfn_lo, iova_size(iova),
3368 !deferred_flush[i].freelist[j], 0);
3370 mask = ilog2(mm_to_dma_pfn(iova_size(iova)));
3371 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
3372 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
3374 __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
3375 if (deferred_flush[i].freelist[j])
3376 dma_free_pagelist(deferred_flush[i].freelist[j]);
3378 deferred_flush[i].next = 0;
3384 static void flush_unmaps_timeout(unsigned long data)
3386 unsigned long flags;
3388 spin_lock_irqsave(&async_umap_flush_lock, flags);
3390 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3393 static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist)
3395 unsigned long flags;
3397 struct intel_iommu *iommu;
3399 spin_lock_irqsave(&async_umap_flush_lock, flags);
3400 if (list_size == HIGH_WATER_MARK)
3403 iommu = domain_get_iommu(dom);
3404 iommu_id = iommu->seq_id;
3406 next = deferred_flush[iommu_id].next;
3407 deferred_flush[iommu_id].domain[next] = dom;
3408 deferred_flush[iommu_id].iova[next] = iova;
3409 deferred_flush[iommu_id].freelist[next] = freelist;
3410 deferred_flush[iommu_id].next++;
3413 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
3417 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3420 static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
3422 struct dmar_domain *domain;
3423 unsigned long start_pfn, last_pfn;
3425 struct intel_iommu *iommu;
3426 struct page *freelist;
3428 if (iommu_no_mapping(dev))
3431 domain = find_domain(dev);
3434 iommu = domain_get_iommu(domain);
3436 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
3437 if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
3438 (unsigned long long)dev_addr))
3441 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3442 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3444 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3445 dev_name(dev), start_pfn, last_pfn);
3447 freelist = domain_unmap(domain, start_pfn, last_pfn);
3449 if (intel_iommu_strict) {
3450 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3451 last_pfn - start_pfn + 1, !freelist, 0);
3453 __free_iova(&domain->iovad, iova);
3454 dma_free_pagelist(freelist);
3456 add_unmap(domain, iova, freelist);
3458 * queue up the release of the unmap to save the 1/6th of the
3459 * cpu used up by the iotlb flush operation...
3464 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3465 size_t size, enum dma_data_direction dir,
3466 struct dma_attrs *attrs)
3468 intel_unmap(dev, dev_addr);
3471 static void *intel_alloc_coherent(struct device *dev, size_t size,
3472 dma_addr_t *dma_handle, gfp_t flags,
3473 struct dma_attrs *attrs)
3475 struct page *page = NULL;
3478 size = PAGE_ALIGN(size);
3479 order = get_order(size);
3481 if (!iommu_no_mapping(dev))
3482 flags &= ~(GFP_DMA | GFP_DMA32);
3483 else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3484 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3490 if (flags & __GFP_WAIT) {
3491 unsigned int count = size >> PAGE_SHIFT;
3493 page = dma_alloc_from_contiguous(dev, count, order);
3494 if (page && iommu_no_mapping(dev) &&
3495 page_to_phys(page) + size > dev->coherent_dma_mask) {
3496 dma_release_from_contiguous(dev, page, count);
3502 page = alloc_pages(flags, order);
3505 memset(page_address(page), 0, size);
3507 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3509 dev->coherent_dma_mask);
3511 return page_address(page);
3512 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3513 __free_pages(page, order);
3518 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3519 dma_addr_t dma_handle, struct dma_attrs *attrs)
3522 struct page *page = virt_to_page(vaddr);
3524 size = PAGE_ALIGN(size);
3525 order = get_order(size);
3527 intel_unmap(dev, dma_handle);
3528 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3529 __free_pages(page, order);
3532 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3533 int nelems, enum dma_data_direction dir,
3534 struct dma_attrs *attrs)
3536 intel_unmap(dev, sglist[0].dma_address);
3539 static int intel_nontranslate_map_sg(struct device *hddev,
3540 struct scatterlist *sglist, int nelems, int dir)
3543 struct scatterlist *sg;
3545 for_each_sg(sglist, sg, nelems, i) {
3546 BUG_ON(!sg_page(sg));
3547 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3548 sg->dma_length = sg->length;
3553 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3554 enum dma_data_direction dir, struct dma_attrs *attrs)
3557 struct dmar_domain *domain;
3560 struct iova *iova = NULL;
3562 struct scatterlist *sg;
3563 unsigned long start_vpfn;
3564 struct intel_iommu *iommu;
3566 BUG_ON(dir == DMA_NONE);
3567 if (iommu_no_mapping(dev))
3568 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3570 domain = get_valid_domain_for_dev(dev);
3574 iommu = domain_get_iommu(domain);
3576 for_each_sg(sglist, sg, nelems, i)
3577 size += aligned_nrpages(sg->offset, sg->length);
3579 iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3582 sglist->dma_length = 0;
3587 * Check if DMAR supports zero-length reads on write only
3590 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3591 !cap_zlr(iommu->cap))
3592 prot |= DMA_PTE_READ;
3593 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3594 prot |= DMA_PTE_WRITE;
3596 start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3598 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3599 if (unlikely(ret)) {
3600 dma_pte_free_pagetable(domain, start_vpfn,
3601 start_vpfn + size - 1);
3602 __free_iova(&domain->iovad, iova);
3606 /* it's a non-present to present mapping. Only flush if caching mode */
3607 if (cap_caching_mode(iommu->cap))
3608 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 0, 1);
3610 iommu_flush_write_buffer(iommu);
3615 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3620 struct dma_map_ops intel_dma_ops = {
3621 .alloc = intel_alloc_coherent,
3622 .free = intel_free_coherent,
3623 .map_sg = intel_map_sg,
3624 .unmap_sg = intel_unmap_sg,
3625 .map_page = intel_map_page,
3626 .unmap_page = intel_unmap_page,
3627 .mapping_error = intel_mapping_error,
3630 static inline int iommu_domain_cache_init(void)
3634 iommu_domain_cache = kmem_cache_create("iommu_domain",
3635 sizeof(struct dmar_domain),
3640 if (!iommu_domain_cache) {
3641 pr_err("Couldn't create iommu_domain cache\n");
3648 static inline int iommu_devinfo_cache_init(void)
3652 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3653 sizeof(struct device_domain_info),
3657 if (!iommu_devinfo_cache) {
3658 pr_err("Couldn't create devinfo cache\n");
3665 static int __init iommu_init_mempool(void)
3668 ret = iommu_iova_cache_init();
3672 ret = iommu_domain_cache_init();
3676 ret = iommu_devinfo_cache_init();
3680 kmem_cache_destroy(iommu_domain_cache);
3682 iommu_iova_cache_destroy();
3687 static void __init iommu_exit_mempool(void)
3689 kmem_cache_destroy(iommu_devinfo_cache);
3690 kmem_cache_destroy(iommu_domain_cache);
3691 iommu_iova_cache_destroy();
3694 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3696 struct dmar_drhd_unit *drhd;
3700 /* We know that this device on this chipset has its own IOMMU.
3701 * If we find it under a different IOMMU, then the BIOS is lying
3702 * to us. Hope that the IOMMU for this device is actually
3703 * disabled, and it needs no translation...
3705 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3707 /* "can't" happen */
3708 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3711 vtbar &= 0xffff0000;
3713 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3714 drhd = dmar_find_matched_drhd_unit(pdev);
3715 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3716 TAINT_FIRMWARE_WORKAROUND,
3717 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3718 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3720 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3722 static void __init init_no_remapping_devices(void)
3724 struct dmar_drhd_unit *drhd;
3728 for_each_drhd_unit(drhd) {
3729 if (!drhd->include_all) {
3730 for_each_active_dev_scope(drhd->devices,
3731 drhd->devices_cnt, i, dev)
3733 /* ignore DMAR unit if no devices exist */
3734 if (i == drhd->devices_cnt)
3739 for_each_active_drhd_unit(drhd) {
3740 if (drhd->include_all)
3743 for_each_active_dev_scope(drhd->devices,
3744 drhd->devices_cnt, i, dev)
3745 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3747 if (i < drhd->devices_cnt)
3750 /* This IOMMU has *only* gfx devices. Either bypass it or
3751 set the gfx_mapped flag, as appropriate */
3753 intel_iommu_gfx_mapped = 1;
3756 for_each_active_dev_scope(drhd->devices,
3757 drhd->devices_cnt, i, dev)
3758 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3763 #ifdef CONFIG_SUSPEND
3764 static int init_iommu_hw(void)
3766 struct dmar_drhd_unit *drhd;
3767 struct intel_iommu *iommu = NULL;
3769 for_each_active_iommu(iommu, drhd)
3771 dmar_reenable_qi(iommu);
3773 for_each_iommu(iommu, drhd) {
3774 if (drhd->ignored) {
3776 * we always have to disable PMRs or DMA may fail on
3780 iommu_disable_protect_mem_regions(iommu);
3784 iommu_flush_write_buffer(iommu);
3786 iommu_set_root_entry(iommu);
3788 iommu->flush.flush_context(iommu, 0, 0, 0,
3789 DMA_CCMD_GLOBAL_INVL);
3790 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3791 iommu_enable_translation(iommu);
3792 iommu_disable_protect_mem_regions(iommu);
3798 static void iommu_flush_all(void)
3800 struct dmar_drhd_unit *drhd;
3801 struct intel_iommu *iommu;
3803 for_each_active_iommu(iommu, drhd) {
3804 iommu->flush.flush_context(iommu, 0, 0, 0,
3805 DMA_CCMD_GLOBAL_INVL);
3806 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3807 DMA_TLB_GLOBAL_FLUSH);
3811 static int iommu_suspend(void)
3813 struct dmar_drhd_unit *drhd;
3814 struct intel_iommu *iommu = NULL;
3817 for_each_active_iommu(iommu, drhd) {
3818 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3820 if (!iommu->iommu_state)
3826 for_each_active_iommu(iommu, drhd) {
3827 iommu_disable_translation(iommu);
3829 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3831 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3832 readl(iommu->reg + DMAR_FECTL_REG);
3833 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3834 readl(iommu->reg + DMAR_FEDATA_REG);
3835 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3836 readl(iommu->reg + DMAR_FEADDR_REG);
3837 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3838 readl(iommu->reg + DMAR_FEUADDR_REG);
3840 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3845 for_each_active_iommu(iommu, drhd)
3846 kfree(iommu->iommu_state);
3851 static void iommu_resume(void)
3853 struct dmar_drhd_unit *drhd;
3854 struct intel_iommu *iommu = NULL;
3857 if (init_iommu_hw()) {
3859 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3861 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3865 for_each_active_iommu(iommu, drhd) {
3867 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3869 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3870 iommu->reg + DMAR_FECTL_REG);
3871 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3872 iommu->reg + DMAR_FEDATA_REG);
3873 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3874 iommu->reg + DMAR_FEADDR_REG);
3875 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3876 iommu->reg + DMAR_FEUADDR_REG);
3878 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3881 for_each_active_iommu(iommu, drhd)
3882 kfree(iommu->iommu_state);
3885 static struct syscore_ops iommu_syscore_ops = {
3886 .resume = iommu_resume,
3887 .suspend = iommu_suspend,
3890 static void __init init_iommu_pm_ops(void)
3892 register_syscore_ops(&iommu_syscore_ops);
3896 static inline void init_iommu_pm_ops(void) {}
3897 #endif /* CONFIG_PM */
3900 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3902 struct acpi_dmar_reserved_memory *rmrr;
3903 struct dmar_rmrr_unit *rmrru;
3905 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3909 rmrru->hdr = header;
3910 rmrr = (struct acpi_dmar_reserved_memory *)header;
3911 rmrru->base_address = rmrr->base_address;
3912 rmrru->end_address = rmrr->end_address;
3913 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3914 ((void *)rmrr) + rmrr->header.length,
3915 &rmrru->devices_cnt);
3916 if (rmrru->devices_cnt && rmrru->devices == NULL) {
3921 list_add(&rmrru->list, &dmar_rmrr_units);
3926 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3928 struct dmar_atsr_unit *atsru;
3929 struct acpi_dmar_atsr *tmp;
3931 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3932 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3933 if (atsr->segment != tmp->segment)
3935 if (atsr->header.length != tmp->header.length)
3937 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3944 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3946 struct acpi_dmar_atsr *atsr;
3947 struct dmar_atsr_unit *atsru;
3949 if (system_state != SYSTEM_BOOTING && !intel_iommu_enabled)
3952 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3953 atsru = dmar_find_atsr(atsr);
3957 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3962 * If memory is allocated from slab by ACPI _DSM method, we need to
3963 * copy the memory content because the memory buffer will be freed
3966 atsru->hdr = (void *)(atsru + 1);
3967 memcpy(atsru->hdr, hdr, hdr->length);
3968 atsru->include_all = atsr->flags & 0x1;
3969 if (!atsru->include_all) {
3970 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3971 (void *)atsr + atsr->header.length,
3972 &atsru->devices_cnt);
3973 if (atsru->devices_cnt && atsru->devices == NULL) {
3979 list_add_rcu(&atsru->list, &dmar_atsr_units);
3984 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3986 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3990 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3992 struct acpi_dmar_atsr *atsr;
3993 struct dmar_atsr_unit *atsru;
3995 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3996 atsru = dmar_find_atsr(atsr);
3998 list_del_rcu(&atsru->list);
4000 intel_iommu_free_atsr(atsru);
4006 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4010 struct acpi_dmar_atsr *atsr;
4011 struct dmar_atsr_unit *atsru;
4013 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4014 atsru = dmar_find_atsr(atsr);
4018 if (!atsru->include_all && atsru->devices && atsru->devices_cnt)
4019 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4026 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4029 struct intel_iommu *iommu = dmaru->iommu;
4031 if (g_iommus[iommu->seq_id])
4034 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4035 pr_warn("%s: Doesn't support hardware pass through.\n",
4039 if (!ecap_sc_support(iommu->ecap) &&
4040 domain_update_iommu_snooping(iommu)) {
4041 pr_warn("%s: Doesn't support snooping.\n",
4045 sp = domain_update_iommu_superpage(iommu) - 1;
4046 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4047 pr_warn("%s: Doesn't support large page.\n",
4053 * Disable translation if already enabled prior to OS handover.
4055 if (iommu->gcmd & DMA_GCMD_TE)
4056 iommu_disable_translation(iommu);
4058 g_iommus[iommu->seq_id] = iommu;
4059 ret = iommu_init_domains(iommu);
4061 ret = iommu_alloc_root_entry(iommu);
4065 if (dmaru->ignored) {
4067 * we always have to disable PMRs or DMA may fail on this device
4070 iommu_disable_protect_mem_regions(iommu);
4074 intel_iommu_init_qi(iommu);
4075 iommu_flush_write_buffer(iommu);
4076 ret = dmar_set_interrupt(iommu);
4080 iommu_set_root_entry(iommu);
4081 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4082 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4083 iommu_enable_translation(iommu);
4086 ret = iommu_attach_domain(si_domain, iommu);
4087 if (ret < 0 || si_domain->id != ret)
4089 domain_attach_iommu(si_domain, iommu);
4092 iommu_disable_protect_mem_regions(iommu);
4096 disable_dmar_iommu(iommu);
4098 free_dmar_iommu(iommu);
4102 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4105 struct intel_iommu *iommu = dmaru->iommu;
4107 if (!intel_iommu_enabled)
4113 ret = intel_iommu_add(dmaru);
4115 disable_dmar_iommu(iommu);
4116 free_dmar_iommu(iommu);
4122 static void intel_iommu_free_dmars(void)
4124 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4125 struct dmar_atsr_unit *atsru, *atsr_n;
4127 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4128 list_del(&rmrru->list);
4129 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4133 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4134 list_del(&atsru->list);
4135 intel_iommu_free_atsr(atsru);
4139 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4142 struct pci_bus *bus;
4143 struct pci_dev *bridge = NULL;
4145 struct acpi_dmar_atsr *atsr;
4146 struct dmar_atsr_unit *atsru;
4148 dev = pci_physfn(dev);
4149 for (bus = dev->bus; bus; bus = bus->parent) {
4151 if (!bridge || !pci_is_pcie(bridge) ||
4152 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4154 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4161 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4162 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4163 if (atsr->segment != pci_domain_nr(dev->bus))
4166 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4167 if (tmp == &bridge->dev)
4170 if (atsru->include_all)
4180 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4183 struct dmar_rmrr_unit *rmrru;
4184 struct dmar_atsr_unit *atsru;
4185 struct acpi_dmar_atsr *atsr;
4186 struct acpi_dmar_reserved_memory *rmrr;
4188 if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
4191 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4192 rmrr = container_of(rmrru->hdr,
4193 struct acpi_dmar_reserved_memory, header);
4194 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4195 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4196 ((void *)rmrr) + rmrr->header.length,
4197 rmrr->segment, rmrru->devices,
4198 rmrru->devices_cnt);
4201 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
4202 dmar_remove_dev_scope(info, rmrr->segment,
4203 rmrru->devices, rmrru->devices_cnt);
4207 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4208 if (atsru->include_all)
4211 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4212 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4213 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4214 (void *)atsr + atsr->header.length,
4215 atsr->segment, atsru->devices,
4216 atsru->devices_cnt);
4221 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
4222 if (dmar_remove_dev_scope(info, atsr->segment,
4223 atsru->devices, atsru->devices_cnt))
4232 * Here we only respond to action of unbound device from driver.
4234 * Added device is not attached to its DMAR domain here yet. That will happen
4235 * when mapping the device to iova.
4237 static int device_notifier(struct notifier_block *nb,
4238 unsigned long action, void *data)
4240 struct device *dev = data;
4241 struct dmar_domain *domain;
4243 if (iommu_dummy(dev))
4246 if (action != BUS_NOTIFY_REMOVED_DEVICE)
4249 domain = find_domain(dev);
4253 down_read(&dmar_global_lock);
4254 domain_remove_one_dev_info(domain, dev);
4255 if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4256 domain_exit(domain);
4257 up_read(&dmar_global_lock);
4262 static struct notifier_block device_nb = {
4263 .notifier_call = device_notifier,
4266 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4267 unsigned long val, void *v)
4269 struct memory_notify *mhp = v;
4270 unsigned long long start, end;
4271 unsigned long start_vpfn, last_vpfn;
4274 case MEM_GOING_ONLINE:
4275 start = mhp->start_pfn << PAGE_SHIFT;
4276 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4277 if (iommu_domain_identity_map(si_domain, start, end)) {
4278 pr_warn("Failed to build identity map for [%llx-%llx]\n",
4285 case MEM_CANCEL_ONLINE:
4286 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4287 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4288 while (start_vpfn <= last_vpfn) {
4290 struct dmar_drhd_unit *drhd;
4291 struct intel_iommu *iommu;
4292 struct page *freelist;
4294 iova = find_iova(&si_domain->iovad, start_vpfn);
4296 pr_debug("Failed get IOVA for PFN %lx\n",
4301 iova = split_and_remove_iova(&si_domain->iovad, iova,
4302 start_vpfn, last_vpfn);
4304 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4305 start_vpfn, last_vpfn);
4309 freelist = domain_unmap(si_domain, iova->pfn_lo,
4313 for_each_active_iommu(iommu, drhd)
4314 iommu_flush_iotlb_psi(iommu, si_domain->id,
4315 iova->pfn_lo, iova_size(iova),
4318 dma_free_pagelist(freelist);
4320 start_vpfn = iova->pfn_hi + 1;
4321 free_iova_mem(iova);
4329 static struct notifier_block intel_iommu_memory_nb = {
4330 .notifier_call = intel_iommu_memory_notifier,
4335 static ssize_t intel_iommu_show_version(struct device *dev,
4336 struct device_attribute *attr,
4339 struct intel_iommu *iommu = dev_get_drvdata(dev);
4340 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4341 return sprintf(buf, "%d:%d\n",
4342 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4344 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4346 static ssize_t intel_iommu_show_address(struct device *dev,
4347 struct device_attribute *attr,
4350 struct intel_iommu *iommu = dev_get_drvdata(dev);
4351 return sprintf(buf, "%llx\n", iommu->reg_phys);
4353 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4355 static ssize_t intel_iommu_show_cap(struct device *dev,
4356 struct device_attribute *attr,
4359 struct intel_iommu *iommu = dev_get_drvdata(dev);
4360 return sprintf(buf, "%llx\n", iommu->cap);
4362 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4364 static ssize_t intel_iommu_show_ecap(struct device *dev,
4365 struct device_attribute *attr,
4368 struct intel_iommu *iommu = dev_get_drvdata(dev);
4369 return sprintf(buf, "%llx\n", iommu->ecap);
4371 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4373 static struct attribute *intel_iommu_attrs[] = {
4374 &dev_attr_version.attr,
4375 &dev_attr_address.attr,
4377 &dev_attr_ecap.attr,
4381 static struct attribute_group intel_iommu_group = {
4382 .name = "intel-iommu",
4383 .attrs = intel_iommu_attrs,
4386 const struct attribute_group *intel_iommu_groups[] = {
4391 int __init intel_iommu_init(void)
4394 struct dmar_drhd_unit *drhd;
4395 struct intel_iommu *iommu;
4397 /* VT-d is required for a TXT/tboot launch, so enforce that */
4398 force_on = tboot_force_iommu();
4400 if (iommu_init_mempool()) {
4402 panic("tboot: Failed to initialize iommu memory\n");
4406 down_write(&dmar_global_lock);
4407 if (dmar_table_init()) {
4409 panic("tboot: Failed to initialize DMAR table\n");
4414 * Disable translation if already enabled prior to OS handover.
4416 for_each_active_iommu(iommu, drhd)
4417 if (iommu->gcmd & DMA_GCMD_TE)
4418 iommu_disable_translation(iommu);
4420 if (dmar_dev_scope_init() < 0) {
4422 panic("tboot: Failed to initialize DMAR device scope\n");
4426 if (no_iommu || dmar_disabled)
4429 if (list_empty(&dmar_rmrr_units))
4430 pr_info("No RMRR found\n");
4432 if (list_empty(&dmar_atsr_units))
4433 pr_info("No ATSR found\n");
4435 if (dmar_init_reserved_ranges()) {
4437 panic("tboot: Failed to reserve iommu ranges\n");
4438 goto out_free_reserved_range;
4441 init_no_remapping_devices();
4446 panic("tboot: Failed to initialize DMARs\n");
4447 pr_err("Initialization failed\n");
4448 goto out_free_reserved_range;
4450 up_write(&dmar_global_lock);
4451 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4453 init_timer(&unmap_timer);
4454 #ifdef CONFIG_SWIOTLB
4457 dma_ops = &intel_dma_ops;
4459 init_iommu_pm_ops();
4461 for_each_active_iommu(iommu, drhd)
4462 iommu->iommu_dev = iommu_device_create(NULL, iommu,
4466 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4467 bus_register_notifier(&pci_bus_type, &device_nb);
4468 if (si_domain && !hw_pass_through)
4469 register_memory_notifier(&intel_iommu_memory_nb);
4471 intel_iommu_enabled = 1;
4475 out_free_reserved_range:
4476 put_iova_domain(&reserved_iova_list);
4478 intel_iommu_free_dmars();
4479 up_write(&dmar_global_lock);
4480 iommu_exit_mempool();
4484 static int iommu_detach_dev_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4486 struct intel_iommu *iommu = opaque;
4488 iommu_detach_dev(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4493 * NB - intel-iommu lacks any sort of reference counting for the users of
4494 * dependent devices. If multiple endpoints have intersecting dependent
4495 * devices, unbinding the driver from any one of them will possibly leave
4496 * the others unable to operate.
4498 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
4501 if (!iommu || !dev || !dev_is_pci(dev))
4504 pci_for_each_dma_alias(to_pci_dev(dev), &iommu_detach_dev_cb, iommu);
4507 static void domain_remove_one_dev_info(struct dmar_domain *domain,
4510 struct device_domain_info *info, *tmp;
4511 struct intel_iommu *iommu;
4512 unsigned long flags;
4516 iommu = device_to_iommu(dev, &bus, &devfn);
4520 spin_lock_irqsave(&device_domain_lock, flags);
4521 list_for_each_entry_safe(info, tmp, &domain->devices, link) {
4522 if (info->iommu == iommu && info->bus == bus &&
4523 info->devfn == devfn) {
4524 unlink_domain_info(info);
4525 spin_unlock_irqrestore(&device_domain_lock, flags);
4527 iommu_disable_dev_iotlb(info);
4528 iommu_detach_dev(iommu, info->bus, info->devfn);
4529 iommu_detach_dependent_devices(iommu, dev);
4530 free_devinfo_mem(info);
4532 spin_lock_irqsave(&device_domain_lock, flags);
4540 /* if there is no other devices under the same iommu
4541 * owned by this domain, clear this iommu in iommu_bmp
4542 * update iommu count and coherency
4544 if (info->iommu == iommu)
4548 spin_unlock_irqrestore(&device_domain_lock, flags);
4551 domain_detach_iommu(domain, iommu);
4552 if (!domain_type_is_vm_or_si(domain))
4553 iommu_detach_domain(domain, iommu);
4557 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4561 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
4563 domain_reserve_special_ranges(domain);
4565 /* calculate AGAW */
4566 domain->gaw = guest_width;
4567 adjust_width = guestwidth_to_adjustwidth(guest_width);
4568 domain->agaw = width_to_agaw(adjust_width);
4570 domain->iommu_coherency = 0;
4571 domain->iommu_snooping = 0;
4572 domain->iommu_superpage = 0;
4573 domain->max_addr = 0;
4575 /* always allocate the top pgd */
4576 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4579 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4583 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4585 struct dmar_domain *dmar_domain;
4586 struct iommu_domain *domain;
4588 if (type != IOMMU_DOMAIN_UNMANAGED)
4591 dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4593 pr_err("Can't allocate dmar_domain\n");
4596 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4597 pr_err("Domain initialization failed\n");
4598 domain_exit(dmar_domain);
4601 domain_update_iommu_cap(dmar_domain);
4603 domain = &dmar_domain->domain;
4604 domain->geometry.aperture_start = 0;
4605 domain->geometry.aperture_end = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4606 domain->geometry.force_aperture = true;
4611 static void intel_iommu_domain_free(struct iommu_domain *domain)
4613 domain_exit(to_dmar_domain(domain));
4616 static int intel_iommu_attach_device(struct iommu_domain *domain,
4619 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4620 struct intel_iommu *iommu;
4624 if (device_is_rmrr_locked(dev)) {
4625 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
4629 /* normally dev is not mapped */
4630 if (unlikely(domain_context_mapped(dev))) {
4631 struct dmar_domain *old_domain;
4633 old_domain = find_domain(dev);
4635 if (domain_type_is_vm_or_si(dmar_domain))
4636 domain_remove_one_dev_info(old_domain, dev);
4638 domain_remove_dev_info(old_domain);
4640 if (!domain_type_is_vm_or_si(old_domain) &&
4641 list_empty(&old_domain->devices))
4642 domain_exit(old_domain);
4646 iommu = device_to_iommu(dev, &bus, &devfn);
4650 /* check if this iommu agaw is sufficient for max mapped address */
4651 addr_width = agaw_to_width(iommu->agaw);
4652 if (addr_width > cap_mgaw(iommu->cap))
4653 addr_width = cap_mgaw(iommu->cap);
4655 if (dmar_domain->max_addr > (1LL << addr_width)) {
4656 pr_err("%s: iommu width (%d) is not "
4657 "sufficient for the mapped address (%llx)\n",
4658 __func__, addr_width, dmar_domain->max_addr);
4661 dmar_domain->gaw = addr_width;
4664 * Knock out extra levels of page tables if necessary
4666 while (iommu->agaw < dmar_domain->agaw) {
4667 struct dma_pte *pte;
4669 pte = dmar_domain->pgd;
4670 if (dma_pte_present(pte)) {
4671 dmar_domain->pgd = (struct dma_pte *)
4672 phys_to_virt(dma_pte_addr(pte));
4673 free_pgtable_page(pte);
4675 dmar_domain->agaw--;
4678 return domain_add_dev_info(dmar_domain, dev, CONTEXT_TT_MULTI_LEVEL);
4681 static void intel_iommu_detach_device(struct iommu_domain *domain,
4684 domain_remove_one_dev_info(to_dmar_domain(domain), dev);
4687 static int intel_iommu_map(struct iommu_domain *domain,
4688 unsigned long iova, phys_addr_t hpa,
4689 size_t size, int iommu_prot)
4691 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4696 if (iommu_prot & IOMMU_READ)
4697 prot |= DMA_PTE_READ;
4698 if (iommu_prot & IOMMU_WRITE)
4699 prot |= DMA_PTE_WRITE;
4700 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4701 prot |= DMA_PTE_SNP;
4703 max_addr = iova + size;
4704 if (dmar_domain->max_addr < max_addr) {
4707 /* check if minimum agaw is sufficient for mapped address */
4708 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4709 if (end < max_addr) {
4710 pr_err("%s: iommu width (%d) is not "
4711 "sufficient for the mapped address (%llx)\n",
4712 __func__, dmar_domain->gaw, max_addr);
4715 dmar_domain->max_addr = max_addr;
4717 /* Round up size to next multiple of PAGE_SIZE, if it and
4718 the low bits of hpa would take us onto the next page */
4719 size = aligned_nrpages(hpa, size);
4720 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4721 hpa >> VTD_PAGE_SHIFT, size, prot);
4725 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4726 unsigned long iova, size_t size)
4728 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4729 struct page *freelist = NULL;
4730 struct intel_iommu *iommu;
4731 unsigned long start_pfn, last_pfn;
4732 unsigned int npages;
4733 int iommu_id, num, ndomains, level = 0;
4735 /* Cope with horrid API which requires us to unmap more than the
4736 size argument if it happens to be a large-page mapping. */
4737 if (!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level))
4740 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4741 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4743 start_pfn = iova >> VTD_PAGE_SHIFT;
4744 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4746 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
4748 npages = last_pfn - start_pfn + 1;
4750 for_each_set_bit(iommu_id, dmar_domain->iommu_bmp, g_num_of_iommus) {
4751 iommu = g_iommus[iommu_id];
4754 * find bit position of dmar_domain
4756 ndomains = cap_ndoms(iommu->cap);
4757 for_each_set_bit(num, iommu->domain_ids, ndomains) {
4758 if (iommu->domains[num] == dmar_domain)
4759 iommu_flush_iotlb_psi(iommu, num, start_pfn,
4760 npages, !freelist, 0);
4765 dma_free_pagelist(freelist);
4767 if (dmar_domain->max_addr == iova + size)
4768 dmar_domain->max_addr = iova;
4773 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4776 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4777 struct dma_pte *pte;
4781 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4783 phys = dma_pte_addr(pte);
4788 static bool intel_iommu_capable(enum iommu_cap cap)
4790 if (cap == IOMMU_CAP_CACHE_COHERENCY)
4791 return domain_update_iommu_snooping(NULL) == 1;
4792 if (cap == IOMMU_CAP_INTR_REMAP)
4793 return irq_remapping_enabled == 1;
4798 static int intel_iommu_add_device(struct device *dev)
4800 struct intel_iommu *iommu;
4801 struct iommu_group *group;
4804 iommu = device_to_iommu(dev, &bus, &devfn);
4808 iommu_device_link(iommu->iommu_dev, dev);
4810 group = iommu_group_get_for_dev(dev);
4813 return PTR_ERR(group);
4815 iommu_group_put(group);
4819 static void intel_iommu_remove_device(struct device *dev)
4821 struct intel_iommu *iommu;
4824 iommu = device_to_iommu(dev, &bus, &devfn);
4828 iommu_group_remove_device(dev);
4830 iommu_device_unlink(iommu->iommu_dev, dev);
4833 static const struct iommu_ops intel_iommu_ops = {
4834 .capable = intel_iommu_capable,
4835 .domain_alloc = intel_iommu_domain_alloc,
4836 .domain_free = intel_iommu_domain_free,
4837 .attach_dev = intel_iommu_attach_device,
4838 .detach_dev = intel_iommu_detach_device,
4839 .map = intel_iommu_map,
4840 .unmap = intel_iommu_unmap,
4841 .map_sg = default_iommu_map_sg,
4842 .iova_to_phys = intel_iommu_iova_to_phys,
4843 .add_device = intel_iommu_add_device,
4844 .remove_device = intel_iommu_remove_device,
4845 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
4848 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4850 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4851 pr_info("Disabling IOMMU for graphics on this chipset\n");
4855 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4856 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4857 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4858 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4859 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4860 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4861 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4863 static void quirk_iommu_rwbf(struct pci_dev *dev)
4866 * Mobile 4 Series Chipset neglects to set RWBF capability,
4867 * but needs it. Same seems to hold for the desktop versions.
4869 pr_info("Forcing write-buffer flush capability\n");
4873 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4874 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4875 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4876 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4877 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4878 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4879 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4882 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
4883 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4884 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
4885 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
4886 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4887 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4888 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4889 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4891 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4895 if (pci_read_config_word(dev, GGC, &ggc))
4898 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4899 pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4901 } else if (dmar_map_gfx) {
4902 /* we have to ensure the gfx device is idle before we flush */
4903 pr_info("Disabling batched IOTLB flush on Ironlake\n");
4904 intel_iommu_strict = 1;
4907 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4908 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4909 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4910 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4912 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4913 ISOCH DMAR unit for the Azalia sound device, but not give it any
4914 TLB entries, which causes it to deadlock. Check for that. We do
4915 this in a function called from init_dmars(), instead of in a PCI
4916 quirk, because we don't want to print the obnoxious "BIOS broken"
4917 message if VT-d is actually disabled.
4919 static void __init check_tylersburg_isoch(void)
4921 struct pci_dev *pdev;
4922 uint32_t vtisochctrl;
4924 /* If there's no Azalia in the system anyway, forget it. */
4925 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4930 /* System Management Registers. Might be hidden, in which case
4931 we can't do the sanity check. But that's OK, because the
4932 known-broken BIOSes _don't_ actually hide it, so far. */
4933 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4937 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4944 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4945 if (vtisochctrl & 1)
4948 /* Drop all bits other than the number of TLB entries */
4949 vtisochctrl &= 0x1c;
4951 /* If we have the recommended number of TLB entries (16), fine. */
4952 if (vtisochctrl == 0x10)
4955 /* Zero TLB entries? You get to ride the short bus to school. */
4957 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4958 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4959 dmi_get_system_info(DMI_BIOS_VENDOR),
4960 dmi_get_system_info(DMI_BIOS_VERSION),
4961 dmi_get_system_info(DMI_PRODUCT_VERSION));
4962 iommu_identity_mapping |= IDENTMAP_AZALIA;
4966 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",