2 * Copyright © 2006-2014 Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * Authors: David Woodhouse <dwmw2@infradead.org>,
14 * Ashok Raj <ashok.raj@intel.com>,
15 * Shaohua Li <shaohua.li@intel.com>,
16 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17 * Fenghua Yu <fenghua.yu@intel.com>
18 * Joerg Roedel <jroedel@suse.de>
21 #define pr_fmt(fmt) "DMAR: " fmt
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/export.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/memory.h>
36 #include <linux/cpu.h>
37 #include <linux/timer.h>
39 #include <linux/iova.h>
40 #include <linux/iommu.h>
41 #include <linux/intel-iommu.h>
42 #include <linux/syscore_ops.h>
43 #include <linux/tboot.h>
44 #include <linux/dmi.h>
45 #include <linux/pci-ats.h>
46 #include <linux/memblock.h>
47 #include <linux/dma-contiguous.h>
48 #include <linux/dma-direct.h>
49 #include <linux/crash_dump.h>
50 #include <asm/irq_remapping.h>
51 #include <asm/cacheflush.h>
52 #include <asm/iommu.h>
54 #include "irq_remapping.h"
56 #define ROOT_SIZE VTD_PAGE_SIZE
57 #define CONTEXT_SIZE VTD_PAGE_SIZE
59 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
60 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
61 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
62 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
64 #define IOAPIC_RANGE_START (0xfee00000)
65 #define IOAPIC_RANGE_END (0xfeefffff)
66 #define IOVA_START_ADDR (0x1000)
68 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
70 #define MAX_AGAW_WIDTH 64
71 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
73 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
74 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
76 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
77 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
78 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
79 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
80 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
82 /* IO virtual address start page frame number */
83 #define IOVA_START_PFN (1)
85 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
87 /* page table handling */
88 #define LEVEL_STRIDE (9)
89 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
92 * This bitmap is used to advertise the page sizes our hardware support
93 * to the IOMMU core, which will then use this information to split
94 * physically contiguous memory regions it is mapping into page sizes
97 * Traditionally the IOMMU core just handed us the mappings directly,
98 * after making sure the size is an order of a 4KiB page and that the
99 * mapping has natural alignment.
101 * To retain this behavior, we currently advertise that we support
102 * all page sizes that are an order of 4KiB.
104 * If at some point we'd like to utilize the IOMMU core's new behavior,
105 * we could change this to advertise the real page sizes we support.
107 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
109 static inline int agaw_to_level(int agaw)
114 static inline int agaw_to_width(int agaw)
116 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
119 static inline int width_to_agaw(int width)
121 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
124 static inline unsigned int level_to_offset_bits(int level)
126 return (level - 1) * LEVEL_STRIDE;
129 static inline int pfn_level_offset(unsigned long pfn, int level)
131 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
134 static inline unsigned long level_mask(int level)
136 return -1UL << level_to_offset_bits(level);
139 static inline unsigned long level_size(int level)
141 return 1UL << level_to_offset_bits(level);
144 static inline unsigned long align_to_level(unsigned long pfn, int level)
146 return (pfn + level_size(level) - 1) & level_mask(level);
149 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
151 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
154 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
155 are never going to work. */
156 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
158 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
161 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
163 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
165 static inline unsigned long page_to_dma_pfn(struct page *pg)
167 return mm_to_dma_pfn(page_to_pfn(pg));
169 static inline unsigned long virt_to_dma_pfn(void *p)
171 return page_to_dma_pfn(virt_to_page(p));
174 /* global iommu list, set NULL for ignored DMAR units */
175 static struct intel_iommu **g_iommus;
177 static void __init check_tylersburg_isoch(void);
178 static int rwbf_quirk;
181 * set to 1 to panic kernel if can't successfully enable VT-d
182 * (used when kernel is launched w/ TXT)
184 static int force_on = 0;
185 int intel_iommu_tboot_noforce;
190 * 12-63: Context Ptr (12 - (haw-1))
197 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
200 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
203 static phys_addr_t root_entry_lctp(struct root_entry *re)
208 return re->lo & VTD_PAGE_MASK;
212 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
215 static phys_addr_t root_entry_uctp(struct root_entry *re)
220 return re->hi & VTD_PAGE_MASK;
225 * 1: fault processing disable
226 * 2-3: translation type
227 * 12-63: address space root
233 struct context_entry {
238 static inline void context_clear_pasid_enable(struct context_entry *context)
240 context->lo &= ~(1ULL << 11);
243 static inline bool context_pasid_enabled(struct context_entry *context)
245 return !!(context->lo & (1ULL << 11));
248 static inline void context_set_copied(struct context_entry *context)
250 context->hi |= (1ull << 3);
253 static inline bool context_copied(struct context_entry *context)
255 return !!(context->hi & (1ULL << 3));
258 static inline bool __context_present(struct context_entry *context)
260 return (context->lo & 1);
263 static inline bool context_present(struct context_entry *context)
265 return context_pasid_enabled(context) ?
266 __context_present(context) :
267 __context_present(context) && !context_copied(context);
270 static inline void context_set_present(struct context_entry *context)
275 static inline void context_set_fault_enable(struct context_entry *context)
277 context->lo &= (((u64)-1) << 2) | 1;
280 static inline void context_set_translation_type(struct context_entry *context,
283 context->lo &= (((u64)-1) << 4) | 3;
284 context->lo |= (value & 3) << 2;
287 static inline void context_set_address_root(struct context_entry *context,
290 context->lo &= ~VTD_PAGE_MASK;
291 context->lo |= value & VTD_PAGE_MASK;
294 static inline void context_set_address_width(struct context_entry *context,
297 context->hi |= value & 7;
300 static inline void context_set_domain_id(struct context_entry *context,
303 context->hi |= (value & ((1 << 16) - 1)) << 8;
306 static inline int context_domain_id(struct context_entry *c)
308 return((c->hi >> 8) & 0xffff);
311 static inline void context_clear_entry(struct context_entry *context)
324 * 12-63: Host physcial address
330 static inline void dma_clear_pte(struct dma_pte *pte)
335 static inline u64 dma_pte_addr(struct dma_pte *pte)
338 return pte->val & VTD_PAGE_MASK;
340 /* Must have a full atomic 64-bit read */
341 return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
345 static inline bool dma_pte_present(struct dma_pte *pte)
347 return (pte->val & 3) != 0;
350 static inline bool dma_pte_superpage(struct dma_pte *pte)
352 return (pte->val & DMA_PTE_LARGE_PAGE);
355 static inline int first_pte_in_page(struct dma_pte *pte)
357 return !((unsigned long)pte & ~VTD_PAGE_MASK);
361 * This domain is a statically identity mapping domain.
362 * 1. This domain creats a static 1:1 mapping to all usable memory.
363 * 2. It maps to each iommu if successful.
364 * 3. Each iommu mapps to this domain if successful.
366 static struct dmar_domain *si_domain;
367 static int hw_pass_through = 1;
370 * Domain represents a virtual machine, more than one devices
371 * across iommus may be owned in one domain, e.g. kvm guest.
373 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 0)
375 /* si_domain contains mulitple devices */
376 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 1)
378 #define for_each_domain_iommu(idx, domain) \
379 for (idx = 0; idx < g_num_of_iommus; idx++) \
380 if (domain->iommu_refcnt[idx])
383 int nid; /* node id */
385 unsigned iommu_refcnt[DMAR_UNITS_SUPPORTED];
386 /* Refcount of devices per iommu */
389 u16 iommu_did[DMAR_UNITS_SUPPORTED];
390 /* Domain ids per IOMMU. Use u16 since
391 * domain ids are 16 bit wide according
392 * to VT-d spec, section 9.3 */
394 bool has_iotlb_device;
395 struct list_head devices; /* all devices' list */
396 struct iova_domain iovad; /* iova's that belong to this domain */
398 struct dma_pte *pgd; /* virtual address */
399 int gaw; /* max guest address width */
401 /* adjusted guest address width, 0 is level 2 30-bit */
404 int flags; /* flags to find out type of domain */
406 int iommu_coherency;/* indicate coherency of iommu access */
407 int iommu_snooping; /* indicate snooping control feature*/
408 int iommu_count; /* reference count of iommu */
409 int iommu_superpage;/* Level of superpages supported:
410 0 == 4KiB (no superpages), 1 == 2MiB,
411 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
412 u64 max_addr; /* maximum mapped address */
414 struct iommu_domain domain; /* generic domain data structure for
418 /* PCI domain-device relationship */
419 struct device_domain_info {
420 struct list_head link; /* link to domain siblings */
421 struct list_head global; /* link to global list */
422 u8 bus; /* PCI bus number */
423 u8 devfn; /* PCI devfn number */
424 u8 pasid_supported:3;
431 struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
432 struct intel_iommu *iommu; /* IOMMU used by this device */
433 struct dmar_domain *domain; /* pointer to domain */
436 struct dmar_rmrr_unit {
437 struct list_head list; /* list of rmrr units */
438 struct acpi_dmar_header *hdr; /* ACPI header */
439 u64 base_address; /* reserved base address*/
440 u64 end_address; /* reserved end address */
441 struct dmar_dev_scope *devices; /* target devices */
442 int devices_cnt; /* target device count */
443 struct iommu_resv_region *resv; /* reserved region handle */
446 struct dmar_atsr_unit {
447 struct list_head list; /* list of ATSR units */
448 struct acpi_dmar_header *hdr; /* ACPI header */
449 struct dmar_dev_scope *devices; /* target devices */
450 int devices_cnt; /* target device count */
451 u8 include_all:1; /* include all ports */
454 static LIST_HEAD(dmar_atsr_units);
455 static LIST_HEAD(dmar_rmrr_units);
457 #define for_each_rmrr_units(rmrr) \
458 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
460 /* bitmap for indexing intel_iommus */
461 static int g_num_of_iommus;
463 static void domain_exit(struct dmar_domain *domain);
464 static void domain_remove_dev_info(struct dmar_domain *domain);
465 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
467 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
468 static void domain_context_clear(struct intel_iommu *iommu,
470 static int domain_detach_iommu(struct dmar_domain *domain,
471 struct intel_iommu *iommu);
473 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
474 int dmar_disabled = 0;
476 int dmar_disabled = 1;
477 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
479 int intel_iommu_enabled = 0;
480 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
482 static int dmar_map_gfx = 1;
483 static int dmar_forcedac;
484 static int intel_iommu_strict;
485 static int intel_iommu_superpage = 1;
486 static int intel_iommu_ecs = 1;
487 static int iommu_identity_mapping;
489 #define IDENTMAP_ALL 1
490 #define IDENTMAP_GFX 2
491 #define IDENTMAP_AZALIA 4
493 #define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap))
494 #define pasid_enabled(iommu) (ecs_enabled(iommu) && ecap_pasid(iommu->ecap))
496 int intel_iommu_gfx_mapped;
497 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
499 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
500 static DEFINE_SPINLOCK(device_domain_lock);
501 static LIST_HEAD(device_domain_list);
503 const struct iommu_ops intel_iommu_ops;
505 static bool translation_pre_enabled(struct intel_iommu *iommu)
507 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
510 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
512 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
515 static void init_translation_status(struct intel_iommu *iommu)
519 gsts = readl(iommu->reg + DMAR_GSTS_REG);
520 if (gsts & DMA_GSTS_TES)
521 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
524 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
525 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
527 return container_of(dom, struct dmar_domain, domain);
530 static int __init intel_iommu_setup(char *str)
535 if (!strncmp(str, "on", 2)) {
537 pr_info("IOMMU enabled\n");
538 } else if (!strncmp(str, "off", 3)) {
540 pr_info("IOMMU disabled\n");
541 } else if (!strncmp(str, "igfx_off", 8)) {
543 pr_info("Disable GFX device mapping\n");
544 } else if (!strncmp(str, "forcedac", 8)) {
545 pr_info("Forcing DAC for PCI devices\n");
547 } else if (!strncmp(str, "strict", 6)) {
548 pr_info("Disable batched IOTLB flush\n");
549 intel_iommu_strict = 1;
550 } else if (!strncmp(str, "sp_off", 6)) {
551 pr_info("Disable supported super page\n");
552 intel_iommu_superpage = 0;
553 } else if (!strncmp(str, "ecs_off", 7)) {
555 "Intel-IOMMU: disable extended context table support\n");
557 } else if (!strncmp(str, "tboot_noforce", 13)) {
559 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
560 intel_iommu_tboot_noforce = 1;
563 str += strcspn(str, ",");
569 __setup("intel_iommu=", intel_iommu_setup);
571 static struct kmem_cache *iommu_domain_cache;
572 static struct kmem_cache *iommu_devinfo_cache;
574 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
576 struct dmar_domain **domains;
579 domains = iommu->domains[idx];
583 return domains[did & 0xff];
586 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
587 struct dmar_domain *domain)
589 struct dmar_domain **domains;
592 if (!iommu->domains[idx]) {
593 size_t size = 256 * sizeof(struct dmar_domain *);
594 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
597 domains = iommu->domains[idx];
598 if (WARN_ON(!domains))
601 domains[did & 0xff] = domain;
604 static inline void *alloc_pgtable_page(int node)
609 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
611 vaddr = page_address(page);
615 static inline void free_pgtable_page(void *vaddr)
617 free_page((unsigned long)vaddr);
620 static inline void *alloc_domain_mem(void)
622 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
625 static void free_domain_mem(void *vaddr)
627 kmem_cache_free(iommu_domain_cache, vaddr);
630 static inline void * alloc_devinfo_mem(void)
632 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
635 static inline void free_devinfo_mem(void *vaddr)
637 kmem_cache_free(iommu_devinfo_cache, vaddr);
640 static inline int domain_type_is_vm(struct dmar_domain *domain)
642 return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
645 static inline int domain_type_is_si(struct dmar_domain *domain)
647 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
650 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
652 return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
653 DOMAIN_FLAG_STATIC_IDENTITY);
656 static inline int domain_pfn_supported(struct dmar_domain *domain,
659 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
661 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
664 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
669 sagaw = cap_sagaw(iommu->cap);
670 for (agaw = width_to_agaw(max_gaw);
672 if (test_bit(agaw, &sagaw))
680 * Calculate max SAGAW for each iommu.
682 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
684 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
688 * calculate agaw for each iommu.
689 * "SAGAW" may be different across iommus, use a default agaw, and
690 * get a supported less agaw for iommus that don't support the default agaw.
692 int iommu_calculate_agaw(struct intel_iommu *iommu)
694 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
697 /* This functionin only returns single iommu in a domain */
698 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
702 /* si_domain and vm domain should not get here. */
703 BUG_ON(domain_type_is_vm_or_si(domain));
704 for_each_domain_iommu(iommu_id, domain)
707 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
710 return g_iommus[iommu_id];
713 static void domain_update_iommu_coherency(struct dmar_domain *domain)
715 struct dmar_drhd_unit *drhd;
716 struct intel_iommu *iommu;
720 domain->iommu_coherency = 1;
722 for_each_domain_iommu(i, domain) {
724 if (!ecap_coherent(g_iommus[i]->ecap)) {
725 domain->iommu_coherency = 0;
732 /* No hardware attached; use lowest common denominator */
734 for_each_active_iommu(iommu, drhd) {
735 if (!ecap_coherent(iommu->ecap)) {
736 domain->iommu_coherency = 0;
743 static int domain_update_iommu_snooping(struct intel_iommu *skip)
745 struct dmar_drhd_unit *drhd;
746 struct intel_iommu *iommu;
750 for_each_active_iommu(iommu, drhd) {
752 if (!ecap_sc_support(iommu->ecap)) {
763 static int domain_update_iommu_superpage(struct intel_iommu *skip)
765 struct dmar_drhd_unit *drhd;
766 struct intel_iommu *iommu;
769 if (!intel_iommu_superpage) {
773 /* set iommu_superpage to the smallest common denominator */
775 for_each_active_iommu(iommu, drhd) {
777 mask &= cap_super_page_val(iommu->cap);
787 /* Some capabilities may be different across iommus */
788 static void domain_update_iommu_cap(struct dmar_domain *domain)
790 domain_update_iommu_coherency(domain);
791 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
792 domain->iommu_superpage = domain_update_iommu_superpage(NULL);
795 static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu,
796 u8 bus, u8 devfn, int alloc)
798 struct root_entry *root = &iommu->root_entry[bus];
799 struct context_entry *context;
803 if (ecs_enabled(iommu)) {
811 context = phys_to_virt(*entry & VTD_PAGE_MASK);
813 unsigned long phy_addr;
817 context = alloc_pgtable_page(iommu->node);
821 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
822 phy_addr = virt_to_phys((void *)context);
823 *entry = phy_addr | 1;
824 __iommu_flush_cache(iommu, entry, sizeof(*entry));
826 return &context[devfn];
829 static int iommu_dummy(struct device *dev)
831 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
834 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
836 struct dmar_drhd_unit *drhd = NULL;
837 struct intel_iommu *iommu;
839 struct pci_dev *ptmp, *pdev = NULL;
843 if (iommu_dummy(dev))
846 if (dev_is_pci(dev)) {
847 struct pci_dev *pf_pdev;
849 pdev = to_pci_dev(dev);
852 /* VMD child devices currently cannot be handled individually */
853 if (is_vmd(pdev->bus))
857 /* VFs aren't listed in scope tables; we need to look up
858 * the PF instead to find the IOMMU. */
859 pf_pdev = pci_physfn(pdev);
861 segment = pci_domain_nr(pdev->bus);
862 } else if (has_acpi_companion(dev))
863 dev = &ACPI_COMPANION(dev)->dev;
866 for_each_active_iommu(iommu, drhd) {
867 if (pdev && segment != drhd->segment)
870 for_each_active_dev_scope(drhd->devices,
871 drhd->devices_cnt, i, tmp) {
873 /* For a VF use its original BDF# not that of the PF
874 * which we used for the IOMMU lookup. Strictly speaking
875 * we could do this for all PCI devices; we only need to
876 * get the BDF# from the scope table for ACPI matches. */
877 if (pdev && pdev->is_virtfn)
880 *bus = drhd->devices[i].bus;
881 *devfn = drhd->devices[i].devfn;
885 if (!pdev || !dev_is_pci(tmp))
888 ptmp = to_pci_dev(tmp);
889 if (ptmp->subordinate &&
890 ptmp->subordinate->number <= pdev->bus->number &&
891 ptmp->subordinate->busn_res.end >= pdev->bus->number)
895 if (pdev && drhd->include_all) {
897 *bus = pdev->bus->number;
898 *devfn = pdev->devfn;
909 static void domain_flush_cache(struct dmar_domain *domain,
910 void *addr, int size)
912 if (!domain->iommu_coherency)
913 clflush_cache_range(addr, size);
916 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
918 struct context_entry *context;
922 spin_lock_irqsave(&iommu->lock, flags);
923 context = iommu_context_addr(iommu, bus, devfn, 0);
925 ret = context_present(context);
926 spin_unlock_irqrestore(&iommu->lock, flags);
930 static void free_context_table(struct intel_iommu *iommu)
934 struct context_entry *context;
936 spin_lock_irqsave(&iommu->lock, flags);
937 if (!iommu->root_entry) {
940 for (i = 0; i < ROOT_ENTRY_NR; i++) {
941 context = iommu_context_addr(iommu, i, 0, 0);
943 free_pgtable_page(context);
945 if (!ecs_enabled(iommu))
948 context = iommu_context_addr(iommu, i, 0x80, 0);
950 free_pgtable_page(context);
953 free_pgtable_page(iommu->root_entry);
954 iommu->root_entry = NULL;
956 spin_unlock_irqrestore(&iommu->lock, flags);
959 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
960 unsigned long pfn, int *target_level)
962 struct dma_pte *parent, *pte = NULL;
963 int level = agaw_to_level(domain->agaw);
966 BUG_ON(!domain->pgd);
968 if (!domain_pfn_supported(domain, pfn))
969 /* Address beyond IOMMU's addressing capabilities. */
972 parent = domain->pgd;
977 offset = pfn_level_offset(pfn, level);
978 pte = &parent[offset];
979 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
981 if (level == *target_level)
984 if (!dma_pte_present(pte)) {
987 tmp_page = alloc_pgtable_page(domain->nid);
992 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
993 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
994 if (cmpxchg64(&pte->val, 0ULL, pteval))
995 /* Someone else set it while we were thinking; use theirs. */
996 free_pgtable_page(tmp_page);
998 domain_flush_cache(domain, pte, sizeof(*pte));
1003 parent = phys_to_virt(dma_pte_addr(pte));
1008 *target_level = level;
1014 /* return address's pte at specific level */
1015 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1017 int level, int *large_page)
1019 struct dma_pte *parent, *pte = NULL;
1020 int total = agaw_to_level(domain->agaw);
1023 parent = domain->pgd;
1024 while (level <= total) {
1025 offset = pfn_level_offset(pfn, total);
1026 pte = &parent[offset];
1030 if (!dma_pte_present(pte)) {
1031 *large_page = total;
1035 if (dma_pte_superpage(pte)) {
1036 *large_page = total;
1040 parent = phys_to_virt(dma_pte_addr(pte));
1046 /* clear last level pte, a tlb flush should be followed */
1047 static void dma_pte_clear_range(struct dmar_domain *domain,
1048 unsigned long start_pfn,
1049 unsigned long last_pfn)
1051 unsigned int large_page = 1;
1052 struct dma_pte *first_pte, *pte;
1054 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1055 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1056 BUG_ON(start_pfn > last_pfn);
1058 /* we don't need lock here; nobody else touches the iova range */
1061 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1063 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1068 start_pfn += lvl_to_nr_pages(large_page);
1070 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1072 domain_flush_cache(domain, first_pte,
1073 (void *)pte - (void *)first_pte);
1075 } while (start_pfn && start_pfn <= last_pfn);
1078 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1079 int retain_level, struct dma_pte *pte,
1080 unsigned long pfn, unsigned long start_pfn,
1081 unsigned long last_pfn)
1083 pfn = max(start_pfn, pfn);
1084 pte = &pte[pfn_level_offset(pfn, level)];
1087 unsigned long level_pfn;
1088 struct dma_pte *level_pte;
1090 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1093 level_pfn = pfn & level_mask(level);
1094 level_pte = phys_to_virt(dma_pte_addr(pte));
1097 dma_pte_free_level(domain, level - 1, retain_level,
1098 level_pte, level_pfn, start_pfn,
1103 * Free the page table if we're below the level we want to
1104 * retain and the range covers the entire table.
1106 if (level < retain_level && !(start_pfn > level_pfn ||
1107 last_pfn < level_pfn + level_size(level) - 1)) {
1109 domain_flush_cache(domain, pte, sizeof(*pte));
1110 free_pgtable_page(level_pte);
1113 pfn += level_size(level);
1114 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1118 * clear last level (leaf) ptes and free page table pages below the
1119 * level we wish to keep intact.
1121 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1122 unsigned long start_pfn,
1123 unsigned long last_pfn,
1126 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1127 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1128 BUG_ON(start_pfn > last_pfn);
1130 dma_pte_clear_range(domain, start_pfn, last_pfn);
1132 /* We don't need lock here; nobody else touches the iova range */
1133 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1134 domain->pgd, 0, start_pfn, last_pfn);
1137 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1138 free_pgtable_page(domain->pgd);
1143 /* When a page at a given level is being unlinked from its parent, we don't
1144 need to *modify* it at all. All we need to do is make a list of all the
1145 pages which can be freed just as soon as we've flushed the IOTLB and we
1146 know the hardware page-walk will no longer touch them.
1147 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1149 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1150 int level, struct dma_pte *pte,
1151 struct page *freelist)
1155 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1156 pg->freelist = freelist;
1162 pte = page_address(pg);
1164 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1165 freelist = dma_pte_list_pagetables(domain, level - 1,
1168 } while (!first_pte_in_page(pte));
1173 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1174 struct dma_pte *pte, unsigned long pfn,
1175 unsigned long start_pfn,
1176 unsigned long last_pfn,
1177 struct page *freelist)
1179 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1181 pfn = max(start_pfn, pfn);
1182 pte = &pte[pfn_level_offset(pfn, level)];
1185 unsigned long level_pfn;
1187 if (!dma_pte_present(pte))
1190 level_pfn = pfn & level_mask(level);
1192 /* If range covers entire pagetable, free it */
1193 if (start_pfn <= level_pfn &&
1194 last_pfn >= level_pfn + level_size(level) - 1) {
1195 /* These suborbinate page tables are going away entirely. Don't
1196 bother to clear them; we're just going to *free* them. */
1197 if (level > 1 && !dma_pte_superpage(pte))
1198 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1204 } else if (level > 1) {
1205 /* Recurse down into a level that isn't *entirely* obsolete */
1206 freelist = dma_pte_clear_level(domain, level - 1,
1207 phys_to_virt(dma_pte_addr(pte)),
1208 level_pfn, start_pfn, last_pfn,
1212 pfn += level_size(level);
1213 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1216 domain_flush_cache(domain, first_pte,
1217 (void *)++last_pte - (void *)first_pte);
1222 /* We can't just free the pages because the IOMMU may still be walking
1223 the page tables, and may have cached the intermediate levels. The
1224 pages can only be freed after the IOTLB flush has been done. */
1225 static struct page *domain_unmap(struct dmar_domain *domain,
1226 unsigned long start_pfn,
1227 unsigned long last_pfn)
1229 struct page *freelist = NULL;
1231 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1232 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1233 BUG_ON(start_pfn > last_pfn);
1235 /* we don't need lock here; nobody else touches the iova range */
1236 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1237 domain->pgd, 0, start_pfn, last_pfn, NULL);
1240 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1241 struct page *pgd_page = virt_to_page(domain->pgd);
1242 pgd_page->freelist = freelist;
1243 freelist = pgd_page;
1251 static void dma_free_pagelist(struct page *freelist)
1255 while ((pg = freelist)) {
1256 freelist = pg->freelist;
1257 free_pgtable_page(page_address(pg));
1261 static void iova_entry_free(unsigned long data)
1263 struct page *freelist = (struct page *)data;
1265 dma_free_pagelist(freelist);
1268 /* iommu handling */
1269 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1271 struct root_entry *root;
1272 unsigned long flags;
1274 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1276 pr_err("Allocating root entry for %s failed\n",
1281 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1283 spin_lock_irqsave(&iommu->lock, flags);
1284 iommu->root_entry = root;
1285 spin_unlock_irqrestore(&iommu->lock, flags);
1290 static void iommu_set_root_entry(struct intel_iommu *iommu)
1296 addr = virt_to_phys(iommu->root_entry);
1297 if (ecs_enabled(iommu))
1298 addr |= DMA_RTADDR_RTT;
1300 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1301 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1303 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1305 /* Make sure hardware complete it */
1306 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1307 readl, (sts & DMA_GSTS_RTPS), sts);
1309 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1312 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1317 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1320 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1321 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1323 /* Make sure hardware complete it */
1324 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1325 readl, (!(val & DMA_GSTS_WBFS)), val);
1327 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1330 /* return value determine if we need a write buffer flush */
1331 static void __iommu_flush_context(struct intel_iommu *iommu,
1332 u16 did, u16 source_id, u8 function_mask,
1339 case DMA_CCMD_GLOBAL_INVL:
1340 val = DMA_CCMD_GLOBAL_INVL;
1342 case DMA_CCMD_DOMAIN_INVL:
1343 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1345 case DMA_CCMD_DEVICE_INVL:
1346 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1347 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1352 val |= DMA_CCMD_ICC;
1354 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1355 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1357 /* Make sure hardware complete it */
1358 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1359 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1361 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1364 /* return value determine if we need a write buffer flush */
1365 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1366 u64 addr, unsigned int size_order, u64 type)
1368 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1369 u64 val = 0, val_iva = 0;
1373 case DMA_TLB_GLOBAL_FLUSH:
1374 /* global flush doesn't need set IVA_REG */
1375 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1377 case DMA_TLB_DSI_FLUSH:
1378 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1380 case DMA_TLB_PSI_FLUSH:
1381 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1382 /* IH bit is passed in as part of address */
1383 val_iva = size_order | addr;
1388 /* Note: set drain read/write */
1391 * This is probably to be super secure.. Looks like we can
1392 * ignore it without any impact.
1394 if (cap_read_drain(iommu->cap))
1395 val |= DMA_TLB_READ_DRAIN;
1397 if (cap_write_drain(iommu->cap))
1398 val |= DMA_TLB_WRITE_DRAIN;
1400 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1401 /* Note: Only uses first TLB reg currently */
1403 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1404 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1406 /* Make sure hardware complete it */
1407 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1408 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1410 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1412 /* check IOTLB invalidation granularity */
1413 if (DMA_TLB_IAIG(val) == 0)
1414 pr_err("Flush IOTLB failed\n");
1415 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1416 pr_debug("TLB flush request %Lx, actual %Lx\n",
1417 (unsigned long long)DMA_TLB_IIRG(type),
1418 (unsigned long long)DMA_TLB_IAIG(val));
1421 static struct device_domain_info *
1422 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1425 struct device_domain_info *info;
1427 assert_spin_locked(&device_domain_lock);
1432 list_for_each_entry(info, &domain->devices, link)
1433 if (info->iommu == iommu && info->bus == bus &&
1434 info->devfn == devfn) {
1435 if (info->ats_supported && info->dev)
1443 static void domain_update_iotlb(struct dmar_domain *domain)
1445 struct device_domain_info *info;
1446 bool has_iotlb_device = false;
1448 assert_spin_locked(&device_domain_lock);
1450 list_for_each_entry(info, &domain->devices, link) {
1451 struct pci_dev *pdev;
1453 if (!info->dev || !dev_is_pci(info->dev))
1456 pdev = to_pci_dev(info->dev);
1457 if (pdev->ats_enabled) {
1458 has_iotlb_device = true;
1463 domain->has_iotlb_device = has_iotlb_device;
1466 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1468 struct pci_dev *pdev;
1470 assert_spin_locked(&device_domain_lock);
1472 if (!info || !dev_is_pci(info->dev))
1475 pdev = to_pci_dev(info->dev);
1477 #ifdef CONFIG_INTEL_IOMMU_SVM
1478 /* The PCIe spec, in its wisdom, declares that the behaviour of
1479 the device if you enable PASID support after ATS support is
1480 undefined. So always enable PASID support on devices which
1481 have it, even if we can't yet know if we're ever going to
1483 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1484 info->pasid_enabled = 1;
1486 if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1487 info->pri_enabled = 1;
1489 if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1490 info->ats_enabled = 1;
1491 domain_update_iotlb(info->domain);
1492 info->ats_qdep = pci_ats_queue_depth(pdev);
1496 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1498 struct pci_dev *pdev;
1500 assert_spin_locked(&device_domain_lock);
1502 if (!dev_is_pci(info->dev))
1505 pdev = to_pci_dev(info->dev);
1507 if (info->ats_enabled) {
1508 pci_disable_ats(pdev);
1509 info->ats_enabled = 0;
1510 domain_update_iotlb(info->domain);
1512 #ifdef CONFIG_INTEL_IOMMU_SVM
1513 if (info->pri_enabled) {
1514 pci_disable_pri(pdev);
1515 info->pri_enabled = 0;
1517 if (info->pasid_enabled) {
1518 pci_disable_pasid(pdev);
1519 info->pasid_enabled = 0;
1524 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1525 u64 addr, unsigned mask)
1528 unsigned long flags;
1529 struct device_domain_info *info;
1531 if (!domain->has_iotlb_device)
1534 spin_lock_irqsave(&device_domain_lock, flags);
1535 list_for_each_entry(info, &domain->devices, link) {
1536 if (!info->ats_enabled)
1539 sid = info->bus << 8 | info->devfn;
1540 qdep = info->ats_qdep;
1541 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1543 spin_unlock_irqrestore(&device_domain_lock, flags);
1546 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1547 struct dmar_domain *domain,
1548 unsigned long pfn, unsigned int pages,
1551 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1552 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1553 u16 did = domain->iommu_did[iommu->seq_id];
1560 * Fallback to domain selective flush if no PSI support or the size is
1562 * PSI requires page size to be 2 ^ x, and the base address is naturally
1563 * aligned to the size
1565 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1566 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1569 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1573 * In caching mode, changes of pages from non-present to present require
1574 * flush. However, device IOTLB doesn't need to be flushed in this case.
1576 if (!cap_caching_mode(iommu->cap) || !map)
1577 iommu_flush_dev_iotlb(domain, addr, mask);
1580 /* Notification for newly created mappings */
1581 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1582 struct dmar_domain *domain,
1583 unsigned long pfn, unsigned int pages)
1585 /* It's a non-present to present mapping. Only flush if caching mode */
1586 if (cap_caching_mode(iommu->cap))
1587 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1589 iommu_flush_write_buffer(iommu);
1592 static void iommu_flush_iova(struct iova_domain *iovad)
1594 struct dmar_domain *domain;
1597 domain = container_of(iovad, struct dmar_domain, iovad);
1599 for_each_domain_iommu(idx, domain) {
1600 struct intel_iommu *iommu = g_iommus[idx];
1601 u16 did = domain->iommu_did[iommu->seq_id];
1603 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1605 if (!cap_caching_mode(iommu->cap))
1606 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1607 0, MAX_AGAW_PFN_WIDTH);
1611 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1614 unsigned long flags;
1616 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1617 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1618 pmen &= ~DMA_PMEN_EPM;
1619 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1621 /* wait for the protected region status bit to clear */
1622 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1623 readl, !(pmen & DMA_PMEN_PRS), pmen);
1625 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1628 static void iommu_enable_translation(struct intel_iommu *iommu)
1631 unsigned long flags;
1633 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1634 iommu->gcmd |= DMA_GCMD_TE;
1635 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1637 /* Make sure hardware complete it */
1638 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1639 readl, (sts & DMA_GSTS_TES), sts);
1641 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1644 static void iommu_disable_translation(struct intel_iommu *iommu)
1649 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1650 iommu->gcmd &= ~DMA_GCMD_TE;
1651 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1653 /* Make sure hardware complete it */
1654 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1655 readl, (!(sts & DMA_GSTS_TES)), sts);
1657 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1661 static int iommu_init_domains(struct intel_iommu *iommu)
1663 u32 ndomains, nlongs;
1666 ndomains = cap_ndoms(iommu->cap);
1667 pr_debug("%s: Number of Domains supported <%d>\n",
1668 iommu->name, ndomains);
1669 nlongs = BITS_TO_LONGS(ndomains);
1671 spin_lock_init(&iommu->lock);
1673 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1674 if (!iommu->domain_ids) {
1675 pr_err("%s: Allocating domain id array failed\n",
1680 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1681 iommu->domains = kzalloc(size, GFP_KERNEL);
1683 if (iommu->domains) {
1684 size = 256 * sizeof(struct dmar_domain *);
1685 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1688 if (!iommu->domains || !iommu->domains[0]) {
1689 pr_err("%s: Allocating domain array failed\n",
1691 kfree(iommu->domain_ids);
1692 kfree(iommu->domains);
1693 iommu->domain_ids = NULL;
1694 iommu->domains = NULL;
1701 * If Caching mode is set, then invalid translations are tagged
1702 * with domain-id 0, hence we need to pre-allocate it. We also
1703 * use domain-id 0 as a marker for non-allocated domain-id, so
1704 * make sure it is not used for a real domain.
1706 set_bit(0, iommu->domain_ids);
1711 static void disable_dmar_iommu(struct intel_iommu *iommu)
1713 struct device_domain_info *info, *tmp;
1714 unsigned long flags;
1716 if (!iommu->domains || !iommu->domain_ids)
1720 spin_lock_irqsave(&device_domain_lock, flags);
1721 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1722 struct dmar_domain *domain;
1724 if (info->iommu != iommu)
1727 if (!info->dev || !info->domain)
1730 domain = info->domain;
1732 __dmar_remove_one_dev_info(info);
1734 if (!domain_type_is_vm_or_si(domain)) {
1736 * The domain_exit() function can't be called under
1737 * device_domain_lock, as it takes this lock itself.
1738 * So release the lock here and re-run the loop
1741 spin_unlock_irqrestore(&device_domain_lock, flags);
1742 domain_exit(domain);
1746 spin_unlock_irqrestore(&device_domain_lock, flags);
1748 if (iommu->gcmd & DMA_GCMD_TE)
1749 iommu_disable_translation(iommu);
1752 static void free_dmar_iommu(struct intel_iommu *iommu)
1754 if ((iommu->domains) && (iommu->domain_ids)) {
1755 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1758 for (i = 0; i < elems; i++)
1759 kfree(iommu->domains[i]);
1760 kfree(iommu->domains);
1761 kfree(iommu->domain_ids);
1762 iommu->domains = NULL;
1763 iommu->domain_ids = NULL;
1766 g_iommus[iommu->seq_id] = NULL;
1768 /* free context mapping */
1769 free_context_table(iommu);
1771 #ifdef CONFIG_INTEL_IOMMU_SVM
1772 if (pasid_enabled(iommu)) {
1773 if (ecap_prs(iommu->ecap))
1774 intel_svm_finish_prq(iommu);
1775 intel_svm_free_pasid_tables(iommu);
1780 static struct dmar_domain *alloc_domain(int flags)
1782 struct dmar_domain *domain;
1784 domain = alloc_domain_mem();
1788 memset(domain, 0, sizeof(*domain));
1790 domain->flags = flags;
1791 domain->has_iotlb_device = false;
1792 INIT_LIST_HEAD(&domain->devices);
1797 /* Must be called with iommu->lock */
1798 static int domain_attach_iommu(struct dmar_domain *domain,
1799 struct intel_iommu *iommu)
1801 unsigned long ndomains;
1804 assert_spin_locked(&device_domain_lock);
1805 assert_spin_locked(&iommu->lock);
1807 domain->iommu_refcnt[iommu->seq_id] += 1;
1808 domain->iommu_count += 1;
1809 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1810 ndomains = cap_ndoms(iommu->cap);
1811 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1813 if (num >= ndomains) {
1814 pr_err("%s: No free domain ids\n", iommu->name);
1815 domain->iommu_refcnt[iommu->seq_id] -= 1;
1816 domain->iommu_count -= 1;
1820 set_bit(num, iommu->domain_ids);
1821 set_iommu_domain(iommu, num, domain);
1823 domain->iommu_did[iommu->seq_id] = num;
1824 domain->nid = iommu->node;
1826 domain_update_iommu_cap(domain);
1832 static int domain_detach_iommu(struct dmar_domain *domain,
1833 struct intel_iommu *iommu)
1835 int num, count = INT_MAX;
1837 assert_spin_locked(&device_domain_lock);
1838 assert_spin_locked(&iommu->lock);
1840 domain->iommu_refcnt[iommu->seq_id] -= 1;
1841 count = --domain->iommu_count;
1842 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1843 num = domain->iommu_did[iommu->seq_id];
1844 clear_bit(num, iommu->domain_ids);
1845 set_iommu_domain(iommu, num, NULL);
1847 domain_update_iommu_cap(domain);
1848 domain->iommu_did[iommu->seq_id] = 0;
1854 static struct iova_domain reserved_iova_list;
1855 static struct lock_class_key reserved_rbtree_key;
1857 static int dmar_init_reserved_ranges(void)
1859 struct pci_dev *pdev = NULL;
1863 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1865 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1866 &reserved_rbtree_key);
1868 /* IOAPIC ranges shouldn't be accessed by DMA */
1869 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1870 IOVA_PFN(IOAPIC_RANGE_END));
1872 pr_err("Reserve IOAPIC range failed\n");
1876 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1877 for_each_pci_dev(pdev) {
1880 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1881 r = &pdev->resource[i];
1882 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1884 iova = reserve_iova(&reserved_iova_list,
1888 pr_err("Reserve iova failed\n");
1896 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1898 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1901 static inline int guestwidth_to_adjustwidth(int gaw)
1904 int r = (gaw - 12) % 9;
1915 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1918 int adjust_width, agaw;
1919 unsigned long sagaw;
1922 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1924 err = init_iova_flush_queue(&domain->iovad,
1925 iommu_flush_iova, iova_entry_free);
1929 domain_reserve_special_ranges(domain);
1931 /* calculate AGAW */
1932 if (guest_width > cap_mgaw(iommu->cap))
1933 guest_width = cap_mgaw(iommu->cap);
1934 domain->gaw = guest_width;
1935 adjust_width = guestwidth_to_adjustwidth(guest_width);
1936 agaw = width_to_agaw(adjust_width);
1937 sagaw = cap_sagaw(iommu->cap);
1938 if (!test_bit(agaw, &sagaw)) {
1939 /* hardware doesn't support it, choose a bigger one */
1940 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1941 agaw = find_next_bit(&sagaw, 5, agaw);
1945 domain->agaw = agaw;
1947 if (ecap_coherent(iommu->ecap))
1948 domain->iommu_coherency = 1;
1950 domain->iommu_coherency = 0;
1952 if (ecap_sc_support(iommu->ecap))
1953 domain->iommu_snooping = 1;
1955 domain->iommu_snooping = 0;
1957 if (intel_iommu_superpage)
1958 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1960 domain->iommu_superpage = 0;
1962 domain->nid = iommu->node;
1964 /* always allocate the top pgd */
1965 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1968 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1972 static void domain_exit(struct dmar_domain *domain)
1974 struct page *freelist = NULL;
1976 /* Domain 0 is reserved, so dont process it */
1980 /* Remove associated devices and clear attached or cached domains */
1982 domain_remove_dev_info(domain);
1986 put_iova_domain(&domain->iovad);
1988 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1990 dma_free_pagelist(freelist);
1992 free_domain_mem(domain);
1995 static int domain_context_mapping_one(struct dmar_domain *domain,
1996 struct intel_iommu *iommu,
1999 u16 did = domain->iommu_did[iommu->seq_id];
2000 int translation = CONTEXT_TT_MULTI_LEVEL;
2001 struct device_domain_info *info = NULL;
2002 struct context_entry *context;
2003 unsigned long flags;
2004 struct dma_pte *pgd;
2009 if (hw_pass_through && domain_type_is_si(domain))
2010 translation = CONTEXT_TT_PASS_THROUGH;
2012 pr_debug("Set context mapping for %02x:%02x.%d\n",
2013 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2015 BUG_ON(!domain->pgd);
2017 spin_lock_irqsave(&device_domain_lock, flags);
2018 spin_lock(&iommu->lock);
2021 context = iommu_context_addr(iommu, bus, devfn, 1);
2026 if (context_present(context))
2030 * For kdump cases, old valid entries may be cached due to the
2031 * in-flight DMA and copied pgtable, but there is no unmapping
2032 * behaviour for them, thus we need an explicit cache flush for
2033 * the newly-mapped device. For kdump, at this point, the device
2034 * is supposed to finish reset at its driver probe stage, so no
2035 * in-flight DMA will exist, and we don't need to worry anymore
2038 if (context_copied(context)) {
2039 u16 did_old = context_domain_id(context);
2041 if (did_old < cap_ndoms(iommu->cap)) {
2042 iommu->flush.flush_context(iommu, did_old,
2043 (((u16)bus) << 8) | devfn,
2044 DMA_CCMD_MASK_NOBIT,
2045 DMA_CCMD_DEVICE_INVL);
2046 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2053 context_clear_entry(context);
2054 context_set_domain_id(context, did);
2057 * Skip top levels of page tables for iommu which has less agaw
2058 * than default. Unnecessary for PT mode.
2060 if (translation != CONTEXT_TT_PASS_THROUGH) {
2061 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
2063 pgd = phys_to_virt(dma_pte_addr(pgd));
2064 if (!dma_pte_present(pgd))
2068 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2069 if (info && info->ats_supported)
2070 translation = CONTEXT_TT_DEV_IOTLB;
2072 translation = CONTEXT_TT_MULTI_LEVEL;
2074 context_set_address_root(context, virt_to_phys(pgd));
2075 context_set_address_width(context, iommu->agaw);
2078 * In pass through mode, AW must be programmed to
2079 * indicate the largest AGAW value supported by
2080 * hardware. And ASR is ignored by hardware.
2082 context_set_address_width(context, iommu->msagaw);
2085 context_set_translation_type(context, translation);
2086 context_set_fault_enable(context);
2087 context_set_present(context);
2088 domain_flush_cache(domain, context, sizeof(*context));
2091 * It's a non-present to present mapping. If hardware doesn't cache
2092 * non-present entry we only need to flush the write-buffer. If the
2093 * _does_ cache non-present entries, then it does so in the special
2094 * domain #0, which we have to flush:
2096 if (cap_caching_mode(iommu->cap)) {
2097 iommu->flush.flush_context(iommu, 0,
2098 (((u16)bus) << 8) | devfn,
2099 DMA_CCMD_MASK_NOBIT,
2100 DMA_CCMD_DEVICE_INVL);
2101 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2103 iommu_flush_write_buffer(iommu);
2105 iommu_enable_dev_iotlb(info);
2110 spin_unlock(&iommu->lock);
2111 spin_unlock_irqrestore(&device_domain_lock, flags);
2116 struct domain_context_mapping_data {
2117 struct dmar_domain *domain;
2118 struct intel_iommu *iommu;
2121 static int domain_context_mapping_cb(struct pci_dev *pdev,
2122 u16 alias, void *opaque)
2124 struct domain_context_mapping_data *data = opaque;
2126 return domain_context_mapping_one(data->domain, data->iommu,
2127 PCI_BUS_NUM(alias), alias & 0xff);
2131 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2133 struct intel_iommu *iommu;
2135 struct domain_context_mapping_data data;
2137 iommu = device_to_iommu(dev, &bus, &devfn);
2141 if (!dev_is_pci(dev))
2142 return domain_context_mapping_one(domain, iommu, bus, devfn);
2144 data.domain = domain;
2147 return pci_for_each_dma_alias(to_pci_dev(dev),
2148 &domain_context_mapping_cb, &data);
2151 static int domain_context_mapped_cb(struct pci_dev *pdev,
2152 u16 alias, void *opaque)
2154 struct intel_iommu *iommu = opaque;
2156 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2159 static int domain_context_mapped(struct device *dev)
2161 struct intel_iommu *iommu;
2164 iommu = device_to_iommu(dev, &bus, &devfn);
2168 if (!dev_is_pci(dev))
2169 return device_context_mapped(iommu, bus, devfn);
2171 return !pci_for_each_dma_alias(to_pci_dev(dev),
2172 domain_context_mapped_cb, iommu);
2175 /* Returns a number of VTD pages, but aligned to MM page size */
2176 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2179 host_addr &= ~PAGE_MASK;
2180 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2183 /* Return largest possible superpage level for a given mapping */
2184 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2185 unsigned long iov_pfn,
2186 unsigned long phy_pfn,
2187 unsigned long pages)
2189 int support, level = 1;
2190 unsigned long pfnmerge;
2192 support = domain->iommu_superpage;
2194 /* To use a large page, the virtual *and* physical addresses
2195 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2196 of them will mean we have to use smaller pages. So just
2197 merge them and check both at once. */
2198 pfnmerge = iov_pfn | phy_pfn;
2200 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2201 pages >>= VTD_STRIDE_SHIFT;
2204 pfnmerge >>= VTD_STRIDE_SHIFT;
2211 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2212 struct scatterlist *sg, unsigned long phys_pfn,
2213 unsigned long nr_pages, int prot)
2215 struct dma_pte *first_pte = NULL, *pte = NULL;
2216 phys_addr_t uninitialized_var(pteval);
2217 unsigned long sg_res = 0;
2218 unsigned int largepage_lvl = 0;
2219 unsigned long lvl_pages = 0;
2221 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2223 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2226 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2230 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2233 while (nr_pages > 0) {
2237 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2239 sg_res = aligned_nrpages(sg->offset, sg->length);
2240 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2241 sg->dma_length = sg->length;
2242 pteval = (sg_phys(sg) - pgoff) | prot;
2243 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2247 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2249 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2252 /* It is large page*/
2253 if (largepage_lvl > 1) {
2254 unsigned long nr_superpages, end_pfn;
2256 pteval |= DMA_PTE_LARGE_PAGE;
2257 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2259 nr_superpages = sg_res / lvl_pages;
2260 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2263 * Ensure that old small page tables are
2264 * removed to make room for superpage(s).
2265 * We're adding new large pages, so make sure
2266 * we don't remove their parent tables.
2268 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2271 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2275 /* We don't need lock here, nobody else
2276 * touches the iova range
2278 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2280 static int dumps = 5;
2281 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2282 iov_pfn, tmp, (unsigned long long)pteval);
2285 debug_dma_dump_mappings(NULL);
2290 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2292 BUG_ON(nr_pages < lvl_pages);
2293 BUG_ON(sg_res < lvl_pages);
2295 nr_pages -= lvl_pages;
2296 iov_pfn += lvl_pages;
2297 phys_pfn += lvl_pages;
2298 pteval += lvl_pages * VTD_PAGE_SIZE;
2299 sg_res -= lvl_pages;
2301 /* If the next PTE would be the first in a new page, then we
2302 need to flush the cache on the entries we've just written.
2303 And then we'll need to recalculate 'pte', so clear it and
2304 let it get set again in the if (!pte) block above.
2306 If we're done (!nr_pages) we need to flush the cache too.
2308 Also if we've been setting superpages, we may need to
2309 recalculate 'pte' and switch back to smaller pages for the
2310 end of the mapping, if the trailing size is not enough to
2311 use another superpage (i.e. sg_res < lvl_pages). */
2313 if (!nr_pages || first_pte_in_page(pte) ||
2314 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2315 domain_flush_cache(domain, first_pte,
2316 (void *)pte - (void *)first_pte);
2320 if (!sg_res && nr_pages)
2326 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2327 struct scatterlist *sg, unsigned long phys_pfn,
2328 unsigned long nr_pages, int prot)
2331 struct intel_iommu *iommu;
2333 /* Do the real mapping first */
2334 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2338 /* Notify about the new mapping */
2339 if (domain_type_is_vm(domain)) {
2340 /* VM typed domains can have more than one IOMMUs */
2342 for_each_domain_iommu(iommu_id, domain) {
2343 iommu = g_iommus[iommu_id];
2344 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2347 /* General domains only have one IOMMU */
2348 iommu = domain_get_iommu(domain);
2349 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2355 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2356 struct scatterlist *sg, unsigned long nr_pages,
2359 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2362 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2363 unsigned long phys_pfn, unsigned long nr_pages,
2366 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2369 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2371 unsigned long flags;
2372 struct context_entry *context;
2378 spin_lock_irqsave(&iommu->lock, flags);
2379 context = iommu_context_addr(iommu, bus, devfn, 0);
2381 spin_unlock_irqrestore(&iommu->lock, flags);
2384 did_old = context_domain_id(context);
2385 context_clear_entry(context);
2386 __iommu_flush_cache(iommu, context, sizeof(*context));
2387 spin_unlock_irqrestore(&iommu->lock, flags);
2388 iommu->flush.flush_context(iommu,
2390 (((u16)bus) << 8) | devfn,
2391 DMA_CCMD_MASK_NOBIT,
2392 DMA_CCMD_DEVICE_INVL);
2393 iommu->flush.flush_iotlb(iommu,
2400 static inline void unlink_domain_info(struct device_domain_info *info)
2402 assert_spin_locked(&device_domain_lock);
2403 list_del(&info->link);
2404 list_del(&info->global);
2406 info->dev->archdata.iommu = NULL;
2409 static void domain_remove_dev_info(struct dmar_domain *domain)
2411 struct device_domain_info *info, *tmp;
2412 unsigned long flags;
2414 spin_lock_irqsave(&device_domain_lock, flags);
2415 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2416 __dmar_remove_one_dev_info(info);
2417 spin_unlock_irqrestore(&device_domain_lock, flags);
2422 * Note: we use struct device->archdata.iommu stores the info
2424 static struct dmar_domain *find_domain(struct device *dev)
2426 struct device_domain_info *info;
2428 /* No lock here, assumes no domain exit in normal case */
2429 info = dev->archdata.iommu;
2431 return info->domain;
2435 static inline struct device_domain_info *
2436 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2438 struct device_domain_info *info;
2440 list_for_each_entry(info, &device_domain_list, global)
2441 if (info->iommu->segment == segment && info->bus == bus &&
2442 info->devfn == devfn)
2448 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2451 struct dmar_domain *domain)
2453 struct dmar_domain *found = NULL;
2454 struct device_domain_info *info;
2455 unsigned long flags;
2458 info = alloc_devinfo_mem();
2463 info->devfn = devfn;
2464 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2465 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2468 info->domain = domain;
2469 info->iommu = iommu;
2471 if (dev && dev_is_pci(dev)) {
2472 struct pci_dev *pdev = to_pci_dev(info->dev);
2474 if (!pci_ats_disabled() &&
2475 ecap_dev_iotlb_support(iommu->ecap) &&
2476 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2477 dmar_find_matched_atsr_unit(pdev))
2478 info->ats_supported = 1;
2480 if (ecs_enabled(iommu)) {
2481 if (pasid_enabled(iommu)) {
2482 int features = pci_pasid_features(pdev);
2484 info->pasid_supported = features | 1;
2487 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2488 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2489 info->pri_supported = 1;
2493 spin_lock_irqsave(&device_domain_lock, flags);
2495 found = find_domain(dev);
2498 struct device_domain_info *info2;
2499 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2501 found = info2->domain;
2507 spin_unlock_irqrestore(&device_domain_lock, flags);
2508 free_devinfo_mem(info);
2509 /* Caller must free the original domain */
2513 spin_lock(&iommu->lock);
2514 ret = domain_attach_iommu(domain, iommu);
2515 spin_unlock(&iommu->lock);
2518 spin_unlock_irqrestore(&device_domain_lock, flags);
2519 free_devinfo_mem(info);
2523 list_add(&info->link, &domain->devices);
2524 list_add(&info->global, &device_domain_list);
2526 dev->archdata.iommu = info;
2527 spin_unlock_irqrestore(&device_domain_lock, flags);
2529 if (dev && domain_context_mapping(domain, dev)) {
2530 pr_err("Domain context map for %s failed\n", dev_name(dev));
2531 dmar_remove_one_dev_info(domain, dev);
2538 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2540 *(u16 *)opaque = alias;
2544 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2546 struct device_domain_info *info = NULL;
2547 struct dmar_domain *domain = NULL;
2548 struct intel_iommu *iommu;
2550 unsigned long flags;
2553 iommu = device_to_iommu(dev, &bus, &devfn);
2557 if (dev_is_pci(dev)) {
2558 struct pci_dev *pdev = to_pci_dev(dev);
2560 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2562 spin_lock_irqsave(&device_domain_lock, flags);
2563 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2564 PCI_BUS_NUM(dma_alias),
2567 iommu = info->iommu;
2568 domain = info->domain;
2570 spin_unlock_irqrestore(&device_domain_lock, flags);
2572 /* DMA alias already has a domain, use it */
2577 /* Allocate and initialize new domain for the device */
2578 domain = alloc_domain(0);
2581 if (domain_init(domain, iommu, gaw)) {
2582 domain_exit(domain);
2591 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2592 struct dmar_domain *domain)
2594 struct intel_iommu *iommu;
2595 struct dmar_domain *tmp;
2596 u16 req_id, dma_alias;
2599 iommu = device_to_iommu(dev, &bus, &devfn);
2603 req_id = ((u16)bus << 8) | devfn;
2605 if (dev_is_pci(dev)) {
2606 struct pci_dev *pdev = to_pci_dev(dev);
2608 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2610 /* register PCI DMA alias device */
2611 if (req_id != dma_alias) {
2612 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2613 dma_alias & 0xff, NULL, domain);
2615 if (!tmp || tmp != domain)
2620 tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2621 if (!tmp || tmp != domain)
2627 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2629 struct dmar_domain *domain, *tmp;
2631 domain = find_domain(dev);
2635 domain = find_or_alloc_domain(dev, gaw);
2639 tmp = set_domain_for_dev(dev, domain);
2640 if (!tmp || domain != tmp) {
2641 domain_exit(domain);
2650 static int iommu_domain_identity_map(struct dmar_domain *domain,
2651 unsigned long long start,
2652 unsigned long long end)
2654 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2655 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2657 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2658 dma_to_mm_pfn(last_vpfn))) {
2659 pr_err("Reserving iova failed\n");
2663 pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2665 * RMRR range might have overlap with physical memory range,
2668 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2670 return __domain_mapping(domain, first_vpfn, NULL,
2671 first_vpfn, last_vpfn - first_vpfn + 1,
2672 DMA_PTE_READ|DMA_PTE_WRITE);
2675 static int domain_prepare_identity_map(struct device *dev,
2676 struct dmar_domain *domain,
2677 unsigned long long start,
2678 unsigned long long end)
2680 /* For _hardware_ passthrough, don't bother. But for software
2681 passthrough, we do it anyway -- it may indicate a memory
2682 range which is reserved in E820, so which didn't get set
2683 up to start with in si_domain */
2684 if (domain == si_domain && hw_pass_through) {
2685 pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2686 dev_name(dev), start, end);
2690 pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2691 dev_name(dev), start, end);
2694 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2695 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2696 dmi_get_system_info(DMI_BIOS_VENDOR),
2697 dmi_get_system_info(DMI_BIOS_VERSION),
2698 dmi_get_system_info(DMI_PRODUCT_VERSION));
2702 if (end >> agaw_to_width(domain->agaw)) {
2703 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2704 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2705 agaw_to_width(domain->agaw),
2706 dmi_get_system_info(DMI_BIOS_VENDOR),
2707 dmi_get_system_info(DMI_BIOS_VERSION),
2708 dmi_get_system_info(DMI_PRODUCT_VERSION));
2712 return iommu_domain_identity_map(domain, start, end);
2715 static int iommu_prepare_identity_map(struct device *dev,
2716 unsigned long long start,
2717 unsigned long long end)
2719 struct dmar_domain *domain;
2722 domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2726 ret = domain_prepare_identity_map(dev, domain, start, end);
2728 domain_exit(domain);
2733 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2736 if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2738 return iommu_prepare_identity_map(dev, rmrr->base_address,
2742 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2743 static inline void iommu_prepare_isa(void)
2745 struct pci_dev *pdev;
2748 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2752 pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2753 ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2756 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2761 static inline void iommu_prepare_isa(void)
2765 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2767 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2769 static int __init si_domain_init(int hw)
2773 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2777 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2778 domain_exit(si_domain);
2782 pr_debug("Identity mapping domain allocated\n");
2787 for_each_online_node(nid) {
2788 unsigned long start_pfn, end_pfn;
2791 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2792 ret = iommu_domain_identity_map(si_domain,
2793 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2802 static int identity_mapping(struct device *dev)
2804 struct device_domain_info *info;
2806 if (likely(!iommu_identity_mapping))
2809 info = dev->archdata.iommu;
2810 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2811 return (info->domain == si_domain);
2816 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2818 struct dmar_domain *ndomain;
2819 struct intel_iommu *iommu;
2822 iommu = device_to_iommu(dev, &bus, &devfn);
2826 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2827 if (ndomain != domain)
2833 static bool device_has_rmrr(struct device *dev)
2835 struct dmar_rmrr_unit *rmrr;
2840 for_each_rmrr_units(rmrr) {
2842 * Return TRUE if this RMRR contains the device that
2845 for_each_active_dev_scope(rmrr->devices,
2846 rmrr->devices_cnt, i, tmp)
2857 * There are a couple cases where we need to restrict the functionality of
2858 * devices associated with RMRRs. The first is when evaluating a device for
2859 * identity mapping because problems exist when devices are moved in and out
2860 * of domains and their respective RMRR information is lost. This means that
2861 * a device with associated RMRRs will never be in a "passthrough" domain.
2862 * The second is use of the device through the IOMMU API. This interface
2863 * expects to have full control of the IOVA space for the device. We cannot
2864 * satisfy both the requirement that RMRR access is maintained and have an
2865 * unencumbered IOVA space. We also have no ability to quiesce the device's
2866 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2867 * We therefore prevent devices associated with an RMRR from participating in
2868 * the IOMMU API, which eliminates them from device assignment.
2870 * In both cases we assume that PCI USB devices with RMRRs have them largely
2871 * for historical reasons and that the RMRR space is not actively used post
2872 * boot. This exclusion may change if vendors begin to abuse it.
2874 * The same exception is made for graphics devices, with the requirement that
2875 * any use of the RMRR regions will be torn down before assigning the device
2878 static bool device_is_rmrr_locked(struct device *dev)
2880 if (!device_has_rmrr(dev))
2883 if (dev_is_pci(dev)) {
2884 struct pci_dev *pdev = to_pci_dev(dev);
2886 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2893 static int iommu_should_identity_map(struct device *dev, int startup)
2896 if (dev_is_pci(dev)) {
2897 struct pci_dev *pdev = to_pci_dev(dev);
2899 if (device_is_rmrr_locked(dev))
2902 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2905 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2908 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2912 * We want to start off with all devices in the 1:1 domain, and
2913 * take them out later if we find they can't access all of memory.
2915 * However, we can't do this for PCI devices behind bridges,
2916 * because all PCI devices behind the same bridge will end up
2917 * with the same source-id on their transactions.
2919 * Practically speaking, we can't change things around for these
2920 * devices at run-time, because we can't be sure there'll be no
2921 * DMA transactions in flight for any of their siblings.
2923 * So PCI devices (unless they're on the root bus) as well as
2924 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2925 * the 1:1 domain, just in _case_ one of their siblings turns out
2926 * not to be able to map all of memory.
2928 if (!pci_is_pcie(pdev)) {
2929 if (!pci_is_root_bus(pdev->bus))
2931 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2933 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2936 if (device_has_rmrr(dev))
2941 * At boot time, we don't yet know if devices will be 64-bit capable.
2942 * Assume that they will — if they turn out not to be, then we can
2943 * take them out of the 1:1 domain later.
2947 * If the device's dma_mask is less than the system's memory
2948 * size then this is not a candidate for identity mapping.
2950 u64 dma_mask = *dev->dma_mask;
2952 if (dev->coherent_dma_mask &&
2953 dev->coherent_dma_mask < dma_mask)
2954 dma_mask = dev->coherent_dma_mask;
2956 return dma_mask >= dma_get_required_mask(dev);
2962 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2966 if (!iommu_should_identity_map(dev, 1))
2969 ret = domain_add_dev_info(si_domain, dev);
2971 pr_info("%s identity mapping for device %s\n",
2972 hw ? "Hardware" : "Software", dev_name(dev));
2973 else if (ret == -ENODEV)
2974 /* device not associated with an iommu */
2981 static int __init iommu_prepare_static_identity_mapping(int hw)
2983 struct pci_dev *pdev = NULL;
2984 struct dmar_drhd_unit *drhd;
2985 struct intel_iommu *iommu;
2990 for_each_pci_dev(pdev) {
2991 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2996 for_each_active_iommu(iommu, drhd)
2997 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2998 struct acpi_device_physical_node *pn;
2999 struct acpi_device *adev;
3001 if (dev->bus != &acpi_bus_type)
3004 adev= to_acpi_device(dev);
3005 mutex_lock(&adev->physical_node_lock);
3006 list_for_each_entry(pn, &adev->physical_node_list, node) {
3007 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
3011 mutex_unlock(&adev->physical_node_lock);
3019 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3022 * Start from the sane iommu hardware state.
3023 * If the queued invalidation is already initialized by us
3024 * (for example, while enabling interrupt-remapping) then
3025 * we got the things already rolling from a sane state.
3029 * Clear any previous faults.
3031 dmar_fault(-1, iommu);
3033 * Disable queued invalidation if supported and already enabled
3034 * before OS handover.
3036 dmar_disable_qi(iommu);
3039 if (dmar_enable_qi(iommu)) {
3041 * Queued Invalidate not enabled, use Register Based Invalidate
3043 iommu->flush.flush_context = __iommu_flush_context;
3044 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3045 pr_info("%s: Using Register based invalidation\n",
3048 iommu->flush.flush_context = qi_flush_context;
3049 iommu->flush.flush_iotlb = qi_flush_iotlb;
3050 pr_info("%s: Using Queued invalidation\n", iommu->name);
3054 static int copy_context_table(struct intel_iommu *iommu,
3055 struct root_entry *old_re,
3056 struct context_entry **tbl,
3059 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3060 struct context_entry *new_ce = NULL, ce;
3061 struct context_entry *old_ce = NULL;
3062 struct root_entry re;
3063 phys_addr_t old_ce_phys;
3065 tbl_idx = ext ? bus * 2 : bus;
3066 memcpy(&re, old_re, sizeof(re));
3068 for (devfn = 0; devfn < 256; devfn++) {
3069 /* First calculate the correct index */
3070 idx = (ext ? devfn * 2 : devfn) % 256;
3073 /* First save what we may have and clean up */
3075 tbl[tbl_idx] = new_ce;
3076 __iommu_flush_cache(iommu, new_ce,
3086 old_ce_phys = root_entry_lctp(&re);
3088 old_ce_phys = root_entry_uctp(&re);
3091 if (ext && devfn == 0) {
3092 /* No LCTP, try UCTP */
3101 old_ce = memremap(old_ce_phys, PAGE_SIZE,
3106 new_ce = alloc_pgtable_page(iommu->node);
3113 /* Now copy the context entry */
3114 memcpy(&ce, old_ce + idx, sizeof(ce));
3116 if (!__context_present(&ce))
3119 did = context_domain_id(&ce);
3120 if (did >= 0 && did < cap_ndoms(iommu->cap))
3121 set_bit(did, iommu->domain_ids);
3124 * We need a marker for copied context entries. This
3125 * marker needs to work for the old format as well as
3126 * for extended context entries.
3128 * Bit 67 of the context entry is used. In the old
3129 * format this bit is available to software, in the
3130 * extended format it is the PGE bit, but PGE is ignored
3131 * by HW if PASIDs are disabled (and thus still
3134 * So disable PASIDs first and then mark the entry
3135 * copied. This means that we don't copy PASID
3136 * translations from the old kernel, but this is fine as
3137 * faults there are not fatal.
3139 context_clear_pasid_enable(&ce);
3140 context_set_copied(&ce);
3145 tbl[tbl_idx + pos] = new_ce;
3147 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3156 static int copy_translation_tables(struct intel_iommu *iommu)
3158 struct context_entry **ctxt_tbls;
3159 struct root_entry *old_rt;
3160 phys_addr_t old_rt_phys;
3161 int ctxt_table_entries;
3162 unsigned long flags;
3167 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3168 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3169 new_ext = !!ecap_ecs(iommu->ecap);
3172 * The RTT bit can only be changed when translation is disabled,
3173 * but disabling translation means to open a window for data
3174 * corruption. So bail out and don't copy anything if we would
3175 * have to change the bit.
3180 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3184 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3188 /* This is too big for the stack - allocate it from slab */
3189 ctxt_table_entries = ext ? 512 : 256;
3191 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3195 for (bus = 0; bus < 256; bus++) {
3196 ret = copy_context_table(iommu, &old_rt[bus],
3197 ctxt_tbls, bus, ext);
3199 pr_err("%s: Failed to copy context table for bus %d\n",
3205 spin_lock_irqsave(&iommu->lock, flags);
3207 /* Context tables are copied, now write them to the root_entry table */
3208 for (bus = 0; bus < 256; bus++) {
3209 int idx = ext ? bus * 2 : bus;
3212 if (ctxt_tbls[idx]) {
3213 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3214 iommu->root_entry[bus].lo = val;
3217 if (!ext || !ctxt_tbls[idx + 1])
3220 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3221 iommu->root_entry[bus].hi = val;
3224 spin_unlock_irqrestore(&iommu->lock, flags);
3228 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3238 static int __init init_dmars(void)
3240 struct dmar_drhd_unit *drhd;
3241 struct dmar_rmrr_unit *rmrr;
3242 bool copied_tables = false;
3244 struct intel_iommu *iommu;
3250 * initialize and program root entry to not present
3253 for_each_drhd_unit(drhd) {
3255 * lock not needed as this is only incremented in the single
3256 * threaded kernel __init code path all other access are read
3259 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3263 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3266 /* Preallocate enough resources for IOMMU hot-addition */
3267 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3268 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3270 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3273 pr_err("Allocating global iommu array failed\n");
3278 for_each_active_iommu(iommu, drhd) {
3279 g_iommus[iommu->seq_id] = iommu;
3281 intel_iommu_init_qi(iommu);
3283 ret = iommu_init_domains(iommu);
3287 init_translation_status(iommu);
3289 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3290 iommu_disable_translation(iommu);
3291 clear_translation_pre_enabled(iommu);
3292 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3298 * we could share the same root & context tables
3299 * among all IOMMU's. Need to Split it later.
3301 ret = iommu_alloc_root_entry(iommu);
3305 if (translation_pre_enabled(iommu)) {
3306 pr_info("Translation already enabled - trying to copy translation structures\n");
3308 ret = copy_translation_tables(iommu);
3311 * We found the IOMMU with translation
3312 * enabled - but failed to copy over the
3313 * old root-entry table. Try to proceed
3314 * by disabling translation now and
3315 * allocating a clean root-entry table.
3316 * This might cause DMAR faults, but
3317 * probably the dump will still succeed.
3319 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3321 iommu_disable_translation(iommu);
3322 clear_translation_pre_enabled(iommu);
3324 pr_info("Copied translation tables from previous kernel for %s\n",
3326 copied_tables = true;
3330 if (!ecap_pass_through(iommu->ecap))
3331 hw_pass_through = 0;
3332 #ifdef CONFIG_INTEL_IOMMU_SVM
3333 if (pasid_enabled(iommu))
3334 intel_svm_alloc_pasid_tables(iommu);
3339 * Now that qi is enabled on all iommus, set the root entry and flush
3340 * caches. This is required on some Intel X58 chipsets, otherwise the
3341 * flush_context function will loop forever and the boot hangs.
3343 for_each_active_iommu(iommu, drhd) {
3344 iommu_flush_write_buffer(iommu);
3345 iommu_set_root_entry(iommu);
3346 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3347 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3350 if (iommu_pass_through)
3351 iommu_identity_mapping |= IDENTMAP_ALL;
3353 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3354 iommu_identity_mapping |= IDENTMAP_GFX;
3357 check_tylersburg_isoch();
3359 if (iommu_identity_mapping) {
3360 ret = si_domain_init(hw_pass_through);
3367 * If we copied translations from a previous kernel in the kdump
3368 * case, we can not assign the devices to domains now, as that
3369 * would eliminate the old mappings. So skip this part and defer
3370 * the assignment to device driver initialization time.
3376 * If pass through is not set or not enabled, setup context entries for
3377 * identity mappings for rmrr, gfx, and isa and may fall back to static
3378 * identity mapping if iommu_identity_mapping is set.
3380 if (iommu_identity_mapping) {
3381 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3383 pr_crit("Failed to setup IOMMU pass-through\n");
3389 * for each dev attached to rmrr
3391 * locate drhd for dev, alloc domain for dev
3392 * allocate free domain
3393 * allocate page table entries for rmrr
3394 * if context not allocated for bus
3395 * allocate and init context
3396 * set present in root table for this bus
3397 * init context with domain, translation etc
3401 pr_info("Setting RMRR:\n");
3402 for_each_rmrr_units(rmrr) {
3403 /* some BIOS lists non-exist devices in DMAR table. */
3404 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3406 ret = iommu_prepare_rmrr_dev(rmrr, dev);
3408 pr_err("Mapping reserved region failed\n");
3412 iommu_prepare_isa();
3419 * global invalidate context cache
3420 * global invalidate iotlb
3421 * enable translation
3423 for_each_iommu(iommu, drhd) {
3424 if (drhd->ignored) {
3426 * we always have to disable PMRs or DMA may fail on
3430 iommu_disable_protect_mem_regions(iommu);
3434 iommu_flush_write_buffer(iommu);
3436 #ifdef CONFIG_INTEL_IOMMU_SVM
3437 if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
3438 ret = intel_svm_enable_prq(iommu);
3443 ret = dmar_set_interrupt(iommu);
3447 if (!translation_pre_enabled(iommu))
3448 iommu_enable_translation(iommu);
3450 iommu_disable_protect_mem_regions(iommu);
3456 for_each_active_iommu(iommu, drhd) {
3457 disable_dmar_iommu(iommu);
3458 free_dmar_iommu(iommu);
3467 /* This takes a number of _MM_ pages, not VTD pages */
3468 static unsigned long intel_alloc_iova(struct device *dev,
3469 struct dmar_domain *domain,
3470 unsigned long nrpages, uint64_t dma_mask)
3472 unsigned long iova_pfn = 0;
3474 /* Restrict dma_mask to the width that the iommu can handle */
3475 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3476 /* Ensure we reserve the whole size-aligned region */
3477 nrpages = __roundup_pow_of_two(nrpages);
3479 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3481 * First try to allocate an io virtual address in
3482 * DMA_BIT_MASK(32) and if that fails then try allocating
3485 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3486 IOVA_PFN(DMA_BIT_MASK(32)), false);
3490 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3491 IOVA_PFN(dma_mask), true);
3492 if (unlikely(!iova_pfn)) {
3493 pr_err("Allocating %ld-page iova for %s failed",
3494 nrpages, dev_name(dev));
3501 static struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3503 struct dmar_domain *domain, *tmp;
3504 struct dmar_rmrr_unit *rmrr;
3505 struct device *i_dev;
3508 domain = find_domain(dev);
3512 domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3516 /* We have a new domain - setup possible RMRRs for the device */
3518 for_each_rmrr_units(rmrr) {
3519 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3524 ret = domain_prepare_identity_map(dev, domain,
3528 dev_err(dev, "Mapping reserved region failed\n");
3533 tmp = set_domain_for_dev(dev, domain);
3534 if (!tmp || domain != tmp) {
3535 domain_exit(domain);
3542 pr_err("Allocating domain for %s failed\n", dev_name(dev));
3548 /* Check if the dev needs to go through non-identity map and unmap process.*/
3549 static int iommu_no_mapping(struct device *dev)
3553 if (iommu_dummy(dev))
3556 if (!iommu_identity_mapping)
3559 found = identity_mapping(dev);
3561 if (iommu_should_identity_map(dev, 0))
3565 * 32 bit DMA is removed from si_domain and fall back
3566 * to non-identity mapping.
3568 dmar_remove_one_dev_info(si_domain, dev);
3569 pr_info("32bit %s uses non-identity mapping\n",
3575 * In case of a detached 64 bit DMA device from vm, the device
3576 * is put into si_domain for identity mapping.
3578 if (iommu_should_identity_map(dev, 0)) {
3580 ret = domain_add_dev_info(si_domain, dev);
3582 pr_info("64bit %s uses identity mapping\n",
3592 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3593 size_t size, int dir, u64 dma_mask)
3595 struct dmar_domain *domain;
3596 phys_addr_t start_paddr;
3597 unsigned long iova_pfn;
3600 struct intel_iommu *iommu;
3601 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3603 BUG_ON(dir == DMA_NONE);
3605 if (iommu_no_mapping(dev))
3608 domain = get_valid_domain_for_dev(dev);
3612 iommu = domain_get_iommu(domain);
3613 size = aligned_nrpages(paddr, size);
3615 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3620 * Check if DMAR supports zero-length reads on write only
3623 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3624 !cap_zlr(iommu->cap))
3625 prot |= DMA_PTE_READ;
3626 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3627 prot |= DMA_PTE_WRITE;
3629 * paddr - (paddr + size) might be partial page, we should map the whole
3630 * page. Note: if two part of one page are separately mapped, we
3631 * might have two guest_addr mapping to the same host paddr, but this
3632 * is not a big problem
3634 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3635 mm_to_dma_pfn(paddr_pfn), size, prot);
3639 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3640 start_paddr += paddr & ~PAGE_MASK;
3645 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3646 pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3647 dev_name(dev), size, (unsigned long long)paddr, dir);
3651 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3652 unsigned long offset, size_t size,
3653 enum dma_data_direction dir,
3654 unsigned long attrs)
3656 return __intel_map_single(dev, page_to_phys(page) + offset, size,
3657 dir, *dev->dma_mask);
3660 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3662 struct dmar_domain *domain;
3663 unsigned long start_pfn, last_pfn;
3664 unsigned long nrpages;
3665 unsigned long iova_pfn;
3666 struct intel_iommu *iommu;
3667 struct page *freelist;
3669 if (iommu_no_mapping(dev))
3672 domain = find_domain(dev);
3675 iommu = domain_get_iommu(domain);
3677 iova_pfn = IOVA_PFN(dev_addr);
3679 nrpages = aligned_nrpages(dev_addr, size);
3680 start_pfn = mm_to_dma_pfn(iova_pfn);
3681 last_pfn = start_pfn + nrpages - 1;
3683 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3684 dev_name(dev), start_pfn, last_pfn);
3686 freelist = domain_unmap(domain, start_pfn, last_pfn);
3688 if (intel_iommu_strict) {
3689 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3690 nrpages, !freelist, 0);
3692 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3693 dma_free_pagelist(freelist);
3695 queue_iova(&domain->iovad, iova_pfn, nrpages,
3696 (unsigned long)freelist);
3698 * queue up the release of the unmap to save the 1/6th of the
3699 * cpu used up by the iotlb flush operation...
3704 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3705 size_t size, enum dma_data_direction dir,
3706 unsigned long attrs)
3708 intel_unmap(dev, dev_addr, size);
3711 static void *intel_alloc_coherent(struct device *dev, size_t size,
3712 dma_addr_t *dma_handle, gfp_t flags,
3713 unsigned long attrs)
3715 struct page *page = NULL;
3718 size = PAGE_ALIGN(size);
3719 order = get_order(size);
3721 if (!iommu_no_mapping(dev))
3722 flags &= ~(GFP_DMA | GFP_DMA32);
3723 else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3724 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3730 if (gfpflags_allow_blocking(flags)) {
3731 unsigned int count = size >> PAGE_SHIFT;
3733 page = dma_alloc_from_contiguous(dev, count, order, flags);
3734 if (page && iommu_no_mapping(dev) &&
3735 page_to_phys(page) + size > dev->coherent_dma_mask) {
3736 dma_release_from_contiguous(dev, page, count);
3742 page = alloc_pages(flags, order);
3745 memset(page_address(page), 0, size);
3747 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3749 dev->coherent_dma_mask);
3751 return page_address(page);
3752 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3753 __free_pages(page, order);
3758 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3759 dma_addr_t dma_handle, unsigned long attrs)
3762 struct page *page = virt_to_page(vaddr);
3764 size = PAGE_ALIGN(size);
3765 order = get_order(size);
3767 intel_unmap(dev, dma_handle, size);
3768 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3769 __free_pages(page, order);
3772 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3773 int nelems, enum dma_data_direction dir,
3774 unsigned long attrs)
3776 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3777 unsigned long nrpages = 0;
3778 struct scatterlist *sg;
3781 for_each_sg(sglist, sg, nelems, i) {
3782 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3785 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3788 static int intel_nontranslate_map_sg(struct device *hddev,
3789 struct scatterlist *sglist, int nelems, int dir)
3792 struct scatterlist *sg;
3794 for_each_sg(sglist, sg, nelems, i) {
3795 BUG_ON(!sg_page(sg));
3796 sg->dma_address = sg_phys(sg);
3797 sg->dma_length = sg->length;
3802 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3803 enum dma_data_direction dir, unsigned long attrs)
3806 struct dmar_domain *domain;
3809 unsigned long iova_pfn;
3811 struct scatterlist *sg;
3812 unsigned long start_vpfn;
3813 struct intel_iommu *iommu;
3815 BUG_ON(dir == DMA_NONE);
3816 if (iommu_no_mapping(dev))
3817 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3819 domain = get_valid_domain_for_dev(dev);
3823 iommu = domain_get_iommu(domain);
3825 for_each_sg(sglist, sg, nelems, i)
3826 size += aligned_nrpages(sg->offset, sg->length);
3828 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3831 sglist->dma_length = 0;
3836 * Check if DMAR supports zero-length reads on write only
3839 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3840 !cap_zlr(iommu->cap))
3841 prot |= DMA_PTE_READ;
3842 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3843 prot |= DMA_PTE_WRITE;
3845 start_vpfn = mm_to_dma_pfn(iova_pfn);
3847 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3848 if (unlikely(ret)) {
3849 dma_pte_free_pagetable(domain, start_vpfn,
3850 start_vpfn + size - 1,
3851 agaw_to_level(domain->agaw) + 1);
3852 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3859 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3864 const struct dma_map_ops intel_dma_ops = {
3865 .alloc = intel_alloc_coherent,
3866 .free = intel_free_coherent,
3867 .map_sg = intel_map_sg,
3868 .unmap_sg = intel_unmap_sg,
3869 .map_page = intel_map_page,
3870 .unmap_page = intel_unmap_page,
3871 .mapping_error = intel_mapping_error,
3873 .dma_supported = dma_direct_supported,
3877 static inline int iommu_domain_cache_init(void)
3881 iommu_domain_cache = kmem_cache_create("iommu_domain",
3882 sizeof(struct dmar_domain),
3887 if (!iommu_domain_cache) {
3888 pr_err("Couldn't create iommu_domain cache\n");
3895 static inline int iommu_devinfo_cache_init(void)
3899 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3900 sizeof(struct device_domain_info),
3904 if (!iommu_devinfo_cache) {
3905 pr_err("Couldn't create devinfo cache\n");
3912 static int __init iommu_init_mempool(void)
3915 ret = iova_cache_get();
3919 ret = iommu_domain_cache_init();
3923 ret = iommu_devinfo_cache_init();
3927 kmem_cache_destroy(iommu_domain_cache);
3934 static void __init iommu_exit_mempool(void)
3936 kmem_cache_destroy(iommu_devinfo_cache);
3937 kmem_cache_destroy(iommu_domain_cache);
3941 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3943 struct dmar_drhd_unit *drhd;
3947 /* We know that this device on this chipset has its own IOMMU.
3948 * If we find it under a different IOMMU, then the BIOS is lying
3949 * to us. Hope that the IOMMU for this device is actually
3950 * disabled, and it needs no translation...
3952 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3954 /* "can't" happen */
3955 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3958 vtbar &= 0xffff0000;
3960 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3961 drhd = dmar_find_matched_drhd_unit(pdev);
3962 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3963 TAINT_FIRMWARE_WORKAROUND,
3964 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3965 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3967 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3969 static void __init init_no_remapping_devices(void)
3971 struct dmar_drhd_unit *drhd;
3975 for_each_drhd_unit(drhd) {
3976 if (!drhd->include_all) {
3977 for_each_active_dev_scope(drhd->devices,
3978 drhd->devices_cnt, i, dev)
3980 /* ignore DMAR unit if no devices exist */
3981 if (i == drhd->devices_cnt)
3986 for_each_active_drhd_unit(drhd) {
3987 if (drhd->include_all)
3990 for_each_active_dev_scope(drhd->devices,
3991 drhd->devices_cnt, i, dev)
3992 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3994 if (i < drhd->devices_cnt)
3997 /* This IOMMU has *only* gfx devices. Either bypass it or
3998 set the gfx_mapped flag, as appropriate */
4000 intel_iommu_gfx_mapped = 1;
4003 for_each_active_dev_scope(drhd->devices,
4004 drhd->devices_cnt, i, dev)
4005 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4010 #ifdef CONFIG_SUSPEND
4011 static int init_iommu_hw(void)
4013 struct dmar_drhd_unit *drhd;
4014 struct intel_iommu *iommu = NULL;
4016 for_each_active_iommu(iommu, drhd)
4018 dmar_reenable_qi(iommu);
4020 for_each_iommu(iommu, drhd) {
4021 if (drhd->ignored) {
4023 * we always have to disable PMRs or DMA may fail on
4027 iommu_disable_protect_mem_regions(iommu);
4031 iommu_flush_write_buffer(iommu);
4033 iommu_set_root_entry(iommu);
4035 iommu->flush.flush_context(iommu, 0, 0, 0,
4036 DMA_CCMD_GLOBAL_INVL);
4037 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4038 iommu_enable_translation(iommu);
4039 iommu_disable_protect_mem_regions(iommu);
4045 static void iommu_flush_all(void)
4047 struct dmar_drhd_unit *drhd;
4048 struct intel_iommu *iommu;
4050 for_each_active_iommu(iommu, drhd) {
4051 iommu->flush.flush_context(iommu, 0, 0, 0,
4052 DMA_CCMD_GLOBAL_INVL);
4053 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4054 DMA_TLB_GLOBAL_FLUSH);
4058 static int iommu_suspend(void)
4060 struct dmar_drhd_unit *drhd;
4061 struct intel_iommu *iommu = NULL;
4064 for_each_active_iommu(iommu, drhd) {
4065 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4067 if (!iommu->iommu_state)
4073 for_each_active_iommu(iommu, drhd) {
4074 iommu_disable_translation(iommu);
4076 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4078 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4079 readl(iommu->reg + DMAR_FECTL_REG);
4080 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4081 readl(iommu->reg + DMAR_FEDATA_REG);
4082 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4083 readl(iommu->reg + DMAR_FEADDR_REG);
4084 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4085 readl(iommu->reg + DMAR_FEUADDR_REG);
4087 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4092 for_each_active_iommu(iommu, drhd)
4093 kfree(iommu->iommu_state);
4098 static void iommu_resume(void)
4100 struct dmar_drhd_unit *drhd;
4101 struct intel_iommu *iommu = NULL;
4104 if (init_iommu_hw()) {
4106 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4108 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4112 for_each_active_iommu(iommu, drhd) {
4114 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4116 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4117 iommu->reg + DMAR_FECTL_REG);
4118 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4119 iommu->reg + DMAR_FEDATA_REG);
4120 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4121 iommu->reg + DMAR_FEADDR_REG);
4122 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4123 iommu->reg + DMAR_FEUADDR_REG);
4125 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4128 for_each_active_iommu(iommu, drhd)
4129 kfree(iommu->iommu_state);
4132 static struct syscore_ops iommu_syscore_ops = {
4133 .resume = iommu_resume,
4134 .suspend = iommu_suspend,
4137 static void __init init_iommu_pm_ops(void)
4139 register_syscore_ops(&iommu_syscore_ops);
4143 static inline void init_iommu_pm_ops(void) {}
4144 #endif /* CONFIG_PM */
4147 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4149 struct acpi_dmar_reserved_memory *rmrr;
4150 int prot = DMA_PTE_READ|DMA_PTE_WRITE;
4151 struct dmar_rmrr_unit *rmrru;
4154 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4158 rmrru->hdr = header;
4159 rmrr = (struct acpi_dmar_reserved_memory *)header;
4160 rmrru->base_address = rmrr->base_address;
4161 rmrru->end_address = rmrr->end_address;
4163 length = rmrr->end_address - rmrr->base_address + 1;
4164 rmrru->resv = iommu_alloc_resv_region(rmrr->base_address, length, prot,
4169 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4170 ((void *)rmrr) + rmrr->header.length,
4171 &rmrru->devices_cnt);
4172 if (rmrru->devices_cnt && rmrru->devices == NULL)
4175 list_add(&rmrru->list, &dmar_rmrr_units);
4186 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4188 struct dmar_atsr_unit *atsru;
4189 struct acpi_dmar_atsr *tmp;
4191 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4192 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4193 if (atsr->segment != tmp->segment)
4195 if (atsr->header.length != tmp->header.length)
4197 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4204 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4206 struct acpi_dmar_atsr *atsr;
4207 struct dmar_atsr_unit *atsru;
4209 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4212 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4213 atsru = dmar_find_atsr(atsr);
4217 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4222 * If memory is allocated from slab by ACPI _DSM method, we need to
4223 * copy the memory content because the memory buffer will be freed
4226 atsru->hdr = (void *)(atsru + 1);
4227 memcpy(atsru->hdr, hdr, hdr->length);
4228 atsru->include_all = atsr->flags & 0x1;
4229 if (!atsru->include_all) {
4230 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4231 (void *)atsr + atsr->header.length,
4232 &atsru->devices_cnt);
4233 if (atsru->devices_cnt && atsru->devices == NULL) {
4239 list_add_rcu(&atsru->list, &dmar_atsr_units);
4244 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4246 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4250 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4252 struct acpi_dmar_atsr *atsr;
4253 struct dmar_atsr_unit *atsru;
4255 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4256 atsru = dmar_find_atsr(atsr);
4258 list_del_rcu(&atsru->list);
4260 intel_iommu_free_atsr(atsru);
4266 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4270 struct acpi_dmar_atsr *atsr;
4271 struct dmar_atsr_unit *atsru;
4273 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4274 atsru = dmar_find_atsr(atsr);
4278 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4279 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4287 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4290 struct intel_iommu *iommu = dmaru->iommu;
4292 if (g_iommus[iommu->seq_id])
4295 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4296 pr_warn("%s: Doesn't support hardware pass through.\n",
4300 if (!ecap_sc_support(iommu->ecap) &&
4301 domain_update_iommu_snooping(iommu)) {
4302 pr_warn("%s: Doesn't support snooping.\n",
4306 sp = domain_update_iommu_superpage(iommu) - 1;
4307 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4308 pr_warn("%s: Doesn't support large page.\n",
4314 * Disable translation if already enabled prior to OS handover.
4316 if (iommu->gcmd & DMA_GCMD_TE)
4317 iommu_disable_translation(iommu);
4319 g_iommus[iommu->seq_id] = iommu;
4320 ret = iommu_init_domains(iommu);
4322 ret = iommu_alloc_root_entry(iommu);
4326 #ifdef CONFIG_INTEL_IOMMU_SVM
4327 if (pasid_enabled(iommu))
4328 intel_svm_alloc_pasid_tables(iommu);
4331 if (dmaru->ignored) {
4333 * we always have to disable PMRs or DMA may fail on this device
4336 iommu_disable_protect_mem_regions(iommu);
4340 intel_iommu_init_qi(iommu);
4341 iommu_flush_write_buffer(iommu);
4343 #ifdef CONFIG_INTEL_IOMMU_SVM
4344 if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
4345 ret = intel_svm_enable_prq(iommu);
4350 ret = dmar_set_interrupt(iommu);
4354 iommu_set_root_entry(iommu);
4355 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4356 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4357 iommu_enable_translation(iommu);
4359 iommu_disable_protect_mem_regions(iommu);
4363 disable_dmar_iommu(iommu);
4365 free_dmar_iommu(iommu);
4369 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4372 struct intel_iommu *iommu = dmaru->iommu;
4374 if (!intel_iommu_enabled)
4380 ret = intel_iommu_add(dmaru);
4382 disable_dmar_iommu(iommu);
4383 free_dmar_iommu(iommu);
4389 static void intel_iommu_free_dmars(void)
4391 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4392 struct dmar_atsr_unit *atsru, *atsr_n;
4394 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4395 list_del(&rmrru->list);
4396 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4401 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4402 list_del(&atsru->list);
4403 intel_iommu_free_atsr(atsru);
4407 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4410 struct pci_bus *bus;
4411 struct pci_dev *bridge = NULL;
4413 struct acpi_dmar_atsr *atsr;
4414 struct dmar_atsr_unit *atsru;
4416 dev = pci_physfn(dev);
4417 for (bus = dev->bus; bus; bus = bus->parent) {
4419 /* If it's an integrated device, allow ATS */
4422 /* Connected via non-PCIe: no ATS */
4423 if (!pci_is_pcie(bridge) ||
4424 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4426 /* If we found the root port, look it up in the ATSR */
4427 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4432 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4433 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4434 if (atsr->segment != pci_domain_nr(dev->bus))
4437 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4438 if (tmp == &bridge->dev)
4441 if (atsru->include_all)
4451 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4454 struct dmar_rmrr_unit *rmrru;
4455 struct dmar_atsr_unit *atsru;
4456 struct acpi_dmar_atsr *atsr;
4457 struct acpi_dmar_reserved_memory *rmrr;
4459 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4462 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4463 rmrr = container_of(rmrru->hdr,
4464 struct acpi_dmar_reserved_memory, header);
4465 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4466 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4467 ((void *)rmrr) + rmrr->header.length,
4468 rmrr->segment, rmrru->devices,
4469 rmrru->devices_cnt);
4472 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4473 dmar_remove_dev_scope(info, rmrr->segment,
4474 rmrru->devices, rmrru->devices_cnt);
4478 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4479 if (atsru->include_all)
4482 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4483 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4484 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4485 (void *)atsr + atsr->header.length,
4486 atsr->segment, atsru->devices,
4487 atsru->devices_cnt);
4492 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4493 if (dmar_remove_dev_scope(info, atsr->segment,
4494 atsru->devices, atsru->devices_cnt))
4503 * Here we only respond to action of unbound device from driver.
4505 * Added device is not attached to its DMAR domain here yet. That will happen
4506 * when mapping the device to iova.
4508 static int device_notifier(struct notifier_block *nb,
4509 unsigned long action, void *data)
4511 struct device *dev = data;
4512 struct dmar_domain *domain;
4514 if (iommu_dummy(dev))
4517 if (action != BUS_NOTIFY_REMOVED_DEVICE)
4520 domain = find_domain(dev);
4524 dmar_remove_one_dev_info(domain, dev);
4525 if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4526 domain_exit(domain);
4531 static struct notifier_block device_nb = {
4532 .notifier_call = device_notifier,
4535 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4536 unsigned long val, void *v)
4538 struct memory_notify *mhp = v;
4539 unsigned long long start, end;
4540 unsigned long start_vpfn, last_vpfn;
4543 case MEM_GOING_ONLINE:
4544 start = mhp->start_pfn << PAGE_SHIFT;
4545 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4546 if (iommu_domain_identity_map(si_domain, start, end)) {
4547 pr_warn("Failed to build identity map for [%llx-%llx]\n",
4554 case MEM_CANCEL_ONLINE:
4555 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4556 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4557 while (start_vpfn <= last_vpfn) {
4559 struct dmar_drhd_unit *drhd;
4560 struct intel_iommu *iommu;
4561 struct page *freelist;
4563 iova = find_iova(&si_domain->iovad, start_vpfn);
4565 pr_debug("Failed get IOVA for PFN %lx\n",
4570 iova = split_and_remove_iova(&si_domain->iovad, iova,
4571 start_vpfn, last_vpfn);
4573 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4574 start_vpfn, last_vpfn);
4578 freelist = domain_unmap(si_domain, iova->pfn_lo,
4582 for_each_active_iommu(iommu, drhd)
4583 iommu_flush_iotlb_psi(iommu, si_domain,
4584 iova->pfn_lo, iova_size(iova),
4587 dma_free_pagelist(freelist);
4589 start_vpfn = iova->pfn_hi + 1;
4590 free_iova_mem(iova);
4598 static struct notifier_block intel_iommu_memory_nb = {
4599 .notifier_call = intel_iommu_memory_notifier,
4603 static void free_all_cpu_cached_iovas(unsigned int cpu)
4607 for (i = 0; i < g_num_of_iommus; i++) {
4608 struct intel_iommu *iommu = g_iommus[i];
4609 struct dmar_domain *domain;
4615 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4616 domain = get_iommu_domain(iommu, (u16)did);
4620 free_cpu_cached_iovas(cpu, &domain->iovad);
4625 static int intel_iommu_cpu_dead(unsigned int cpu)
4627 free_all_cpu_cached_iovas(cpu);
4631 static void intel_disable_iommus(void)
4633 struct intel_iommu *iommu = NULL;
4634 struct dmar_drhd_unit *drhd;
4636 for_each_iommu(iommu, drhd)
4637 iommu_disable_translation(iommu);
4640 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4642 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4644 return container_of(iommu_dev, struct intel_iommu, iommu);
4647 static ssize_t intel_iommu_show_version(struct device *dev,
4648 struct device_attribute *attr,
4651 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4652 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4653 return sprintf(buf, "%d:%d\n",
4654 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4656 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4658 static ssize_t intel_iommu_show_address(struct device *dev,
4659 struct device_attribute *attr,
4662 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4663 return sprintf(buf, "%llx\n", iommu->reg_phys);
4665 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4667 static ssize_t intel_iommu_show_cap(struct device *dev,
4668 struct device_attribute *attr,
4671 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4672 return sprintf(buf, "%llx\n", iommu->cap);
4674 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4676 static ssize_t intel_iommu_show_ecap(struct device *dev,
4677 struct device_attribute *attr,
4680 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4681 return sprintf(buf, "%llx\n", iommu->ecap);
4683 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4685 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4686 struct device_attribute *attr,
4689 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4690 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4692 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4694 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4695 struct device_attribute *attr,
4698 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4699 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4700 cap_ndoms(iommu->cap)));
4702 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4704 static struct attribute *intel_iommu_attrs[] = {
4705 &dev_attr_version.attr,
4706 &dev_attr_address.attr,
4708 &dev_attr_ecap.attr,
4709 &dev_attr_domains_supported.attr,
4710 &dev_attr_domains_used.attr,
4714 static struct attribute_group intel_iommu_group = {
4715 .name = "intel-iommu",
4716 .attrs = intel_iommu_attrs,
4719 const struct attribute_group *intel_iommu_groups[] = {
4724 int __init intel_iommu_init(void)
4727 struct dmar_drhd_unit *drhd;
4728 struct intel_iommu *iommu;
4730 /* VT-d is required for a TXT/tboot launch, so enforce that */
4731 force_on = tboot_force_iommu();
4733 if (iommu_init_mempool()) {
4735 panic("tboot: Failed to initialize iommu memory\n");
4739 down_write(&dmar_global_lock);
4740 if (dmar_table_init()) {
4742 panic("tboot: Failed to initialize DMAR table\n");
4746 if (dmar_dev_scope_init() < 0) {
4748 panic("tboot: Failed to initialize DMAR device scope\n");
4752 up_write(&dmar_global_lock);
4755 * The bus notifier takes the dmar_global_lock, so lockdep will
4756 * complain later when we register it under the lock.
4758 dmar_register_bus_notifier();
4760 down_write(&dmar_global_lock);
4762 if (no_iommu || dmar_disabled) {
4764 * We exit the function here to ensure IOMMU's remapping and
4765 * mempool aren't setup, which means that the IOMMU's PMRs
4766 * won't be disabled via the call to init_dmars(). So disable
4767 * it explicitly here. The PMRs were setup by tboot prior to
4768 * calling SENTER, but the kernel is expected to reset/tear
4771 if (intel_iommu_tboot_noforce) {
4772 for_each_iommu(iommu, drhd)
4773 iommu_disable_protect_mem_regions(iommu);
4777 * Make sure the IOMMUs are switched off, even when we
4778 * boot into a kexec kernel and the previous kernel left
4781 intel_disable_iommus();
4785 if (list_empty(&dmar_rmrr_units))
4786 pr_info("No RMRR found\n");
4788 if (list_empty(&dmar_atsr_units))
4789 pr_info("No ATSR found\n");
4791 if (dmar_init_reserved_ranges()) {
4793 panic("tboot: Failed to reserve iommu ranges\n");
4794 goto out_free_reserved_range;
4797 init_no_remapping_devices();
4802 panic("tboot: Failed to initialize DMARs\n");
4803 pr_err("Initialization failed\n");
4804 goto out_free_reserved_range;
4806 up_write(&dmar_global_lock);
4807 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4809 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4812 dma_ops = &intel_dma_ops;
4814 init_iommu_pm_ops();
4816 for_each_active_iommu(iommu, drhd) {
4817 iommu_device_sysfs_add(&iommu->iommu, NULL,
4820 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4821 iommu_device_register(&iommu->iommu);
4824 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4825 bus_register_notifier(&pci_bus_type, &device_nb);
4826 if (si_domain && !hw_pass_through)
4827 register_memory_notifier(&intel_iommu_memory_nb);
4828 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4829 intel_iommu_cpu_dead);
4830 intel_iommu_enabled = 1;
4834 out_free_reserved_range:
4835 put_iova_domain(&reserved_iova_list);
4837 intel_iommu_free_dmars();
4838 up_write(&dmar_global_lock);
4839 iommu_exit_mempool();
4843 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4845 struct intel_iommu *iommu = opaque;
4847 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4852 * NB - intel-iommu lacks any sort of reference counting for the users of
4853 * dependent devices. If multiple endpoints have intersecting dependent
4854 * devices, unbinding the driver from any one of them will possibly leave
4855 * the others unable to operate.
4857 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4859 if (!iommu || !dev || !dev_is_pci(dev))
4862 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4865 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4867 struct intel_iommu *iommu;
4868 unsigned long flags;
4870 assert_spin_locked(&device_domain_lock);
4875 iommu = info->iommu;
4878 iommu_disable_dev_iotlb(info);
4879 domain_context_clear(iommu, info->dev);
4882 unlink_domain_info(info);
4884 spin_lock_irqsave(&iommu->lock, flags);
4885 domain_detach_iommu(info->domain, iommu);
4886 spin_unlock_irqrestore(&iommu->lock, flags);
4888 free_devinfo_mem(info);
4891 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
4894 struct device_domain_info *info;
4895 unsigned long flags;
4897 spin_lock_irqsave(&device_domain_lock, flags);
4898 info = dev->archdata.iommu;
4899 __dmar_remove_one_dev_info(info);
4900 spin_unlock_irqrestore(&device_domain_lock, flags);
4903 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4907 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
4908 domain_reserve_special_ranges(domain);
4910 /* calculate AGAW */
4911 domain->gaw = guest_width;
4912 adjust_width = guestwidth_to_adjustwidth(guest_width);
4913 domain->agaw = width_to_agaw(adjust_width);
4915 domain->iommu_coherency = 0;
4916 domain->iommu_snooping = 0;
4917 domain->iommu_superpage = 0;
4918 domain->max_addr = 0;
4920 /* always allocate the top pgd */
4921 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4924 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4928 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4930 struct dmar_domain *dmar_domain;
4931 struct iommu_domain *domain;
4933 if (type != IOMMU_DOMAIN_UNMANAGED)
4936 dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4938 pr_err("Can't allocate dmar_domain\n");
4941 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4942 pr_err("Domain initialization failed\n");
4943 domain_exit(dmar_domain);
4946 domain_update_iommu_cap(dmar_domain);
4948 domain = &dmar_domain->domain;
4949 domain->geometry.aperture_start = 0;
4950 domain->geometry.aperture_end = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4951 domain->geometry.force_aperture = true;
4956 static void intel_iommu_domain_free(struct iommu_domain *domain)
4958 domain_exit(to_dmar_domain(domain));
4961 static int intel_iommu_attach_device(struct iommu_domain *domain,
4964 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4965 struct intel_iommu *iommu;
4969 if (device_is_rmrr_locked(dev)) {
4970 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
4974 /* normally dev is not mapped */
4975 if (unlikely(domain_context_mapped(dev))) {
4976 struct dmar_domain *old_domain;
4978 old_domain = find_domain(dev);
4981 dmar_remove_one_dev_info(old_domain, dev);
4984 if (!domain_type_is_vm_or_si(old_domain) &&
4985 list_empty(&old_domain->devices))
4986 domain_exit(old_domain);
4990 iommu = device_to_iommu(dev, &bus, &devfn);
4994 /* check if this iommu agaw is sufficient for max mapped address */
4995 addr_width = agaw_to_width(iommu->agaw);
4996 if (addr_width > cap_mgaw(iommu->cap))
4997 addr_width = cap_mgaw(iommu->cap);
4999 if (dmar_domain->max_addr > (1LL << addr_width)) {
5000 pr_err("%s: iommu width (%d) is not "
5001 "sufficient for the mapped address (%llx)\n",
5002 __func__, addr_width, dmar_domain->max_addr);
5005 dmar_domain->gaw = addr_width;
5008 * Knock out extra levels of page tables if necessary
5010 while (iommu->agaw < dmar_domain->agaw) {
5011 struct dma_pte *pte;
5013 pte = dmar_domain->pgd;
5014 if (dma_pte_present(pte)) {
5015 dmar_domain->pgd = (struct dma_pte *)
5016 phys_to_virt(dma_pte_addr(pte));
5017 free_pgtable_page(pte);
5019 dmar_domain->agaw--;
5022 return domain_add_dev_info(dmar_domain, dev);
5025 static void intel_iommu_detach_device(struct iommu_domain *domain,
5028 dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
5031 static int intel_iommu_map(struct iommu_domain *domain,
5032 unsigned long iova, phys_addr_t hpa,
5033 size_t size, int iommu_prot)
5035 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5040 if (iommu_prot & IOMMU_READ)
5041 prot |= DMA_PTE_READ;
5042 if (iommu_prot & IOMMU_WRITE)
5043 prot |= DMA_PTE_WRITE;
5044 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5045 prot |= DMA_PTE_SNP;
5047 max_addr = iova + size;
5048 if (dmar_domain->max_addr < max_addr) {
5051 /* check if minimum agaw is sufficient for mapped address */
5052 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5053 if (end < max_addr) {
5054 pr_err("%s: iommu width (%d) is not "
5055 "sufficient for the mapped address (%llx)\n",
5056 __func__, dmar_domain->gaw, max_addr);
5059 dmar_domain->max_addr = max_addr;
5061 /* Round up size to next multiple of PAGE_SIZE, if it and
5062 the low bits of hpa would take us onto the next page */
5063 size = aligned_nrpages(hpa, size);
5064 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5065 hpa >> VTD_PAGE_SHIFT, size, prot);
5069 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5070 unsigned long iova, size_t size)
5072 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5073 struct page *freelist = NULL;
5074 unsigned long start_pfn, last_pfn;
5075 unsigned int npages;
5076 int iommu_id, level = 0;
5078 /* Cope with horrid API which requires us to unmap more than the
5079 size argument if it happens to be a large-page mapping. */
5080 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5082 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5083 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5085 start_pfn = iova >> VTD_PAGE_SHIFT;
5086 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5088 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5090 npages = last_pfn - start_pfn + 1;
5092 for_each_domain_iommu(iommu_id, dmar_domain)
5093 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5094 start_pfn, npages, !freelist, 0);
5096 dma_free_pagelist(freelist);
5098 if (dmar_domain->max_addr == iova + size)
5099 dmar_domain->max_addr = iova;
5104 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5107 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5108 struct dma_pte *pte;
5112 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5114 phys = dma_pte_addr(pte);
5119 static bool intel_iommu_capable(enum iommu_cap cap)
5121 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5122 return domain_update_iommu_snooping(NULL) == 1;
5123 if (cap == IOMMU_CAP_INTR_REMAP)
5124 return irq_remapping_enabled == 1;
5129 static int intel_iommu_add_device(struct device *dev)
5131 struct intel_iommu *iommu;
5132 struct iommu_group *group;
5135 iommu = device_to_iommu(dev, &bus, &devfn);
5139 iommu_device_link(&iommu->iommu, dev);
5141 group = iommu_group_get_for_dev(dev);
5144 return PTR_ERR(group);
5146 iommu_group_put(group);
5150 static void intel_iommu_remove_device(struct device *dev)
5152 struct intel_iommu *iommu;
5155 iommu = device_to_iommu(dev, &bus, &devfn);
5159 iommu_group_remove_device(dev);
5161 iommu_device_unlink(&iommu->iommu, dev);
5164 static void intel_iommu_get_resv_regions(struct device *device,
5165 struct list_head *head)
5167 struct iommu_resv_region *reg;
5168 struct dmar_rmrr_unit *rmrr;
5169 struct device *i_dev;
5173 for_each_rmrr_units(rmrr) {
5174 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5176 if (i_dev != device)
5179 list_add_tail(&rmrr->resv->list, head);
5184 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5185 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5189 list_add_tail(®->list, head);
5192 static void intel_iommu_put_resv_regions(struct device *dev,
5193 struct list_head *head)
5195 struct iommu_resv_region *entry, *next;
5197 list_for_each_entry_safe(entry, next, head, list) {
5198 if (entry->type == IOMMU_RESV_RESERVED)
5203 #ifdef CONFIG_INTEL_IOMMU_SVM
5204 #define MAX_NR_PASID_BITS (20)
5205 static inline unsigned long intel_iommu_get_pts(struct intel_iommu *iommu)
5208 * Convert ecap_pss to extend context entry pts encoding, also
5209 * respect the soft pasid_max value set by the iommu.
5210 * - number of PASID bits = ecap_pss + 1
5211 * - number of PASID table entries = 2^(pts + 5)
5212 * Therefore, pts = ecap_pss - 4
5213 * e.g. KBL ecap_pss = 0x13, PASID has 20 bits, pts = 15
5215 if (ecap_pss(iommu->ecap) < 5)
5218 /* pasid_max is encoded as actual number of entries not the bits */
5219 return find_first_bit((unsigned long *)&iommu->pasid_max,
5220 MAX_NR_PASID_BITS) - 5;
5223 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev)
5225 struct device_domain_info *info;
5226 struct context_entry *context;
5227 struct dmar_domain *domain;
5228 unsigned long flags;
5232 domain = get_valid_domain_for_dev(sdev->dev);
5236 spin_lock_irqsave(&device_domain_lock, flags);
5237 spin_lock(&iommu->lock);
5240 info = sdev->dev->archdata.iommu;
5241 if (!info || !info->pasid_supported)
5244 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5245 if (WARN_ON(!context))
5248 ctx_lo = context[0].lo;
5250 sdev->did = domain->iommu_did[iommu->seq_id];
5251 sdev->sid = PCI_DEVID(info->bus, info->devfn);
5253 if (!(ctx_lo & CONTEXT_PASIDE)) {
5254 if (iommu->pasid_state_table)
5255 context[1].hi = (u64)virt_to_phys(iommu->pasid_state_table);
5256 context[1].lo = (u64)virt_to_phys(iommu->pasid_table) |
5257 intel_iommu_get_pts(iommu);
5260 /* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both
5261 * extended to permit requests-with-PASID if the PASIDE bit
5262 * is set. which makes sense. For CONTEXT_TT_PASS_THROUGH,
5263 * however, the PASIDE bit is ignored and requests-with-PASID
5264 * are unconditionally blocked. Which makes less sense.
5265 * So convert from CONTEXT_TT_PASS_THROUGH to one of the new
5266 * "guest mode" translation types depending on whether ATS
5267 * is available or not. Annoyingly, we can't use the new
5268 * modes *unless* PASIDE is set. */
5269 if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 2)) {
5270 ctx_lo &= ~CONTEXT_TT_MASK;
5271 if (info->ats_supported)
5272 ctx_lo |= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2;
5274 ctx_lo |= CONTEXT_TT_PT_PASID << 2;
5276 ctx_lo |= CONTEXT_PASIDE;
5277 if (iommu->pasid_state_table)
5278 ctx_lo |= CONTEXT_DINVE;
5279 if (info->pri_supported)
5280 ctx_lo |= CONTEXT_PRS;
5281 context[0].lo = ctx_lo;
5283 iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
5284 DMA_CCMD_MASK_NOBIT,
5285 DMA_CCMD_DEVICE_INVL);
5288 /* Enable PASID support in the device, if it wasn't already */
5289 if (!info->pasid_enabled)
5290 iommu_enable_dev_iotlb(info);
5292 if (info->ats_enabled) {
5293 sdev->dev_iotlb = 1;
5294 sdev->qdep = info->ats_qdep;
5295 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
5301 spin_unlock(&iommu->lock);
5302 spin_unlock_irqrestore(&device_domain_lock, flags);
5307 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5309 struct intel_iommu *iommu;
5312 if (iommu_dummy(dev)) {
5314 "No IOMMU translation for device; cannot enable SVM\n");
5318 iommu = device_to_iommu(dev, &bus, &devfn);
5320 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5324 if (!iommu->pasid_table) {
5325 dev_err(dev, "PASID not enabled on IOMMU; cannot enable SVM\n");
5331 #endif /* CONFIG_INTEL_IOMMU_SVM */
5333 const struct iommu_ops intel_iommu_ops = {
5334 .capable = intel_iommu_capable,
5335 .domain_alloc = intel_iommu_domain_alloc,
5336 .domain_free = intel_iommu_domain_free,
5337 .attach_dev = intel_iommu_attach_device,
5338 .detach_dev = intel_iommu_detach_device,
5339 .map = intel_iommu_map,
5340 .unmap = intel_iommu_unmap,
5341 .map_sg = default_iommu_map_sg,
5342 .iova_to_phys = intel_iommu_iova_to_phys,
5343 .add_device = intel_iommu_add_device,
5344 .remove_device = intel_iommu_remove_device,
5345 .get_resv_regions = intel_iommu_get_resv_regions,
5346 .put_resv_regions = intel_iommu_put_resv_regions,
5347 .device_group = pci_device_group,
5348 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
5351 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5353 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5354 pr_info("Disabling IOMMU for graphics on this chipset\n");
5358 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5359 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5360 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5361 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5362 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5363 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5364 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5366 static void quirk_iommu_rwbf(struct pci_dev *dev)
5369 * Mobile 4 Series Chipset neglects to set RWBF capability,
5370 * but needs it. Same seems to hold for the desktop versions.
5372 pr_info("Forcing write-buffer flush capability\n");
5376 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5377 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5378 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5379 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5380 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5381 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5382 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5385 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
5386 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
5387 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
5388 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
5389 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
5390 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
5391 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
5392 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
5394 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5398 if (pci_read_config_word(dev, GGC, &ggc))
5401 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5402 pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5404 } else if (dmar_map_gfx) {
5405 /* we have to ensure the gfx device is idle before we flush */
5406 pr_info("Disabling batched IOTLB flush on Ironlake\n");
5407 intel_iommu_strict = 1;
5410 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5411 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5412 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5413 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5415 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5416 ISOCH DMAR unit for the Azalia sound device, but not give it any
5417 TLB entries, which causes it to deadlock. Check for that. We do
5418 this in a function called from init_dmars(), instead of in a PCI
5419 quirk, because we don't want to print the obnoxious "BIOS broken"
5420 message if VT-d is actually disabled.
5422 static void __init check_tylersburg_isoch(void)
5424 struct pci_dev *pdev;
5425 uint32_t vtisochctrl;
5427 /* If there's no Azalia in the system anyway, forget it. */
5428 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5433 /* System Management Registers. Might be hidden, in which case
5434 we can't do the sanity check. But that's OK, because the
5435 known-broken BIOSes _don't_ actually hide it, so far. */
5436 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5440 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5447 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5448 if (vtisochctrl & 1)
5451 /* Drop all bits other than the number of TLB entries */
5452 vtisochctrl &= 0x1c;
5454 /* If we have the recommended number of TLB entries (16), fine. */
5455 if (vtisochctrl == 0x10)
5458 /* Zero TLB entries? You get to ride the short bus to school. */
5460 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5461 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5462 dmi_get_system_info(DMI_BIOS_VENDOR),
5463 dmi_get_system_info(DMI_BIOS_VERSION),
5464 dmi_get_system_info(DMI_PRODUCT_VERSION));
5465 iommu_identity_mapping |= IDENTMAP_AZALIA;
5469 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",