1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright © 2006-2014 Intel Corporation.
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <asm/irq_remapping.h>
45 #include <asm/cacheflush.h>
46 #include <asm/iommu.h>
48 #include "irq_remapping.h"
49 #include "intel-pasid.h"
51 #define ROOT_SIZE VTD_PAGE_SIZE
52 #define CONTEXT_SIZE VTD_PAGE_SIZE
54 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
55 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
56 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
57 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
59 #define IOAPIC_RANGE_START (0xfee00000)
60 #define IOAPIC_RANGE_END (0xfeefffff)
61 #define IOVA_START_ADDR (0x1000)
63 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
65 #define MAX_AGAW_WIDTH 64
66 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
68 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
69 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
71 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
72 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
73 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
74 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
75 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
77 /* IO virtual address start page frame number */
78 #define IOVA_START_PFN (1)
80 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
82 /* page table handling */
83 #define LEVEL_STRIDE (9)
84 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
87 * This bitmap is used to advertise the page sizes our hardware support
88 * to the IOMMU core, which will then use this information to split
89 * physically contiguous memory regions it is mapping into page sizes
92 * Traditionally the IOMMU core just handed us the mappings directly,
93 * after making sure the size is an order of a 4KiB page and that the
94 * mapping has natural alignment.
96 * To retain this behavior, we currently advertise that we support
97 * all page sizes that are an order of 4KiB.
99 * If at some point we'd like to utilize the IOMMU core's new behavior,
100 * we could change this to advertise the real page sizes we support.
102 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
104 static inline int agaw_to_level(int agaw)
109 static inline int agaw_to_width(int agaw)
111 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 static inline int width_to_agaw(int width)
116 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 static inline unsigned int level_to_offset_bits(int level)
121 return (level - 1) * LEVEL_STRIDE;
124 static inline int pfn_level_offset(unsigned long pfn, int level)
126 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 static inline unsigned long level_mask(int level)
131 return -1UL << level_to_offset_bits(level);
134 static inline unsigned long level_size(int level)
136 return 1UL << level_to_offset_bits(level);
139 static inline unsigned long align_to_level(unsigned long pfn, int level)
141 return (pfn + level_size(level) - 1) & level_mask(level);
144 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
146 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
150 are never going to work. */
151 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
153 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
158 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
160 static inline unsigned long page_to_dma_pfn(struct page *pg)
162 return mm_to_dma_pfn(page_to_pfn(pg));
164 static inline unsigned long virt_to_dma_pfn(void *p)
166 return page_to_dma_pfn(virt_to_page(p));
169 /* global iommu list, set NULL for ignored DMAR units */
170 static struct intel_iommu **g_iommus;
172 static void __init check_tylersburg_isoch(void);
173 static int rwbf_quirk;
176 * set to 1 to panic kernel if can't successfully enable VT-d
177 * (used when kernel is launched w/ TXT)
179 static int force_on = 0;
180 int intel_iommu_tboot_noforce;
181 static int no_platform_optin;
183 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189 static phys_addr_t root_entry_lctp(struct root_entry *re)
194 return re->lo & VTD_PAGE_MASK;
198 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201 static phys_addr_t root_entry_uctp(struct root_entry *re)
206 return re->hi & VTD_PAGE_MASK;
209 static inline void context_clear_pasid_enable(struct context_entry *context)
211 context->lo &= ~(1ULL << 11);
214 static inline bool context_pasid_enabled(struct context_entry *context)
216 return !!(context->lo & (1ULL << 11));
219 static inline void context_set_copied(struct context_entry *context)
221 context->hi |= (1ull << 3);
224 static inline bool context_copied(struct context_entry *context)
226 return !!(context->hi & (1ULL << 3));
229 static inline bool __context_present(struct context_entry *context)
231 return (context->lo & 1);
234 bool context_present(struct context_entry *context)
236 return context_pasid_enabled(context) ?
237 __context_present(context) :
238 __context_present(context) && !context_copied(context);
241 static inline void context_set_present(struct context_entry *context)
246 static inline void context_set_fault_enable(struct context_entry *context)
248 context->lo &= (((u64)-1) << 2) | 1;
251 static inline void context_set_translation_type(struct context_entry *context,
254 context->lo &= (((u64)-1) << 4) | 3;
255 context->lo |= (value & 3) << 2;
258 static inline void context_set_address_root(struct context_entry *context,
261 context->lo &= ~VTD_PAGE_MASK;
262 context->lo |= value & VTD_PAGE_MASK;
265 static inline void context_set_address_width(struct context_entry *context,
268 context->hi |= value & 7;
271 static inline void context_set_domain_id(struct context_entry *context,
274 context->hi |= (value & ((1 << 16) - 1)) << 8;
277 static inline int context_domain_id(struct context_entry *c)
279 return((c->hi >> 8) & 0xffff);
282 static inline void context_clear_entry(struct context_entry *context)
289 * This domain is a statically identity mapping domain.
290 * 1. This domain creats a static 1:1 mapping to all usable memory.
291 * 2. It maps to each iommu if successful.
292 * 3. Each iommu mapps to this domain if successful.
294 static struct dmar_domain *si_domain;
295 static int hw_pass_through = 1;
297 /* si_domain contains mulitple devices */
298 #define DOMAIN_FLAG_STATIC_IDENTITY BIT(0)
301 * This is a DMA domain allocated through the iommu domain allocation
302 * interface. But one or more devices belonging to this domain have
303 * been chosen to use a private domain. We should avoid to use the
304 * map/unmap/iova_to_phys APIs on it.
306 #define DOMAIN_FLAG_LOSE_CHILDREN BIT(1)
308 #define for_each_domain_iommu(idx, domain) \
309 for (idx = 0; idx < g_num_of_iommus; idx++) \
310 if (domain->iommu_refcnt[idx])
312 struct dmar_rmrr_unit {
313 struct list_head list; /* list of rmrr units */
314 struct acpi_dmar_header *hdr; /* ACPI header */
315 u64 base_address; /* reserved base address*/
316 u64 end_address; /* reserved end address */
317 struct dmar_dev_scope *devices; /* target devices */
318 int devices_cnt; /* target device count */
321 struct dmar_atsr_unit {
322 struct list_head list; /* list of ATSR units */
323 struct acpi_dmar_header *hdr; /* ACPI header */
324 struct dmar_dev_scope *devices; /* target devices */
325 int devices_cnt; /* target device count */
326 u8 include_all:1; /* include all ports */
329 static LIST_HEAD(dmar_atsr_units);
330 static LIST_HEAD(dmar_rmrr_units);
332 #define for_each_rmrr_units(rmrr) \
333 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
335 /* bitmap for indexing intel_iommus */
336 static int g_num_of_iommus;
338 static void domain_exit(struct dmar_domain *domain);
339 static void domain_remove_dev_info(struct dmar_domain *domain);
340 static void dmar_remove_one_dev_info(struct device *dev);
341 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
342 static int domain_detach_iommu(struct dmar_domain *domain,
343 struct intel_iommu *iommu);
344 static bool device_is_rmrr_locked(struct device *dev);
345 static int intel_iommu_attach_device(struct iommu_domain *domain,
348 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
349 int dmar_disabled = 0;
351 int dmar_disabled = 1;
352 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
355 int intel_iommu_enabled = 0;
356 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
358 static int dmar_map_gfx = 1;
359 static int dmar_forcedac;
360 static int intel_iommu_strict;
361 static int intel_iommu_superpage = 1;
362 static int iommu_identity_mapping;
364 #define IDENTMAP_ALL 1
365 #define IDENTMAP_GFX 2
366 #define IDENTMAP_AZALIA 4
368 int intel_iommu_gfx_mapped;
369 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
371 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
372 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
373 static DEFINE_SPINLOCK(device_domain_lock);
374 static LIST_HEAD(device_domain_list);
377 * Iterate over elements in device_domain_list and call the specified
378 * callback @fn against each element.
380 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
381 void *data), void *data)
385 struct device_domain_info *info;
387 spin_lock_irqsave(&device_domain_lock, flags);
388 list_for_each_entry(info, &device_domain_list, global) {
389 ret = fn(info, data);
391 spin_unlock_irqrestore(&device_domain_lock, flags);
395 spin_unlock_irqrestore(&device_domain_lock, flags);
400 const struct iommu_ops intel_iommu_ops;
402 static bool translation_pre_enabled(struct intel_iommu *iommu)
404 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
407 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
409 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
412 static void init_translation_status(struct intel_iommu *iommu)
416 gsts = readl(iommu->reg + DMAR_GSTS_REG);
417 if (gsts & DMA_GSTS_TES)
418 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
421 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
422 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
424 return container_of(dom, struct dmar_domain, domain);
427 static int __init intel_iommu_setup(char *str)
432 if (!strncmp(str, "on", 2)) {
434 pr_info("IOMMU enabled\n");
435 } else if (!strncmp(str, "off", 3)) {
437 no_platform_optin = 1;
438 pr_info("IOMMU disabled\n");
439 } else if (!strncmp(str, "igfx_off", 8)) {
441 pr_info("Disable GFX device mapping\n");
442 } else if (!strncmp(str, "forcedac", 8)) {
443 pr_info("Forcing DAC for PCI devices\n");
445 } else if (!strncmp(str, "strict", 6)) {
446 pr_info("Disable batched IOTLB flush\n");
447 intel_iommu_strict = 1;
448 } else if (!strncmp(str, "sp_off", 6)) {
449 pr_info("Disable supported super page\n");
450 intel_iommu_superpage = 0;
451 } else if (!strncmp(str, "sm_on", 5)) {
452 pr_info("Intel-IOMMU: scalable mode supported\n");
454 } else if (!strncmp(str, "tboot_noforce", 13)) {
456 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
457 intel_iommu_tboot_noforce = 1;
460 str += strcspn(str, ",");
466 __setup("intel_iommu=", intel_iommu_setup);
468 static struct kmem_cache *iommu_domain_cache;
469 static struct kmem_cache *iommu_devinfo_cache;
471 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
473 struct dmar_domain **domains;
476 domains = iommu->domains[idx];
480 return domains[did & 0xff];
483 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
484 struct dmar_domain *domain)
486 struct dmar_domain **domains;
489 if (!iommu->domains[idx]) {
490 size_t size = 256 * sizeof(struct dmar_domain *);
491 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
494 domains = iommu->domains[idx];
495 if (WARN_ON(!domains))
498 domains[did & 0xff] = domain;
501 void *alloc_pgtable_page(int node)
506 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
508 vaddr = page_address(page);
512 void free_pgtable_page(void *vaddr)
514 free_page((unsigned long)vaddr);
517 static inline void *alloc_domain_mem(void)
519 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
522 static void free_domain_mem(void *vaddr)
524 kmem_cache_free(iommu_domain_cache, vaddr);
527 static inline void * alloc_devinfo_mem(void)
529 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
532 static inline void free_devinfo_mem(void *vaddr)
534 kmem_cache_free(iommu_devinfo_cache, vaddr);
537 static inline int domain_type_is_si(struct dmar_domain *domain)
539 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
542 static inline int domain_pfn_supported(struct dmar_domain *domain,
545 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
547 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
550 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
555 sagaw = cap_sagaw(iommu->cap);
556 for (agaw = width_to_agaw(max_gaw);
558 if (test_bit(agaw, &sagaw))
566 * Calculate max SAGAW for each iommu.
568 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
570 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
574 * calculate agaw for each iommu.
575 * "SAGAW" may be different across iommus, use a default agaw, and
576 * get a supported less agaw for iommus that don't support the default agaw.
578 int iommu_calculate_agaw(struct intel_iommu *iommu)
580 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
583 /* This functionin only returns single iommu in a domain */
584 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
588 /* si_domain and vm domain should not get here. */
589 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
592 for_each_domain_iommu(iommu_id, domain)
595 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
598 return g_iommus[iommu_id];
601 static void domain_update_iommu_coherency(struct dmar_domain *domain)
603 struct dmar_drhd_unit *drhd;
604 struct intel_iommu *iommu;
608 domain->iommu_coherency = 1;
610 for_each_domain_iommu(i, domain) {
612 if (!ecap_coherent(g_iommus[i]->ecap)) {
613 domain->iommu_coherency = 0;
620 /* No hardware attached; use lowest common denominator */
622 for_each_active_iommu(iommu, drhd) {
623 if (!ecap_coherent(iommu->ecap)) {
624 domain->iommu_coherency = 0;
631 static int domain_update_iommu_snooping(struct intel_iommu *skip)
633 struct dmar_drhd_unit *drhd;
634 struct intel_iommu *iommu;
638 for_each_active_iommu(iommu, drhd) {
640 if (!ecap_sc_support(iommu->ecap)) {
651 static int domain_update_iommu_superpage(struct intel_iommu *skip)
653 struct dmar_drhd_unit *drhd;
654 struct intel_iommu *iommu;
657 if (!intel_iommu_superpage) {
661 /* set iommu_superpage to the smallest common denominator */
663 for_each_active_iommu(iommu, drhd) {
665 mask &= cap_super_page_val(iommu->cap);
675 /* Some capabilities may be different across iommus */
676 static void domain_update_iommu_cap(struct dmar_domain *domain)
678 domain_update_iommu_coherency(domain);
679 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
680 domain->iommu_superpage = domain_update_iommu_superpage(NULL);
683 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
686 struct root_entry *root = &iommu->root_entry[bus];
687 struct context_entry *context;
691 if (sm_supported(iommu)) {
699 context = phys_to_virt(*entry & VTD_PAGE_MASK);
701 unsigned long phy_addr;
705 context = alloc_pgtable_page(iommu->node);
709 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
710 phy_addr = virt_to_phys((void *)context);
711 *entry = phy_addr | 1;
712 __iommu_flush_cache(iommu, entry, sizeof(*entry));
714 return &context[devfn];
717 static int iommu_dummy(struct device *dev)
719 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
723 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
724 * sub-hierarchy of a candidate PCI-PCI bridge
725 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
726 * @bridge: the candidate PCI-PCI bridge
728 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
731 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
733 struct pci_dev *pdev, *pbridge;
735 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
738 pdev = to_pci_dev(dev);
739 pbridge = to_pci_dev(bridge);
741 if (pbridge->subordinate &&
742 pbridge->subordinate->number <= pdev->bus->number &&
743 pbridge->subordinate->busn_res.end >= pdev->bus->number)
749 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
751 struct dmar_drhd_unit *drhd = NULL;
752 struct intel_iommu *iommu;
754 struct pci_dev *pdev = NULL;
758 if (iommu_dummy(dev))
761 if (dev_is_pci(dev)) {
762 struct pci_dev *pf_pdev;
764 pdev = to_pci_dev(dev);
767 /* VMD child devices currently cannot be handled individually */
768 if (is_vmd(pdev->bus))
772 /* VFs aren't listed in scope tables; we need to look up
773 * the PF instead to find the IOMMU. */
774 pf_pdev = pci_physfn(pdev);
776 segment = pci_domain_nr(pdev->bus);
777 } else if (has_acpi_companion(dev))
778 dev = &ACPI_COMPANION(dev)->dev;
781 for_each_active_iommu(iommu, drhd) {
782 if (pdev && segment != drhd->segment)
785 for_each_active_dev_scope(drhd->devices,
786 drhd->devices_cnt, i, tmp) {
788 /* For a VF use its original BDF# not that of the PF
789 * which we used for the IOMMU lookup. Strictly speaking
790 * we could do this for all PCI devices; we only need to
791 * get the BDF# from the scope table for ACPI matches. */
792 if (pdev && pdev->is_virtfn)
795 *bus = drhd->devices[i].bus;
796 *devfn = drhd->devices[i].devfn;
800 if (is_downstream_to_pci_bridge(dev, tmp))
804 if (pdev && drhd->include_all) {
806 *bus = pdev->bus->number;
807 *devfn = pdev->devfn;
818 static void domain_flush_cache(struct dmar_domain *domain,
819 void *addr, int size)
821 if (!domain->iommu_coherency)
822 clflush_cache_range(addr, size);
825 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
827 struct context_entry *context;
831 spin_lock_irqsave(&iommu->lock, flags);
832 context = iommu_context_addr(iommu, bus, devfn, 0);
834 ret = context_present(context);
835 spin_unlock_irqrestore(&iommu->lock, flags);
839 static void free_context_table(struct intel_iommu *iommu)
843 struct context_entry *context;
845 spin_lock_irqsave(&iommu->lock, flags);
846 if (!iommu->root_entry) {
849 for (i = 0; i < ROOT_ENTRY_NR; i++) {
850 context = iommu_context_addr(iommu, i, 0, 0);
852 free_pgtable_page(context);
854 if (!sm_supported(iommu))
857 context = iommu_context_addr(iommu, i, 0x80, 0);
859 free_pgtable_page(context);
862 free_pgtable_page(iommu->root_entry);
863 iommu->root_entry = NULL;
865 spin_unlock_irqrestore(&iommu->lock, flags);
868 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
869 unsigned long pfn, int *target_level)
871 struct dma_pte *parent, *pte;
872 int level = agaw_to_level(domain->agaw);
875 BUG_ON(!domain->pgd);
877 if (!domain_pfn_supported(domain, pfn))
878 /* Address beyond IOMMU's addressing capabilities. */
881 parent = domain->pgd;
886 offset = pfn_level_offset(pfn, level);
887 pte = &parent[offset];
888 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
890 if (level == *target_level)
893 if (!dma_pte_present(pte)) {
896 tmp_page = alloc_pgtable_page(domain->nid);
901 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
902 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
903 if (cmpxchg64(&pte->val, 0ULL, pteval))
904 /* Someone else set it while we were thinking; use theirs. */
905 free_pgtable_page(tmp_page);
907 domain_flush_cache(domain, pte, sizeof(*pte));
912 parent = phys_to_virt(dma_pte_addr(pte));
917 *target_level = level;
922 /* return address's pte at specific level */
923 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
925 int level, int *large_page)
927 struct dma_pte *parent, *pte;
928 int total = agaw_to_level(domain->agaw);
931 parent = domain->pgd;
932 while (level <= total) {
933 offset = pfn_level_offset(pfn, total);
934 pte = &parent[offset];
938 if (!dma_pte_present(pte)) {
943 if (dma_pte_superpage(pte)) {
948 parent = phys_to_virt(dma_pte_addr(pte));
954 /* clear last level pte, a tlb flush should be followed */
955 static void dma_pte_clear_range(struct dmar_domain *domain,
956 unsigned long start_pfn,
957 unsigned long last_pfn)
959 unsigned int large_page;
960 struct dma_pte *first_pte, *pte;
962 BUG_ON(!domain_pfn_supported(domain, start_pfn));
963 BUG_ON(!domain_pfn_supported(domain, last_pfn));
964 BUG_ON(start_pfn > last_pfn);
966 /* we don't need lock here; nobody else touches the iova range */
969 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
971 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
976 start_pfn += lvl_to_nr_pages(large_page);
978 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
980 domain_flush_cache(domain, first_pte,
981 (void *)pte - (void *)first_pte);
983 } while (start_pfn && start_pfn <= last_pfn);
986 static void dma_pte_free_level(struct dmar_domain *domain, int level,
987 int retain_level, struct dma_pte *pte,
988 unsigned long pfn, unsigned long start_pfn,
989 unsigned long last_pfn)
991 pfn = max(start_pfn, pfn);
992 pte = &pte[pfn_level_offset(pfn, level)];
995 unsigned long level_pfn;
996 struct dma_pte *level_pte;
998 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1001 level_pfn = pfn & level_mask(level);
1002 level_pte = phys_to_virt(dma_pte_addr(pte));
1005 dma_pte_free_level(domain, level - 1, retain_level,
1006 level_pte, level_pfn, start_pfn,
1011 * Free the page table if we're below the level we want to
1012 * retain and the range covers the entire table.
1014 if (level < retain_level && !(start_pfn > level_pfn ||
1015 last_pfn < level_pfn + level_size(level) - 1)) {
1017 domain_flush_cache(domain, pte, sizeof(*pte));
1018 free_pgtable_page(level_pte);
1021 pfn += level_size(level);
1022 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1026 * clear last level (leaf) ptes and free page table pages below the
1027 * level we wish to keep intact.
1029 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1030 unsigned long start_pfn,
1031 unsigned long last_pfn,
1034 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1035 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1036 BUG_ON(start_pfn > last_pfn);
1038 dma_pte_clear_range(domain, start_pfn, last_pfn);
1040 /* We don't need lock here; nobody else touches the iova range */
1041 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1042 domain->pgd, 0, start_pfn, last_pfn);
1045 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1046 free_pgtable_page(domain->pgd);
1051 /* When a page at a given level is being unlinked from its parent, we don't
1052 need to *modify* it at all. All we need to do is make a list of all the
1053 pages which can be freed just as soon as we've flushed the IOTLB and we
1054 know the hardware page-walk will no longer touch them.
1055 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1057 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1058 int level, struct dma_pte *pte,
1059 struct page *freelist)
1063 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1064 pg->freelist = freelist;
1070 pte = page_address(pg);
1072 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1073 freelist = dma_pte_list_pagetables(domain, level - 1,
1076 } while (!first_pte_in_page(pte));
1081 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1082 struct dma_pte *pte, unsigned long pfn,
1083 unsigned long start_pfn,
1084 unsigned long last_pfn,
1085 struct page *freelist)
1087 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1089 pfn = max(start_pfn, pfn);
1090 pte = &pte[pfn_level_offset(pfn, level)];
1093 unsigned long level_pfn;
1095 if (!dma_pte_present(pte))
1098 level_pfn = pfn & level_mask(level);
1100 /* If range covers entire pagetable, free it */
1101 if (start_pfn <= level_pfn &&
1102 last_pfn >= level_pfn + level_size(level) - 1) {
1103 /* These suborbinate page tables are going away entirely. Don't
1104 bother to clear them; we're just going to *free* them. */
1105 if (level > 1 && !dma_pte_superpage(pte))
1106 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1112 } else if (level > 1) {
1113 /* Recurse down into a level that isn't *entirely* obsolete */
1114 freelist = dma_pte_clear_level(domain, level - 1,
1115 phys_to_virt(dma_pte_addr(pte)),
1116 level_pfn, start_pfn, last_pfn,
1120 pfn += level_size(level);
1121 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1124 domain_flush_cache(domain, first_pte,
1125 (void *)++last_pte - (void *)first_pte);
1130 /* We can't just free the pages because the IOMMU may still be walking
1131 the page tables, and may have cached the intermediate levels. The
1132 pages can only be freed after the IOTLB flush has been done. */
1133 static struct page *domain_unmap(struct dmar_domain *domain,
1134 unsigned long start_pfn,
1135 unsigned long last_pfn)
1137 struct page *freelist;
1139 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1140 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1141 BUG_ON(start_pfn > last_pfn);
1143 /* we don't need lock here; nobody else touches the iova range */
1144 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1145 domain->pgd, 0, start_pfn, last_pfn, NULL);
1148 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1149 struct page *pgd_page = virt_to_page(domain->pgd);
1150 pgd_page->freelist = freelist;
1151 freelist = pgd_page;
1159 static void dma_free_pagelist(struct page *freelist)
1163 while ((pg = freelist)) {
1164 freelist = pg->freelist;
1165 free_pgtable_page(page_address(pg));
1169 static void iova_entry_free(unsigned long data)
1171 struct page *freelist = (struct page *)data;
1173 dma_free_pagelist(freelist);
1176 /* iommu handling */
1177 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1179 struct root_entry *root;
1180 unsigned long flags;
1182 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1184 pr_err("Allocating root entry for %s failed\n",
1189 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1191 spin_lock_irqsave(&iommu->lock, flags);
1192 iommu->root_entry = root;
1193 spin_unlock_irqrestore(&iommu->lock, flags);
1198 static void iommu_set_root_entry(struct intel_iommu *iommu)
1204 addr = virt_to_phys(iommu->root_entry);
1205 if (sm_supported(iommu))
1206 addr |= DMA_RTADDR_SMT;
1208 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1209 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1211 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1213 /* Make sure hardware complete it */
1214 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1215 readl, (sts & DMA_GSTS_RTPS), sts);
1217 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1220 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1225 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1228 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1229 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1231 /* Make sure hardware complete it */
1232 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1233 readl, (!(val & DMA_GSTS_WBFS)), val);
1235 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1238 /* return value determine if we need a write buffer flush */
1239 static void __iommu_flush_context(struct intel_iommu *iommu,
1240 u16 did, u16 source_id, u8 function_mask,
1247 case DMA_CCMD_GLOBAL_INVL:
1248 val = DMA_CCMD_GLOBAL_INVL;
1250 case DMA_CCMD_DOMAIN_INVL:
1251 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1253 case DMA_CCMD_DEVICE_INVL:
1254 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1255 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1260 val |= DMA_CCMD_ICC;
1262 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1263 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1265 /* Make sure hardware complete it */
1266 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1267 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1269 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1272 /* return value determine if we need a write buffer flush */
1273 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1274 u64 addr, unsigned int size_order, u64 type)
1276 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1277 u64 val = 0, val_iva = 0;
1281 case DMA_TLB_GLOBAL_FLUSH:
1282 /* global flush doesn't need set IVA_REG */
1283 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1285 case DMA_TLB_DSI_FLUSH:
1286 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1288 case DMA_TLB_PSI_FLUSH:
1289 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1290 /* IH bit is passed in as part of address */
1291 val_iva = size_order | addr;
1296 /* Note: set drain read/write */
1299 * This is probably to be super secure.. Looks like we can
1300 * ignore it without any impact.
1302 if (cap_read_drain(iommu->cap))
1303 val |= DMA_TLB_READ_DRAIN;
1305 if (cap_write_drain(iommu->cap))
1306 val |= DMA_TLB_WRITE_DRAIN;
1308 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1309 /* Note: Only uses first TLB reg currently */
1311 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1312 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1314 /* Make sure hardware complete it */
1315 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1316 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1318 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1320 /* check IOTLB invalidation granularity */
1321 if (DMA_TLB_IAIG(val) == 0)
1322 pr_err("Flush IOTLB failed\n");
1323 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1324 pr_debug("TLB flush request %Lx, actual %Lx\n",
1325 (unsigned long long)DMA_TLB_IIRG(type),
1326 (unsigned long long)DMA_TLB_IAIG(val));
1329 static struct device_domain_info *
1330 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1333 struct device_domain_info *info;
1335 assert_spin_locked(&device_domain_lock);
1340 list_for_each_entry(info, &domain->devices, link)
1341 if (info->iommu == iommu && info->bus == bus &&
1342 info->devfn == devfn) {
1343 if (info->ats_supported && info->dev)
1351 static void domain_update_iotlb(struct dmar_domain *domain)
1353 struct device_domain_info *info;
1354 bool has_iotlb_device = false;
1356 assert_spin_locked(&device_domain_lock);
1358 list_for_each_entry(info, &domain->devices, link) {
1359 struct pci_dev *pdev;
1361 if (!info->dev || !dev_is_pci(info->dev))
1364 pdev = to_pci_dev(info->dev);
1365 if (pdev->ats_enabled) {
1366 has_iotlb_device = true;
1371 domain->has_iotlb_device = has_iotlb_device;
1374 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1376 struct pci_dev *pdev;
1378 assert_spin_locked(&device_domain_lock);
1380 if (!info || !dev_is_pci(info->dev))
1383 pdev = to_pci_dev(info->dev);
1384 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1385 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1386 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1387 * reserved, which should be set to 0.
1389 if (!ecap_dit(info->iommu->ecap))
1392 struct pci_dev *pf_pdev;
1394 /* pdev will be returned if device is not a vf */
1395 pf_pdev = pci_physfn(pdev);
1396 info->pfsid = pci_dev_id(pf_pdev);
1399 #ifdef CONFIG_INTEL_IOMMU_SVM
1400 /* The PCIe spec, in its wisdom, declares that the behaviour of
1401 the device if you enable PASID support after ATS support is
1402 undefined. So always enable PASID support on devices which
1403 have it, even if we can't yet know if we're ever going to
1405 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1406 info->pasid_enabled = 1;
1408 if (info->pri_supported &&
1409 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1410 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1411 info->pri_enabled = 1;
1413 if (!pdev->untrusted && info->ats_supported &&
1414 pci_ats_page_aligned(pdev) &&
1415 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1416 info->ats_enabled = 1;
1417 domain_update_iotlb(info->domain);
1418 info->ats_qdep = pci_ats_queue_depth(pdev);
1422 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1424 struct pci_dev *pdev;
1426 assert_spin_locked(&device_domain_lock);
1428 if (!dev_is_pci(info->dev))
1431 pdev = to_pci_dev(info->dev);
1433 if (info->ats_enabled) {
1434 pci_disable_ats(pdev);
1435 info->ats_enabled = 0;
1436 domain_update_iotlb(info->domain);
1438 #ifdef CONFIG_INTEL_IOMMU_SVM
1439 if (info->pri_enabled) {
1440 pci_disable_pri(pdev);
1441 info->pri_enabled = 0;
1443 if (info->pasid_enabled) {
1444 pci_disable_pasid(pdev);
1445 info->pasid_enabled = 0;
1450 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1451 u64 addr, unsigned mask)
1454 unsigned long flags;
1455 struct device_domain_info *info;
1457 if (!domain->has_iotlb_device)
1460 spin_lock_irqsave(&device_domain_lock, flags);
1461 list_for_each_entry(info, &domain->devices, link) {
1462 if (!info->ats_enabled)
1465 sid = info->bus << 8 | info->devfn;
1466 qdep = info->ats_qdep;
1467 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1470 spin_unlock_irqrestore(&device_domain_lock, flags);
1473 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1474 struct dmar_domain *domain,
1475 unsigned long pfn, unsigned int pages,
1478 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1479 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1480 u16 did = domain->iommu_did[iommu->seq_id];
1487 * Fallback to domain selective flush if no PSI support or the size is
1489 * PSI requires page size to be 2 ^ x, and the base address is naturally
1490 * aligned to the size
1492 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1493 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1496 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1500 * In caching mode, changes of pages from non-present to present require
1501 * flush. However, device IOTLB doesn't need to be flushed in this case.
1503 if (!cap_caching_mode(iommu->cap) || !map)
1504 iommu_flush_dev_iotlb(domain, addr, mask);
1507 /* Notification for newly created mappings */
1508 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1509 struct dmar_domain *domain,
1510 unsigned long pfn, unsigned int pages)
1512 /* It's a non-present to present mapping. Only flush if caching mode */
1513 if (cap_caching_mode(iommu->cap))
1514 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1516 iommu_flush_write_buffer(iommu);
1519 static void iommu_flush_iova(struct iova_domain *iovad)
1521 struct dmar_domain *domain;
1524 domain = container_of(iovad, struct dmar_domain, iovad);
1526 for_each_domain_iommu(idx, domain) {
1527 struct intel_iommu *iommu = g_iommus[idx];
1528 u16 did = domain->iommu_did[iommu->seq_id];
1530 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1532 if (!cap_caching_mode(iommu->cap))
1533 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1534 0, MAX_AGAW_PFN_WIDTH);
1538 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1541 unsigned long flags;
1543 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1546 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1547 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1548 pmen &= ~DMA_PMEN_EPM;
1549 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1551 /* wait for the protected region status bit to clear */
1552 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1553 readl, !(pmen & DMA_PMEN_PRS), pmen);
1555 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1558 static void iommu_enable_translation(struct intel_iommu *iommu)
1561 unsigned long flags;
1563 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1564 iommu->gcmd |= DMA_GCMD_TE;
1565 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1567 /* Make sure hardware complete it */
1568 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1569 readl, (sts & DMA_GSTS_TES), sts);
1571 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1574 static void iommu_disable_translation(struct intel_iommu *iommu)
1579 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1580 iommu->gcmd &= ~DMA_GCMD_TE;
1581 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1583 /* Make sure hardware complete it */
1584 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1585 readl, (!(sts & DMA_GSTS_TES)), sts);
1587 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1590 static int iommu_init_domains(struct intel_iommu *iommu)
1592 u32 ndomains, nlongs;
1595 ndomains = cap_ndoms(iommu->cap);
1596 pr_debug("%s: Number of Domains supported <%d>\n",
1597 iommu->name, ndomains);
1598 nlongs = BITS_TO_LONGS(ndomains);
1600 spin_lock_init(&iommu->lock);
1602 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1603 if (!iommu->domain_ids) {
1604 pr_err("%s: Allocating domain id array failed\n",
1609 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1610 iommu->domains = kzalloc(size, GFP_KERNEL);
1612 if (iommu->domains) {
1613 size = 256 * sizeof(struct dmar_domain *);
1614 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1617 if (!iommu->domains || !iommu->domains[0]) {
1618 pr_err("%s: Allocating domain array failed\n",
1620 kfree(iommu->domain_ids);
1621 kfree(iommu->domains);
1622 iommu->domain_ids = NULL;
1623 iommu->domains = NULL;
1628 * If Caching mode is set, then invalid translations are tagged
1629 * with domain-id 0, hence we need to pre-allocate it. We also
1630 * use domain-id 0 as a marker for non-allocated domain-id, so
1631 * make sure it is not used for a real domain.
1633 set_bit(0, iommu->domain_ids);
1636 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1637 * entry for first-level or pass-through translation modes should
1638 * be programmed with a domain id different from those used for
1639 * second-level or nested translation. We reserve a domain id for
1642 if (sm_supported(iommu))
1643 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1648 static void disable_dmar_iommu(struct intel_iommu *iommu)
1650 struct device_domain_info *info, *tmp;
1651 unsigned long flags;
1653 if (!iommu->domains || !iommu->domain_ids)
1656 spin_lock_irqsave(&device_domain_lock, flags);
1657 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1658 if (info->iommu != iommu)
1661 if (!info->dev || !info->domain)
1664 __dmar_remove_one_dev_info(info);
1666 spin_unlock_irqrestore(&device_domain_lock, flags);
1668 if (iommu->gcmd & DMA_GCMD_TE)
1669 iommu_disable_translation(iommu);
1672 static void free_dmar_iommu(struct intel_iommu *iommu)
1674 if ((iommu->domains) && (iommu->domain_ids)) {
1675 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1678 for (i = 0; i < elems; i++)
1679 kfree(iommu->domains[i]);
1680 kfree(iommu->domains);
1681 kfree(iommu->domain_ids);
1682 iommu->domains = NULL;
1683 iommu->domain_ids = NULL;
1686 g_iommus[iommu->seq_id] = NULL;
1688 /* free context mapping */
1689 free_context_table(iommu);
1691 #ifdef CONFIG_INTEL_IOMMU_SVM
1692 if (pasid_supported(iommu)) {
1693 if (ecap_prs(iommu->ecap))
1694 intel_svm_finish_prq(iommu);
1699 static struct dmar_domain *alloc_domain(int flags)
1701 struct dmar_domain *domain;
1703 domain = alloc_domain_mem();
1707 memset(domain, 0, sizeof(*domain));
1708 domain->nid = NUMA_NO_NODE;
1709 domain->flags = flags;
1710 domain->has_iotlb_device = false;
1711 INIT_LIST_HEAD(&domain->devices);
1716 /* Must be called with iommu->lock */
1717 static int domain_attach_iommu(struct dmar_domain *domain,
1718 struct intel_iommu *iommu)
1720 unsigned long ndomains;
1723 assert_spin_locked(&device_domain_lock);
1724 assert_spin_locked(&iommu->lock);
1726 domain->iommu_refcnt[iommu->seq_id] += 1;
1727 domain->iommu_count += 1;
1728 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1729 ndomains = cap_ndoms(iommu->cap);
1730 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1732 if (num >= ndomains) {
1733 pr_err("%s: No free domain ids\n", iommu->name);
1734 domain->iommu_refcnt[iommu->seq_id] -= 1;
1735 domain->iommu_count -= 1;
1739 set_bit(num, iommu->domain_ids);
1740 set_iommu_domain(iommu, num, domain);
1742 domain->iommu_did[iommu->seq_id] = num;
1743 domain->nid = iommu->node;
1745 domain_update_iommu_cap(domain);
1751 static int domain_detach_iommu(struct dmar_domain *domain,
1752 struct intel_iommu *iommu)
1756 assert_spin_locked(&device_domain_lock);
1757 assert_spin_locked(&iommu->lock);
1759 domain->iommu_refcnt[iommu->seq_id] -= 1;
1760 count = --domain->iommu_count;
1761 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1762 num = domain->iommu_did[iommu->seq_id];
1763 clear_bit(num, iommu->domain_ids);
1764 set_iommu_domain(iommu, num, NULL);
1766 domain_update_iommu_cap(domain);
1767 domain->iommu_did[iommu->seq_id] = 0;
1773 static struct iova_domain reserved_iova_list;
1774 static struct lock_class_key reserved_rbtree_key;
1776 static int dmar_init_reserved_ranges(void)
1778 struct pci_dev *pdev = NULL;
1782 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1784 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1785 &reserved_rbtree_key);
1787 /* IOAPIC ranges shouldn't be accessed by DMA */
1788 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1789 IOVA_PFN(IOAPIC_RANGE_END));
1791 pr_err("Reserve IOAPIC range failed\n");
1795 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1796 for_each_pci_dev(pdev) {
1799 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1800 r = &pdev->resource[i];
1801 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1803 iova = reserve_iova(&reserved_iova_list,
1807 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1815 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1817 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1820 static inline int guestwidth_to_adjustwidth(int gaw)
1823 int r = (gaw - 12) % 9;
1834 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1837 int adjust_width, agaw;
1838 unsigned long sagaw;
1841 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1843 err = init_iova_flush_queue(&domain->iovad,
1844 iommu_flush_iova, iova_entry_free);
1848 domain_reserve_special_ranges(domain);
1850 /* calculate AGAW */
1851 if (guest_width > cap_mgaw(iommu->cap))
1852 guest_width = cap_mgaw(iommu->cap);
1853 domain->gaw = guest_width;
1854 adjust_width = guestwidth_to_adjustwidth(guest_width);
1855 agaw = width_to_agaw(adjust_width);
1856 sagaw = cap_sagaw(iommu->cap);
1857 if (!test_bit(agaw, &sagaw)) {
1858 /* hardware doesn't support it, choose a bigger one */
1859 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1860 agaw = find_next_bit(&sagaw, 5, agaw);
1864 domain->agaw = agaw;
1866 if (ecap_coherent(iommu->ecap))
1867 domain->iommu_coherency = 1;
1869 domain->iommu_coherency = 0;
1871 if (ecap_sc_support(iommu->ecap))
1872 domain->iommu_snooping = 1;
1874 domain->iommu_snooping = 0;
1876 if (intel_iommu_superpage)
1877 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1879 domain->iommu_superpage = 0;
1881 domain->nid = iommu->node;
1883 /* always allocate the top pgd */
1884 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1887 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1891 static void domain_exit(struct dmar_domain *domain)
1894 /* Remove associated devices and clear attached or cached domains */
1895 domain_remove_dev_info(domain);
1898 put_iova_domain(&domain->iovad);
1901 struct page *freelist;
1903 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1904 dma_free_pagelist(freelist);
1907 free_domain_mem(domain);
1911 * Get the PASID directory size for scalable mode context entry.
1912 * Value of X in the PDTS field of a scalable mode context entry
1913 * indicates PASID directory with 2^(X + 7) entries.
1915 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1919 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1920 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1928 * Set the RID_PASID field of a scalable mode context entry. The
1929 * IOMMU hardware will use the PASID value set in this field for
1930 * DMA translations of DMA requests without PASID.
1933 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1935 context->hi |= pasid & ((1 << 20) - 1);
1936 context->hi |= (1 << 20);
1940 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1943 static inline void context_set_sm_dte(struct context_entry *context)
1945 context->lo |= (1 << 2);
1949 * Set the PRE(Page Request Enable) field of a scalable mode context
1952 static inline void context_set_sm_pre(struct context_entry *context)
1954 context->lo |= (1 << 4);
1957 /* Convert value to context PASID directory size field coding. */
1958 #define context_pdts(pds) (((pds) & 0x7) << 9)
1960 static int domain_context_mapping_one(struct dmar_domain *domain,
1961 struct intel_iommu *iommu,
1962 struct pasid_table *table,
1965 u16 did = domain->iommu_did[iommu->seq_id];
1966 int translation = CONTEXT_TT_MULTI_LEVEL;
1967 struct device_domain_info *info = NULL;
1968 struct context_entry *context;
1969 unsigned long flags;
1974 if (hw_pass_through && domain_type_is_si(domain))
1975 translation = CONTEXT_TT_PASS_THROUGH;
1977 pr_debug("Set context mapping for %02x:%02x.%d\n",
1978 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1980 BUG_ON(!domain->pgd);
1982 spin_lock_irqsave(&device_domain_lock, flags);
1983 spin_lock(&iommu->lock);
1986 context = iommu_context_addr(iommu, bus, devfn, 1);
1991 if (context_present(context))
1995 * For kdump cases, old valid entries may be cached due to the
1996 * in-flight DMA and copied pgtable, but there is no unmapping
1997 * behaviour for them, thus we need an explicit cache flush for
1998 * the newly-mapped device. For kdump, at this point, the device
1999 * is supposed to finish reset at its driver probe stage, so no
2000 * in-flight DMA will exist, and we don't need to worry anymore
2003 if (context_copied(context)) {
2004 u16 did_old = context_domain_id(context);
2006 if (did_old < cap_ndoms(iommu->cap)) {
2007 iommu->flush.flush_context(iommu, did_old,
2008 (((u16)bus) << 8) | devfn,
2009 DMA_CCMD_MASK_NOBIT,
2010 DMA_CCMD_DEVICE_INVL);
2011 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2016 context_clear_entry(context);
2018 if (sm_supported(iommu)) {
2023 /* Setup the PASID DIR pointer: */
2024 pds = context_get_sm_pds(table);
2025 context->lo = (u64)virt_to_phys(table->table) |
2028 /* Setup the RID_PASID field: */
2029 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2032 * Setup the Device-TLB enable bit and Page request
2035 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2036 if (info && info->ats_supported)
2037 context_set_sm_dte(context);
2038 if (info && info->pri_supported)
2039 context_set_sm_pre(context);
2041 struct dma_pte *pgd = domain->pgd;
2044 context_set_domain_id(context, did);
2046 if (translation != CONTEXT_TT_PASS_THROUGH) {
2048 * Skip top levels of page tables for iommu which has
2049 * less agaw than default. Unnecessary for PT mode.
2051 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2053 pgd = phys_to_virt(dma_pte_addr(pgd));
2054 if (!dma_pte_present(pgd))
2058 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2059 if (info && info->ats_supported)
2060 translation = CONTEXT_TT_DEV_IOTLB;
2062 translation = CONTEXT_TT_MULTI_LEVEL;
2064 context_set_address_root(context, virt_to_phys(pgd));
2065 context_set_address_width(context, agaw);
2068 * In pass through mode, AW must be programmed to
2069 * indicate the largest AGAW value supported by
2070 * hardware. And ASR is ignored by hardware.
2072 context_set_address_width(context, iommu->msagaw);
2075 context_set_translation_type(context, translation);
2078 context_set_fault_enable(context);
2079 context_set_present(context);
2080 domain_flush_cache(domain, context, sizeof(*context));
2083 * It's a non-present to present mapping. If hardware doesn't cache
2084 * non-present entry we only need to flush the write-buffer. If the
2085 * _does_ cache non-present entries, then it does so in the special
2086 * domain #0, which we have to flush:
2088 if (cap_caching_mode(iommu->cap)) {
2089 iommu->flush.flush_context(iommu, 0,
2090 (((u16)bus) << 8) | devfn,
2091 DMA_CCMD_MASK_NOBIT,
2092 DMA_CCMD_DEVICE_INVL);
2093 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2095 iommu_flush_write_buffer(iommu);
2097 iommu_enable_dev_iotlb(info);
2102 spin_unlock(&iommu->lock);
2103 spin_unlock_irqrestore(&device_domain_lock, flags);
2109 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2111 struct pasid_table *table;
2112 struct intel_iommu *iommu;
2115 iommu = device_to_iommu(dev, &bus, &devfn);
2119 table = intel_pasid_get_table(dev);
2120 return domain_context_mapping_one(domain, iommu, table, bus, devfn);
2123 static int domain_context_mapped_cb(struct pci_dev *pdev,
2124 u16 alias, void *opaque)
2126 struct intel_iommu *iommu = opaque;
2128 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2131 static int domain_context_mapped(struct device *dev)
2133 struct intel_iommu *iommu;
2136 iommu = device_to_iommu(dev, &bus, &devfn);
2140 if (!dev_is_pci(dev))
2141 return device_context_mapped(iommu, bus, devfn);
2143 return !pci_for_each_dma_alias(to_pci_dev(dev),
2144 domain_context_mapped_cb, iommu);
2147 /* Returns a number of VTD pages, but aligned to MM page size */
2148 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2151 host_addr &= ~PAGE_MASK;
2152 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2155 /* Return largest possible superpage level for a given mapping */
2156 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2157 unsigned long iov_pfn,
2158 unsigned long phy_pfn,
2159 unsigned long pages)
2161 int support, level = 1;
2162 unsigned long pfnmerge;
2164 support = domain->iommu_superpage;
2166 /* To use a large page, the virtual *and* physical addresses
2167 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2168 of them will mean we have to use smaller pages. So just
2169 merge them and check both at once. */
2170 pfnmerge = iov_pfn | phy_pfn;
2172 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2173 pages >>= VTD_STRIDE_SHIFT;
2176 pfnmerge >>= VTD_STRIDE_SHIFT;
2183 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2184 struct scatterlist *sg, unsigned long phys_pfn,
2185 unsigned long nr_pages, int prot)
2187 struct dma_pte *first_pte = NULL, *pte = NULL;
2188 phys_addr_t uninitialized_var(pteval);
2189 unsigned long sg_res = 0;
2190 unsigned int largepage_lvl = 0;
2191 unsigned long lvl_pages = 0;
2193 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2195 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2198 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2202 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2205 while (nr_pages > 0) {
2209 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2211 sg_res = aligned_nrpages(sg->offset, sg->length);
2212 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2213 sg->dma_length = sg->length;
2214 pteval = (sg_phys(sg) - pgoff) | prot;
2215 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2219 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2221 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2224 /* It is large page*/
2225 if (largepage_lvl > 1) {
2226 unsigned long nr_superpages, end_pfn;
2228 pteval |= DMA_PTE_LARGE_PAGE;
2229 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2231 nr_superpages = sg_res / lvl_pages;
2232 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2235 * Ensure that old small page tables are
2236 * removed to make room for superpage(s).
2237 * We're adding new large pages, so make sure
2238 * we don't remove their parent tables.
2240 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2243 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2247 /* We don't need lock here, nobody else
2248 * touches the iova range
2250 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2252 static int dumps = 5;
2253 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2254 iov_pfn, tmp, (unsigned long long)pteval);
2257 debug_dma_dump_mappings(NULL);
2262 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2264 BUG_ON(nr_pages < lvl_pages);
2265 BUG_ON(sg_res < lvl_pages);
2267 nr_pages -= lvl_pages;
2268 iov_pfn += lvl_pages;
2269 phys_pfn += lvl_pages;
2270 pteval += lvl_pages * VTD_PAGE_SIZE;
2271 sg_res -= lvl_pages;
2273 /* If the next PTE would be the first in a new page, then we
2274 need to flush the cache on the entries we've just written.
2275 And then we'll need to recalculate 'pte', so clear it and
2276 let it get set again in the if (!pte) block above.
2278 If we're done (!nr_pages) we need to flush the cache too.
2280 Also if we've been setting superpages, we may need to
2281 recalculate 'pte' and switch back to smaller pages for the
2282 end of the mapping, if the trailing size is not enough to
2283 use another superpage (i.e. sg_res < lvl_pages). */
2285 if (!nr_pages || first_pte_in_page(pte) ||
2286 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2287 domain_flush_cache(domain, first_pte,
2288 (void *)pte - (void *)first_pte);
2292 if (!sg_res && nr_pages)
2298 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2299 struct scatterlist *sg, unsigned long phys_pfn,
2300 unsigned long nr_pages, int prot)
2303 struct intel_iommu *iommu;
2305 /* Do the real mapping first */
2306 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2310 for_each_domain_iommu(iommu_id, domain) {
2311 iommu = g_iommus[iommu_id];
2312 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2318 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2319 struct scatterlist *sg, unsigned long nr_pages,
2322 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2325 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2326 unsigned long phys_pfn, unsigned long nr_pages,
2329 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2332 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2334 unsigned long flags;
2335 struct context_entry *context;
2341 spin_lock_irqsave(&iommu->lock, flags);
2342 context = iommu_context_addr(iommu, bus, devfn, 0);
2344 spin_unlock_irqrestore(&iommu->lock, flags);
2347 did_old = context_domain_id(context);
2348 context_clear_entry(context);
2349 __iommu_flush_cache(iommu, context, sizeof(*context));
2350 spin_unlock_irqrestore(&iommu->lock, flags);
2351 iommu->flush.flush_context(iommu,
2353 (((u16)bus) << 8) | devfn,
2354 DMA_CCMD_MASK_NOBIT,
2355 DMA_CCMD_DEVICE_INVL);
2356 iommu->flush.flush_iotlb(iommu,
2363 static inline void unlink_domain_info(struct device_domain_info *info)
2365 assert_spin_locked(&device_domain_lock);
2366 list_del(&info->link);
2367 list_del(&info->global);
2369 info->dev->archdata.iommu = NULL;
2372 static void domain_remove_dev_info(struct dmar_domain *domain)
2374 struct device_domain_info *info, *tmp;
2375 unsigned long flags;
2377 spin_lock_irqsave(&device_domain_lock, flags);
2378 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2379 __dmar_remove_one_dev_info(info);
2380 spin_unlock_irqrestore(&device_domain_lock, flags);
2385 * Note: we use struct device->archdata.iommu stores the info
2387 static struct dmar_domain *find_domain(struct device *dev)
2389 struct device_domain_info *info;
2391 if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO)) {
2392 struct iommu_domain *domain;
2394 dev->archdata.iommu = NULL;
2395 domain = iommu_get_domain_for_dev(dev);
2397 intel_iommu_attach_device(domain, dev);
2400 /* No lock here, assumes no domain exit in normal case */
2401 info = dev->archdata.iommu;
2404 return info->domain;
2408 static inline struct device_domain_info *
2409 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2411 struct device_domain_info *info;
2413 list_for_each_entry(info, &device_domain_list, global)
2414 if (info->iommu->segment == segment && info->bus == bus &&
2415 info->devfn == devfn)
2421 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2424 struct dmar_domain *domain)
2426 struct dmar_domain *found = NULL;
2427 struct device_domain_info *info;
2428 unsigned long flags;
2431 info = alloc_devinfo_mem();
2436 info->devfn = devfn;
2437 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2438 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2441 info->domain = domain;
2442 info->iommu = iommu;
2443 info->pasid_table = NULL;
2444 info->auxd_enabled = 0;
2445 INIT_LIST_HEAD(&info->auxiliary_domains);
2447 if (dev && dev_is_pci(dev)) {
2448 struct pci_dev *pdev = to_pci_dev(info->dev);
2450 if (!pdev->untrusted &&
2451 !pci_ats_disabled() &&
2452 ecap_dev_iotlb_support(iommu->ecap) &&
2453 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2454 dmar_find_matched_atsr_unit(pdev))
2455 info->ats_supported = 1;
2457 if (sm_supported(iommu)) {
2458 if (pasid_supported(iommu)) {
2459 int features = pci_pasid_features(pdev);
2461 info->pasid_supported = features | 1;
2464 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2465 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2466 info->pri_supported = 1;
2470 spin_lock_irqsave(&device_domain_lock, flags);
2472 found = find_domain(dev);
2475 struct device_domain_info *info2;
2476 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2478 found = info2->domain;
2484 spin_unlock_irqrestore(&device_domain_lock, flags);
2485 free_devinfo_mem(info);
2486 /* Caller must free the original domain */
2490 spin_lock(&iommu->lock);
2491 ret = domain_attach_iommu(domain, iommu);
2492 spin_unlock(&iommu->lock);
2495 spin_unlock_irqrestore(&device_domain_lock, flags);
2496 free_devinfo_mem(info);
2500 list_add(&info->link, &domain->devices);
2501 list_add(&info->global, &device_domain_list);
2503 dev->archdata.iommu = info;
2504 spin_unlock_irqrestore(&device_domain_lock, flags);
2506 /* PASID table is mandatory for a PCI device in scalable mode. */
2507 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2508 ret = intel_pasid_alloc_table(dev);
2510 dev_err(dev, "PASID table allocation failed\n");
2511 dmar_remove_one_dev_info(dev);
2515 /* Setup the PASID entry for requests without PASID: */
2516 spin_lock(&iommu->lock);
2517 if (hw_pass_through && domain_type_is_si(domain))
2518 ret = intel_pasid_setup_pass_through(iommu, domain,
2519 dev, PASID_RID2PASID);
2521 ret = intel_pasid_setup_second_level(iommu, domain,
2522 dev, PASID_RID2PASID);
2523 spin_unlock(&iommu->lock);
2525 dev_err(dev, "Setup RID2PASID failed\n");
2526 dmar_remove_one_dev_info(dev);
2531 if (dev && domain_context_mapping(domain, dev)) {
2532 dev_err(dev, "Domain context map failed\n");
2533 dmar_remove_one_dev_info(dev);
2540 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2542 *(u16 *)opaque = alias;
2546 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2548 struct device_domain_info *info;
2549 struct dmar_domain *domain = NULL;
2550 struct intel_iommu *iommu;
2552 unsigned long flags;
2555 iommu = device_to_iommu(dev, &bus, &devfn);
2559 if (dev_is_pci(dev)) {
2560 struct pci_dev *pdev = to_pci_dev(dev);
2562 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2564 spin_lock_irqsave(&device_domain_lock, flags);
2565 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2566 PCI_BUS_NUM(dma_alias),
2569 iommu = info->iommu;
2570 domain = info->domain;
2572 spin_unlock_irqrestore(&device_domain_lock, flags);
2574 /* DMA alias already has a domain, use it */
2579 /* Allocate and initialize new domain for the device */
2580 domain = alloc_domain(0);
2583 if (domain_init(domain, iommu, gaw)) {
2584 domain_exit(domain);
2592 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2593 struct dmar_domain *domain)
2595 struct intel_iommu *iommu;
2596 struct dmar_domain *tmp;
2597 u16 req_id, dma_alias;
2600 iommu = device_to_iommu(dev, &bus, &devfn);
2604 req_id = ((u16)bus << 8) | devfn;
2606 if (dev_is_pci(dev)) {
2607 struct pci_dev *pdev = to_pci_dev(dev);
2609 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2611 /* register PCI DMA alias device */
2612 if (req_id != dma_alias) {
2613 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2614 dma_alias & 0xff, NULL, domain);
2616 if (!tmp || tmp != domain)
2621 tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2622 if (!tmp || tmp != domain)
2628 static int iommu_domain_identity_map(struct dmar_domain *domain,
2629 unsigned long long start,
2630 unsigned long long end)
2632 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2633 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2635 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2636 dma_to_mm_pfn(last_vpfn))) {
2637 pr_err("Reserving iova failed\n");
2641 pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2643 * RMRR range might have overlap with physical memory range,
2646 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2648 return __domain_mapping(domain, first_vpfn, NULL,
2649 first_vpfn, last_vpfn - first_vpfn + 1,
2650 DMA_PTE_READ|DMA_PTE_WRITE);
2653 static int domain_prepare_identity_map(struct device *dev,
2654 struct dmar_domain *domain,
2655 unsigned long long start,
2656 unsigned long long end)
2658 /* For _hardware_ passthrough, don't bother. But for software
2659 passthrough, we do it anyway -- it may indicate a memory
2660 range which is reserved in E820, so which didn't get set
2661 up to start with in si_domain */
2662 if (domain == si_domain && hw_pass_through) {
2663 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2668 dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2671 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2672 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2673 dmi_get_system_info(DMI_BIOS_VENDOR),
2674 dmi_get_system_info(DMI_BIOS_VERSION),
2675 dmi_get_system_info(DMI_PRODUCT_VERSION));
2679 if (end >> agaw_to_width(domain->agaw)) {
2680 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2681 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2682 agaw_to_width(domain->agaw),
2683 dmi_get_system_info(DMI_BIOS_VENDOR),
2684 dmi_get_system_info(DMI_BIOS_VERSION),
2685 dmi_get_system_info(DMI_PRODUCT_VERSION));
2689 return iommu_domain_identity_map(domain, start, end);
2692 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2694 static int __init si_domain_init(int hw)
2696 struct dmar_rmrr_unit *rmrr;
2700 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2704 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2705 domain_exit(si_domain);
2712 for_each_online_node(nid) {
2713 unsigned long start_pfn, end_pfn;
2716 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2717 ret = iommu_domain_identity_map(si_domain,
2718 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2725 * Normally we use DMA domains for devices which have RMRRs. But we
2726 * loose this requirement for graphic and usb devices. Identity map
2727 * the RMRRs for graphic and USB devices so that they could use the
2730 for_each_rmrr_units(rmrr) {
2731 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2733 unsigned long long start = rmrr->base_address;
2734 unsigned long long end = rmrr->end_address;
2736 if (device_is_rmrr_locked(dev))
2739 if (WARN_ON(end < start ||
2740 end >> agaw_to_width(si_domain->agaw)))
2743 ret = iommu_domain_identity_map(si_domain, start, end);
2752 static int identity_mapping(struct device *dev)
2754 struct device_domain_info *info;
2756 info = dev->archdata.iommu;
2757 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2758 return (info->domain == si_domain);
2763 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2765 struct dmar_domain *ndomain;
2766 struct intel_iommu *iommu;
2769 iommu = device_to_iommu(dev, &bus, &devfn);
2773 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2774 if (ndomain != domain)
2780 static bool device_has_rmrr(struct device *dev)
2782 struct dmar_rmrr_unit *rmrr;
2787 for_each_rmrr_units(rmrr) {
2789 * Return TRUE if this RMRR contains the device that
2792 for_each_active_dev_scope(rmrr->devices,
2793 rmrr->devices_cnt, i, tmp)
2795 is_downstream_to_pci_bridge(dev, tmp)) {
2805 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2806 * is relaxable (ie. is allowed to be not enforced under some conditions)
2807 * @dev: device handle
2809 * We assume that PCI USB devices with RMRRs have them largely
2810 * for historical reasons and that the RMRR space is not actively used post
2811 * boot. This exclusion may change if vendors begin to abuse it.
2813 * The same exception is made for graphics devices, with the requirement that
2814 * any use of the RMRR regions will be torn down before assigning the device
2817 * Return: true if the RMRR is relaxable, false otherwise
2819 static bool device_rmrr_is_relaxable(struct device *dev)
2821 struct pci_dev *pdev;
2823 if (!dev_is_pci(dev))
2826 pdev = to_pci_dev(dev);
2827 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2834 * There are a couple cases where we need to restrict the functionality of
2835 * devices associated with RMRRs. The first is when evaluating a device for
2836 * identity mapping because problems exist when devices are moved in and out
2837 * of domains and their respective RMRR information is lost. This means that
2838 * a device with associated RMRRs will never be in a "passthrough" domain.
2839 * The second is use of the device through the IOMMU API. This interface
2840 * expects to have full control of the IOVA space for the device. We cannot
2841 * satisfy both the requirement that RMRR access is maintained and have an
2842 * unencumbered IOVA space. We also have no ability to quiesce the device's
2843 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2844 * We therefore prevent devices associated with an RMRR from participating in
2845 * the IOMMU API, which eliminates them from device assignment.
2847 * In both cases, devices which have relaxable RMRRs are not concerned by this
2848 * restriction. See device_rmrr_is_relaxable comment.
2850 static bool device_is_rmrr_locked(struct device *dev)
2852 if (!device_has_rmrr(dev))
2855 if (device_rmrr_is_relaxable(dev))
2862 * Return the required default domain type for a specific device.
2864 * @dev: the device in query
2865 * @startup: true if this is during early boot
2868 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2869 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2870 * - 0: both identity and dynamic domains work for this device
2872 static int device_def_domain_type(struct device *dev)
2874 if (dev_is_pci(dev)) {
2875 struct pci_dev *pdev = to_pci_dev(dev);
2877 if (device_is_rmrr_locked(dev))
2878 return IOMMU_DOMAIN_DMA;
2881 * Prevent any device marked as untrusted from getting
2882 * placed into the statically identity mapping domain.
2884 if (pdev->untrusted)
2885 return IOMMU_DOMAIN_DMA;
2887 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2888 return IOMMU_DOMAIN_IDENTITY;
2890 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2891 return IOMMU_DOMAIN_IDENTITY;
2894 * We want to start off with all devices in the 1:1 domain, and
2895 * take them out later if we find they can't access all of memory.
2897 * However, we can't do this for PCI devices behind bridges,
2898 * because all PCI devices behind the same bridge will end up
2899 * with the same source-id on their transactions.
2901 * Practically speaking, we can't change things around for these
2902 * devices at run-time, because we can't be sure there'll be no
2903 * DMA transactions in flight for any of their siblings.
2905 * So PCI devices (unless they're on the root bus) as well as
2906 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2907 * the 1:1 domain, just in _case_ one of their siblings turns out
2908 * not to be able to map all of memory.
2910 if (!pci_is_pcie(pdev)) {
2911 if (!pci_is_root_bus(pdev->bus))
2912 return IOMMU_DOMAIN_DMA;
2913 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2914 return IOMMU_DOMAIN_DMA;
2915 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2916 return IOMMU_DOMAIN_DMA;
2918 if (device_has_rmrr(dev))
2919 return IOMMU_DOMAIN_DMA;
2922 return (iommu_identity_mapping & IDENTMAP_ALL) ?
2923 IOMMU_DOMAIN_IDENTITY : 0;
2926 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2929 * Start from the sane iommu hardware state.
2930 * If the queued invalidation is already initialized by us
2931 * (for example, while enabling interrupt-remapping) then
2932 * we got the things already rolling from a sane state.
2936 * Clear any previous faults.
2938 dmar_fault(-1, iommu);
2940 * Disable queued invalidation if supported and already enabled
2941 * before OS handover.
2943 dmar_disable_qi(iommu);
2946 if (dmar_enable_qi(iommu)) {
2948 * Queued Invalidate not enabled, use Register Based Invalidate
2950 iommu->flush.flush_context = __iommu_flush_context;
2951 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2952 pr_info("%s: Using Register based invalidation\n",
2955 iommu->flush.flush_context = qi_flush_context;
2956 iommu->flush.flush_iotlb = qi_flush_iotlb;
2957 pr_info("%s: Using Queued invalidation\n", iommu->name);
2961 static int copy_context_table(struct intel_iommu *iommu,
2962 struct root_entry *old_re,
2963 struct context_entry **tbl,
2966 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2967 struct context_entry *new_ce = NULL, ce;
2968 struct context_entry *old_ce = NULL;
2969 struct root_entry re;
2970 phys_addr_t old_ce_phys;
2972 tbl_idx = ext ? bus * 2 : bus;
2973 memcpy(&re, old_re, sizeof(re));
2975 for (devfn = 0; devfn < 256; devfn++) {
2976 /* First calculate the correct index */
2977 idx = (ext ? devfn * 2 : devfn) % 256;
2980 /* First save what we may have and clean up */
2982 tbl[tbl_idx] = new_ce;
2983 __iommu_flush_cache(iommu, new_ce,
2993 old_ce_phys = root_entry_lctp(&re);
2995 old_ce_phys = root_entry_uctp(&re);
2998 if (ext && devfn == 0) {
2999 /* No LCTP, try UCTP */
3008 old_ce = memremap(old_ce_phys, PAGE_SIZE,
3013 new_ce = alloc_pgtable_page(iommu->node);
3020 /* Now copy the context entry */
3021 memcpy(&ce, old_ce + idx, sizeof(ce));
3023 if (!__context_present(&ce))
3026 did = context_domain_id(&ce);
3027 if (did >= 0 && did < cap_ndoms(iommu->cap))
3028 set_bit(did, iommu->domain_ids);
3031 * We need a marker for copied context entries. This
3032 * marker needs to work for the old format as well as
3033 * for extended context entries.
3035 * Bit 67 of the context entry is used. In the old
3036 * format this bit is available to software, in the
3037 * extended format it is the PGE bit, but PGE is ignored
3038 * by HW if PASIDs are disabled (and thus still
3041 * So disable PASIDs first and then mark the entry
3042 * copied. This means that we don't copy PASID
3043 * translations from the old kernel, but this is fine as
3044 * faults there are not fatal.
3046 context_clear_pasid_enable(&ce);
3047 context_set_copied(&ce);
3052 tbl[tbl_idx + pos] = new_ce;
3054 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3063 static int copy_translation_tables(struct intel_iommu *iommu)
3065 struct context_entry **ctxt_tbls;
3066 struct root_entry *old_rt;
3067 phys_addr_t old_rt_phys;
3068 int ctxt_table_entries;
3069 unsigned long flags;
3074 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3075 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3076 new_ext = !!ecap_ecs(iommu->ecap);
3079 * The RTT bit can only be changed when translation is disabled,
3080 * but disabling translation means to open a window for data
3081 * corruption. So bail out and don't copy anything if we would
3082 * have to change the bit.
3087 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3091 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3095 /* This is too big for the stack - allocate it from slab */
3096 ctxt_table_entries = ext ? 512 : 256;
3098 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3102 for (bus = 0; bus < 256; bus++) {
3103 ret = copy_context_table(iommu, &old_rt[bus],
3104 ctxt_tbls, bus, ext);
3106 pr_err("%s: Failed to copy context table for bus %d\n",
3112 spin_lock_irqsave(&iommu->lock, flags);
3114 /* Context tables are copied, now write them to the root_entry table */
3115 for (bus = 0; bus < 256; bus++) {
3116 int idx = ext ? bus * 2 : bus;
3119 if (ctxt_tbls[idx]) {
3120 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3121 iommu->root_entry[bus].lo = val;
3124 if (!ext || !ctxt_tbls[idx + 1])
3127 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3128 iommu->root_entry[bus].hi = val;
3131 spin_unlock_irqrestore(&iommu->lock, flags);
3135 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3145 static int __init init_dmars(void)
3147 struct dmar_drhd_unit *drhd;
3148 struct intel_iommu *iommu;
3154 * initialize and program root entry to not present
3157 for_each_drhd_unit(drhd) {
3159 * lock not needed as this is only incremented in the single
3160 * threaded kernel __init code path all other access are read
3163 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3167 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3170 /* Preallocate enough resources for IOMMU hot-addition */
3171 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3172 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3174 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3177 pr_err("Allocating global iommu array failed\n");
3182 for_each_iommu(iommu, drhd) {
3183 if (drhd->ignored) {
3184 iommu_disable_translation(iommu);
3189 * Find the max pasid size of all IOMMU's in the system.
3190 * We need to ensure the system pasid table is no bigger
3191 * than the smallest supported.
3193 if (pasid_supported(iommu)) {
3194 u32 temp = 2 << ecap_pss(iommu->ecap);
3196 intel_pasid_max_id = min_t(u32, temp,
3197 intel_pasid_max_id);
3200 g_iommus[iommu->seq_id] = iommu;
3202 intel_iommu_init_qi(iommu);