Merge tag 'iommu-fixes-v5.3-rc7' of git://git.kernel.org/pub/scm/linux/kernel/git...
[sfrench/cifs-2.6.git] / drivers / iommu / intel-iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <asm/irq_remapping.h>
45 #include <asm/cacheflush.h>
46 #include <asm/iommu.h>
47
48 #include "irq_remapping.h"
49 #include "intel-pasid.h"
50
51 #define ROOT_SIZE               VTD_PAGE_SIZE
52 #define CONTEXT_SIZE            VTD_PAGE_SIZE
53
54 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
55 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
56 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
57 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
58
59 #define IOAPIC_RANGE_START      (0xfee00000)
60 #define IOAPIC_RANGE_END        (0xfeefffff)
61 #define IOVA_START_ADDR         (0x1000)
62
63 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
64
65 #define MAX_AGAW_WIDTH 64
66 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
67
68 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
69 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
70
71 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
72    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
73 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
74                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
75 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
76
77 /* IO virtual address start page frame number */
78 #define IOVA_START_PFN          (1)
79
80 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
81
82 /* page table handling */
83 #define LEVEL_STRIDE            (9)
84 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
85
86 /*
87  * This bitmap is used to advertise the page sizes our hardware support
88  * to the IOMMU core, which will then use this information to split
89  * physically contiguous memory regions it is mapping into page sizes
90  * that we support.
91  *
92  * Traditionally the IOMMU core just handed us the mappings directly,
93  * after making sure the size is an order of a 4KiB page and that the
94  * mapping has natural alignment.
95  *
96  * To retain this behavior, we currently advertise that we support
97  * all page sizes that are an order of 4KiB.
98  *
99  * If at some point we'd like to utilize the IOMMU core's new behavior,
100  * we could change this to advertise the real page sizes we support.
101  */
102 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
103
104 static inline int agaw_to_level(int agaw)
105 {
106         return agaw + 2;
107 }
108
109 static inline int agaw_to_width(int agaw)
110 {
111         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
112 }
113
114 static inline int width_to_agaw(int width)
115 {
116         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
117 }
118
119 static inline unsigned int level_to_offset_bits(int level)
120 {
121         return (level - 1) * LEVEL_STRIDE;
122 }
123
124 static inline int pfn_level_offset(unsigned long pfn, int level)
125 {
126         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
127 }
128
129 static inline unsigned long level_mask(int level)
130 {
131         return -1UL << level_to_offset_bits(level);
132 }
133
134 static inline unsigned long level_size(int level)
135 {
136         return 1UL << level_to_offset_bits(level);
137 }
138
139 static inline unsigned long align_to_level(unsigned long pfn, int level)
140 {
141         return (pfn + level_size(level) - 1) & level_mask(level);
142 }
143
144 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
145 {
146         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
147 }
148
149 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
150    are never going to work. */
151 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
152 {
153         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
154 }
155
156 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
157 {
158         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
159 }
160 static inline unsigned long page_to_dma_pfn(struct page *pg)
161 {
162         return mm_to_dma_pfn(page_to_pfn(pg));
163 }
164 static inline unsigned long virt_to_dma_pfn(void *p)
165 {
166         return page_to_dma_pfn(virt_to_page(p));
167 }
168
169 /* global iommu list, set NULL for ignored DMAR units */
170 static struct intel_iommu **g_iommus;
171
172 static void __init check_tylersburg_isoch(void);
173 static int rwbf_quirk;
174
175 /*
176  * set to 1 to panic kernel if can't successfully enable VT-d
177  * (used when kernel is launched w/ TXT)
178  */
179 static int force_on = 0;
180 int intel_iommu_tboot_noforce;
181 static int no_platform_optin;
182
183 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
184
185 /*
186  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
187  * if marked present.
188  */
189 static phys_addr_t root_entry_lctp(struct root_entry *re)
190 {
191         if (!(re->lo & 1))
192                 return 0;
193
194         return re->lo & VTD_PAGE_MASK;
195 }
196
197 /*
198  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
199  * if marked present.
200  */
201 static phys_addr_t root_entry_uctp(struct root_entry *re)
202 {
203         if (!(re->hi & 1))
204                 return 0;
205
206         return re->hi & VTD_PAGE_MASK;
207 }
208
209 static inline void context_clear_pasid_enable(struct context_entry *context)
210 {
211         context->lo &= ~(1ULL << 11);
212 }
213
214 static inline bool context_pasid_enabled(struct context_entry *context)
215 {
216         return !!(context->lo & (1ULL << 11));
217 }
218
219 static inline void context_set_copied(struct context_entry *context)
220 {
221         context->hi |= (1ull << 3);
222 }
223
224 static inline bool context_copied(struct context_entry *context)
225 {
226         return !!(context->hi & (1ULL << 3));
227 }
228
229 static inline bool __context_present(struct context_entry *context)
230 {
231         return (context->lo & 1);
232 }
233
234 bool context_present(struct context_entry *context)
235 {
236         return context_pasid_enabled(context) ?
237              __context_present(context) :
238              __context_present(context) && !context_copied(context);
239 }
240
241 static inline void context_set_present(struct context_entry *context)
242 {
243         context->lo |= 1;
244 }
245
246 static inline void context_set_fault_enable(struct context_entry *context)
247 {
248         context->lo &= (((u64)-1) << 2) | 1;
249 }
250
251 static inline void context_set_translation_type(struct context_entry *context,
252                                                 unsigned long value)
253 {
254         context->lo &= (((u64)-1) << 4) | 3;
255         context->lo |= (value & 3) << 2;
256 }
257
258 static inline void context_set_address_root(struct context_entry *context,
259                                             unsigned long value)
260 {
261         context->lo &= ~VTD_PAGE_MASK;
262         context->lo |= value & VTD_PAGE_MASK;
263 }
264
265 static inline void context_set_address_width(struct context_entry *context,
266                                              unsigned long value)
267 {
268         context->hi |= value & 7;
269 }
270
271 static inline void context_set_domain_id(struct context_entry *context,
272                                          unsigned long value)
273 {
274         context->hi |= (value & ((1 << 16) - 1)) << 8;
275 }
276
277 static inline int context_domain_id(struct context_entry *c)
278 {
279         return((c->hi >> 8) & 0xffff);
280 }
281
282 static inline void context_clear_entry(struct context_entry *context)
283 {
284         context->lo = 0;
285         context->hi = 0;
286 }
287
288 /*
289  * This domain is a statically identity mapping domain.
290  *      1. This domain creats a static 1:1 mapping to all usable memory.
291  *      2. It maps to each iommu if successful.
292  *      3. Each iommu mapps to this domain if successful.
293  */
294 static struct dmar_domain *si_domain;
295 static int hw_pass_through = 1;
296
297 /* si_domain contains mulitple devices */
298 #define DOMAIN_FLAG_STATIC_IDENTITY             BIT(0)
299
300 /*
301  * This is a DMA domain allocated through the iommu domain allocation
302  * interface. But one or more devices belonging to this domain have
303  * been chosen to use a private domain. We should avoid to use the
304  * map/unmap/iova_to_phys APIs on it.
305  */
306 #define DOMAIN_FLAG_LOSE_CHILDREN               BIT(1)
307
308 #define for_each_domain_iommu(idx, domain)                      \
309         for (idx = 0; idx < g_num_of_iommus; idx++)             \
310                 if (domain->iommu_refcnt[idx])
311
312 struct dmar_rmrr_unit {
313         struct list_head list;          /* list of rmrr units   */
314         struct acpi_dmar_header *hdr;   /* ACPI header          */
315         u64     base_address;           /* reserved base address*/
316         u64     end_address;            /* reserved end address */
317         struct dmar_dev_scope *devices; /* target devices */
318         int     devices_cnt;            /* target device count */
319 };
320
321 struct dmar_atsr_unit {
322         struct list_head list;          /* list of ATSR units */
323         struct acpi_dmar_header *hdr;   /* ACPI header */
324         struct dmar_dev_scope *devices; /* target devices */
325         int devices_cnt;                /* target device count */
326         u8 include_all:1;               /* include all ports */
327 };
328
329 static LIST_HEAD(dmar_atsr_units);
330 static LIST_HEAD(dmar_rmrr_units);
331
332 #define for_each_rmrr_units(rmrr) \
333         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
334
335 /* bitmap for indexing intel_iommus */
336 static int g_num_of_iommus;
337
338 static void domain_exit(struct dmar_domain *domain);
339 static void domain_remove_dev_info(struct dmar_domain *domain);
340 static void dmar_remove_one_dev_info(struct device *dev);
341 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
342 static void domain_context_clear(struct intel_iommu *iommu,
343                                  struct device *dev);
344 static int domain_detach_iommu(struct dmar_domain *domain,
345                                struct intel_iommu *iommu);
346 static bool device_is_rmrr_locked(struct device *dev);
347 static int intel_iommu_attach_device(struct iommu_domain *domain,
348                                      struct device *dev);
349
350 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
351 int dmar_disabled = 0;
352 #else
353 int dmar_disabled = 1;
354 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
355
356 int intel_iommu_sm;
357 int intel_iommu_enabled = 0;
358 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
359
360 static int dmar_map_gfx = 1;
361 static int dmar_forcedac;
362 static int intel_iommu_strict;
363 static int intel_iommu_superpage = 1;
364 static int iommu_identity_mapping;
365
366 #define IDENTMAP_ALL            1
367 #define IDENTMAP_GFX            2
368 #define IDENTMAP_AZALIA         4
369
370 int intel_iommu_gfx_mapped;
371 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
372
373 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
374 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
375 static DEFINE_SPINLOCK(device_domain_lock);
376 static LIST_HEAD(device_domain_list);
377
378 /*
379  * Iterate over elements in device_domain_list and call the specified
380  * callback @fn against each element.
381  */
382 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
383                                      void *data), void *data)
384 {
385         int ret = 0;
386         unsigned long flags;
387         struct device_domain_info *info;
388
389         spin_lock_irqsave(&device_domain_lock, flags);
390         list_for_each_entry(info, &device_domain_list, global) {
391                 ret = fn(info, data);
392                 if (ret) {
393                         spin_unlock_irqrestore(&device_domain_lock, flags);
394                         return ret;
395                 }
396         }
397         spin_unlock_irqrestore(&device_domain_lock, flags);
398
399         return 0;
400 }
401
402 const struct iommu_ops intel_iommu_ops;
403
404 static bool translation_pre_enabled(struct intel_iommu *iommu)
405 {
406         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
407 }
408
409 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
410 {
411         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
412 }
413
414 static void init_translation_status(struct intel_iommu *iommu)
415 {
416         u32 gsts;
417
418         gsts = readl(iommu->reg + DMAR_GSTS_REG);
419         if (gsts & DMA_GSTS_TES)
420                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
421 }
422
423 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
424 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
425 {
426         return container_of(dom, struct dmar_domain, domain);
427 }
428
429 static int __init intel_iommu_setup(char *str)
430 {
431         if (!str)
432                 return -EINVAL;
433         while (*str) {
434                 if (!strncmp(str, "on", 2)) {
435                         dmar_disabled = 0;
436                         pr_info("IOMMU enabled\n");
437                 } else if (!strncmp(str, "off", 3)) {
438                         dmar_disabled = 1;
439                         no_platform_optin = 1;
440                         pr_info("IOMMU disabled\n");
441                 } else if (!strncmp(str, "igfx_off", 8)) {
442                         dmar_map_gfx = 0;
443                         pr_info("Disable GFX device mapping\n");
444                 } else if (!strncmp(str, "forcedac", 8)) {
445                         pr_info("Forcing DAC for PCI devices\n");
446                         dmar_forcedac = 1;
447                 } else if (!strncmp(str, "strict", 6)) {
448                         pr_info("Disable batched IOTLB flush\n");
449                         intel_iommu_strict = 1;
450                 } else if (!strncmp(str, "sp_off", 6)) {
451                         pr_info("Disable supported super page\n");
452                         intel_iommu_superpage = 0;
453                 } else if (!strncmp(str, "sm_on", 5)) {
454                         pr_info("Intel-IOMMU: scalable mode supported\n");
455                         intel_iommu_sm = 1;
456                 } else if (!strncmp(str, "tboot_noforce", 13)) {
457                         printk(KERN_INFO
458                                 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
459                         intel_iommu_tboot_noforce = 1;
460                 }
461
462                 str += strcspn(str, ",");
463                 while (*str == ',')
464                         str++;
465         }
466         return 0;
467 }
468 __setup("intel_iommu=", intel_iommu_setup);
469
470 static struct kmem_cache *iommu_domain_cache;
471 static struct kmem_cache *iommu_devinfo_cache;
472
473 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
474 {
475         struct dmar_domain **domains;
476         int idx = did >> 8;
477
478         domains = iommu->domains[idx];
479         if (!domains)
480                 return NULL;
481
482         return domains[did & 0xff];
483 }
484
485 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
486                              struct dmar_domain *domain)
487 {
488         struct dmar_domain **domains;
489         int idx = did >> 8;
490
491         if (!iommu->domains[idx]) {
492                 size_t size = 256 * sizeof(struct dmar_domain *);
493                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
494         }
495
496         domains = iommu->domains[idx];
497         if (WARN_ON(!domains))
498                 return;
499         else
500                 domains[did & 0xff] = domain;
501 }
502
503 void *alloc_pgtable_page(int node)
504 {
505         struct page *page;
506         void *vaddr = NULL;
507
508         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
509         if (page)
510                 vaddr = page_address(page);
511         return vaddr;
512 }
513
514 void free_pgtable_page(void *vaddr)
515 {
516         free_page((unsigned long)vaddr);
517 }
518
519 static inline void *alloc_domain_mem(void)
520 {
521         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
522 }
523
524 static void free_domain_mem(void *vaddr)
525 {
526         kmem_cache_free(iommu_domain_cache, vaddr);
527 }
528
529 static inline void * alloc_devinfo_mem(void)
530 {
531         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
532 }
533
534 static inline void free_devinfo_mem(void *vaddr)
535 {
536         kmem_cache_free(iommu_devinfo_cache, vaddr);
537 }
538
539 static inline int domain_type_is_si(struct dmar_domain *domain)
540 {
541         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
542 }
543
544 static inline int domain_pfn_supported(struct dmar_domain *domain,
545                                        unsigned long pfn)
546 {
547         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
548
549         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
550 }
551
552 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
553 {
554         unsigned long sagaw;
555         int agaw = -1;
556
557         sagaw = cap_sagaw(iommu->cap);
558         for (agaw = width_to_agaw(max_gaw);
559              agaw >= 0; agaw--) {
560                 if (test_bit(agaw, &sagaw))
561                         break;
562         }
563
564         return agaw;
565 }
566
567 /*
568  * Calculate max SAGAW for each iommu.
569  */
570 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
571 {
572         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
573 }
574
575 /*
576  * calculate agaw for each iommu.
577  * "SAGAW" may be different across iommus, use a default agaw, and
578  * get a supported less agaw for iommus that don't support the default agaw.
579  */
580 int iommu_calculate_agaw(struct intel_iommu *iommu)
581 {
582         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
583 }
584
585 /* This functionin only returns single iommu in a domain */
586 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
587 {
588         int iommu_id;
589
590         /* si_domain and vm domain should not get here. */
591         if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
592                 return NULL;
593
594         for_each_domain_iommu(iommu_id, domain)
595                 break;
596
597         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
598                 return NULL;
599
600         return g_iommus[iommu_id];
601 }
602
603 static void domain_update_iommu_coherency(struct dmar_domain *domain)
604 {
605         struct dmar_drhd_unit *drhd;
606         struct intel_iommu *iommu;
607         bool found = false;
608         int i;
609
610         domain->iommu_coherency = 1;
611
612         for_each_domain_iommu(i, domain) {
613                 found = true;
614                 if (!ecap_coherent(g_iommus[i]->ecap)) {
615                         domain->iommu_coherency = 0;
616                         break;
617                 }
618         }
619         if (found)
620                 return;
621
622         /* No hardware attached; use lowest common denominator */
623         rcu_read_lock();
624         for_each_active_iommu(iommu, drhd) {
625                 if (!ecap_coherent(iommu->ecap)) {
626                         domain->iommu_coherency = 0;
627                         break;
628                 }
629         }
630         rcu_read_unlock();
631 }
632
633 static int domain_update_iommu_snooping(struct intel_iommu *skip)
634 {
635         struct dmar_drhd_unit *drhd;
636         struct intel_iommu *iommu;
637         int ret = 1;
638
639         rcu_read_lock();
640         for_each_active_iommu(iommu, drhd) {
641                 if (iommu != skip) {
642                         if (!ecap_sc_support(iommu->ecap)) {
643                                 ret = 0;
644                                 break;
645                         }
646                 }
647         }
648         rcu_read_unlock();
649
650         return ret;
651 }
652
653 static int domain_update_iommu_superpage(struct intel_iommu *skip)
654 {
655         struct dmar_drhd_unit *drhd;
656         struct intel_iommu *iommu;
657         int mask = 0xf;
658
659         if (!intel_iommu_superpage) {
660                 return 0;
661         }
662
663         /* set iommu_superpage to the smallest common denominator */
664         rcu_read_lock();
665         for_each_active_iommu(iommu, drhd) {
666                 if (iommu != skip) {
667                         mask &= cap_super_page_val(iommu->cap);
668                         if (!mask)
669                                 break;
670                 }
671         }
672         rcu_read_unlock();
673
674         return fls(mask);
675 }
676
677 /* Some capabilities may be different across iommus */
678 static void domain_update_iommu_cap(struct dmar_domain *domain)
679 {
680         domain_update_iommu_coherency(domain);
681         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
682         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
683 }
684
685 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
686                                          u8 devfn, int alloc)
687 {
688         struct root_entry *root = &iommu->root_entry[bus];
689         struct context_entry *context;
690         u64 *entry;
691
692         entry = &root->lo;
693         if (sm_supported(iommu)) {
694                 if (devfn >= 0x80) {
695                         devfn -= 0x80;
696                         entry = &root->hi;
697                 }
698                 devfn *= 2;
699         }
700         if (*entry & 1)
701                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
702         else {
703                 unsigned long phy_addr;
704                 if (!alloc)
705                         return NULL;
706
707                 context = alloc_pgtable_page(iommu->node);
708                 if (!context)
709                         return NULL;
710
711                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
712                 phy_addr = virt_to_phys((void *)context);
713                 *entry = phy_addr | 1;
714                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
715         }
716         return &context[devfn];
717 }
718
719 static int iommu_dummy(struct device *dev)
720 {
721         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
722 }
723
724 /**
725  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
726  *                               sub-hierarchy of a candidate PCI-PCI bridge
727  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
728  * @bridge: the candidate PCI-PCI bridge
729  *
730  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
731  */
732 static bool
733 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
734 {
735         struct pci_dev *pdev, *pbridge;
736
737         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
738                 return false;
739
740         pdev = to_pci_dev(dev);
741         pbridge = to_pci_dev(bridge);
742
743         if (pbridge->subordinate &&
744             pbridge->subordinate->number <= pdev->bus->number &&
745             pbridge->subordinate->busn_res.end >= pdev->bus->number)
746                 return true;
747
748         return false;
749 }
750
751 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
752 {
753         struct dmar_drhd_unit *drhd = NULL;
754         struct intel_iommu *iommu;
755         struct device *tmp;
756         struct pci_dev *pdev = NULL;
757         u16 segment = 0;
758         int i;
759
760         if (iommu_dummy(dev))
761                 return NULL;
762
763         if (dev_is_pci(dev)) {
764                 struct pci_dev *pf_pdev;
765
766                 pdev = to_pci_dev(dev);
767
768 #ifdef CONFIG_X86
769                 /* VMD child devices currently cannot be handled individually */
770                 if (is_vmd(pdev->bus))
771                         return NULL;
772 #endif
773
774                 /* VFs aren't listed in scope tables; we need to look up
775                  * the PF instead to find the IOMMU. */
776                 pf_pdev = pci_physfn(pdev);
777                 dev = &pf_pdev->dev;
778                 segment = pci_domain_nr(pdev->bus);
779         } else if (has_acpi_companion(dev))
780                 dev = &ACPI_COMPANION(dev)->dev;
781
782         rcu_read_lock();
783         for_each_active_iommu(iommu, drhd) {
784                 if (pdev && segment != drhd->segment)
785                         continue;
786
787                 for_each_active_dev_scope(drhd->devices,
788                                           drhd->devices_cnt, i, tmp) {
789                         if (tmp == dev) {
790                                 /* For a VF use its original BDF# not that of the PF
791                                  * which we used for the IOMMU lookup. Strictly speaking
792                                  * we could do this for all PCI devices; we only need to
793                                  * get the BDF# from the scope table for ACPI matches. */
794                                 if (pdev && pdev->is_virtfn)
795                                         goto got_pdev;
796
797                                 *bus = drhd->devices[i].bus;
798                                 *devfn = drhd->devices[i].devfn;
799                                 goto out;
800                         }
801
802                         if (is_downstream_to_pci_bridge(dev, tmp))
803                                 goto got_pdev;
804                 }
805
806                 if (pdev && drhd->include_all) {
807                 got_pdev:
808                         *bus = pdev->bus->number;
809                         *devfn = pdev->devfn;
810                         goto out;
811                 }
812         }
813         iommu = NULL;
814  out:
815         rcu_read_unlock();
816
817         return iommu;
818 }
819
820 static void domain_flush_cache(struct dmar_domain *domain,
821                                void *addr, int size)
822 {
823         if (!domain->iommu_coherency)
824                 clflush_cache_range(addr, size);
825 }
826
827 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
828 {
829         struct context_entry *context;
830         int ret = 0;
831         unsigned long flags;
832
833         spin_lock_irqsave(&iommu->lock, flags);
834         context = iommu_context_addr(iommu, bus, devfn, 0);
835         if (context)
836                 ret = context_present(context);
837         spin_unlock_irqrestore(&iommu->lock, flags);
838         return ret;
839 }
840
841 static void free_context_table(struct intel_iommu *iommu)
842 {
843         int i;
844         unsigned long flags;
845         struct context_entry *context;
846
847         spin_lock_irqsave(&iommu->lock, flags);
848         if (!iommu->root_entry) {
849                 goto out;
850         }
851         for (i = 0; i < ROOT_ENTRY_NR; i++) {
852                 context = iommu_context_addr(iommu, i, 0, 0);
853                 if (context)
854                         free_pgtable_page(context);
855
856                 if (!sm_supported(iommu))
857                         continue;
858
859                 context = iommu_context_addr(iommu, i, 0x80, 0);
860                 if (context)
861                         free_pgtable_page(context);
862
863         }
864         free_pgtable_page(iommu->root_entry);
865         iommu->root_entry = NULL;
866 out:
867         spin_unlock_irqrestore(&iommu->lock, flags);
868 }
869
870 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
871                                       unsigned long pfn, int *target_level)
872 {
873         struct dma_pte *parent, *pte;
874         int level = agaw_to_level(domain->agaw);
875         int offset;
876
877         BUG_ON(!domain->pgd);
878
879         if (!domain_pfn_supported(domain, pfn))
880                 /* Address beyond IOMMU's addressing capabilities. */
881                 return NULL;
882
883         parent = domain->pgd;
884
885         while (1) {
886                 void *tmp_page;
887
888                 offset = pfn_level_offset(pfn, level);
889                 pte = &parent[offset];
890                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
891                         break;
892                 if (level == *target_level)
893                         break;
894
895                 if (!dma_pte_present(pte)) {
896                         uint64_t pteval;
897
898                         tmp_page = alloc_pgtable_page(domain->nid);
899
900                         if (!tmp_page)
901                                 return NULL;
902
903                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
904                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
905                         if (cmpxchg64(&pte->val, 0ULL, pteval))
906                                 /* Someone else set it while we were thinking; use theirs. */
907                                 free_pgtable_page(tmp_page);
908                         else
909                                 domain_flush_cache(domain, pte, sizeof(*pte));
910                 }
911                 if (level == 1)
912                         break;
913
914                 parent = phys_to_virt(dma_pte_addr(pte));
915                 level--;
916         }
917
918         if (!*target_level)
919                 *target_level = level;
920
921         return pte;
922 }
923
924 /* return address's pte at specific level */
925 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
926                                          unsigned long pfn,
927                                          int level, int *large_page)
928 {
929         struct dma_pte *parent, *pte;
930         int total = agaw_to_level(domain->agaw);
931         int offset;
932
933         parent = domain->pgd;
934         while (level <= total) {
935                 offset = pfn_level_offset(pfn, total);
936                 pte = &parent[offset];
937                 if (level == total)
938                         return pte;
939
940                 if (!dma_pte_present(pte)) {
941                         *large_page = total;
942                         break;
943                 }
944
945                 if (dma_pte_superpage(pte)) {
946                         *large_page = total;
947                         return pte;
948                 }
949
950                 parent = phys_to_virt(dma_pte_addr(pte));
951                 total--;
952         }
953         return NULL;
954 }
955
956 /* clear last level pte, a tlb flush should be followed */
957 static void dma_pte_clear_range(struct dmar_domain *domain,
958                                 unsigned long start_pfn,
959                                 unsigned long last_pfn)
960 {
961         unsigned int large_page;
962         struct dma_pte *first_pte, *pte;
963
964         BUG_ON(!domain_pfn_supported(domain, start_pfn));
965         BUG_ON(!domain_pfn_supported(domain, last_pfn));
966         BUG_ON(start_pfn > last_pfn);
967
968         /* we don't need lock here; nobody else touches the iova range */
969         do {
970                 large_page = 1;
971                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
972                 if (!pte) {
973                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
974                         continue;
975                 }
976                 do {
977                         dma_clear_pte(pte);
978                         start_pfn += lvl_to_nr_pages(large_page);
979                         pte++;
980                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
981
982                 domain_flush_cache(domain, first_pte,
983                                    (void *)pte - (void *)first_pte);
984
985         } while (start_pfn && start_pfn <= last_pfn);
986 }
987
988 static void dma_pte_free_level(struct dmar_domain *domain, int level,
989                                int retain_level, struct dma_pte *pte,
990                                unsigned long pfn, unsigned long start_pfn,
991                                unsigned long last_pfn)
992 {
993         pfn = max(start_pfn, pfn);
994         pte = &pte[pfn_level_offset(pfn, level)];
995
996         do {
997                 unsigned long level_pfn;
998                 struct dma_pte *level_pte;
999
1000                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1001                         goto next;
1002
1003                 level_pfn = pfn & level_mask(level);
1004                 level_pte = phys_to_virt(dma_pte_addr(pte));
1005
1006                 if (level > 2) {
1007                         dma_pte_free_level(domain, level - 1, retain_level,
1008                                            level_pte, level_pfn, start_pfn,
1009                                            last_pfn);
1010                 }
1011
1012                 /*
1013                  * Free the page table if we're below the level we want to
1014                  * retain and the range covers the entire table.
1015                  */
1016                 if (level < retain_level && !(start_pfn > level_pfn ||
1017                       last_pfn < level_pfn + level_size(level) - 1)) {
1018                         dma_clear_pte(pte);
1019                         domain_flush_cache(domain, pte, sizeof(*pte));
1020                         free_pgtable_page(level_pte);
1021                 }
1022 next:
1023                 pfn += level_size(level);
1024         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1025 }
1026
1027 /*
1028  * clear last level (leaf) ptes and free page table pages below the
1029  * level we wish to keep intact.
1030  */
1031 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1032                                    unsigned long start_pfn,
1033                                    unsigned long last_pfn,
1034                                    int retain_level)
1035 {
1036         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1037         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1038         BUG_ON(start_pfn > last_pfn);
1039
1040         dma_pte_clear_range(domain, start_pfn, last_pfn);
1041
1042         /* We don't need lock here; nobody else touches the iova range */
1043         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1044                            domain->pgd, 0, start_pfn, last_pfn);
1045
1046         /* free pgd */
1047         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1048                 free_pgtable_page(domain->pgd);
1049                 domain->pgd = NULL;
1050         }
1051 }
1052
1053 /* When a page at a given level is being unlinked from its parent, we don't
1054    need to *modify* it at all. All we need to do is make a list of all the
1055    pages which can be freed just as soon as we've flushed the IOTLB and we
1056    know the hardware page-walk will no longer touch them.
1057    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1058    be freed. */
1059 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1060                                             int level, struct dma_pte *pte,
1061                                             struct page *freelist)
1062 {
1063         struct page *pg;
1064
1065         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1066         pg->freelist = freelist;
1067         freelist = pg;
1068
1069         if (level == 1)
1070                 return freelist;
1071
1072         pte = page_address(pg);
1073         do {
1074                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1075                         freelist = dma_pte_list_pagetables(domain, level - 1,
1076                                                            pte, freelist);
1077                 pte++;
1078         } while (!first_pte_in_page(pte));
1079
1080         return freelist;
1081 }
1082
1083 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1084                                         struct dma_pte *pte, unsigned long pfn,
1085                                         unsigned long start_pfn,
1086                                         unsigned long last_pfn,
1087                                         struct page *freelist)
1088 {
1089         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1090
1091         pfn = max(start_pfn, pfn);
1092         pte = &pte[pfn_level_offset(pfn, level)];
1093
1094         do {
1095                 unsigned long level_pfn;
1096
1097                 if (!dma_pte_present(pte))
1098                         goto next;
1099
1100                 level_pfn = pfn & level_mask(level);
1101
1102                 /* If range covers entire pagetable, free it */
1103                 if (start_pfn <= level_pfn &&
1104                     last_pfn >= level_pfn + level_size(level) - 1) {
1105                         /* These suborbinate page tables are going away entirely. Don't
1106                            bother to clear them; we're just going to *free* them. */
1107                         if (level > 1 && !dma_pte_superpage(pte))
1108                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1109
1110                         dma_clear_pte(pte);
1111                         if (!first_pte)
1112                                 first_pte = pte;
1113                         last_pte = pte;
1114                 } else if (level > 1) {
1115                         /* Recurse down into a level that isn't *entirely* obsolete */
1116                         freelist = dma_pte_clear_level(domain, level - 1,
1117                                                        phys_to_virt(dma_pte_addr(pte)),
1118                                                        level_pfn, start_pfn, last_pfn,
1119                                                        freelist);
1120                 }
1121 next:
1122                 pfn += level_size(level);
1123         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1124
1125         if (first_pte)
1126                 domain_flush_cache(domain, first_pte,
1127                                    (void *)++last_pte - (void *)first_pte);
1128
1129         return freelist;
1130 }
1131
1132 /* We can't just free the pages because the IOMMU may still be walking
1133    the page tables, and may have cached the intermediate levels. The
1134    pages can only be freed after the IOTLB flush has been done. */
1135 static struct page *domain_unmap(struct dmar_domain *domain,
1136                                  unsigned long start_pfn,
1137                                  unsigned long last_pfn)
1138 {
1139         struct page *freelist;
1140
1141         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1142         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1143         BUG_ON(start_pfn > last_pfn);
1144
1145         /* we don't need lock here; nobody else touches the iova range */
1146         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1147                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1148
1149         /* free pgd */
1150         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1151                 struct page *pgd_page = virt_to_page(domain->pgd);
1152                 pgd_page->freelist = freelist;
1153                 freelist = pgd_page;
1154
1155                 domain->pgd = NULL;
1156         }
1157
1158         return freelist;
1159 }
1160
1161 static void dma_free_pagelist(struct page *freelist)
1162 {
1163         struct page *pg;
1164
1165         while ((pg = freelist)) {
1166                 freelist = pg->freelist;
1167                 free_pgtable_page(page_address(pg));
1168         }
1169 }
1170
1171 static void iova_entry_free(unsigned long data)
1172 {
1173         struct page *freelist = (struct page *)data;
1174
1175         dma_free_pagelist(freelist);
1176 }
1177
1178 /* iommu handling */
1179 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1180 {
1181         struct root_entry *root;
1182         unsigned long flags;
1183
1184         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1185         if (!root) {
1186                 pr_err("Allocating root entry for %s failed\n",
1187                         iommu->name);
1188                 return -ENOMEM;
1189         }
1190
1191         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1192
1193         spin_lock_irqsave(&iommu->lock, flags);
1194         iommu->root_entry = root;
1195         spin_unlock_irqrestore(&iommu->lock, flags);
1196
1197         return 0;
1198 }
1199
1200 static void iommu_set_root_entry(struct intel_iommu *iommu)
1201 {
1202         u64 addr;
1203         u32 sts;
1204         unsigned long flag;
1205
1206         addr = virt_to_phys(iommu->root_entry);
1207         if (sm_supported(iommu))
1208                 addr |= DMA_RTADDR_SMT;
1209
1210         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1211         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1212
1213         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1214
1215         /* Make sure hardware complete it */
1216         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1217                       readl, (sts & DMA_GSTS_RTPS), sts);
1218
1219         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1220 }
1221
1222 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1223 {
1224         u32 val;
1225         unsigned long flag;
1226
1227         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1228                 return;
1229
1230         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1231         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1232
1233         /* Make sure hardware complete it */
1234         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1235                       readl, (!(val & DMA_GSTS_WBFS)), val);
1236
1237         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1238 }
1239
1240 /* return value determine if we need a write buffer flush */
1241 static void __iommu_flush_context(struct intel_iommu *iommu,
1242                                   u16 did, u16 source_id, u8 function_mask,
1243                                   u64 type)
1244 {
1245         u64 val = 0;
1246         unsigned long flag;
1247
1248         switch (type) {
1249         case DMA_CCMD_GLOBAL_INVL:
1250                 val = DMA_CCMD_GLOBAL_INVL;
1251                 break;
1252         case DMA_CCMD_DOMAIN_INVL:
1253                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1254                 break;
1255         case DMA_CCMD_DEVICE_INVL:
1256                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1257                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1258                 break;
1259         default:
1260                 BUG();
1261         }
1262         val |= DMA_CCMD_ICC;
1263
1264         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1265         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1266
1267         /* Make sure hardware complete it */
1268         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1269                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1270
1271         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1272 }
1273
1274 /* return value determine if we need a write buffer flush */
1275 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1276                                 u64 addr, unsigned int size_order, u64 type)
1277 {
1278         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1279         u64 val = 0, val_iva = 0;
1280         unsigned long flag;
1281
1282         switch (type) {
1283         case DMA_TLB_GLOBAL_FLUSH:
1284                 /* global flush doesn't need set IVA_REG */
1285                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1286                 break;
1287         case DMA_TLB_DSI_FLUSH:
1288                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1289                 break;
1290         case DMA_TLB_PSI_FLUSH:
1291                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1292                 /* IH bit is passed in as part of address */
1293                 val_iva = size_order | addr;
1294                 break;
1295         default:
1296                 BUG();
1297         }
1298         /* Note: set drain read/write */
1299 #if 0
1300         /*
1301          * This is probably to be super secure.. Looks like we can
1302          * ignore it without any impact.
1303          */
1304         if (cap_read_drain(iommu->cap))
1305                 val |= DMA_TLB_READ_DRAIN;
1306 #endif
1307         if (cap_write_drain(iommu->cap))
1308                 val |= DMA_TLB_WRITE_DRAIN;
1309
1310         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1311         /* Note: Only uses first TLB reg currently */
1312         if (val_iva)
1313                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1314         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1315
1316         /* Make sure hardware complete it */
1317         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1318                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1319
1320         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1321
1322         /* check IOTLB invalidation granularity */
1323         if (DMA_TLB_IAIG(val) == 0)
1324                 pr_err("Flush IOTLB failed\n");
1325         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1326                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1327                         (unsigned long long)DMA_TLB_IIRG(type),
1328                         (unsigned long long)DMA_TLB_IAIG(val));
1329 }
1330
1331 static struct device_domain_info *
1332 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1333                          u8 bus, u8 devfn)
1334 {
1335         struct device_domain_info *info;
1336
1337         assert_spin_locked(&device_domain_lock);
1338
1339         if (!iommu->qi)
1340                 return NULL;
1341
1342         list_for_each_entry(info, &domain->devices, link)
1343                 if (info->iommu == iommu && info->bus == bus &&
1344                     info->devfn == devfn) {
1345                         if (info->ats_supported && info->dev)
1346                                 return info;
1347                         break;
1348                 }
1349
1350         return NULL;
1351 }
1352
1353 static void domain_update_iotlb(struct dmar_domain *domain)
1354 {
1355         struct device_domain_info *info;
1356         bool has_iotlb_device = false;
1357
1358         assert_spin_locked(&device_domain_lock);
1359
1360         list_for_each_entry(info, &domain->devices, link) {
1361                 struct pci_dev *pdev;
1362
1363                 if (!info->dev || !dev_is_pci(info->dev))
1364                         continue;
1365
1366                 pdev = to_pci_dev(info->dev);
1367                 if (pdev->ats_enabled) {
1368                         has_iotlb_device = true;
1369                         break;
1370                 }
1371         }
1372
1373         domain->has_iotlb_device = has_iotlb_device;
1374 }
1375
1376 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1377 {
1378         struct pci_dev *pdev;
1379
1380         assert_spin_locked(&device_domain_lock);
1381
1382         if (!info || !dev_is_pci(info->dev))
1383                 return;
1384
1385         pdev = to_pci_dev(info->dev);
1386         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1387          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1388          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1389          * reserved, which should be set to 0.
1390          */
1391         if (!ecap_dit(info->iommu->ecap))
1392                 info->pfsid = 0;
1393         else {
1394                 struct pci_dev *pf_pdev;
1395
1396                 /* pdev will be returned if device is not a vf */
1397                 pf_pdev = pci_physfn(pdev);
1398                 info->pfsid = pci_dev_id(pf_pdev);
1399         }
1400
1401 #ifdef CONFIG_INTEL_IOMMU_SVM
1402         /* The PCIe spec, in its wisdom, declares that the behaviour of
1403            the device if you enable PASID support after ATS support is
1404            undefined. So always enable PASID support on devices which
1405            have it, even if we can't yet know if we're ever going to
1406            use it. */
1407         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1408                 info->pasid_enabled = 1;
1409
1410         if (info->pri_supported &&
1411             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1412             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1413                 info->pri_enabled = 1;
1414 #endif
1415         if (!pdev->untrusted && info->ats_supported &&
1416             pci_ats_page_aligned(pdev) &&
1417             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1418                 info->ats_enabled = 1;
1419                 domain_update_iotlb(info->domain);
1420                 info->ats_qdep = pci_ats_queue_depth(pdev);
1421         }
1422 }
1423
1424 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1425 {
1426         struct pci_dev *pdev;
1427
1428         assert_spin_locked(&device_domain_lock);
1429
1430         if (!dev_is_pci(info->dev))
1431                 return;
1432
1433         pdev = to_pci_dev(info->dev);
1434
1435         if (info->ats_enabled) {
1436                 pci_disable_ats(pdev);
1437                 info->ats_enabled = 0;
1438                 domain_update_iotlb(info->domain);
1439         }
1440 #ifdef CONFIG_INTEL_IOMMU_SVM
1441         if (info->pri_enabled) {
1442                 pci_disable_pri(pdev);
1443                 info->pri_enabled = 0;
1444         }
1445         if (info->pasid_enabled) {
1446                 pci_disable_pasid(pdev);
1447                 info->pasid_enabled = 0;
1448         }
1449 #endif
1450 }
1451
1452 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1453                                   u64 addr, unsigned mask)
1454 {
1455         u16 sid, qdep;
1456         unsigned long flags;
1457         struct device_domain_info *info;
1458
1459         if (!domain->has_iotlb_device)
1460                 return;
1461
1462         spin_lock_irqsave(&device_domain_lock, flags);
1463         list_for_each_entry(info, &domain->devices, link) {
1464                 if (!info->ats_enabled)
1465                         continue;
1466
1467                 sid = info->bus << 8 | info->devfn;
1468                 qdep = info->ats_qdep;
1469                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1470                                 qdep, addr, mask);
1471         }
1472         spin_unlock_irqrestore(&device_domain_lock, flags);
1473 }
1474
1475 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1476                                   struct dmar_domain *domain,
1477                                   unsigned long pfn, unsigned int pages,
1478                                   int ih, int map)
1479 {
1480         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1481         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1482         u16 did = domain->iommu_did[iommu->seq_id];
1483
1484         BUG_ON(pages == 0);
1485
1486         if (ih)
1487                 ih = 1 << 6;
1488         /*
1489          * Fallback to domain selective flush if no PSI support or the size is
1490          * too big.
1491          * PSI requires page size to be 2 ^ x, and the base address is naturally
1492          * aligned to the size
1493          */
1494         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1495                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1496                                                 DMA_TLB_DSI_FLUSH);
1497         else
1498                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1499                                                 DMA_TLB_PSI_FLUSH);
1500
1501         /*
1502          * In caching mode, changes of pages from non-present to present require
1503          * flush. However, device IOTLB doesn't need to be flushed in this case.
1504          */
1505         if (!cap_caching_mode(iommu->cap) || !map)
1506                 iommu_flush_dev_iotlb(domain, addr, mask);
1507 }
1508
1509 /* Notification for newly created mappings */
1510 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1511                                         struct dmar_domain *domain,
1512                                         unsigned long pfn, unsigned int pages)
1513 {
1514         /* It's a non-present to present mapping. Only flush if caching mode */
1515         if (cap_caching_mode(iommu->cap))
1516                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1517         else
1518                 iommu_flush_write_buffer(iommu);
1519 }
1520
1521 static void iommu_flush_iova(struct iova_domain *iovad)
1522 {
1523         struct dmar_domain *domain;
1524         int idx;
1525
1526         domain = container_of(iovad, struct dmar_domain, iovad);
1527
1528         for_each_domain_iommu(idx, domain) {
1529                 struct intel_iommu *iommu = g_iommus[idx];
1530                 u16 did = domain->iommu_did[iommu->seq_id];
1531
1532                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1533
1534                 if (!cap_caching_mode(iommu->cap))
1535                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1536                                               0, MAX_AGAW_PFN_WIDTH);
1537         }
1538 }
1539
1540 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1541 {
1542         u32 pmen;
1543         unsigned long flags;
1544
1545         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1546                 return;
1547
1548         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1549         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1550         pmen &= ~DMA_PMEN_EPM;
1551         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1552
1553         /* wait for the protected region status bit to clear */
1554         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1555                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1556
1557         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1558 }
1559
1560 static void iommu_enable_translation(struct intel_iommu *iommu)
1561 {
1562         u32 sts;
1563         unsigned long flags;
1564
1565         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1566         iommu->gcmd |= DMA_GCMD_TE;
1567         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1568
1569         /* Make sure hardware complete it */
1570         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1571                       readl, (sts & DMA_GSTS_TES), sts);
1572
1573         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1574 }
1575
1576 static void iommu_disable_translation(struct intel_iommu *iommu)
1577 {
1578         u32 sts;
1579         unsigned long flag;
1580
1581         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1582         iommu->gcmd &= ~DMA_GCMD_TE;
1583         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1584
1585         /* Make sure hardware complete it */
1586         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1587                       readl, (!(sts & DMA_GSTS_TES)), sts);
1588
1589         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1590 }
1591
1592 static int iommu_init_domains(struct intel_iommu *iommu)
1593 {
1594         u32 ndomains, nlongs;
1595         size_t size;
1596
1597         ndomains = cap_ndoms(iommu->cap);
1598         pr_debug("%s: Number of Domains supported <%d>\n",
1599                  iommu->name, ndomains);
1600         nlongs = BITS_TO_LONGS(ndomains);
1601
1602         spin_lock_init(&iommu->lock);
1603
1604         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1605         if (!iommu->domain_ids) {
1606                 pr_err("%s: Allocating domain id array failed\n",
1607                        iommu->name);
1608                 return -ENOMEM;
1609         }
1610
1611         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1612         iommu->domains = kzalloc(size, GFP_KERNEL);
1613
1614         if (iommu->domains) {
1615                 size = 256 * sizeof(struct dmar_domain *);
1616                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1617         }
1618
1619         if (!iommu->domains || !iommu->domains[0]) {
1620                 pr_err("%s: Allocating domain array failed\n",
1621                        iommu->name);
1622                 kfree(iommu->domain_ids);
1623                 kfree(iommu->domains);
1624                 iommu->domain_ids = NULL;
1625                 iommu->domains    = NULL;
1626                 return -ENOMEM;
1627         }
1628
1629         /*
1630          * If Caching mode is set, then invalid translations are tagged
1631          * with domain-id 0, hence we need to pre-allocate it. We also
1632          * use domain-id 0 as a marker for non-allocated domain-id, so
1633          * make sure it is not used for a real domain.
1634          */
1635         set_bit(0, iommu->domain_ids);
1636
1637         /*
1638          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1639          * entry for first-level or pass-through translation modes should
1640          * be programmed with a domain id different from those used for
1641          * second-level or nested translation. We reserve a domain id for
1642          * this purpose.
1643          */
1644         if (sm_supported(iommu))
1645                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1646
1647         return 0;
1648 }
1649
1650 static void disable_dmar_iommu(struct intel_iommu *iommu)
1651 {
1652         struct device_domain_info *info, *tmp;
1653         unsigned long flags;
1654
1655         if (!iommu->domains || !iommu->domain_ids)
1656                 return;
1657
1658         spin_lock_irqsave(&device_domain_lock, flags);
1659         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1660                 if (info->iommu != iommu)
1661                         continue;
1662
1663                 if (!info->dev || !info->domain)
1664                         continue;
1665
1666                 __dmar_remove_one_dev_info(info);
1667         }
1668         spin_unlock_irqrestore(&device_domain_lock, flags);
1669
1670         if (iommu->gcmd & DMA_GCMD_TE)
1671                 iommu_disable_translation(iommu);
1672 }
1673
1674 static void free_dmar_iommu(struct intel_iommu *iommu)
1675 {
1676         if ((iommu->domains) && (iommu->domain_ids)) {
1677                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1678                 int i;
1679
1680                 for (i = 0; i < elems; i++)
1681                         kfree(iommu->domains[i]);
1682                 kfree(iommu->domains);
1683                 kfree(iommu->domain_ids);
1684                 iommu->domains = NULL;
1685                 iommu->domain_ids = NULL;
1686         }
1687
1688         g_iommus[iommu->seq_id] = NULL;
1689
1690         /* free context mapping */
1691         free_context_table(iommu);
1692
1693 #ifdef CONFIG_INTEL_IOMMU_SVM
1694         if (pasid_supported(iommu)) {
1695                 if (ecap_prs(iommu->ecap))
1696                         intel_svm_finish_prq(iommu);
1697         }
1698 #endif
1699 }
1700
1701 static struct dmar_domain *alloc_domain(int flags)
1702 {
1703         struct dmar_domain *domain;
1704
1705         domain = alloc_domain_mem();
1706         if (!domain)
1707                 return NULL;
1708
1709         memset(domain, 0, sizeof(*domain));
1710         domain->nid = NUMA_NO_NODE;
1711         domain->flags = flags;
1712         domain->has_iotlb_device = false;
1713         INIT_LIST_HEAD(&domain->devices);
1714
1715         return domain;
1716 }
1717
1718 /* Must be called with iommu->lock */
1719 static int domain_attach_iommu(struct dmar_domain *domain,
1720                                struct intel_iommu *iommu)
1721 {
1722         unsigned long ndomains;
1723         int num;
1724
1725         assert_spin_locked(&device_domain_lock);
1726         assert_spin_locked(&iommu->lock);
1727
1728         domain->iommu_refcnt[iommu->seq_id] += 1;
1729         domain->iommu_count += 1;
1730         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1731                 ndomains = cap_ndoms(iommu->cap);
1732                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1733
1734                 if (num >= ndomains) {
1735                         pr_err("%s: No free domain ids\n", iommu->name);
1736                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1737                         domain->iommu_count -= 1;
1738                         return -ENOSPC;
1739                 }
1740
1741                 set_bit(num, iommu->domain_ids);
1742                 set_iommu_domain(iommu, num, domain);
1743
1744                 domain->iommu_did[iommu->seq_id] = num;
1745                 domain->nid                      = iommu->node;
1746
1747                 domain_update_iommu_cap(domain);
1748         }
1749
1750         return 0;
1751 }
1752
1753 static int domain_detach_iommu(struct dmar_domain *domain,
1754                                struct intel_iommu *iommu)
1755 {
1756         int num, count;
1757
1758         assert_spin_locked(&device_domain_lock);
1759         assert_spin_locked(&iommu->lock);
1760
1761         domain->iommu_refcnt[iommu->seq_id] -= 1;
1762         count = --domain->iommu_count;
1763         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1764                 num = domain->iommu_did[iommu->seq_id];
1765                 clear_bit(num, iommu->domain_ids);
1766                 set_iommu_domain(iommu, num, NULL);
1767
1768                 domain_update_iommu_cap(domain);
1769                 domain->iommu_did[iommu->seq_id] = 0;
1770         }
1771
1772         return count;
1773 }
1774
1775 static struct iova_domain reserved_iova_list;
1776 static struct lock_class_key reserved_rbtree_key;
1777
1778 static int dmar_init_reserved_ranges(void)
1779 {
1780         struct pci_dev *pdev = NULL;
1781         struct iova *iova;
1782         int i;
1783
1784         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1785
1786         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1787                 &reserved_rbtree_key);
1788
1789         /* IOAPIC ranges shouldn't be accessed by DMA */
1790         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1791                 IOVA_PFN(IOAPIC_RANGE_END));
1792         if (!iova) {
1793                 pr_err("Reserve IOAPIC range failed\n");
1794                 return -ENODEV;
1795         }
1796
1797         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1798         for_each_pci_dev(pdev) {
1799                 struct resource *r;
1800
1801                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1802                         r = &pdev->resource[i];
1803                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1804                                 continue;
1805                         iova = reserve_iova(&reserved_iova_list,
1806                                             IOVA_PFN(r->start),
1807                                             IOVA_PFN(r->end));
1808                         if (!iova) {
1809                                 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1810                                 return -ENODEV;
1811                         }
1812                 }
1813         }
1814         return 0;
1815 }
1816
1817 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1818 {
1819         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1820 }
1821
1822 static inline int guestwidth_to_adjustwidth(int gaw)
1823 {
1824         int agaw;
1825         int r = (gaw - 12) % 9;
1826
1827         if (r == 0)
1828                 agaw = gaw;
1829         else
1830                 agaw = gaw + 9 - r;
1831         if (agaw > 64)
1832                 agaw = 64;
1833         return agaw;
1834 }
1835
1836 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1837                        int guest_width)
1838 {
1839         int adjust_width, agaw;
1840         unsigned long sagaw;
1841         int err;
1842
1843         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1844
1845         err = init_iova_flush_queue(&domain->iovad,
1846                                     iommu_flush_iova, iova_entry_free);
1847         if (err)
1848                 return err;
1849
1850         domain_reserve_special_ranges(domain);
1851
1852         /* calculate AGAW */
1853         if (guest_width > cap_mgaw(iommu->cap))
1854                 guest_width = cap_mgaw(iommu->cap);
1855         domain->gaw = guest_width;
1856         adjust_width = guestwidth_to_adjustwidth(guest_width);
1857         agaw = width_to_agaw(adjust_width);
1858         sagaw = cap_sagaw(iommu->cap);
1859         if (!test_bit(agaw, &sagaw)) {
1860                 /* hardware doesn't support it, choose a bigger one */
1861                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1862                 agaw = find_next_bit(&sagaw, 5, agaw);
1863                 if (agaw >= 5)
1864                         return -ENODEV;
1865         }
1866         domain->agaw = agaw;
1867
1868         if (ecap_coherent(iommu->ecap))
1869                 domain->iommu_coherency = 1;
1870         else
1871                 domain->iommu_coherency = 0;
1872
1873         if (ecap_sc_support(iommu->ecap))
1874                 domain->iommu_snooping = 1;
1875         else
1876                 domain->iommu_snooping = 0;
1877
1878         if (intel_iommu_superpage)
1879                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1880         else
1881                 domain->iommu_superpage = 0;
1882
1883         domain->nid = iommu->node;
1884
1885         /* always allocate the top pgd */
1886         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1887         if (!domain->pgd)
1888                 return -ENOMEM;
1889         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1890         return 0;
1891 }
1892
1893 static void domain_exit(struct dmar_domain *domain)
1894 {
1895
1896         /* Remove associated devices and clear attached or cached domains */
1897         domain_remove_dev_info(domain);
1898
1899         /* destroy iovas */
1900         put_iova_domain(&domain->iovad);
1901
1902         if (domain->pgd) {
1903                 struct page *freelist;
1904
1905                 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1906                 dma_free_pagelist(freelist);
1907         }
1908
1909         free_domain_mem(domain);
1910 }
1911
1912 /*
1913  * Get the PASID directory size for scalable mode context entry.
1914  * Value of X in the PDTS field of a scalable mode context entry
1915  * indicates PASID directory with 2^(X + 7) entries.
1916  */
1917 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1918 {
1919         int pds, max_pde;
1920
1921         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1922         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1923         if (pds < 7)
1924                 return 0;
1925
1926         return pds - 7;
1927 }
1928
1929 /*
1930  * Set the RID_PASID field of a scalable mode context entry. The
1931  * IOMMU hardware will use the PASID value set in this field for
1932  * DMA translations of DMA requests without PASID.
1933  */
1934 static inline void
1935 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1936 {
1937         context->hi |= pasid & ((1 << 20) - 1);
1938         context->hi |= (1 << 20);
1939 }
1940
1941 /*
1942  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1943  * entry.
1944  */
1945 static inline void context_set_sm_dte(struct context_entry *context)
1946 {
1947         context->lo |= (1 << 2);
1948 }
1949
1950 /*
1951  * Set the PRE(Page Request Enable) field of a scalable mode context
1952  * entry.
1953  */
1954 static inline void context_set_sm_pre(struct context_entry *context)
1955 {
1956         context->lo |= (1 << 4);
1957 }
1958
1959 /* Convert value to context PASID directory size field coding. */
1960 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1961
1962 static int domain_context_mapping_one(struct dmar_domain *domain,
1963                                       struct intel_iommu *iommu,
1964                                       struct pasid_table *table,
1965                                       u8 bus, u8 devfn)
1966 {
1967         u16 did = domain->iommu_did[iommu->seq_id];
1968         int translation = CONTEXT_TT_MULTI_LEVEL;
1969         struct device_domain_info *info = NULL;
1970         struct context_entry *context;
1971         unsigned long flags;
1972         int ret;
1973
1974         WARN_ON(did == 0);
1975
1976         if (hw_pass_through && domain_type_is_si(domain))
1977                 translation = CONTEXT_TT_PASS_THROUGH;
1978
1979         pr_debug("Set context mapping for %02x:%02x.%d\n",
1980                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1981
1982         BUG_ON(!domain->pgd);
1983
1984         spin_lock_irqsave(&device_domain_lock, flags);
1985         spin_lock(&iommu->lock);
1986
1987         ret = -ENOMEM;
1988         context = iommu_context_addr(iommu, bus, devfn, 1);
1989         if (!context)
1990                 goto out_unlock;
1991
1992         ret = 0;
1993         if (context_present(context))
1994                 goto out_unlock;
1995
1996         /*
1997          * For kdump cases, old valid entries may be cached due to the
1998          * in-flight DMA and copied pgtable, but there is no unmapping
1999          * behaviour for them, thus we need an explicit cache flush for
2000          * the newly-mapped device. For kdump, at this point, the device
2001          * is supposed to finish reset at its driver probe stage, so no
2002          * in-flight DMA will exist, and we don't need to worry anymore
2003          * hereafter.
2004          */
2005         if (context_copied(context)) {
2006                 u16 did_old = context_domain_id(context);
2007
2008                 if (did_old < cap_ndoms(iommu->cap)) {
2009                         iommu->flush.flush_context(iommu, did_old,
2010                                                    (((u16)bus) << 8) | devfn,
2011                                                    DMA_CCMD_MASK_NOBIT,
2012                                                    DMA_CCMD_DEVICE_INVL);
2013                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2014                                                  DMA_TLB_DSI_FLUSH);
2015                 }
2016         }
2017
2018         context_clear_entry(context);
2019
2020         if (sm_supported(iommu)) {
2021                 unsigned long pds;
2022
2023                 WARN_ON(!table);
2024
2025                 /* Setup the PASID DIR pointer: */
2026                 pds = context_get_sm_pds(table);
2027                 context->lo = (u64)virt_to_phys(table->table) |
2028                                 context_pdts(pds);
2029
2030                 /* Setup the RID_PASID field: */
2031                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2032
2033                 /*
2034                  * Setup the Device-TLB enable bit and Page request
2035                  * Enable bit:
2036                  */
2037                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2038                 if (info && info->ats_supported)
2039                         context_set_sm_dte(context);
2040                 if (info && info->pri_supported)
2041                         context_set_sm_pre(context);
2042         } else {
2043                 struct dma_pte *pgd = domain->pgd;
2044                 int agaw;
2045
2046                 context_set_domain_id(context, did);
2047
2048                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2049                         /*
2050                          * Skip top levels of page tables for iommu which has
2051                          * less agaw than default. Unnecessary for PT mode.
2052                          */
2053                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2054                                 ret = -ENOMEM;
2055                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2056                                 if (!dma_pte_present(pgd))
2057                                         goto out_unlock;
2058                         }
2059
2060                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2061                         if (info && info->ats_supported)
2062                                 translation = CONTEXT_TT_DEV_IOTLB;
2063                         else
2064                                 translation = CONTEXT_TT_MULTI_LEVEL;
2065
2066                         context_set_address_root(context, virt_to_phys(pgd));
2067                         context_set_address_width(context, agaw);
2068                 } else {
2069                         /*
2070                          * In pass through mode, AW must be programmed to
2071                          * indicate the largest AGAW value supported by
2072                          * hardware. And ASR is ignored by hardware.
2073                          */
2074                         context_set_address_width(context, iommu->msagaw);
2075                 }
2076
2077                 context_set_translation_type(context, translation);
2078         }
2079
2080         context_set_fault_enable(context);
2081         context_set_present(context);
2082         domain_flush_cache(domain, context, sizeof(*context));
2083
2084         /*
2085          * It's a non-present to present mapping. If hardware doesn't cache
2086          * non-present entry we only need to flush the write-buffer. If the
2087          * _does_ cache non-present entries, then it does so in the special
2088          * domain #0, which we have to flush:
2089          */
2090         if (cap_caching_mode(iommu->cap)) {
2091                 iommu->flush.flush_context(iommu, 0,
2092                                            (((u16)bus) << 8) | devfn,
2093                                            DMA_CCMD_MASK_NOBIT,
2094                                            DMA_CCMD_DEVICE_INVL);
2095                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2096         } else {
2097                 iommu_flush_write_buffer(iommu);
2098         }
2099         iommu_enable_dev_iotlb(info);
2100
2101         ret = 0;
2102
2103 out_unlock:
2104         spin_unlock(&iommu->lock);
2105         spin_unlock_irqrestore(&device_domain_lock, flags);
2106
2107         return ret;
2108 }
2109
2110 struct domain_context_mapping_data {
2111         struct dmar_domain *domain;
2112         struct intel_iommu *iommu;
2113         struct pasid_table *table;
2114 };
2115
2116 static int domain_context_mapping_cb(struct pci_dev *pdev,
2117                                      u16 alias, void *opaque)
2118 {
2119         struct domain_context_mapping_data *data = opaque;
2120
2121         return domain_context_mapping_one(data->domain, data->iommu,
2122                                           data->table, PCI_BUS_NUM(alias),
2123                                           alias & 0xff);
2124 }
2125
2126 static int
2127 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2128 {
2129         struct domain_context_mapping_data data;
2130         struct pasid_table *table;
2131         struct intel_iommu *iommu;
2132         u8 bus, devfn;
2133
2134         iommu = device_to_iommu(dev, &bus, &devfn);
2135         if (!iommu)
2136                 return -ENODEV;
2137
2138         table = intel_pasid_get_table(dev);
2139
2140         if (!dev_is_pci(dev))
2141                 return domain_context_mapping_one(domain, iommu, table,
2142                                                   bus, devfn);
2143
2144         data.domain = domain;
2145         data.iommu = iommu;
2146         data.table = table;
2147
2148         return pci_for_each_dma_alias(to_pci_dev(dev),
2149                                       &domain_context_mapping_cb, &data);
2150 }
2151
2152 static int domain_context_mapped_cb(struct pci_dev *pdev,
2153                                     u16 alias, void *opaque)
2154 {
2155         struct intel_iommu *iommu = opaque;
2156
2157         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2158 }
2159
2160 static int domain_context_mapped(struct device *dev)
2161 {
2162         struct intel_iommu *iommu;
2163         u8 bus, devfn;
2164
2165         iommu = device_to_iommu(dev, &bus, &devfn);
2166         if (!iommu)
2167                 return -ENODEV;
2168
2169         if (!dev_is_pci(dev))
2170                 return device_context_mapped(iommu, bus, devfn);
2171
2172         return !pci_for_each_dma_alias(to_pci_dev(dev),
2173                                        domain_context_mapped_cb, iommu);
2174 }
2175
2176 /* Returns a number of VTD pages, but aligned to MM page size */
2177 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2178                                             size_t size)
2179 {
2180         host_addr &= ~PAGE_MASK;
2181         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2182 }
2183
2184 /* Return largest possible superpage level for a given mapping */
2185 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2186                                           unsigned long iov_pfn,
2187                                           unsigned long phy_pfn,
2188                                           unsigned long pages)
2189 {
2190         int support, level = 1;
2191         unsigned long pfnmerge;
2192
2193         support = domain->iommu_superpage;
2194
2195         /* To use a large page, the virtual *and* physical addresses
2196            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2197            of them will mean we have to use smaller pages. So just
2198            merge them and check both at once. */
2199         pfnmerge = iov_pfn | phy_pfn;
2200
2201         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2202                 pages >>= VTD_STRIDE_SHIFT;
2203                 if (!pages)
2204                         break;
2205                 pfnmerge >>= VTD_STRIDE_SHIFT;
2206                 level++;
2207                 support--;
2208         }
2209         return level;
2210 }
2211
2212 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2213                             struct scatterlist *sg, unsigned long phys_pfn,
2214                             unsigned long nr_pages, int prot)
2215 {
2216         struct dma_pte *first_pte = NULL, *pte = NULL;
2217         phys_addr_t uninitialized_var(pteval);
2218         unsigned long sg_res = 0;
2219         unsigned int largepage_lvl = 0;
2220         unsigned long lvl_pages = 0;
2221
2222         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2223
2224         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2225                 return -EINVAL;
2226
2227         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2228
2229         if (!sg) {
2230                 sg_res = nr_pages;
2231                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2232         }
2233
2234         while (nr_pages > 0) {
2235                 uint64_t tmp;
2236
2237                 if (!sg_res) {
2238                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2239
2240                         sg_res = aligned_nrpages(sg->offset, sg->length);
2241                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2242                         sg->dma_length = sg->length;
2243                         pteval = (sg_phys(sg) - pgoff) | prot;
2244                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2245                 }
2246
2247                 if (!pte) {
2248                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2249
2250                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2251                         if (!pte)
2252                                 return -ENOMEM;
2253                         /* It is large page*/
2254                         if (largepage_lvl > 1) {
2255                                 unsigned long nr_superpages, end_pfn;
2256
2257                                 pteval |= DMA_PTE_LARGE_PAGE;
2258                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2259
2260                                 nr_superpages = sg_res / lvl_pages;
2261                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2262
2263                                 /*
2264                                  * Ensure that old small page tables are
2265                                  * removed to make room for superpage(s).
2266                                  * We're adding new large pages, so make sure
2267                                  * we don't remove their parent tables.
2268                                  */
2269                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2270                                                        largepage_lvl + 1);
2271                         } else {
2272                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2273                         }
2274
2275                 }
2276                 /* We don't need lock here, nobody else
2277                  * touches the iova range
2278                  */
2279                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2280                 if (tmp) {
2281                         static int dumps = 5;
2282                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2283                                 iov_pfn, tmp, (unsigned long long)pteval);
2284                         if (dumps) {
2285                                 dumps--;
2286                                 debug_dma_dump_mappings(NULL);
2287                         }
2288                         WARN_ON(1);
2289                 }
2290
2291                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2292
2293                 BUG_ON(nr_pages < lvl_pages);
2294                 BUG_ON(sg_res < lvl_pages);
2295
2296                 nr_pages -= lvl_pages;
2297                 iov_pfn += lvl_pages;
2298                 phys_pfn += lvl_pages;
2299                 pteval += lvl_pages * VTD_PAGE_SIZE;
2300                 sg_res -= lvl_pages;
2301
2302                 /* If the next PTE would be the first in a new page, then we
2303                    need to flush the cache on the entries we've just written.
2304                    And then we'll need to recalculate 'pte', so clear it and
2305                    let it get set again in the if (!pte) block above.
2306
2307                    If we're done (!nr_pages) we need to flush the cache too.
2308
2309                    Also if we've been setting superpages, we may need to
2310                    recalculate 'pte' and switch back to smaller pages for the
2311                    end of the mapping, if the trailing size is not enough to
2312                    use another superpage (i.e. sg_res < lvl_pages). */
2313                 pte++;
2314                 if (!nr_pages || first_pte_in_page(pte) ||
2315                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2316                         domain_flush_cache(domain, first_pte,
2317                                            (void *)pte - (void *)first_pte);
2318                         pte = NULL;
2319                 }
2320
2321                 if (!sg_res && nr_pages)
2322                         sg = sg_next(sg);
2323         }
2324         return 0;
2325 }
2326
2327 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2328                           struct scatterlist *sg, unsigned long phys_pfn,
2329                           unsigned long nr_pages, int prot)
2330 {
2331         int iommu_id, ret;
2332         struct intel_iommu *iommu;
2333
2334         /* Do the real mapping first */
2335         ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2336         if (ret)
2337                 return ret;
2338
2339         for_each_domain_iommu(iommu_id, domain) {
2340                 iommu = g_iommus[iommu_id];
2341                 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2342         }
2343
2344         return 0;
2345 }
2346
2347 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2348                                     struct scatterlist *sg, unsigned long nr_pages,
2349                                     int prot)
2350 {
2351         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2352 }
2353
2354 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2355                                      unsigned long phys_pfn, unsigned long nr_pages,
2356                                      int prot)
2357 {
2358         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2359 }
2360
2361 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2362 {
2363         unsigned long flags;
2364         struct context_entry *context;
2365         u16 did_old;
2366
2367         if (!iommu)
2368                 return;
2369
2370         spin_lock_irqsave(&iommu->lock, flags);
2371         context = iommu_context_addr(iommu, bus, devfn, 0);
2372         if (!context) {
2373                 spin_unlock_irqrestore(&iommu->lock, flags);
2374                 return;
2375         }
2376         did_old = context_domain_id(context);
2377         context_clear_entry(context);
2378         __iommu_flush_cache(iommu, context, sizeof(*context));
2379         spin_unlock_irqrestore(&iommu->lock, flags);
2380         iommu->flush.flush_context(iommu,
2381                                    did_old,
2382                                    (((u16)bus) << 8) | devfn,
2383                                    DMA_CCMD_MASK_NOBIT,
2384                                    DMA_CCMD_DEVICE_INVL);
2385         iommu->flush.flush_iotlb(iommu,
2386                                  did_old,
2387                                  0,
2388                                  0,
2389                                  DMA_TLB_DSI_FLUSH);
2390 }
2391
2392 static inline void unlink_domain_info(struct device_domain_info *info)
2393 {
2394         assert_spin_locked(&device_domain_lock);
2395         list_del(&info->link);
2396         list_del(&info->global);
2397         if (info->dev)
2398                 info->dev->archdata.iommu = NULL;
2399 }
2400
2401 static void domain_remove_dev_info(struct dmar_domain *domain)
2402 {
2403         struct device_domain_info *info, *tmp;
2404         unsigned long flags;
2405
2406         spin_lock_irqsave(&device_domain_lock, flags);
2407         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2408                 __dmar_remove_one_dev_info(info);
2409         spin_unlock_irqrestore(&device_domain_lock, flags);
2410 }
2411
2412 /*
2413  * find_domain
2414  * Note: we use struct device->archdata.iommu stores the info
2415  */
2416 static struct dmar_domain *find_domain(struct device *dev)
2417 {
2418         struct device_domain_info *info;
2419
2420         if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO)) {
2421                 struct iommu_domain *domain;
2422
2423                 dev->archdata.iommu = NULL;
2424                 domain = iommu_get_domain_for_dev(dev);
2425                 if (domain)
2426                         intel_iommu_attach_device(domain, dev);
2427         }
2428
2429         /* No lock here, assumes no domain exit in normal case */
2430         info = dev->archdata.iommu;
2431
2432         if (likely(info))
2433                 return info->domain;
2434         return NULL;
2435 }
2436
2437 static inline struct device_domain_info *
2438 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2439 {
2440         struct device_domain_info *info;
2441
2442         list_for_each_entry(info, &device_domain_list, global)
2443                 if (info->iommu->segment == segment && info->bus == bus &&
2444                     info->devfn == devfn)
2445                         return info;
2446
2447         return NULL;
2448 }
2449
2450 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2451                                                     int bus, int devfn,
2452                                                     struct device *dev,
2453                                                     struct dmar_domain *domain)
2454 {
2455         struct dmar_domain *found = NULL;
2456         struct device_domain_info *info;
2457         unsigned long flags;
2458         int ret;
2459
2460         info = alloc_devinfo_mem();
2461         if (!info)
2462                 return NULL;
2463
2464         info->bus = bus;
2465         info->devfn = devfn;
2466         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2467         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2468         info->ats_qdep = 0;
2469         info->dev = dev;
2470         info->domain = domain;
2471         info->iommu = iommu;
2472         info->pasid_table = NULL;
2473         info->auxd_enabled = 0;
2474         INIT_LIST_HEAD(&info->auxiliary_domains);
2475
2476         if (dev && dev_is_pci(dev)) {
2477                 struct pci_dev *pdev = to_pci_dev(info->dev);
2478
2479                 if (!pdev->untrusted &&
2480                     !pci_ats_disabled() &&
2481                     ecap_dev_iotlb_support(iommu->ecap) &&
2482                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2483                     dmar_find_matched_atsr_unit(pdev))
2484                         info->ats_supported = 1;
2485
2486                 if (sm_supported(iommu)) {
2487                         if (pasid_supported(iommu)) {
2488                                 int features = pci_pasid_features(pdev);
2489                                 if (features >= 0)
2490                                         info->pasid_supported = features | 1;
2491                         }
2492
2493                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2494                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2495                                 info->pri_supported = 1;
2496                 }
2497         }
2498
2499         spin_lock_irqsave(&device_domain_lock, flags);
2500         if (dev)
2501                 found = find_domain(dev);
2502
2503         if (!found) {
2504                 struct device_domain_info *info2;
2505                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2506                 if (info2) {
2507                         found      = info2->domain;
2508                         info2->dev = dev;
2509                 }
2510         }
2511
2512         if (found) {
2513                 spin_unlock_irqrestore(&device_domain_lock, flags);
2514                 free_devinfo_mem(info);
2515                 /* Caller must free the original domain */
2516                 return found;
2517         }
2518
2519         spin_lock(&iommu->lock);
2520         ret = domain_attach_iommu(domain, iommu);
2521         spin_unlock(&iommu->lock);
2522
2523         if (ret) {
2524                 spin_unlock_irqrestore(&device_domain_lock, flags);
2525                 free_devinfo_mem(info);
2526                 return NULL;
2527         }
2528
2529         list_add(&info->link, &domain->devices);
2530         list_add(&info->global, &device_domain_list);
2531         if (dev)
2532                 dev->archdata.iommu = info;
2533         spin_unlock_irqrestore(&device_domain_lock, flags);
2534
2535         /* PASID table is mandatory for a PCI device in scalable mode. */
2536         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2537                 ret = intel_pasid_alloc_table(dev);
2538                 if (ret) {
2539                         dev_err(dev, "PASID table allocation failed\n");
2540                         dmar_remove_one_dev_info(dev);
2541                         return NULL;
2542                 }
2543
2544                 /* Setup the PASID entry for requests without PASID: */
2545                 spin_lock(&iommu->lock);
2546                 if (hw_pass_through && domain_type_is_si(domain))
2547                         ret = intel_pasid_setup_pass_through(iommu, domain,
2548                                         dev, PASID_RID2PASID);
2549                 else
2550                         ret = intel_pasid_setup_second_level(iommu, domain,
2551                                         dev, PASID_RID2PASID);
2552                 spin_unlock(&iommu->lock);
2553                 if (ret) {
2554                         dev_err(dev, "Setup RID2PASID failed\n");
2555                         dmar_remove_one_dev_info(dev);
2556                         return NULL;
2557                 }
2558         }
2559
2560         if (dev && domain_context_mapping(domain, dev)) {
2561                 dev_err(dev, "Domain context map failed\n");
2562                 dmar_remove_one_dev_info(dev);
2563                 return NULL;
2564         }
2565
2566         return domain;
2567 }
2568
2569 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2570 {
2571         *(u16 *)opaque = alias;
2572         return 0;
2573 }
2574
2575 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2576 {
2577         struct device_domain_info *info;
2578         struct dmar_domain *domain = NULL;
2579         struct intel_iommu *iommu;
2580         u16 dma_alias;
2581         unsigned long flags;
2582         u8 bus, devfn;
2583
2584         iommu = device_to_iommu(dev, &bus, &devfn);
2585         if (!iommu)
2586                 return NULL;
2587
2588         if (dev_is_pci(dev)) {
2589                 struct pci_dev *pdev = to_pci_dev(dev);
2590
2591                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2592
2593                 spin_lock_irqsave(&device_domain_lock, flags);
2594                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2595                                                       PCI_BUS_NUM(dma_alias),
2596                                                       dma_alias & 0xff);
2597                 if (info) {
2598                         iommu = info->iommu;
2599                         domain = info->domain;
2600                 }
2601                 spin_unlock_irqrestore(&device_domain_lock, flags);
2602
2603                 /* DMA alias already has a domain, use it */
2604                 if (info)
2605                         goto out;
2606         }
2607
2608         /* Allocate and initialize new domain for the device */
2609         domain = alloc_domain(0);
2610         if (!domain)
2611                 return NULL;
2612         if (domain_init(domain, iommu, gaw)) {
2613                 domain_exit(domain);
2614                 return NULL;
2615         }
2616
2617 out:
2618         return domain;
2619 }
2620
2621 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2622                                               struct dmar_domain *domain)
2623 {
2624         struct intel_iommu *iommu;
2625         struct dmar_domain *tmp;
2626         u16 req_id, dma_alias;
2627         u8 bus, devfn;
2628
2629         iommu = device_to_iommu(dev, &bus, &devfn);
2630         if (!iommu)
2631                 return NULL;
2632
2633         req_id = ((u16)bus << 8) | devfn;
2634
2635         if (dev_is_pci(dev)) {
2636                 struct pci_dev *pdev = to_pci_dev(dev);
2637
2638                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2639
2640                 /* register PCI DMA alias device */
2641                 if (req_id != dma_alias) {
2642                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2643                                         dma_alias & 0xff, NULL, domain);
2644
2645                         if (!tmp || tmp != domain)
2646                                 return tmp;
2647                 }
2648         }
2649
2650         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2651         if (!tmp || tmp != domain)
2652                 return tmp;
2653
2654         return domain;
2655 }
2656
2657 static int iommu_domain_identity_map(struct dmar_domain *domain,
2658                                      unsigned long long start,
2659                                      unsigned long long end)
2660 {
2661         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2662         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2663
2664         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2665                           dma_to_mm_pfn(last_vpfn))) {
2666                 pr_err("Reserving iova failed\n");
2667                 return -ENOMEM;
2668         }
2669
2670         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2671         /*
2672          * RMRR range might have overlap with physical memory range,
2673          * clear it first
2674          */
2675         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2676
2677         return __domain_mapping(domain, first_vpfn, NULL,
2678                                 first_vpfn, last_vpfn - first_vpfn + 1,
2679                                 DMA_PTE_READ|DMA_PTE_WRITE);
2680 }
2681
2682 static int domain_prepare_identity_map(struct device *dev,
2683                                        struct dmar_domain *domain,
2684                                        unsigned long long start,
2685                                        unsigned long long end)
2686 {
2687         /* For _hardware_ passthrough, don't bother. But for software
2688            passthrough, we do it anyway -- it may indicate a memory
2689            range which is reserved in E820, so which didn't get set
2690            up to start with in si_domain */
2691         if (domain == si_domain && hw_pass_through) {
2692                 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2693                          start, end);
2694                 return 0;
2695         }
2696
2697         dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2698
2699         if (end < start) {
2700                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2701                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2702                         dmi_get_system_info(DMI_BIOS_VENDOR),
2703                         dmi_get_system_info(DMI_BIOS_VERSION),
2704                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2705                 return -EIO;
2706         }
2707
2708         if (end >> agaw_to_width(domain->agaw)) {
2709                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2710                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2711                      agaw_to_width(domain->agaw),
2712                      dmi_get_system_info(DMI_BIOS_VENDOR),
2713                      dmi_get_system_info(DMI_BIOS_VERSION),
2714                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2715                 return -EIO;
2716         }
2717
2718         return iommu_domain_identity_map(domain, start, end);
2719 }
2720
2721 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2722
2723 static int __init si_domain_init(int hw)
2724 {
2725         struct dmar_rmrr_unit *rmrr;
2726         struct device *dev;
2727         int i, nid, ret;
2728
2729         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2730         if (!si_domain)
2731                 return -EFAULT;
2732
2733         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2734                 domain_exit(si_domain);
2735                 return -EFAULT;
2736         }
2737
2738         if (hw)
2739                 return 0;
2740
2741         for_each_online_node(nid) {
2742                 unsigned long start_pfn, end_pfn;
2743                 int i;
2744
2745                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2746                         ret = iommu_domain_identity_map(si_domain,
2747                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2748                         if (ret)
2749                                 return ret;
2750                 }
2751         }
2752
2753         /*
2754          * Normally we use DMA domains for devices which have RMRRs. But we
2755          * loose this requirement for graphic and usb devices. Identity map
2756          * the RMRRs for graphic and USB devices so that they could use the
2757          * si_domain.
2758          */
2759         for_each_rmrr_units(rmrr) {
2760                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2761                                           i, dev) {
2762                         unsigned long long start = rmrr->base_address;
2763                         unsigned long long end = rmrr->end_address;
2764
2765                         if (device_is_rmrr_locked(dev))
2766                                 continue;
2767
2768                         if (WARN_ON(end < start ||
2769                                     end >> agaw_to_width(si_domain->agaw)))
2770                                 continue;
2771
2772                         ret = iommu_domain_identity_map(si_domain, start, end);
2773                         if (ret)
2774                                 return ret;
2775                 }
2776         }
2777
2778         return 0;
2779 }
2780
2781 static int identity_mapping(struct device *dev)
2782 {
2783         struct device_domain_info *info;
2784
2785         info = dev->archdata.iommu;
2786         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2787                 return (info->domain == si_domain);
2788
2789         return 0;
2790 }
2791
2792 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2793 {
2794         struct dmar_domain *ndomain;
2795         struct intel_iommu *iommu;
2796         u8 bus, devfn;
2797
2798         iommu = device_to_iommu(dev, &bus, &devfn);
2799         if (!iommu)
2800                 return -ENODEV;
2801
2802         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2803         if (ndomain != domain)
2804                 return -EBUSY;
2805
2806         return 0;
2807 }
2808
2809 static bool device_has_rmrr(struct device *dev)
2810 {
2811         struct dmar_rmrr_unit *rmrr;
2812         struct device *tmp;
2813         int i;
2814
2815         rcu_read_lock();
2816         for_each_rmrr_units(rmrr) {
2817                 /*
2818                  * Return TRUE if this RMRR contains the device that
2819                  * is passed in.
2820                  */
2821                 for_each_active_dev_scope(rmrr->devices,
2822                                           rmrr->devices_cnt, i, tmp)
2823                         if (tmp == dev ||
2824                             is_downstream_to_pci_bridge(dev, tmp)) {
2825                                 rcu_read_unlock();
2826                                 return true;
2827                         }
2828         }
2829         rcu_read_unlock();
2830         return false;
2831 }
2832
2833 /**
2834  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2835  * is relaxable (ie. is allowed to be not enforced under some conditions)
2836  * @dev: device handle
2837  *
2838  * We assume that PCI USB devices with RMRRs have them largely
2839  * for historical reasons and that the RMRR space is not actively used post
2840  * boot.  This exclusion may change if vendors begin to abuse it.
2841  *
2842  * The same exception is made for graphics devices, with the requirement that
2843  * any use of the RMRR regions will be torn down before assigning the device
2844  * to a guest.
2845  *
2846  * Return: true if the RMRR is relaxable, false otherwise
2847  */
2848 static bool device_rmrr_is_relaxable(struct device *dev)
2849 {
2850         struct pci_dev *pdev;
2851
2852         if (!dev_is_pci(dev))
2853                 return false;
2854
2855         pdev = to_pci_dev(dev);
2856         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2857                 return true;
2858         else
2859                 return false;
2860 }
2861
2862 /*
2863  * There are a couple cases where we need to restrict the functionality of
2864  * devices associated with RMRRs.  The first is when evaluating a device for
2865  * identity mapping because problems exist when devices are moved in and out
2866  * of domains and their respective RMRR information is lost.  This means that
2867  * a device with associated RMRRs will never be in a "passthrough" domain.
2868  * The second is use of the device through the IOMMU API.  This interface
2869  * expects to have full control of the IOVA space for the device.  We cannot
2870  * satisfy both the requirement that RMRR access is maintained and have an
2871  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2872  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2873  * We therefore prevent devices associated with an RMRR from participating in
2874  * the IOMMU API, which eliminates them from device assignment.
2875  *
2876  * In both cases, devices which have relaxable RMRRs are not concerned by this
2877  * restriction. See device_rmrr_is_relaxable comment.
2878  */
2879 static bool device_is_rmrr_locked(struct device *dev)
2880 {
2881         if (!device_has_rmrr(dev))
2882                 return false;
2883
2884         if (device_rmrr_is_relaxable(dev))
2885                 return false;
2886
2887         return true;
2888 }
2889
2890 /*
2891  * Return the required default domain type for a specific device.
2892  *
2893  * @dev: the device in query
2894  * @startup: true if this is during early boot
2895  *
2896  * Returns:
2897  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2898  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2899  *  - 0: both identity and dynamic domains work for this device
2900  */
2901 static int device_def_domain_type(struct device *dev)
2902 {
2903         if (dev_is_pci(dev)) {
2904                 struct pci_dev *pdev = to_pci_dev(dev);
2905
2906                 if (device_is_rmrr_locked(dev))
2907                         return IOMMU_DOMAIN_DMA;
2908
2909                 /*
2910                  * Prevent any device marked as untrusted from getting
2911                  * placed into the statically identity mapping domain.
2912                  */
2913                 if (pdev->untrusted)
2914                         return IOMMU_DOMAIN_DMA;
2915
2916                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2917                         return IOMMU_DOMAIN_IDENTITY;
2918
2919                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2920                         return IOMMU_DOMAIN_IDENTITY;
2921
2922                 /*
2923                  * We want to start off with all devices in the 1:1 domain, and
2924                  * take them out later if we find they can't access all of memory.
2925                  *
2926                  * However, we can't do this for PCI devices behind bridges,
2927                  * because all PCI devices behind the same bridge will end up
2928                  * with the same source-id on their transactions.
2929                  *
2930                  * Practically speaking, we can't change things around for these
2931                  * devices at run-time, because we can't be sure there'll be no
2932                  * DMA transactions in flight for any of their siblings.
2933                  *
2934                  * So PCI devices (unless they're on the root bus) as well as
2935                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2936                  * the 1:1 domain, just in _case_ one of their siblings turns out
2937                  * not to be able to map all of memory.
2938                  */
2939                 if (!pci_is_pcie(pdev)) {
2940                         if (!pci_is_root_bus(pdev->bus))
2941                                 return IOMMU_DOMAIN_DMA;
2942                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2943                                 return IOMMU_DOMAIN_DMA;
2944                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2945                         return IOMMU_DOMAIN_DMA;
2946         } else {
2947                 if (device_has_rmrr(dev))
2948                         return IOMMU_DOMAIN_DMA;
2949         }
2950
2951         return (iommu_identity_mapping & IDENTMAP_ALL) ?
2952                         IOMMU_DOMAIN_IDENTITY : 0;
2953 }
2954
2955 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2956 {
2957         /*
2958          * Start from the sane iommu hardware state.
2959          * If the queued invalidation is already initialized by us
2960          * (for example, while enabling interrupt-remapping) then
2961          * we got the things already rolling from a sane state.
2962          */
2963         if (!iommu->qi) {
2964                 /*
2965                  * Clear any previous faults.
2966                  */
2967                 dmar_fault(-1, iommu);
2968                 /*
2969                  * Disable queued invalidation if supported and already enabled
2970                  * before OS handover.
2971                  */
2972                 dmar_disable_qi(iommu);
2973         }
2974
2975         if (dmar_enable_qi(iommu)) {
2976                 /*
2977                  * Queued Invalidate not enabled, use Register Based Invalidate
2978                  */
2979                 iommu->flush.flush_context = __iommu_flush_context;
2980                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2981                 pr_info("%s: Using Register based invalidation\n",
2982                         iommu->name);
2983         } else {
2984                 iommu->flush.flush_context = qi_flush_context;
2985                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2986                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2987         }
2988 }
2989
2990 static int copy_context_table(struct intel_iommu *iommu,
2991                               struct root_entry *old_re,
2992                               struct context_entry **tbl,
2993                               int bus, bool ext)
2994 {
2995         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2996         struct context_entry *new_ce = NULL, ce;
2997         struct context_entry *old_ce = NULL;
2998         struct root_entry re;
2999         phys_addr_t old_ce_phys;
3000
3001         tbl_idx = ext ? bus * 2 : bus;
3002         memcpy(&re, old_re, sizeof(re));
3003
3004         for (devfn = 0; devfn < 256; devfn++) {
3005                 /* First calculate the correct index */
3006                 idx = (ext ? devfn * 2 : devfn) % 256;
3007
3008                 if (idx == 0) {
3009                         /* First save what we may have and clean up */
3010                         if (new_ce) {
3011                                 tbl[tbl_idx] = new_ce;
3012                                 __iommu_flush_cache(iommu, new_ce,
3013                                                     VTD_PAGE_SIZE);
3014                                 pos = 1;
3015                         }
3016
3017                         if (old_ce)
3018                                 memunmap(old_ce);
3019
3020                         ret = 0;
3021                         if (devfn < 0x80)
3022                                 old_ce_phys = root_entry_lctp(&re);
3023                         else
3024                                 old_ce_phys = root_entry_uctp(&re);
3025
3026                         if (!old_ce_phys) {
3027                                 if (ext && devfn == 0) {
3028                                         /* No LCTP, try UCTP */
3029                                         devfn = 0x7f;
3030                                         continue;
3031                                 } else {
3032                                         goto out;
3033                                 }
3034                         }
3035
3036                         ret = -ENOMEM;
3037                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3038                                         MEMREMAP_WB);
3039                         if (!old_ce)
3040                                 goto out;
3041
3042                         new_ce = alloc_pgtable_page(iommu->node);
3043                         if (!new_ce)
3044                                 goto out_unmap;
3045
3046                         ret = 0;
3047                 }
3048
3049                 /* Now copy the context entry */
3050                 memcpy(&ce, old_ce + idx, sizeof(ce));
3051
3052                 if (!__context_present(&ce))
3053                         continue;
3054
3055                 did = context_domain_id(&ce);
3056                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3057                         set_bit(did, iommu->domain_ids);
3058
3059                 /*
3060                  * We need a marker for copied context entries. This
3061                  * marker needs to work for the old format as well as
3062                  * for extended context entries.
3063                  *
3064                  * Bit 67 of the context entry is used. In the old
3065                  * format this bit is available to software, in the
3066                  * extended format it is the PGE bit, but PGE is ignored
3067                  * by HW if PASIDs are disabled (and thus still
3068                  * available).
3069                  *
3070                  * So disable PASIDs first and then mark the entry
3071                  * copied. This means that we don't copy PASID
3072                  * translations from the old kernel, but this is fine as
3073                  * faults there are not fatal.
3074                  */
3075                 context_clear_pasid_enable(&ce);
3076                 context_set_copied(&ce);
3077
3078                 new_ce[idx] = ce;
3079         }
3080
3081         tbl[tbl_idx + pos] = new_ce;
3082
3083         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3084
3085 out_unmap:
3086         memunmap(old_ce);
3087
3088 out:
3089         return ret;
3090 }
3091
3092 static int copy_translation_tables(struct intel_iommu *iommu)
3093 {
3094         struct context_entry **ctxt_tbls;
3095         struct root_entry *old_rt;
3096         phys_addr_t old_rt_phys;
3097         int ctxt_table_entries;
3098         unsigned long flags;
3099         u64 rtaddr_reg;
3100         int bus, ret;
3101         bool new_ext, ext;
3102
3103         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3104         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3105         new_ext    = !!ecap_ecs(iommu->ecap);
3106
3107         /*
3108          * The RTT bit can only be changed when translation is disabled,
3109          * but disabling translation means to open a window for data
3110          * corruption. So bail out and don't copy anything if we would
3111          * have to change the bit.
3112          */
3113         if (new_ext != ext)
3114                 return -EINVAL;
3115
3116         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3117         if (!old_rt_phys)
3118                 return -EINVAL;
3119
3120         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3121         if (!old_rt)
3122                 return -ENOMEM;
3123
3124         /* This is too big for the stack - allocate it from slab */
3125         ctxt_table_entries = ext ? 512 : 256;
3126         ret = -ENOMEM;
3127         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3128         if (!ctxt_tbls)
3129                 goto out_unmap;
3130
3131         for (bus = 0; bus < 256; bus++) {
3132                 ret = copy_context_table(iommu, &old_rt[bus],
3133                                          ctxt_tbls, bus, ext);
3134                 if (ret) {
3135                         pr_err("%s: Failed to copy context table for bus %d\n",
3136                                 iommu->name, bus);
3137                         continue;
3138                 }
3139         }
3140
3141         spin_lock_irqsave(&iommu->lock, flags);
3142
3143         /* Context tables are copied, now write them to the root_entry table */
3144         for (bus = 0; bus < 256; bus++) {
3145                 int idx = ext ? bus * 2 : bus;
3146                 u64 val;
3147
3148                 if (ctxt_tbls[idx]) {
3149                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3150                         iommu->root_entry[bus].lo = val;
3151                 }
3152
3153                 if (!ext || !ctxt_tbls[idx + 1])
3154                         continue;
3155
3156                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3157                 iommu->root_entry[bus].hi = val;
3158         }
3159
3160         spin_unlock_irqrestore(&iommu->lock, flags);
3161
3162         kfree(ctxt_tbls);
3163
3164         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3165
3166         ret = 0;
3167
3168 out_unmap:
3169         memunmap(old_rt);
3170
3171         return ret;
3172 }
3173
3174 static int __init init_dmars(void)
3175 {
3176         struct dmar_drhd_unit *drhd;
3177         struct intel_iommu *iommu;
3178         int ret;
3179
3180         /*
3181          * for each drhd
3182          *    allocate root
3183          *    initialize and program root entry to not present
3184          * endfor
3185          */
3186         for_each_drhd_unit(drhd) {
3187                 /*
3188                  * lock not needed as this is only incremented in the single
3189                  * threaded kernel __init code path all other access are read
3190                  * only
3191                  */
3192                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3193                         g_num_of_iommus++;
3194                         continue;
3195                 }
3196                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3197         }
3198
3199         /* Preallocate enough resources for IOMMU hot-addition */
3200         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3201                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3202
3203         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3204                         GFP_KERNEL);
<