Merge git://git.infradead.org/intel-iommu
[sfrench/cifs-2.6.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright © 2006-2014 Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * Authors: David Woodhouse <dwmw2@infradead.org>,
14  *          Ashok Raj <ashok.raj@intel.com>,
15  *          Shaohua Li <shaohua.li@intel.com>,
16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17  *          Fenghua Yu <fenghua.yu@intel.com>
18  */
19
20 #include <linux/init.h>
21 #include <linux/bitmap.h>
22 #include <linux/debugfs.h>
23 #include <linux/export.h>
24 #include <linux/slab.h>
25 #include <linux/irq.h>
26 #include <linux/interrupt.h>
27 #include <linux/spinlock.h>
28 #include <linux/pci.h>
29 #include <linux/dmar.h>
30 #include <linux/dma-mapping.h>
31 #include <linux/mempool.h>
32 #include <linux/memory.h>
33 #include <linux/timer.h>
34 #include <linux/iova.h>
35 #include <linux/iommu.h>
36 #include <linux/intel-iommu.h>
37 #include <linux/syscore_ops.h>
38 #include <linux/tboot.h>
39 #include <linux/dmi.h>
40 #include <linux/pci-ats.h>
41 #include <linux/memblock.h>
42 #include <linux/dma-contiguous.h>
43 #include <asm/irq_remapping.h>
44 #include <asm/cacheflush.h>
45 #include <asm/iommu.h>
46
47 #include "irq_remapping.h"
48
49 #define ROOT_SIZE               VTD_PAGE_SIZE
50 #define CONTEXT_SIZE            VTD_PAGE_SIZE
51
52 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
53 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
54 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
55 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
56
57 #define IOAPIC_RANGE_START      (0xfee00000)
58 #define IOAPIC_RANGE_END        (0xfeefffff)
59 #define IOVA_START_ADDR         (0x1000)
60
61 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
62
63 #define MAX_AGAW_WIDTH 64
64 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
65
66 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
67 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
68
69 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
70    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
71 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
72                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
73 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
74
75 /* IO virtual address start page frame number */
76 #define IOVA_START_PFN          (1)
77
78 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
79 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
80 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
81
82 /* page table handling */
83 #define LEVEL_STRIDE            (9)
84 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
85
86 /*
87  * This bitmap is used to advertise the page sizes our hardware support
88  * to the IOMMU core, which will then use this information to split
89  * physically contiguous memory regions it is mapping into page sizes
90  * that we support.
91  *
92  * Traditionally the IOMMU core just handed us the mappings directly,
93  * after making sure the size is an order of a 4KiB page and that the
94  * mapping has natural alignment.
95  *
96  * To retain this behavior, we currently advertise that we support
97  * all page sizes that are an order of 4KiB.
98  *
99  * If at some point we'd like to utilize the IOMMU core's new behavior,
100  * we could change this to advertise the real page sizes we support.
101  */
102 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
103
104 static inline int agaw_to_level(int agaw)
105 {
106         return agaw + 2;
107 }
108
109 static inline int agaw_to_width(int agaw)
110 {
111         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
112 }
113
114 static inline int width_to_agaw(int width)
115 {
116         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
117 }
118
119 static inline unsigned int level_to_offset_bits(int level)
120 {
121         return (level - 1) * LEVEL_STRIDE;
122 }
123
124 static inline int pfn_level_offset(unsigned long pfn, int level)
125 {
126         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
127 }
128
129 static inline unsigned long level_mask(int level)
130 {
131         return -1UL << level_to_offset_bits(level);
132 }
133
134 static inline unsigned long level_size(int level)
135 {
136         return 1UL << level_to_offset_bits(level);
137 }
138
139 static inline unsigned long align_to_level(unsigned long pfn, int level)
140 {
141         return (pfn + level_size(level) - 1) & level_mask(level);
142 }
143
144 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
145 {
146         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
147 }
148
149 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
150    are never going to work. */
151 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
152 {
153         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
154 }
155
156 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
157 {
158         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
159 }
160 static inline unsigned long page_to_dma_pfn(struct page *pg)
161 {
162         return mm_to_dma_pfn(page_to_pfn(pg));
163 }
164 static inline unsigned long virt_to_dma_pfn(void *p)
165 {
166         return page_to_dma_pfn(virt_to_page(p));
167 }
168
169 /* global iommu list, set NULL for ignored DMAR units */
170 static struct intel_iommu **g_iommus;
171
172 static void __init check_tylersburg_isoch(void);
173 static int rwbf_quirk;
174
175 /*
176  * set to 1 to panic kernel if can't successfully enable VT-d
177  * (used when kernel is launched w/ TXT)
178  */
179 static int force_on = 0;
180
181 /*
182  * 0: Present
183  * 1-11: Reserved
184  * 12-63: Context Ptr (12 - (haw-1))
185  * 64-127: Reserved
186  */
187 struct root_entry {
188         u64     lo;
189         u64     hi;
190 };
191 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
192
193
194 /*
195  * low 64 bits:
196  * 0: present
197  * 1: fault processing disable
198  * 2-3: translation type
199  * 12-63: address space root
200  * high 64 bits:
201  * 0-2: address width
202  * 3-6: aval
203  * 8-23: domain id
204  */
205 struct context_entry {
206         u64 lo;
207         u64 hi;
208 };
209
210 static inline bool context_present(struct context_entry *context)
211 {
212         return (context->lo & 1);
213 }
214 static inline void context_set_present(struct context_entry *context)
215 {
216         context->lo |= 1;
217 }
218
219 static inline void context_set_fault_enable(struct context_entry *context)
220 {
221         context->lo &= (((u64)-1) << 2) | 1;
222 }
223
224 static inline void context_set_translation_type(struct context_entry *context,
225                                                 unsigned long value)
226 {
227         context->lo &= (((u64)-1) << 4) | 3;
228         context->lo |= (value & 3) << 2;
229 }
230
231 static inline void context_set_address_root(struct context_entry *context,
232                                             unsigned long value)
233 {
234         context->lo &= ~VTD_PAGE_MASK;
235         context->lo |= value & VTD_PAGE_MASK;
236 }
237
238 static inline void context_set_address_width(struct context_entry *context,
239                                              unsigned long value)
240 {
241         context->hi |= value & 7;
242 }
243
244 static inline void context_set_domain_id(struct context_entry *context,
245                                          unsigned long value)
246 {
247         context->hi |= (value & ((1 << 16) - 1)) << 8;
248 }
249
250 static inline void context_clear_entry(struct context_entry *context)
251 {
252         context->lo = 0;
253         context->hi = 0;
254 }
255
256 /*
257  * 0: readable
258  * 1: writable
259  * 2-6: reserved
260  * 7: super page
261  * 8-10: available
262  * 11: snoop behavior
263  * 12-63: Host physcial address
264  */
265 struct dma_pte {
266         u64 val;
267 };
268
269 static inline void dma_clear_pte(struct dma_pte *pte)
270 {
271         pte->val = 0;
272 }
273
274 static inline u64 dma_pte_addr(struct dma_pte *pte)
275 {
276 #ifdef CONFIG_64BIT
277         return pte->val & VTD_PAGE_MASK;
278 #else
279         /* Must have a full atomic 64-bit read */
280         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
281 #endif
282 }
283
284 static inline bool dma_pte_present(struct dma_pte *pte)
285 {
286         return (pte->val & 3) != 0;
287 }
288
289 static inline bool dma_pte_superpage(struct dma_pte *pte)
290 {
291         return (pte->val & DMA_PTE_LARGE_PAGE);
292 }
293
294 static inline int first_pte_in_page(struct dma_pte *pte)
295 {
296         return !((unsigned long)pte & ~VTD_PAGE_MASK);
297 }
298
299 /*
300  * This domain is a statically identity mapping domain.
301  *      1. This domain creats a static 1:1 mapping to all usable memory.
302  *      2. It maps to each iommu if successful.
303  *      3. Each iommu mapps to this domain if successful.
304  */
305 static struct dmar_domain *si_domain;
306 static int hw_pass_through = 1;
307
308 /* domain represents a virtual machine, more than one devices
309  * across iommus may be owned in one domain, e.g. kvm guest.
310  */
311 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 0)
312
313 /* si_domain contains mulitple devices */
314 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 1)
315
316 struct dmar_domain {
317         int     id;                     /* domain id */
318         int     nid;                    /* node id */
319         DECLARE_BITMAP(iommu_bmp, DMAR_UNITS_SUPPORTED);
320                                         /* bitmap of iommus this domain uses*/
321
322         struct list_head devices;       /* all devices' list */
323         struct iova_domain iovad;       /* iova's that belong to this domain */
324
325         struct dma_pte  *pgd;           /* virtual address */
326         int             gaw;            /* max guest address width */
327
328         /* adjusted guest address width, 0 is level 2 30-bit */
329         int             agaw;
330
331         int             flags;          /* flags to find out type of domain */
332
333         int             iommu_coherency;/* indicate coherency of iommu access */
334         int             iommu_snooping; /* indicate snooping control feature*/
335         int             iommu_count;    /* reference count of iommu */
336         int             iommu_superpage;/* Level of superpages supported:
337                                            0 == 4KiB (no superpages), 1 == 2MiB,
338                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
339         spinlock_t      iommu_lock;     /* protect iommu set in domain */
340         u64             max_addr;       /* maximum mapped address */
341
342         struct iommu_domain domain;     /* generic domain data structure for
343                                            iommu core */
344 };
345
346 /* PCI domain-device relationship */
347 struct device_domain_info {
348         struct list_head link;  /* link to domain siblings */
349         struct list_head global; /* link to global list */
350         u8 bus;                 /* PCI bus number */
351         u8 devfn;               /* PCI devfn number */
352         struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
353         struct intel_iommu *iommu; /* IOMMU used by this device */
354         struct dmar_domain *domain; /* pointer to domain */
355 };
356
357 struct dmar_rmrr_unit {
358         struct list_head list;          /* list of rmrr units   */
359         struct acpi_dmar_header *hdr;   /* ACPI header          */
360         u64     base_address;           /* reserved base address*/
361         u64     end_address;            /* reserved end address */
362         struct dmar_dev_scope *devices; /* target devices */
363         int     devices_cnt;            /* target device count */
364 };
365
366 struct dmar_atsr_unit {
367         struct list_head list;          /* list of ATSR units */
368         struct acpi_dmar_header *hdr;   /* ACPI header */
369         struct dmar_dev_scope *devices; /* target devices */
370         int devices_cnt;                /* target device count */
371         u8 include_all:1;               /* include all ports */
372 };
373
374 static LIST_HEAD(dmar_atsr_units);
375 static LIST_HEAD(dmar_rmrr_units);
376
377 #define for_each_rmrr_units(rmrr) \
378         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
379
380 static void flush_unmaps_timeout(unsigned long data);
381
382 static DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
383
384 #define HIGH_WATER_MARK 250
385 struct deferred_flush_tables {
386         int next;
387         struct iova *iova[HIGH_WATER_MARK];
388         struct dmar_domain *domain[HIGH_WATER_MARK];
389         struct page *freelist[HIGH_WATER_MARK];
390 };
391
392 static struct deferred_flush_tables *deferred_flush;
393
394 /* bitmap for indexing intel_iommus */
395 static int g_num_of_iommus;
396
397 static DEFINE_SPINLOCK(async_umap_flush_lock);
398 static LIST_HEAD(unmaps_to_do);
399
400 static int timer_on;
401 static long list_size;
402
403 static void domain_exit(struct dmar_domain *domain);
404 static void domain_remove_dev_info(struct dmar_domain *domain);
405 static void domain_remove_one_dev_info(struct dmar_domain *domain,
406                                        struct device *dev);
407 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
408                                            struct device *dev);
409 static int domain_detach_iommu(struct dmar_domain *domain,
410                                struct intel_iommu *iommu);
411
412 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
413 int dmar_disabled = 0;
414 #else
415 int dmar_disabled = 1;
416 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
417
418 int intel_iommu_enabled = 0;
419 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
420
421 static int dmar_map_gfx = 1;
422 static int dmar_forcedac;
423 static int intel_iommu_strict;
424 static int intel_iommu_superpage = 1;
425
426 int intel_iommu_gfx_mapped;
427 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
428
429 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
430 static DEFINE_SPINLOCK(device_domain_lock);
431 static LIST_HEAD(device_domain_list);
432
433 static const struct iommu_ops intel_iommu_ops;
434
435 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
436 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
437 {
438         return container_of(dom, struct dmar_domain, domain);
439 }
440
441 static int __init intel_iommu_setup(char *str)
442 {
443         if (!str)
444                 return -EINVAL;
445         while (*str) {
446                 if (!strncmp(str, "on", 2)) {
447                         dmar_disabled = 0;
448                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
449                 } else if (!strncmp(str, "off", 3)) {
450                         dmar_disabled = 1;
451                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
452                 } else if (!strncmp(str, "igfx_off", 8)) {
453                         dmar_map_gfx = 0;
454                         printk(KERN_INFO
455                                 "Intel-IOMMU: disable GFX device mapping\n");
456                 } else if (!strncmp(str, "forcedac", 8)) {
457                         printk(KERN_INFO
458                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
459                         dmar_forcedac = 1;
460                 } else if (!strncmp(str, "strict", 6)) {
461                         printk(KERN_INFO
462                                 "Intel-IOMMU: disable batched IOTLB flush\n");
463                         intel_iommu_strict = 1;
464                 } else if (!strncmp(str, "sp_off", 6)) {
465                         printk(KERN_INFO
466                                 "Intel-IOMMU: disable supported super page\n");
467                         intel_iommu_superpage = 0;
468                 }
469
470                 str += strcspn(str, ",");
471                 while (*str == ',')
472                         str++;
473         }
474         return 0;
475 }
476 __setup("intel_iommu=", intel_iommu_setup);
477
478 static struct kmem_cache *iommu_domain_cache;
479 static struct kmem_cache *iommu_devinfo_cache;
480
481 static inline void *alloc_pgtable_page(int node)
482 {
483         struct page *page;
484         void *vaddr = NULL;
485
486         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
487         if (page)
488                 vaddr = page_address(page);
489         return vaddr;
490 }
491
492 static inline void free_pgtable_page(void *vaddr)
493 {
494         free_page((unsigned long)vaddr);
495 }
496
497 static inline void *alloc_domain_mem(void)
498 {
499         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
500 }
501
502 static void free_domain_mem(void *vaddr)
503 {
504         kmem_cache_free(iommu_domain_cache, vaddr);
505 }
506
507 static inline void * alloc_devinfo_mem(void)
508 {
509         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
510 }
511
512 static inline void free_devinfo_mem(void *vaddr)
513 {
514         kmem_cache_free(iommu_devinfo_cache, vaddr);
515 }
516
517 static inline int domain_type_is_vm(struct dmar_domain *domain)
518 {
519         return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
520 }
521
522 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
523 {
524         return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
525                                 DOMAIN_FLAG_STATIC_IDENTITY);
526 }
527
528 static inline int domain_pfn_supported(struct dmar_domain *domain,
529                                        unsigned long pfn)
530 {
531         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
532
533         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
534 }
535
536 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
537 {
538         unsigned long sagaw;
539         int agaw = -1;
540
541         sagaw = cap_sagaw(iommu->cap);
542         for (agaw = width_to_agaw(max_gaw);
543              agaw >= 0; agaw--) {
544                 if (test_bit(agaw, &sagaw))
545                         break;
546         }
547
548         return agaw;
549 }
550
551 /*
552  * Calculate max SAGAW for each iommu.
553  */
554 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
555 {
556         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
557 }
558
559 /*
560  * calculate agaw for each iommu.
561  * "SAGAW" may be different across iommus, use a default agaw, and
562  * get a supported less agaw for iommus that don't support the default agaw.
563  */
564 int iommu_calculate_agaw(struct intel_iommu *iommu)
565 {
566         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
567 }
568
569 /* This functionin only returns single iommu in a domain */
570 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
571 {
572         int iommu_id;
573
574         /* si_domain and vm domain should not get here. */
575         BUG_ON(domain_type_is_vm_or_si(domain));
576         iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
577         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
578                 return NULL;
579
580         return g_iommus[iommu_id];
581 }
582
583 static void domain_update_iommu_coherency(struct dmar_domain *domain)
584 {
585         struct dmar_drhd_unit *drhd;
586         struct intel_iommu *iommu;
587         bool found = false;
588         int i;
589
590         domain->iommu_coherency = 1;
591
592         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
593                 found = true;
594                 if (!ecap_coherent(g_iommus[i]->ecap)) {
595                         domain->iommu_coherency = 0;
596                         break;
597                 }
598         }
599         if (found)
600                 return;
601
602         /* No hardware attached; use lowest common denominator */
603         rcu_read_lock();
604         for_each_active_iommu(iommu, drhd) {
605                 if (!ecap_coherent(iommu->ecap)) {
606                         domain->iommu_coherency = 0;
607                         break;
608                 }
609         }
610         rcu_read_unlock();
611 }
612
613 static int domain_update_iommu_snooping(struct intel_iommu *skip)
614 {
615         struct dmar_drhd_unit *drhd;
616         struct intel_iommu *iommu;
617         int ret = 1;
618
619         rcu_read_lock();
620         for_each_active_iommu(iommu, drhd) {
621                 if (iommu != skip) {
622                         if (!ecap_sc_support(iommu->ecap)) {
623                                 ret = 0;
624                                 break;
625                         }
626                 }
627         }
628         rcu_read_unlock();
629
630         return ret;
631 }
632
633 static int domain_update_iommu_superpage(struct intel_iommu *skip)
634 {
635         struct dmar_drhd_unit *drhd;
636         struct intel_iommu *iommu;
637         int mask = 0xf;
638
639         if (!intel_iommu_superpage) {
640                 return 0;
641         }
642
643         /* set iommu_superpage to the smallest common denominator */
644         rcu_read_lock();
645         for_each_active_iommu(iommu, drhd) {
646                 if (iommu != skip) {
647                         mask &= cap_super_page_val(iommu->cap);
648                         if (!mask)
649                                 break;
650                 }
651         }
652         rcu_read_unlock();
653
654         return fls(mask);
655 }
656
657 /* Some capabilities may be different across iommus */
658 static void domain_update_iommu_cap(struct dmar_domain *domain)
659 {
660         domain_update_iommu_coherency(domain);
661         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
662         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
663 }
664
665 static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu,
666                                                        u8 bus, u8 devfn, int alloc)
667 {
668         struct root_entry *root = &iommu->root_entry[bus];
669         struct context_entry *context;
670         u64 *entry;
671
672         if (ecap_ecs(iommu->ecap)) {
673                 if (devfn >= 0x80) {
674                         devfn -= 0x80;
675                         entry = &root->hi;
676                 }
677                 devfn *= 2;
678         }
679         entry = &root->lo;
680         if (*entry & 1)
681                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
682         else {
683                 unsigned long phy_addr;
684                 if (!alloc)
685                         return NULL;
686
687                 context = alloc_pgtable_page(iommu->node);
688                 if (!context)
689                         return NULL;
690
691                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
692                 phy_addr = virt_to_phys((void *)context);
693                 *entry = phy_addr | 1;
694                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
695         }
696         return &context[devfn];
697 }
698
699 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
700 {
701         struct dmar_drhd_unit *drhd = NULL;
702         struct intel_iommu *iommu;
703         struct device *tmp;
704         struct pci_dev *ptmp, *pdev = NULL;
705         u16 segment = 0;
706         int i;
707
708         if (dev_is_pci(dev)) {
709                 pdev = to_pci_dev(dev);
710                 segment = pci_domain_nr(pdev->bus);
711         } else if (has_acpi_companion(dev))
712                 dev = &ACPI_COMPANION(dev)->dev;
713
714         rcu_read_lock();
715         for_each_active_iommu(iommu, drhd) {
716                 if (pdev && segment != drhd->segment)
717                         continue;
718
719                 for_each_active_dev_scope(drhd->devices,
720                                           drhd->devices_cnt, i, tmp) {
721                         if (tmp == dev) {
722                                 *bus = drhd->devices[i].bus;
723                                 *devfn = drhd->devices[i].devfn;
724                                 goto out;
725                         }
726
727                         if (!pdev || !dev_is_pci(tmp))
728                                 continue;
729
730                         ptmp = to_pci_dev(tmp);
731                         if (ptmp->subordinate &&
732                             ptmp->subordinate->number <= pdev->bus->number &&
733                             ptmp->subordinate->busn_res.end >= pdev->bus->number)
734                                 goto got_pdev;
735                 }
736
737                 if (pdev && drhd->include_all) {
738                 got_pdev:
739                         *bus = pdev->bus->number;
740                         *devfn = pdev->devfn;
741                         goto out;
742                 }
743         }
744         iommu = NULL;
745  out:
746         rcu_read_unlock();
747
748         return iommu;
749 }
750
751 static void domain_flush_cache(struct dmar_domain *domain,
752                                void *addr, int size)
753 {
754         if (!domain->iommu_coherency)
755                 clflush_cache_range(addr, size);
756 }
757
758 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
759 {
760         struct context_entry *context;
761         int ret = 0;
762         unsigned long flags;
763
764         spin_lock_irqsave(&iommu->lock, flags);
765         context = iommu_context_addr(iommu, bus, devfn, 0);
766         if (context)
767                 ret = context_present(context);
768         spin_unlock_irqrestore(&iommu->lock, flags);
769         return ret;
770 }
771
772 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
773 {
774         struct context_entry *context;
775         unsigned long flags;
776
777         spin_lock_irqsave(&iommu->lock, flags);
778         context = iommu_context_addr(iommu, bus, devfn, 0);
779         if (context) {
780                 context_clear_entry(context);
781                 __iommu_flush_cache(iommu, context, sizeof(*context));
782         }
783         spin_unlock_irqrestore(&iommu->lock, flags);
784 }
785
786 static void free_context_table(struct intel_iommu *iommu)
787 {
788         int i;
789         unsigned long flags;
790         struct context_entry *context;
791
792         spin_lock_irqsave(&iommu->lock, flags);
793         if (!iommu->root_entry) {
794                 goto out;
795         }
796         for (i = 0; i < ROOT_ENTRY_NR; i++) {
797                 context = iommu_context_addr(iommu, i, 0, 0);
798                 if (context)
799                         free_pgtable_page(context);
800
801                 if (!ecap_ecs(iommu->ecap))
802                         continue;
803
804                 context = iommu_context_addr(iommu, i, 0x80, 0);
805                 if (context)
806                         free_pgtable_page(context);
807
808         }
809         free_pgtable_page(iommu->root_entry);
810         iommu->root_entry = NULL;
811 out:
812         spin_unlock_irqrestore(&iommu->lock, flags);
813 }
814
815 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
816                                       unsigned long pfn, int *target_level)
817 {
818         struct dma_pte *parent, *pte = NULL;
819         int level = agaw_to_level(domain->agaw);
820         int offset;
821
822         BUG_ON(!domain->pgd);
823
824         if (!domain_pfn_supported(domain, pfn))
825                 /* Address beyond IOMMU's addressing capabilities. */
826                 return NULL;
827
828         parent = domain->pgd;
829
830         while (1) {
831                 void *tmp_page;
832
833                 offset = pfn_level_offset(pfn, level);
834                 pte = &parent[offset];
835                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
836                         break;
837                 if (level == *target_level)
838                         break;
839
840                 if (!dma_pte_present(pte)) {
841                         uint64_t pteval;
842
843                         tmp_page = alloc_pgtable_page(domain->nid);
844
845                         if (!tmp_page)
846                                 return NULL;
847
848                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
849                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
850                         if (cmpxchg64(&pte->val, 0ULL, pteval))
851                                 /* Someone else set it while we were thinking; use theirs. */
852                                 free_pgtable_page(tmp_page);
853                         else
854                                 domain_flush_cache(domain, pte, sizeof(*pte));
855                 }
856                 if (level == 1)
857                         break;
858
859                 parent = phys_to_virt(dma_pte_addr(pte));
860                 level--;
861         }
862
863         if (!*target_level)
864                 *target_level = level;
865
866         return pte;
867 }
868
869
870 /* return address's pte at specific level */
871 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
872                                          unsigned long pfn,
873                                          int level, int *large_page)
874 {
875         struct dma_pte *parent, *pte = NULL;
876         int total = agaw_to_level(domain->agaw);
877         int offset;
878
879         parent = domain->pgd;
880         while (level <= total) {
881                 offset = pfn_level_offset(pfn, total);
882                 pte = &parent[offset];
883                 if (level == total)
884                         return pte;
885
886                 if (!dma_pte_present(pte)) {
887                         *large_page = total;
888                         break;
889                 }
890
891                 if (dma_pte_superpage(pte)) {
892                         *large_page = total;
893                         return pte;
894                 }
895
896                 parent = phys_to_virt(dma_pte_addr(pte));
897                 total--;
898         }
899         return NULL;
900 }
901
902 /* clear last level pte, a tlb flush should be followed */
903 static void dma_pte_clear_range(struct dmar_domain *domain,
904                                 unsigned long start_pfn,
905                                 unsigned long last_pfn)
906 {
907         unsigned int large_page = 1;
908         struct dma_pte *first_pte, *pte;
909
910         BUG_ON(!domain_pfn_supported(domain, start_pfn));
911         BUG_ON(!domain_pfn_supported(domain, last_pfn));
912         BUG_ON(start_pfn > last_pfn);
913
914         /* we don't need lock here; nobody else touches the iova range */
915         do {
916                 large_page = 1;
917                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
918                 if (!pte) {
919                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
920                         continue;
921                 }
922                 do {
923                         dma_clear_pte(pte);
924                         start_pfn += lvl_to_nr_pages(large_page);
925                         pte++;
926                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
927
928                 domain_flush_cache(domain, first_pte,
929                                    (void *)pte - (void *)first_pte);
930
931         } while (start_pfn && start_pfn <= last_pfn);
932 }
933
934 static void dma_pte_free_level(struct dmar_domain *domain, int level,
935                                struct dma_pte *pte, unsigned long pfn,
936                                unsigned long start_pfn, unsigned long last_pfn)
937 {
938         pfn = max(start_pfn, pfn);
939         pte = &pte[pfn_level_offset(pfn, level)];
940
941         do {
942                 unsigned long level_pfn;
943                 struct dma_pte *level_pte;
944
945                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
946                         goto next;
947
948                 level_pfn = pfn & level_mask(level - 1);
949                 level_pte = phys_to_virt(dma_pte_addr(pte));
950
951                 if (level > 2)
952                         dma_pte_free_level(domain, level - 1, level_pte,
953                                            level_pfn, start_pfn, last_pfn);
954
955                 /* If range covers entire pagetable, free it */
956                 if (!(start_pfn > level_pfn ||
957                       last_pfn < level_pfn + level_size(level) - 1)) {
958                         dma_clear_pte(pte);
959                         domain_flush_cache(domain, pte, sizeof(*pte));
960                         free_pgtable_page(level_pte);
961                 }
962 next:
963                 pfn += level_size(level);
964         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
965 }
966
967 /* free page table pages. last level pte should already be cleared */
968 static void dma_pte_free_pagetable(struct dmar_domain *domain,
969                                    unsigned long start_pfn,
970                                    unsigned long last_pfn)
971 {
972         BUG_ON(!domain_pfn_supported(domain, start_pfn));
973         BUG_ON(!domain_pfn_supported(domain, last_pfn));
974         BUG_ON(start_pfn > last_pfn);
975
976         dma_pte_clear_range(domain, start_pfn, last_pfn);
977
978         /* We don't need lock here; nobody else touches the iova range */
979         dma_pte_free_level(domain, agaw_to_level(domain->agaw),
980                            domain->pgd, 0, start_pfn, last_pfn);
981
982         /* free pgd */
983         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
984                 free_pgtable_page(domain->pgd);
985                 domain->pgd = NULL;
986         }
987 }
988
989 /* When a page at a given level is being unlinked from its parent, we don't
990    need to *modify* it at all. All we need to do is make a list of all the
991    pages which can be freed just as soon as we've flushed the IOTLB and we
992    know the hardware page-walk will no longer touch them.
993    The 'pte' argument is the *parent* PTE, pointing to the page that is to
994    be freed. */
995 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
996                                             int level, struct dma_pte *pte,
997                                             struct page *freelist)
998 {
999         struct page *pg;
1000
1001         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1002         pg->freelist = freelist;
1003         freelist = pg;
1004
1005         if (level == 1)
1006                 return freelist;
1007
1008         pte = page_address(pg);
1009         do {
1010                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1011                         freelist = dma_pte_list_pagetables(domain, level - 1,
1012                                                            pte, freelist);
1013                 pte++;
1014         } while (!first_pte_in_page(pte));
1015
1016         return freelist;
1017 }
1018
1019 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1020                                         struct dma_pte *pte, unsigned long pfn,
1021                                         unsigned long start_pfn,
1022                                         unsigned long last_pfn,
1023                                         struct page *freelist)
1024 {
1025         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1026
1027         pfn = max(start_pfn, pfn);
1028         pte = &pte[pfn_level_offset(pfn, level)];
1029
1030         do {
1031                 unsigned long level_pfn;
1032
1033                 if (!dma_pte_present(pte))
1034                         goto next;
1035
1036                 level_pfn = pfn & level_mask(level);
1037
1038                 /* If range covers entire pagetable, free it */
1039                 if (start_pfn <= level_pfn &&
1040                     last_pfn >= level_pfn + level_size(level) - 1) {
1041                         /* These suborbinate page tables are going away entirely. Don't
1042                            bother to clear them; we're just going to *free* them. */
1043                         if (level > 1 && !dma_pte_superpage(pte))
1044                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1045
1046                         dma_clear_pte(pte);
1047                         if (!first_pte)
1048                                 first_pte = pte;
1049                         last_pte = pte;
1050                 } else if (level > 1) {
1051                         /* Recurse down into a level that isn't *entirely* obsolete */
1052                         freelist = dma_pte_clear_level(domain, level - 1,
1053                                                        phys_to_virt(dma_pte_addr(pte)),
1054                                                        level_pfn, start_pfn, last_pfn,
1055                                                        freelist);
1056                 }
1057 next:
1058                 pfn += level_size(level);
1059         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1060
1061         if (first_pte)
1062                 domain_flush_cache(domain, first_pte,
1063                                    (void *)++last_pte - (void *)first_pte);
1064
1065         return freelist;
1066 }
1067
1068 /* We can't just free the pages because the IOMMU may still be walking
1069    the page tables, and may have cached the intermediate levels. The
1070    pages can only be freed after the IOTLB flush has been done. */
1071 struct page *domain_unmap(struct dmar_domain *domain,
1072                           unsigned long start_pfn,
1073                           unsigned long last_pfn)
1074 {
1075         struct page *freelist = NULL;
1076
1077         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1078         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1079         BUG_ON(start_pfn > last_pfn);
1080
1081         /* we don't need lock here; nobody else touches the iova range */
1082         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1083                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1084
1085         /* free pgd */
1086         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1087                 struct page *pgd_page = virt_to_page(domain->pgd);
1088                 pgd_page->freelist = freelist;
1089                 freelist = pgd_page;
1090
1091                 domain->pgd = NULL;
1092         }
1093
1094         return freelist;
1095 }
1096
1097 void dma_free_pagelist(struct page *freelist)
1098 {
1099         struct page *pg;
1100
1101         while ((pg = freelist)) {
1102                 freelist = pg->freelist;
1103                 free_pgtable_page(page_address(pg));
1104         }
1105 }
1106
1107 /* iommu handling */
1108 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1109 {
1110         struct root_entry *root;
1111         unsigned long flags;
1112
1113         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1114         if (!root) {
1115                 pr_err("IOMMU: allocating root entry for %s failed\n",
1116                         iommu->name);
1117                 return -ENOMEM;
1118         }
1119
1120         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1121
1122         spin_lock_irqsave(&iommu->lock, flags);
1123         iommu->root_entry = root;
1124         spin_unlock_irqrestore(&iommu->lock, flags);
1125
1126         return 0;
1127 }
1128
1129 static void iommu_set_root_entry(struct intel_iommu *iommu)
1130 {
1131         u64 addr;
1132         u32 sts;
1133         unsigned long flag;
1134
1135         addr = virt_to_phys(iommu->root_entry);
1136         if (ecap_ecs(iommu->ecap))
1137                 addr |= DMA_RTADDR_RTT;
1138
1139         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1140         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1141
1142         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1143
1144         /* Make sure hardware complete it */
1145         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1146                       readl, (sts & DMA_GSTS_RTPS), sts);
1147
1148         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1149 }
1150
1151 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1152 {
1153         u32 val;
1154         unsigned long flag;
1155
1156         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1157                 return;
1158
1159         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1160         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1161
1162         /* Make sure hardware complete it */
1163         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1164                       readl, (!(val & DMA_GSTS_WBFS)), val);
1165
1166         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1167 }
1168
1169 /* return value determine if we need a write buffer flush */
1170 static void __iommu_flush_context(struct intel_iommu *iommu,
1171                                   u16 did, u16 source_id, u8 function_mask,
1172                                   u64 type)
1173 {
1174         u64 val = 0;
1175         unsigned long flag;
1176
1177         switch (type) {
1178         case DMA_CCMD_GLOBAL_INVL:
1179                 val = DMA_CCMD_GLOBAL_INVL;
1180                 break;
1181         case DMA_CCMD_DOMAIN_INVL:
1182                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1183                 break;
1184         case DMA_CCMD_DEVICE_INVL:
1185                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1186                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1187                 break;
1188         default:
1189                 BUG();
1190         }
1191         val |= DMA_CCMD_ICC;
1192
1193         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1194         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1195
1196         /* Make sure hardware complete it */
1197         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1198                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1199
1200         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1201 }
1202
1203 /* return value determine if we need a write buffer flush */
1204 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1205                                 u64 addr, unsigned int size_order, u64 type)
1206 {
1207         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1208         u64 val = 0, val_iva = 0;
1209         unsigned long flag;
1210
1211         switch (type) {
1212         case DMA_TLB_GLOBAL_FLUSH:
1213                 /* global flush doesn't need set IVA_REG */
1214                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1215                 break;
1216         case DMA_TLB_DSI_FLUSH:
1217                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1218                 break;
1219         case DMA_TLB_PSI_FLUSH:
1220                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1221                 /* IH bit is passed in as part of address */
1222                 val_iva = size_order | addr;
1223                 break;
1224         default:
1225                 BUG();
1226         }
1227         /* Note: set drain read/write */
1228 #if 0
1229         /*
1230          * This is probably to be super secure.. Looks like we can
1231          * ignore it without any impact.
1232          */
1233         if (cap_read_drain(iommu->cap))
1234                 val |= DMA_TLB_READ_DRAIN;
1235 #endif
1236         if (cap_write_drain(iommu->cap))
1237                 val |= DMA_TLB_WRITE_DRAIN;
1238
1239         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1240         /* Note: Only uses first TLB reg currently */
1241         if (val_iva)
1242                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1243         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1244
1245         /* Make sure hardware complete it */
1246         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1247                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1248
1249         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1250
1251         /* check IOTLB invalidation granularity */
1252         if (DMA_TLB_IAIG(val) == 0)
1253                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1254         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1255                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1256                         (unsigned long long)DMA_TLB_IIRG(type),
1257                         (unsigned long long)DMA_TLB_IAIG(val));
1258 }
1259
1260 static struct device_domain_info *
1261 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1262                          u8 bus, u8 devfn)
1263 {
1264         bool found = false;
1265         unsigned long flags;
1266         struct device_domain_info *info;
1267         struct pci_dev *pdev;
1268
1269         if (!ecap_dev_iotlb_support(iommu->ecap))
1270                 return NULL;
1271
1272         if (!iommu->qi)
1273                 return NULL;
1274
1275         spin_lock_irqsave(&device_domain_lock, flags);
1276         list_for_each_entry(info, &domain->devices, link)
1277                 if (info->iommu == iommu && info->bus == bus &&
1278                     info->devfn == devfn) {
1279                         found = true;
1280                         break;
1281                 }
1282         spin_unlock_irqrestore(&device_domain_lock, flags);
1283
1284         if (!found || !info->dev || !dev_is_pci(info->dev))
1285                 return NULL;
1286
1287         pdev = to_pci_dev(info->dev);
1288
1289         if (!pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS))
1290                 return NULL;
1291
1292         if (!dmar_find_matched_atsr_unit(pdev))
1293                 return NULL;
1294
1295         return info;
1296 }
1297
1298 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1299 {
1300         if (!info || !dev_is_pci(info->dev))
1301                 return;
1302
1303         pci_enable_ats(to_pci_dev(info->dev), VTD_PAGE_SHIFT);
1304 }
1305
1306 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1307 {
1308         if (!info->dev || !dev_is_pci(info->dev) ||
1309             !pci_ats_enabled(to_pci_dev(info->dev)))
1310                 return;
1311
1312         pci_disable_ats(to_pci_dev(info->dev));
1313 }
1314
1315 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1316                                   u64 addr, unsigned mask)
1317 {
1318         u16 sid, qdep;
1319         unsigned long flags;
1320         struct device_domain_info *info;
1321
1322         spin_lock_irqsave(&device_domain_lock, flags);
1323         list_for_each_entry(info, &domain->devices, link) {
1324                 struct pci_dev *pdev;
1325                 if (!info->dev || !dev_is_pci(info->dev))
1326                         continue;
1327
1328                 pdev = to_pci_dev(info->dev);
1329                 if (!pci_ats_enabled(pdev))
1330                         continue;
1331
1332                 sid = info->bus << 8 | info->devfn;
1333                 qdep = pci_ats_queue_depth(pdev);
1334                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1335         }
1336         spin_unlock_irqrestore(&device_domain_lock, flags);
1337 }
1338
1339 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1340                                   unsigned long pfn, unsigned int pages, int ih, int map)
1341 {
1342         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1343         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1344
1345         BUG_ON(pages == 0);
1346
1347         if (ih)
1348                 ih = 1 << 6;
1349         /*
1350          * Fallback to domain selective flush if no PSI support or the size is
1351          * too big.
1352          * PSI requires page size to be 2 ^ x, and the base address is naturally
1353          * aligned to the size
1354          */
1355         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1356                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1357                                                 DMA_TLB_DSI_FLUSH);
1358         else
1359                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1360                                                 DMA_TLB_PSI_FLUSH);
1361
1362         /*
1363          * In caching mode, changes of pages from non-present to present require
1364          * flush. However, device IOTLB doesn't need to be flushed in this case.
1365          */
1366         if (!cap_caching_mode(iommu->cap) || !map)
1367                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1368 }
1369
1370 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1371 {
1372         u32 pmen;
1373         unsigned long flags;
1374
1375         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1376         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1377         pmen &= ~DMA_PMEN_EPM;
1378         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1379
1380         /* wait for the protected region status bit to clear */
1381         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1382                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1383
1384         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1385 }
1386
1387 static void iommu_enable_translation(struct intel_iommu *iommu)
1388 {
1389         u32 sts;
1390         unsigned long flags;
1391
1392         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1393         iommu->gcmd |= DMA_GCMD_TE;
1394         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1395
1396         /* Make sure hardware complete it */
1397         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1398                       readl, (sts & DMA_GSTS_TES), sts);
1399
1400         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1401 }
1402
1403 static void iommu_disable_translation(struct intel_iommu *iommu)
1404 {
1405         u32 sts;
1406         unsigned long flag;
1407
1408         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1409         iommu->gcmd &= ~DMA_GCMD_TE;
1410         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1411
1412         /* Make sure hardware complete it */
1413         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1414                       readl, (!(sts & DMA_GSTS_TES)), sts);
1415
1416         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1417 }
1418
1419
1420 static int iommu_init_domains(struct intel_iommu *iommu)
1421 {
1422         unsigned long ndomains;
1423         unsigned long nlongs;
1424
1425         ndomains = cap_ndoms(iommu->cap);
1426         pr_debug("IOMMU%d: Number of Domains supported <%ld>\n",
1427                  iommu->seq_id, ndomains);
1428         nlongs = BITS_TO_LONGS(ndomains);
1429
1430         spin_lock_init(&iommu->lock);
1431
1432         /* TBD: there might be 64K domains,
1433          * consider other allocation for future chip
1434          */
1435         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1436         if (!iommu->domain_ids) {
1437                 pr_err("IOMMU%d: allocating domain id array failed\n",
1438                        iommu->seq_id);
1439                 return -ENOMEM;
1440         }
1441         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1442                         GFP_KERNEL);
1443         if (!iommu->domains) {
1444                 pr_err("IOMMU%d: allocating domain array failed\n",
1445                        iommu->seq_id);
1446                 kfree(iommu->domain_ids);
1447                 iommu->domain_ids = NULL;
1448                 return -ENOMEM;
1449         }
1450
1451         /*
1452          * if Caching mode is set, then invalid translations are tagged
1453          * with domainid 0. Hence we need to pre-allocate it.
1454          */
1455         if (cap_caching_mode(iommu->cap))
1456                 set_bit(0, iommu->domain_ids);
1457         return 0;
1458 }
1459
1460 static void disable_dmar_iommu(struct intel_iommu *iommu)
1461 {
1462         struct dmar_domain *domain;
1463         int i;
1464
1465         if ((iommu->domains) && (iommu->domain_ids)) {
1466                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1467                         /*
1468                          * Domain id 0 is reserved for invalid translation
1469                          * if hardware supports caching mode.
1470                          */
1471                         if (cap_caching_mode(iommu->cap) && i == 0)
1472                                 continue;
1473
1474                         domain = iommu->domains[i];
1475                         clear_bit(i, iommu->domain_ids);
1476                         if (domain_detach_iommu(domain, iommu) == 0 &&
1477                             !domain_type_is_vm(domain))
1478                                 domain_exit(domain);
1479                 }
1480         }
1481
1482         if (iommu->gcmd & DMA_GCMD_TE)
1483                 iommu_disable_translation(iommu);
1484 }
1485
1486 static void free_dmar_iommu(struct intel_iommu *iommu)
1487 {
1488         if ((iommu->domains) && (iommu->domain_ids)) {
1489                 kfree(iommu->domains);
1490                 kfree(iommu->domain_ids);
1491                 iommu->domains = NULL;
1492                 iommu->domain_ids = NULL;
1493         }
1494
1495         g_iommus[iommu->seq_id] = NULL;
1496
1497         /* free context mapping */
1498         free_context_table(iommu);
1499 }
1500
1501 static struct dmar_domain *alloc_domain(int flags)
1502 {
1503         /* domain id for virtual machine, it won't be set in context */
1504         static atomic_t vm_domid = ATOMIC_INIT(0);
1505         struct dmar_domain *domain;
1506
1507         domain = alloc_domain_mem();
1508         if (!domain)
1509                 return NULL;
1510
1511         memset(domain, 0, sizeof(*domain));
1512         domain->nid = -1;
1513         domain->flags = flags;
1514         spin_lock_init(&domain->iommu_lock);
1515         INIT_LIST_HEAD(&domain->devices);
1516         if (flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1517                 domain->id = atomic_inc_return(&vm_domid);
1518
1519         return domain;
1520 }
1521
1522 static int __iommu_attach_domain(struct dmar_domain *domain,
1523                                  struct intel_iommu *iommu)
1524 {
1525         int num;
1526         unsigned long ndomains;
1527
1528         ndomains = cap_ndoms(iommu->cap);
1529         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1530         if (num < ndomains) {
1531                 set_bit(num, iommu->domain_ids);
1532                 iommu->domains[num] = domain;
1533         } else {
1534                 num = -ENOSPC;
1535         }
1536
1537         return num;
1538 }
1539
1540 static int iommu_attach_domain(struct dmar_domain *domain,
1541                                struct intel_iommu *iommu)
1542 {
1543         int num;
1544         unsigned long flags;
1545
1546         spin_lock_irqsave(&iommu->lock, flags);
1547         num = __iommu_attach_domain(domain, iommu);
1548         spin_unlock_irqrestore(&iommu->lock, flags);
1549         if (num < 0)
1550                 pr_err("IOMMU: no free domain ids\n");
1551
1552         return num;
1553 }
1554
1555 static int iommu_attach_vm_domain(struct dmar_domain *domain,
1556                                   struct intel_iommu *iommu)
1557 {
1558         int num;
1559         unsigned long ndomains;
1560
1561         ndomains = cap_ndoms(iommu->cap);
1562         for_each_set_bit(num, iommu->domain_ids, ndomains)
1563                 if (iommu->domains[num] == domain)
1564                         return num;
1565
1566         return __iommu_attach_domain(domain, iommu);
1567 }
1568
1569 static void iommu_detach_domain(struct dmar_domain *domain,
1570                                 struct intel_iommu *iommu)
1571 {
1572         unsigned long flags;
1573         int num, ndomains;
1574
1575         spin_lock_irqsave(&iommu->lock, flags);
1576         if (domain_type_is_vm_or_si(domain)) {
1577                 ndomains = cap_ndoms(iommu->cap);
1578                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1579                         if (iommu->domains[num] == domain) {
1580                                 clear_bit(num, iommu->domain_ids);
1581                                 iommu->domains[num] = NULL;
1582                                 break;
1583                         }
1584                 }
1585         } else {
1586                 clear_bit(domain->id, iommu->domain_ids);
1587                 iommu->domains[domain->id] = NULL;
1588         }
1589         spin_unlock_irqrestore(&iommu->lock, flags);
1590 }
1591
1592 static void domain_attach_iommu(struct dmar_domain *domain,
1593                                struct intel_iommu *iommu)
1594 {
1595         unsigned long flags;
1596
1597         spin_lock_irqsave(&domain->iommu_lock, flags);
1598         if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1599                 domain->iommu_count++;
1600                 if (domain->iommu_count == 1)
1601                         domain->nid = iommu->node;
1602                 domain_update_iommu_cap(domain);
1603         }
1604         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1605 }
1606
1607 static int domain_detach_iommu(struct dmar_domain *domain,
1608                                struct intel_iommu *iommu)
1609 {
1610         unsigned long flags;
1611         int count = INT_MAX;
1612
1613         spin_lock_irqsave(&domain->iommu_lock, flags);
1614         if (test_and_clear_bit(iommu->seq_id, domain->iommu_bmp)) {
1615                 count = --domain->iommu_count;
1616                 domain_update_iommu_cap(domain);
1617         }
1618         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1619
1620         return count;
1621 }
1622
1623 static struct iova_domain reserved_iova_list;
1624 static struct lock_class_key reserved_rbtree_key;
1625
1626 static int dmar_init_reserved_ranges(void)
1627 {
1628         struct pci_dev *pdev = NULL;
1629         struct iova *iova;
1630         int i;
1631
1632         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN,
1633                         DMA_32BIT_PFN);
1634
1635         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1636                 &reserved_rbtree_key);
1637
1638         /* IOAPIC ranges shouldn't be accessed by DMA */
1639         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1640                 IOVA_PFN(IOAPIC_RANGE_END));
1641         if (!iova) {
1642                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1643                 return -ENODEV;
1644         }
1645
1646         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1647         for_each_pci_dev(pdev) {
1648                 struct resource *r;
1649
1650                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1651                         r = &pdev->resource[i];
1652                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1653                                 continue;
1654                         iova = reserve_iova(&reserved_iova_list,
1655                                             IOVA_PFN(r->start),
1656                                             IOVA_PFN(r->end));
1657                         if (!iova) {
1658                                 printk(KERN_ERR "Reserve iova failed\n");
1659                                 return -ENODEV;
1660                         }
1661                 }
1662         }
1663         return 0;
1664 }
1665
1666 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1667 {
1668         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1669 }
1670
1671 static inline int guestwidth_to_adjustwidth(int gaw)
1672 {
1673         int agaw;
1674         int r = (gaw - 12) % 9;
1675
1676         if (r == 0)
1677                 agaw = gaw;
1678         else
1679                 agaw = gaw + 9 - r;
1680         if (agaw > 64)
1681                 agaw = 64;
1682         return agaw;
1683 }
1684
1685 static int domain_init(struct dmar_domain *domain, int guest_width)
1686 {
1687         struct intel_iommu *iommu;
1688         int adjust_width, agaw;
1689         unsigned long sagaw;
1690
1691         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
1692                         DMA_32BIT_PFN);
1693         domain_reserve_special_ranges(domain);
1694
1695         /* calculate AGAW */
1696         iommu = domain_get_iommu(domain);
1697         if (guest_width > cap_mgaw(iommu->cap))
1698                 guest_width = cap_mgaw(iommu->cap);
1699         domain->gaw = guest_width;
1700         adjust_width = guestwidth_to_adjustwidth(guest_width);
1701         agaw = width_to_agaw(adjust_width);
1702         sagaw = cap_sagaw(iommu->cap);
1703         if (!test_bit(agaw, &sagaw)) {
1704                 /* hardware doesn't support it, choose a bigger one */
1705                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1706                 agaw = find_next_bit(&sagaw, 5, agaw);
1707                 if (agaw >= 5)
1708                         return -ENODEV;
1709         }
1710         domain->agaw = agaw;
1711
1712         if (ecap_coherent(iommu->ecap))
1713                 domain->iommu_coherency = 1;
1714         else
1715                 domain->iommu_coherency = 0;
1716
1717         if (ecap_sc_support(iommu->ecap))
1718                 domain->iommu_snooping = 1;
1719         else
1720                 domain->iommu_snooping = 0;
1721
1722         if (intel_iommu_superpage)
1723                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1724         else
1725                 domain->iommu_superpage = 0;
1726
1727         domain->nid = iommu->node;
1728
1729         /* always allocate the top pgd */
1730         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1731         if (!domain->pgd)
1732                 return -ENOMEM;
1733         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1734         return 0;
1735 }
1736
1737 static void domain_exit(struct dmar_domain *domain)
1738 {
1739         struct page *freelist = NULL;
1740         int i;
1741
1742         /* Domain 0 is reserved, so dont process it */
1743         if (!domain)
1744                 return;
1745
1746         /* Flush any lazy unmaps that may reference this domain */
1747         if (!intel_iommu_strict)
1748                 flush_unmaps_timeout(0);
1749
1750         /* remove associated devices */
1751         domain_remove_dev_info(domain);
1752
1753         /* destroy iovas */
1754         put_iova_domain(&domain->iovad);
1755
1756         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1757
1758         /* clear attached or cached domains */
1759         rcu_read_lock();
1760         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus)
1761                 iommu_detach_domain(domain, g_iommus[i]);
1762         rcu_read_unlock();
1763
1764         dma_free_pagelist(freelist);
1765
1766         free_domain_mem(domain);
1767 }
1768
1769 static int domain_context_mapping_one(struct dmar_domain *domain,
1770                                       struct intel_iommu *iommu,
1771                                       u8 bus, u8 devfn, int translation)
1772 {
1773         struct context_entry *context;
1774         unsigned long flags;
1775         struct dma_pte *pgd;
1776         int id;
1777         int agaw;
1778         struct device_domain_info *info = NULL;
1779
1780         pr_debug("Set context mapping for %02x:%02x.%d\n",
1781                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1782
1783         BUG_ON(!domain->pgd);
1784         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1785                translation != CONTEXT_TT_MULTI_LEVEL);
1786
1787         spin_lock_irqsave(&iommu->lock, flags);
1788         context = iommu_context_addr(iommu, bus, devfn, 1);
1789         spin_unlock_irqrestore(&iommu->lock, flags);
1790         if (!context)
1791                 return -ENOMEM;
1792         spin_lock_irqsave(&iommu->lock, flags);
1793         if (context_present(context)) {
1794                 spin_unlock_irqrestore(&iommu->lock, flags);
1795                 return 0;
1796         }
1797
1798         id = domain->id;
1799         pgd = domain->pgd;
1800
1801         if (domain_type_is_vm_or_si(domain)) {
1802                 if (domain_type_is_vm(domain)) {
1803                         id = iommu_attach_vm_domain(domain, iommu);
1804                         if (id < 0) {
1805                                 spin_unlock_irqrestore(&iommu->lock, flags);
1806                                 pr_err("IOMMU: no free domain ids\n");
1807                                 return -EFAULT;
1808                         }
1809                 }
1810
1811                 /* Skip top levels of page tables for
1812                  * iommu which has less agaw than default.
1813                  * Unnecessary for PT mode.
1814                  */
1815                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1816                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1817                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1818                                 if (!dma_pte_present(pgd)) {
1819                                         spin_unlock_irqrestore(&iommu->lock, flags);
1820                                         return -ENOMEM;
1821                                 }
1822                         }
1823                 }
1824         }
1825
1826         context_set_domain_id(context, id);
1827
1828         if (translation != CONTEXT_TT_PASS_THROUGH) {
1829                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
1830                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1831                                      CONTEXT_TT_MULTI_LEVEL;
1832         }
1833         /*
1834          * In pass through mode, AW must be programmed to indicate the largest
1835          * AGAW value supported by hardware. And ASR is ignored by hardware.
1836          */
1837         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1838                 context_set_address_width(context, iommu->msagaw);
1839         else {
1840                 context_set_address_root(context, virt_to_phys(pgd));
1841                 context_set_address_width(context, iommu->agaw);
1842         }
1843
1844         context_set_translation_type(context, translation);
1845         context_set_fault_enable(context);
1846         context_set_present(context);
1847         domain_flush_cache(domain, context, sizeof(*context));
1848
1849         /*
1850          * It's a non-present to present mapping. If hardware doesn't cache
1851          * non-present entry we only need to flush the write-buffer. If the
1852          * _does_ cache non-present entries, then it does so in the special
1853          * domain #0, which we have to flush:
1854          */
1855         if (cap_caching_mode(iommu->cap)) {
1856                 iommu->flush.flush_context(iommu, 0,
1857                                            (((u16)bus) << 8) | devfn,
1858                                            DMA_CCMD_MASK_NOBIT,
1859                                            DMA_CCMD_DEVICE_INVL);
1860                 iommu->flush.flush_iotlb(iommu, id, 0, 0, DMA_TLB_DSI_FLUSH);
1861         } else {
1862                 iommu_flush_write_buffer(iommu);
1863         }
1864         iommu_enable_dev_iotlb(info);
1865         spin_unlock_irqrestore(&iommu->lock, flags);
1866
1867         domain_attach_iommu(domain, iommu);
1868
1869         return 0;
1870 }
1871
1872 struct domain_context_mapping_data {
1873         struct dmar_domain *domain;
1874         struct intel_iommu *iommu;
1875         int translation;
1876 };
1877
1878 static int domain_context_mapping_cb(struct pci_dev *pdev,
1879                                      u16 alias, void *opaque)
1880 {
1881         struct domain_context_mapping_data *data = opaque;
1882
1883         return domain_context_mapping_one(data->domain, data->iommu,
1884                                           PCI_BUS_NUM(alias), alias & 0xff,
1885                                           data->translation);
1886 }
1887
1888 static int
1889 domain_context_mapping(struct dmar_domain *domain, struct device *dev,
1890                        int translation)
1891 {
1892         struct intel_iommu *iommu;
1893         u8 bus, devfn;
1894         struct domain_context_mapping_data data;
1895
1896         iommu = device_to_iommu(dev, &bus, &devfn);
1897         if (!iommu)
1898                 return -ENODEV;
1899
1900         if (!dev_is_pci(dev))
1901                 return domain_context_mapping_one(domain, iommu, bus, devfn,
1902                                                   translation);
1903
1904         data.domain = domain;
1905         data.iommu = iommu;
1906         data.translation = translation;
1907
1908         return pci_for_each_dma_alias(to_pci_dev(dev),
1909                                       &domain_context_mapping_cb, &data);
1910 }
1911
1912 static int domain_context_mapped_cb(struct pci_dev *pdev,
1913                                     u16 alias, void *opaque)
1914 {
1915         struct intel_iommu *iommu = opaque;
1916
1917         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
1918 }
1919
1920 static int domain_context_mapped(struct device *dev)
1921 {
1922         struct intel_iommu *iommu;
1923         u8 bus, devfn;
1924
1925         iommu = device_to_iommu(dev, &bus, &devfn);
1926         if (!iommu)
1927                 return -ENODEV;
1928
1929         if (!dev_is_pci(dev))
1930                 return device_context_mapped(iommu, bus, devfn);
1931
1932         return !pci_for_each_dma_alias(to_pci_dev(dev),
1933                                        domain_context_mapped_cb, iommu);
1934 }
1935
1936 /* Returns a number of VTD pages, but aligned to MM page size */
1937 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1938                                             size_t size)
1939 {
1940         host_addr &= ~PAGE_MASK;
1941         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1942 }
1943
1944 /* Return largest possible superpage level for a given mapping */
1945 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1946                                           unsigned long iov_pfn,
1947                                           unsigned long phy_pfn,
1948                                           unsigned long pages)
1949 {
1950         int support, level = 1;
1951         unsigned long pfnmerge;
1952
1953         support = domain->iommu_superpage;
1954
1955         /* To use a large page, the virtual *and* physical addresses
1956            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1957            of them will mean we have to use smaller pages. So just
1958            merge them and check both at once. */
1959         pfnmerge = iov_pfn | phy_pfn;
1960
1961         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1962                 pages >>= VTD_STRIDE_SHIFT;
1963                 if (!pages)
1964                         break;
1965                 pfnmerge >>= VTD_STRIDE_SHIFT;
1966                 level++;
1967                 support--;
1968         }
1969         return level;
1970 }
1971
1972 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1973                             struct scatterlist *sg, unsigned long phys_pfn,
1974                             unsigned long nr_pages, int prot)
1975 {
1976         struct dma_pte *first_pte = NULL, *pte = NULL;
1977         phys_addr_t uninitialized_var(pteval);
1978         unsigned long sg_res = 0;
1979         unsigned int largepage_lvl = 0;
1980         unsigned long lvl_pages = 0;
1981
1982         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
1983
1984         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1985                 return -EINVAL;
1986
1987         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1988
1989         if (!sg) {
1990                 sg_res = nr_pages;
1991                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1992         }
1993
1994         while (nr_pages > 0) {
1995                 uint64_t tmp;
1996
1997                 if (!sg_res) {
1998                         sg_res = aligned_nrpages(sg->offset, sg->length);
1999                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
2000                         sg->dma_length = sg->length;
2001                         pteval = page_to_phys(sg_page(sg)) | prot;
2002                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2003                 }
2004
2005                 if (!pte) {
2006                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2007
2008                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2009                         if (!pte)
2010                                 return -ENOMEM;
2011                         /* It is large page*/
2012                         if (largepage_lvl > 1) {
2013                                 pteval |= DMA_PTE_LARGE_PAGE;
2014                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2015                                 /*
2016                                  * Ensure that old small page tables are
2017                                  * removed to make room for superpage,
2018                                  * if they exist.
2019                                  */
2020                                 dma_pte_free_pagetable(domain, iov_pfn,
2021                                                        iov_pfn + lvl_pages - 1);
2022                         } else {
2023                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2024                         }
2025
2026                 }
2027                 /* We don't need lock here, nobody else
2028                  * touches the iova range
2029                  */
2030                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2031                 if (tmp) {
2032                         static int dumps = 5;
2033                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2034                                iov_pfn, tmp, (unsigned long long)pteval);
2035                         if (dumps) {
2036                                 dumps--;
2037                                 debug_dma_dump_mappings(NULL);
2038                         }
2039                         WARN_ON(1);
2040                 }
2041
2042                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2043
2044                 BUG_ON(nr_pages < lvl_pages);
2045                 BUG_ON(sg_res < lvl_pages);
2046
2047                 nr_pages -= lvl_pages;
2048                 iov_pfn += lvl_pages;
2049                 phys_pfn += lvl_pages;
2050                 pteval += lvl_pages * VTD_PAGE_SIZE;
2051                 sg_res -= lvl_pages;
2052
2053                 /* If the next PTE would be the first in a new page, then we
2054                    need to flush the cache on the entries we've just written.
2055                    And then we'll need to recalculate 'pte', so clear it and
2056                    let it get set again in the if (!pte) block above.
2057
2058                    If we're done (!nr_pages) we need to flush the cache too.
2059
2060                    Also if we've been setting superpages, we may need to
2061                    recalculate 'pte' and switch back to smaller pages for the
2062                    end of the mapping, if the trailing size is not enough to
2063                    use another superpage (i.e. sg_res < lvl_pages). */
2064                 pte++;
2065                 if (!nr_pages || first_pte_in_page(pte) ||
2066                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2067                         domain_flush_cache(domain, first_pte,
2068                                            (void *)pte - (void *)first_pte);
2069                         pte = NULL;
2070                 }
2071
2072                 if (!sg_res && nr_pages)
2073                         sg = sg_next(sg);
2074         }
2075         return 0;
2076 }
2077
2078 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2079                                     struct scatterlist *sg, unsigned long nr_pages,
2080                                     int prot)
2081 {
2082         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2083 }
2084
2085 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2086                                      unsigned long phys_pfn, unsigned long nr_pages,
2087                                      int prot)
2088 {
2089         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2090 }
2091
2092 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
2093 {
2094         if (!iommu)
2095                 return;
2096
2097         clear_context_table(iommu, bus, devfn);
2098         iommu->flush.flush_context(iommu, 0, 0, 0,
2099                                            DMA_CCMD_GLOBAL_INVL);
2100         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2101 }
2102
2103 static inline void unlink_domain_info(struct device_domain_info *info)
2104 {
2105         assert_spin_locked(&device_domain_lock);
2106         list_del(&info->link);
2107         list_del(&info->global);
2108         if (info->dev)
2109                 info->dev->archdata.iommu = NULL;
2110 }
2111
2112 static void domain_remove_dev_info(struct dmar_domain *domain)
2113 {
2114         struct device_domain_info *info, *tmp;
2115         unsigned long flags;
2116
2117         spin_lock_irqsave(&device_domain_lock, flags);
2118         list_for_each_entry_safe(info, tmp, &domain->devices, link) {
2119                 unlink_domain_info(info);
2120                 spin_unlock_irqrestore(&device_domain_lock, flags);
2121
2122                 iommu_disable_dev_iotlb(info);
2123                 iommu_detach_dev(info->iommu, info->bus, info->devfn);
2124
2125                 if (domain_type_is_vm(domain)) {
2126                         iommu_detach_dependent_devices(info->iommu, info->dev);
2127                         domain_detach_iommu(domain, info->iommu);
2128                 }
2129
2130                 free_devinfo_mem(info);
2131                 spin_lock_irqsave(&device_domain_lock, flags);
2132         }
2133         spin_unlock_irqrestore(&device_domain_lock, flags);
2134 }
2135
2136 /*
2137  * find_domain
2138  * Note: we use struct device->archdata.iommu stores the info
2139  */
2140 static struct dmar_domain *find_domain(struct device *dev)
2141 {
2142         struct device_domain_info *info;
2143
2144         /* No lock here, assumes no domain exit in normal case */
2145         info = dev->archdata.iommu;
2146         if (info)
2147                 return info->domain;
2148         return NULL;
2149 }
2150
2151 static inline struct device_domain_info *
2152 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2153 {
2154         struct device_domain_info *info;
2155
2156         list_for_each_entry(info, &device_domain_list, global)
2157                 if (info->iommu->segment == segment && info->bus == bus &&
2158                     info->devfn == devfn)
2159                         return info;
2160
2161         return NULL;
2162 }
2163
2164 static struct dmar_domain *dmar_insert_dev_info(struct intel_iommu *iommu,
2165                                                 int bus, int devfn,
2166                                                 struct device *dev,
2167                                                 struct dmar_domain *domain)
2168 {
2169         struct dmar_domain *found = NULL;
2170         struct device_domain_info *info;
2171         unsigned long flags;
2172
2173         info = alloc_devinfo_mem();
2174         if (!info)
2175                 return NULL;
2176
2177         info->bus = bus;
2178         info->devfn = devfn;
2179         info->dev = dev;
2180         info->domain = domain;
2181         info->iommu = iommu;
2182
2183         spin_lock_irqsave(&device_domain_lock, flags);
2184         if (dev)
2185                 found = find_domain(dev);
2186         else {
2187                 struct device_domain_info *info2;
2188                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2189                 if (info2)
2190                         found = info2->domain;
2191         }
2192         if (found) {
2193                 spin_unlock_irqrestore(&device_domain_lock, flags);
2194                 free_devinfo_mem(info);
2195                 /* Caller must free the original domain */
2196                 return found;
2197         }
2198
2199         list_add(&info->link, &domain->devices);
2200         list_add(&info->global, &device_domain_list);
2201         if (dev)
2202                 dev->archdata.iommu = info;
2203         spin_unlock_irqrestore(&device_domain_lock, flags);
2204
2205         return domain;
2206 }
2207
2208 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2209 {
2210         *(u16 *)opaque = alias;
2211         return 0;
2212 }
2213
2214 /* domain is initialized */
2215 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2216 {
2217         struct dmar_domain *domain, *tmp;
2218         struct intel_iommu *iommu;
2219         struct device_domain_info *info;
2220         u16 dma_alias;
2221         unsigned long flags;
2222         u8 bus, devfn;
2223
2224         domain = find_domain(dev);
2225         if (domain)
2226                 return domain;
2227
2228         iommu = device_to_iommu(dev, &bus, &devfn);
2229         if (!iommu)
2230                 return NULL;
2231
2232         if (dev_is_pci(dev)) {
2233                 struct pci_dev *pdev = to_pci_dev(dev);
2234
2235                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2236
2237                 spin_lock_irqsave(&device_domain_lock, flags);
2238                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2239                                                       PCI_BUS_NUM(dma_alias),
2240                                                       dma_alias & 0xff);
2241                 if (info) {
2242                         iommu = info->iommu;
2243                         domain = info->domain;
2244                 }
2245                 spin_unlock_irqrestore(&device_domain_lock, flags);
2246
2247                 /* DMA alias already has a domain, uses it */
2248                 if (info)
2249                         goto found_domain;
2250         }
2251
2252         /* Allocate and initialize new domain for the device */
2253         domain = alloc_domain(0);
2254         if (!domain)
2255                 return NULL;
2256         domain->id = iommu_attach_domain(domain, iommu);
2257         if (domain->id < 0) {
2258                 free_domain_mem(domain);
2259                 return NULL;
2260         }
2261         domain_attach_iommu(domain, iommu);
2262         if (domain_init(domain, gaw)) {
2263                 domain_exit(domain);
2264                 return NULL;
2265         }
2266
2267         /* register PCI DMA alias device */
2268         if (dev_is_pci(dev)) {
2269                 tmp = dmar_insert_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2270                                            dma_alias & 0xff, NULL, domain);
2271
2272                 if (!tmp || tmp != domain) {
2273                         domain_exit(domain);
2274                         domain = tmp;
2275                 }
2276
2277                 if (!domain)
2278                         return NULL;
2279         }
2280
2281 found_domain:
2282         tmp = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
2283
2284         if (!tmp || tmp != domain) {
2285                 domain_exit(domain);
2286                 domain = tmp;
2287         }
2288
2289         return domain;
2290 }
2291
2292 static int iommu_identity_mapping;
2293 #define IDENTMAP_ALL            1
2294 #define IDENTMAP_GFX            2
2295 #define IDENTMAP_AZALIA         4
2296
2297 static int iommu_domain_identity_map(struct dmar_domain *domain,
2298                                      unsigned long long start,
2299                                      unsigned long long end)
2300 {
2301         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2302         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2303
2304         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2305                           dma_to_mm_pfn(last_vpfn))) {
2306                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2307                 return -ENOMEM;
2308         }
2309
2310         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2311                  start, end, domain->id);
2312         /*
2313          * RMRR range might have overlap with physical memory range,
2314          * clear it first
2315          */
2316         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2317
2318         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2319                                   last_vpfn - first_vpfn + 1,
2320                                   DMA_PTE_READ|DMA_PTE_WRITE);
2321 }
2322
2323 static int iommu_prepare_identity_map(struct device *dev,
2324                                       unsigned long long start,
2325                                       unsigned long long end)
2326 {
2327         struct dmar_domain *domain;
2328         int ret;
2329
2330         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2331         if (!domain)
2332                 return -ENOMEM;
2333
2334         /* For _hardware_ passthrough, don't bother. But for software
2335            passthrough, we do it anyway -- it may indicate a memory
2336            range which is reserved in E820, so which didn't get set
2337            up to start with in si_domain */
2338         if (domain == si_domain && hw_pass_through) {
2339                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2340                        dev_name(dev), start, end);
2341                 return 0;
2342         }
2343
2344         printk(KERN_INFO
2345                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2346                dev_name(dev), start, end);
2347         
2348         if (end < start) {
2349                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2350                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2351                         dmi_get_system_info(DMI_BIOS_VENDOR),
2352                         dmi_get_system_info(DMI_BIOS_VERSION),
2353                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2354                 ret = -EIO;
2355                 goto error;
2356         }
2357
2358         if (end >> agaw_to_width(domain->agaw)) {
2359                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2360                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2361                      agaw_to_width(domain->agaw),
2362                      dmi_get_system_info(DMI_BIOS_VENDOR),
2363                      dmi_get_system_info(DMI_BIOS_VERSION),
2364                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2365                 ret = -EIO;
2366                 goto error;
2367         }
2368
2369         ret = iommu_domain_identity_map(domain, start, end);
2370         if (ret)
2371                 goto error;
2372
2373         /* context entry init */
2374         ret = domain_context_mapping(domain, dev, CONTEXT_TT_MULTI_LEVEL);
2375         if (ret)
2376                 goto error;
2377
2378         return 0;
2379
2380  error:
2381         domain_exit(domain);
2382         return ret;
2383 }
2384
2385 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2386                                          struct device *dev)
2387 {
2388         if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2389                 return 0;
2390         return iommu_prepare_identity_map(dev, rmrr->base_address,
2391                                           rmrr->end_address);
2392 }
2393
2394 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2395 static inline void iommu_prepare_isa(void)
2396 {
2397         struct pci_dev *pdev;
2398         int ret;
2399
2400         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2401         if (!pdev)
2402                 return;
2403
2404         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2405         ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2406
2407         if (ret)
2408                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2409                        "floppy might not work\n");
2410
2411         pci_dev_put(pdev);
2412 }
2413 #else
2414 static inline void iommu_prepare_isa(void)
2415 {
2416         return;
2417 }
2418 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2419
2420 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2421
2422 static int __init si_domain_init(int hw)
2423 {
2424         struct dmar_drhd_unit *drhd;
2425         struct intel_iommu *iommu;
2426         int nid, ret = 0;
2427         bool first = true;
2428
2429         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2430         if (!si_domain)
2431                 return -EFAULT;
2432
2433         for_each_active_iommu(iommu, drhd) {
2434                 ret = iommu_attach_domain(si_domain, iommu);
2435                 if (ret < 0) {
2436                         domain_exit(si_domain);
2437                         return -EFAULT;
2438                 } else if (first) {
2439                         si_domain->id = ret;
2440                         first = false;
2441                 } else if (si_domain->id != ret) {
2442                         domain_exit(si_domain);
2443                         return -EFAULT;
2444                 }
2445                 domain_attach_iommu(si_domain, iommu);
2446         }
2447
2448         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2449                 domain_exit(si_domain);
2450                 return -EFAULT;
2451         }
2452
2453         pr_debug("IOMMU: identity mapping domain is domain %d\n",
2454                  si_domain->id);
2455
2456         if (hw)
2457                 return 0;
2458
2459         for_each_online_node(nid) {
2460                 unsigned long start_pfn, end_pfn;
2461                 int i;
2462
2463                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2464                         ret = iommu_domain_identity_map(si_domain,
2465                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2466                         if (ret)
2467                                 return ret;
2468                 }
2469         }
2470
2471         return 0;
2472 }
2473
2474 static int identity_mapping(struct device *dev)
2475 {
2476         struct device_domain_info *info;
2477
2478         if (likely(!iommu_identity_mapping))
2479                 return 0;
2480
2481         info = dev->archdata.iommu;
2482         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2483                 return (info->domain == si_domain);
2484
2485         return 0;
2486 }
2487
2488 static int domain_add_dev_info(struct dmar_domain *domain,
2489                                struct device *dev, int translation)
2490 {
2491         struct dmar_domain *ndomain;
2492         struct intel_iommu *iommu;
2493         u8 bus, devfn;
2494         int ret;
2495
2496         iommu = device_to_iommu(dev, &bus, &devfn);
2497         if (!iommu)
2498                 return -ENODEV;
2499
2500         ndomain = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
2501         if (ndomain != domain)
2502                 return -EBUSY;
2503
2504         ret = domain_context_mapping(domain, dev, translation);
2505         if (ret) {
2506                 domain_remove_one_dev_info(domain, dev);
2507                 return ret;
2508         }
2509
2510         return 0;
2511 }
2512
2513 static bool device_has_rmrr(struct device *dev)
2514 {
2515         struct dmar_rmrr_unit *rmrr;
2516         struct device *tmp;
2517         int i;
2518
2519         rcu_read_lock();
2520         for_each_rmrr_units(rmrr) {
2521                 /*
2522                  * Return TRUE if this RMRR contains the device that
2523                  * is passed in.
2524                  */
2525                 for_each_active_dev_scope(rmrr->devices,
2526                                           rmrr->devices_cnt, i, tmp)
2527                         if (tmp == dev) {
2528                                 rcu_read_unlock();
2529                                 return true;
2530                         }
2531         }
2532         rcu_read_unlock();
2533         return false;
2534 }
2535
2536 /*
2537  * There are a couple cases where we need to restrict the functionality of
2538  * devices associated with RMRRs.  The first is when evaluating a device for
2539  * identity mapping because problems exist when devices are moved in and out
2540  * of domains and their respective RMRR information is lost.  This means that
2541  * a device with associated RMRRs will never be in a "passthrough" domain.
2542  * The second is use of the device through the IOMMU API.  This interface
2543  * expects to have full control of the IOVA space for the device.  We cannot
2544  * satisfy both the requirement that RMRR access is maintained and have an
2545  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2546  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2547  * We therefore prevent devices associated with an RMRR from participating in
2548  * the IOMMU API, which eliminates them from device assignment.
2549  *
2550  * In both cases we assume that PCI USB devices with RMRRs have them largely
2551  * for historical reasons and that the RMRR space is not actively used post
2552  * boot.  This exclusion may change if vendors begin to abuse it.
2553  *
2554  * The same exception is made for graphics devices, with the requirement that
2555  * any use of the RMRR regions will be torn down before assigning the device
2556  * to a guest.
2557  */
2558 static bool device_is_rmrr_locked(struct device *dev)
2559 {
2560         if (!device_has_rmrr(dev))
2561                 return false;
2562
2563         if (dev_is_pci(dev)) {
2564                 struct pci_dev *pdev = to_pci_dev(dev);
2565
2566                 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2567                         return false;
2568         }
2569
2570         return true;
2571 }
2572
2573 static int iommu_should_identity_map(struct device *dev, int startup)
2574 {
2575
2576         if (dev_is_pci(dev)) {
2577                 struct pci_dev *pdev = to_pci_dev(dev);
2578
2579                 if (device_is_rmrr_locked(dev))
2580                         return 0;
2581
2582                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2583                         return 1;
2584
2585                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2586                         return 1;
2587
2588                 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2589                         return 0;
2590
2591                 /*
2592                  * We want to start off with all devices in the 1:1 domain, and
2593                  * take them out later if we find they can't access all of memory.
2594                  *
2595                  * However, we can't do this for PCI devices behind bridges,
2596                  * because all PCI devices behind the same bridge will end up
2597                  * with the same source-id on their transactions.
2598                  *
2599                  * Practically speaking, we can't change things around for these
2600                  * devices at run-time, because we can't be sure there'll be no
2601                  * DMA transactions in flight for any of their siblings.
2602                  *
2603                  * So PCI devices (unless they're on the root bus) as well as
2604                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2605                  * the 1:1 domain, just in _case_ one of their siblings turns out
2606                  * not to be able to map all of memory.
2607                  */
2608                 if (!pci_is_pcie(pdev)) {
2609                         if (!pci_is_root_bus(pdev->bus))
2610                                 return 0;
2611                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2612                                 return 0;
2613                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2614                         return 0;
2615         } else {
2616                 if (device_has_rmrr(dev))
2617                         return 0;
2618         }
2619
2620         /*
2621          * At boot time, we don't yet know if devices will be 64-bit capable.
2622          * Assume that they will — if they turn out not to be, then we can
2623          * take them out of the 1:1 domain later.
2624          */
2625         if (!startup) {
2626                 /*
2627                  * If the device's dma_mask is less than the system's memory
2628                  * size then this is not a candidate for identity mapping.
2629                  */
2630                 u64 dma_mask = *dev->dma_mask;
2631
2632                 if (dev->coherent_dma_mask &&
2633                     dev->coherent_dma_mask < dma_mask)
2634                         dma_mask = dev->coherent_dma_mask;
2635
2636                 return dma_mask >= dma_get_required_mask(dev);
2637         }
2638
2639         return 1;
2640 }
2641
2642 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2643 {
2644         int ret;
2645
2646         if (!iommu_should_identity_map(dev, 1))
2647                 return 0;
2648
2649         ret = domain_add_dev_info(si_domain, dev,
2650                                   hw ? CONTEXT_TT_PASS_THROUGH :
2651                                        CONTEXT_TT_MULTI_LEVEL);
2652         if (!ret)
2653                 pr_info("IOMMU: %s identity mapping for device %s\n",
2654                         hw ? "hardware" : "software", dev_name(dev));
2655         else if (ret == -ENODEV)
2656                 /* device not associated with an iommu */
2657                 ret = 0;
2658
2659         return ret;
2660 }
2661
2662
2663 static int __init iommu_prepare_static_identity_mapping(int hw)
2664 {
2665         struct pci_dev *pdev = NULL;
2666         struct dmar_drhd_unit *drhd;
2667         struct intel_iommu *iommu;
2668         struct device *dev;
2669         int i;
2670         int ret = 0;
2671
2672         ret = si_domain_init(hw);
2673         if (ret)
2674                 return -EFAULT;
2675
2676         for_each_pci_dev(pdev) {
2677                 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2678                 if (ret)
2679                         return ret;
2680         }
2681
2682         for_each_active_iommu(iommu, drhd)
2683                 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2684                         struct acpi_device_physical_node *pn;
2685                         struct acpi_device *adev;
2686
2687                         if (dev->bus != &acpi_bus_type)
2688                                 continue;
2689                                 
2690                         adev= to_acpi_device(dev);
2691                         mutex_lock(&adev->physical_node_lock);
2692                         list_for_each_entry(pn, &adev->physical_node_list, node) {
2693                                 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
2694                                 if (ret)
2695                                         break;
2696                         }
2697                         mutex_unlock(&adev->physical_node_lock);
2698                         if (ret)
2699                                 return ret;
2700                 }
2701
2702         return 0;
2703 }
2704
2705 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2706 {
2707         /*
2708          * Start from the sane iommu hardware state.
2709          * If the queued invalidation is already initialized by us
2710          * (for example, while enabling interrupt-remapping) then
2711          * we got the things already rolling from a sane state.
2712          */
2713         if (!iommu->qi) {
2714                 /*
2715                  * Clear any previous faults.
2716                  */
2717                 dmar_fault(-1, iommu);
2718                 /*
2719                  * Disable queued invalidation if supported and already enabled
2720                  * before OS handover.
2721                  */
2722                 dmar_disable_qi(iommu);
2723         }
2724
2725         if (dmar_enable_qi(iommu)) {
2726                 /*
2727                  * Queued Invalidate not enabled, use Register Based Invalidate
2728                  */
2729                 iommu->flush.flush_context = __iommu_flush_context;
2730                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2731                 pr_info("IOMMU: %s using Register based invalidation\n",
2732                         iommu->name);
2733         } else {
2734                 iommu->flush.flush_context = qi_flush_context;
2735                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2736                 pr_info("IOMMU: %s using Queued invalidation\n", iommu->name);
2737         }
2738 }
2739
2740 static int __init init_dmars(void)
2741 {
2742         struct dmar_drhd_unit *drhd;
2743         struct dmar_rmrr_unit *rmrr;
2744         struct device *dev;
2745         struct intel_iommu *iommu;
2746         int i, ret;
2747
2748         /*
2749          * for each drhd
2750          *    allocate root
2751          *    initialize and program root entry to not present
2752          * endfor
2753          */
2754         for_each_drhd_unit(drhd) {
2755                 /*
2756                  * lock not needed as this is only incremented in the single
2757                  * threaded kernel __init code path all other access are read
2758                  * only
2759                  */
2760                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
2761                         g_num_of_iommus++;
2762                         continue;
2763                 }
2764                 printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2765                           DMAR_UNITS_SUPPORTED);
2766         }
2767
2768         /* Preallocate enough resources for IOMMU hot-addition */
2769         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
2770                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
2771
2772         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2773                         GFP_KERNEL);
2774         if (!g_iommus) {
2775                 printk(KERN_ERR "Allocating global iommu array failed\n");
2776                 ret = -ENOMEM;
2777                 goto error;
2778         }
2779
2780         deferred_flush = kzalloc(g_num_of_iommus *
2781                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2782         if (!deferred_flush) {
2783                 ret = -ENOMEM;
2784                 goto free_g_iommus;
2785         }
2786
2787         for_each_active_iommu(iommu, drhd) {
2788                 g_iommus[iommu->seq_id] = iommu;
2789
2790                 ret = iommu_init_domains(iommu);
2791                 if (ret)
2792                         goto free_iommu;
2793
2794                 /*
2795                  * TBD:
2796                  * we could share the same root & context tables
2797                  * among all IOMMU's. Need to Split it later.
2798                  */
2799                 ret = iommu_alloc_root_entry(iommu);
2800                 if (ret)
2801                         goto free_iommu;
2802                 if (!ecap_pass_through(iommu->ecap))
2803                         hw_pass_through = 0;
2804         }
2805
2806         for_each_active_iommu(iommu, drhd)
2807                 intel_iommu_init_qi(iommu);
2808
2809         if (iommu_pass_through)
2810                 iommu_identity_mapping |= IDENTMAP_ALL;
2811
2812 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2813         iommu_identity_mapping |= IDENTMAP_GFX;
2814 #endif
2815
2816         check_tylersburg_isoch();
2817
2818         /*
2819          * If pass through is not set or not enabled, setup context entries for
2820          * identity mappings for rmrr, gfx, and isa and may fall back to static
2821          * identity mapping if iommu_identity_mapping is set.
2822          */
2823         if (iommu_identity_mapping) {
2824                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2825                 if (ret) {
2826                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2827                         goto free_iommu;
2828                 }
2829         }
2830         /*
2831          * For each rmrr
2832          *   for each dev attached to rmrr
2833          *   do
2834          *     locate drhd for dev, alloc domain for dev
2835          *     allocate free domain
2836          *     allocate page table entries for rmrr
2837          *     if context not allocated for bus
2838          *           allocate and init context
2839          *           set present in root table for this bus
2840          *     init context with domain, translation etc
2841          *    endfor
2842          * endfor
2843          */
2844         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2845         for_each_rmrr_units(rmrr) {
2846                 /* some BIOS lists non-exist devices in DMAR table. */
2847                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2848                                           i, dev) {
2849                         ret = iommu_prepare_rmrr_dev(rmrr, dev);
2850                         if (ret)
2851                                 printk(KERN_ERR
2852                                        "IOMMU: mapping reserved region failed\n");
2853                 }
2854         }
2855
2856         iommu_prepare_isa();
2857
2858         /*
2859          * for each drhd
2860          *   enable fault log
2861          *   global invalidate context cache
2862          *   global invalidate iotlb
2863          *   enable translation
2864          */
2865         for_each_iommu(iommu, drhd) {
2866                 if (drhd->ignored) {
2867                         /*
2868                          * we always have to disable PMRs or DMA may fail on
2869                          * this device
2870                          */
2871                         if (force_on)
2872                                 iommu_disable_protect_mem_regions(iommu);
2873                         continue;
2874                 }
2875
2876                 iommu_flush_write_buffer(iommu);
2877
2878                 ret = dmar_set_interrupt(iommu);
2879                 if (ret)
2880                         goto free_iommu;
2881
2882                 iommu_set_root_entry(iommu);
2883
2884                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2885                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2886                 iommu_enable_translation(iommu);
2887                 iommu_disable_protect_mem_regions(iommu);
2888         }
2889
2890         return 0;
2891
2892 free_iommu:
2893         for_each_active_iommu(iommu, drhd) {
2894                 disable_dmar_iommu(iommu);
2895                 free_dmar_iommu(iommu);
2896         }
2897         kfree(deferred_flush);
2898 free_g_iommus:
2899         kfree(g_iommus);
2900 error:
2901         return ret;
2902 }
2903
2904 /* This takes a number of _MM_ pages, not VTD pages */
2905 static struct iova *intel_alloc_iova(struct device *dev,
2906                                      struct dmar_domain *domain,
2907                                      unsigned long nrpages, uint64_t dma_mask)
2908 {
2909         struct iova *iova = NULL;
2910
2911         /* Restrict dma_mask to the width that the iommu can handle */
2912         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2913
2914         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2915                 /*
2916                  * First try to allocate an io virtual address in
2917                  * DMA_BIT_MASK(32) and if that fails then try allocating
2918                  * from higher range
2919                  */
2920                 iova = alloc_iova(&domain->iovad, nrpages,
2921                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2922                 if (iova)
2923                         return iova;
2924         }
2925         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2926         if (unlikely(!iova)) {
2927                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2928                        nrpages, dev_name(dev));
2929                 return NULL;
2930         }
2931
2932         return iova;
2933 }
2934
2935 static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
2936 {
2937         struct dmar_domain *domain;
2938         int ret;
2939
2940         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2941         if (!domain) {
2942                 printk(KERN_ERR "Allocating domain for %s failed",
2943                        dev_name(dev));
2944                 return NULL;
2945         }
2946
2947         /* make sure context mapping is ok */
2948         if (unlikely(!domain_context_mapped(dev))) {
2949                 ret = domain_context_mapping(domain, dev, CONTEXT_TT_MULTI_LEVEL);
2950                 if (ret) {
2951                         printk(KERN_ERR "Domain context map for %s failed",
2952                                dev_name(dev));
2953                         return NULL;
2954                 }
2955         }
2956
2957         return domain;
2958 }
2959
2960 static inline struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
2961 {
2962         struct device_domain_info *info;
2963
2964         /* No lock here, assumes no domain exit in normal case */
2965         info = dev->archdata.iommu;
2966         if (likely(info))
2967                 return info->domain;
2968
2969         return __get_valid_domain_for_dev(dev);
2970 }
2971
2972 static int iommu_dummy(struct device *dev)
2973 {
2974         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2975 }
2976
2977 /* Check if the dev needs to go through non-identity map and unmap process.*/
2978 static int iommu_no_mapping(struct device *dev)
2979 {
2980         int found;
2981
2982         if (iommu_dummy(dev))
2983                 return 1;
2984
2985         if (!iommu_identity_mapping)
2986                 return 0;
2987
2988         found = identity_mapping(dev);
2989         if (found) {
2990                 if (iommu_should_identity_map(dev, 0))
2991                         return 1;
2992                 else {
2993                         /*
2994                          * 32 bit DMA is removed from si_domain and fall back
2995                          * to non-identity mapping.
2996                          */
2997                         domain_remove_one_dev_info(si_domain, dev);
2998                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2999                                dev_name(dev));
3000                         return 0;
3001                 }
3002         } else {
3003                 /*
3004                  * In case of a detached 64 bit DMA device from vm, the device
3005                  * is put into si_domain for identity mapping.
3006                  */
3007                 if (iommu_should_identity_map(dev, 0)) {
3008                         int ret;
3009                         ret = domain_add_dev_info(si_domain, dev,
3010                                                   hw_pass_through ?
3011                                                   CONTEXT_TT_PASS_THROUGH :
3012                                                   CONTEXT_TT_MULTI_LEVEL);
3013                         if (!ret) {
3014                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
3015                                        dev_name(dev));
3016                                 return 1;
3017                         }
3018                 }
3019         }
3020
3021         return 0;
3022 }
3023
3024 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3025                                      size_t size, int dir, u64 dma_mask)
3026 {
3027         struct dmar_domain *domain;
3028         phys_addr_t start_paddr;
3029         struct iova *iova;
3030         int prot = 0;
3031         int ret;
3032         struct intel_iommu *iommu;
3033         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3034
3035         BUG_ON(dir == DMA_NONE);
3036
3037         if (iommu_no_mapping(dev))
3038                 return paddr;
3039
3040         domain = get_valid_domain_for_dev(dev);
3041         if (!domain)
3042                 return 0;
3043
3044         iommu = domain_get_iommu(domain);
3045         size = aligned_nrpages(paddr, size);
3046
3047         iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3048         if (!iova)
3049                 goto error;
3050
3051         /*
3052          * Check if DMAR supports zero-length reads on write only
3053          * mappings..
3054          */
3055         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3056                         !cap_zlr(iommu->cap))
3057                 prot |= DMA_PTE_READ;
3058         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3059                 prot |= DMA_PTE_WRITE;
3060         /*
3061          * paddr - (paddr + size) might be partial page, we should map the whole
3062          * page.  Note: if two part of one page are separately mapped, we
3063          * might have two guest_addr mapping to the same host paddr, but this
3064          * is not a big problem
3065          */
3066         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
3067                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3068         if (ret)
3069                 goto error;
3070
3071         /* it's a non-present to present mapping. Only flush if caching mode */
3072         if (cap_caching_mode(iommu->cap))
3073                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 0, 1);
3074         else
3075                 iommu_flush_write_buffer(iommu);
3076
3077         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
3078         start_paddr += paddr & ~PAGE_MASK;
3079         return start_paddr;
3080
3081 error:
3082         if (iova)
3083                 __free_iova(&domain->iovad, iova);
3084         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
3085                 dev_name(dev), size, (unsigned long long)paddr, dir);
3086         return 0;
3087 }
3088
3089 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3090                                  unsigned long offset, size_t size,
3091                                  enum dma_data_direction dir,
3092                                  struct dma_attrs *attrs)
3093 {
3094         return __intel_map_single(dev, page_to_phys(page) + offset, size,
3095                                   dir, *dev->dma_mask);
3096 }
3097
3098 static void flush_unmaps(void)
3099 {
3100         int i, j;
3101
3102         timer_on = 0;
3103
3104         /* just flush them all */
3105         for (i = 0; i < g_num_of_iommus; i++) {
3106                 struct intel_iommu *iommu = g_iommus[i];
3107                 if (!iommu)
3108                         continue;
3109
3110                 if (!deferred_flush[i].next)
3111                         continue;
3112
3113                 /* In caching mode, global flushes turn emulation expensive */
3114                 if (!cap_caching_mode(iommu->cap))
3115                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3116                                          DMA_TLB_GLOBAL_FLUSH);
3117                 for (j = 0; j < deferred_flush[i].next; j++) {
3118                         unsigned long mask;
3119                         struct iova *iova = deferred_flush[i].iova[j];
3120                         struct dmar_domain *domain = deferred_flush[i].domain[j];
3121
3122                         /* On real hardware multiple invalidations are expensive */
3123                         if (cap_caching_mode(iommu->cap))
3124                                 iommu_flush_iotlb_psi(iommu, domain->id,
3125                                         iova->pfn_lo, iova_size(iova),
3126                                         !deferred_flush[i].freelist[j], 0);
3127                         else {
3128                                 mask = ilog2(mm_to_dma_pfn(iova_size(iova)));
3129                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
3130                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
3131                         }
3132                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
3133                         if (deferred_flush[i].freelist[j])
3134                                 dma_free_pagelist(deferred_flush[i].freelist[j]);
3135                 }
3136                 deferred_flush[i].next = 0;
3137         }
3138
3139         list_size = 0;
3140 }
3141
3142 static void flush_unmaps_timeout(unsigned long data)
3143 {
3144         unsigned long flags;
3145
3146         spin_lock_irqsave(&async_umap_flush_lock, flags);
3147         flush_unmaps();
3148         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3149 }
3150
3151 static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist)
3152 {
3153         unsigned long flags;
3154         int next, iommu_id;
3155         struct intel_iommu *iommu;
3156
3157         spin_lock_irqsave(&async_umap_flush_lock, flags);
3158         if (list_size == HIGH_WATER_MARK)
3159                 flush_unmaps();
3160
3161         iommu = domain_get_iommu(dom);
3162         iommu_id = iommu->seq_id;
3163
3164         next = deferred_flush[iommu_id].next;
3165         deferred_flush[iommu_id].domain[next] = dom;
3166         deferred_flush[iommu_id].iova[next] = iova;
3167         deferred_flush[iommu_id].freelist[next] = freelist;
3168         deferred_flush[iommu_id].next++;
3169
3170         if (!timer_on) {
3171                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
3172                 timer_on = 1;
3173         }
3174         list_size++;
3175         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3176 }
3177
3178 static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
3179 {
3180         struct dmar_domain *domain;
3181         unsigned long start_pfn, last_pfn;
3182         struct iova *iova;
3183         struct intel_iommu *iommu;
3184         struct page *freelist;
3185
3186         if (iommu_no_mapping(dev))
3187                 return;
3188
3189         domain = find_domain(dev);
3190         BUG_ON(!domain);
3191
3192         iommu = domain_get_iommu(domain);
3193
3194         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
3195         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
3196                       (unsigned long long)dev_addr))
3197                 return;
3198
3199         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3200         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3201
3202         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3203                  dev_name(dev), start_pfn, last_pfn);
3204
3205         freelist = domain_unmap(domain, start_pfn, last_pfn);
3206