Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
[sfrench/cifs-2.6.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <linux/memblock.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48
49 #define ROOT_SIZE               VTD_PAGE_SIZE
50 #define CONTEXT_SIZE            VTD_PAGE_SIZE
51
52 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
53 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
54 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
55
56 #define IOAPIC_RANGE_START      (0xfee00000)
57 #define IOAPIC_RANGE_END        (0xfeefffff)
58 #define IOVA_START_ADDR         (0x1000)
59
60 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
61
62 #define MAX_AGAW_WIDTH 64
63
64 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
65 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
66
67 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
68    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
69 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
70                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
71 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
72
73 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
74 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
75 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
76
77 /* page table handling */
78 #define LEVEL_STRIDE            (9)
79 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
80
81 /*
82  * This bitmap is used to advertise the page sizes our hardware support
83  * to the IOMMU core, which will then use this information to split
84  * physically contiguous memory regions it is mapping into page sizes
85  * that we support.
86  *
87  * Traditionally the IOMMU core just handed us the mappings directly,
88  * after making sure the size is an order of a 4KiB page and that the
89  * mapping has natural alignment.
90  *
91  * To retain this behavior, we currently advertise that we support
92  * all page sizes that are an order of 4KiB.
93  *
94  * If at some point we'd like to utilize the IOMMU core's new behavior,
95  * we could change this to advertise the real page sizes we support.
96  */
97 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
98
99 static inline int agaw_to_level(int agaw)
100 {
101         return agaw + 2;
102 }
103
104 static inline int agaw_to_width(int agaw)
105 {
106         return 30 + agaw * LEVEL_STRIDE;
107 }
108
109 static inline int width_to_agaw(int width)
110 {
111         return (width - 30) / LEVEL_STRIDE;
112 }
113
114 static inline unsigned int level_to_offset_bits(int level)
115 {
116         return (level - 1) * LEVEL_STRIDE;
117 }
118
119 static inline int pfn_level_offset(unsigned long pfn, int level)
120 {
121         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
122 }
123
124 static inline unsigned long level_mask(int level)
125 {
126         return -1UL << level_to_offset_bits(level);
127 }
128
129 static inline unsigned long level_size(int level)
130 {
131         return 1UL << level_to_offset_bits(level);
132 }
133
134 static inline unsigned long align_to_level(unsigned long pfn, int level)
135 {
136         return (pfn + level_size(level) - 1) & level_mask(level);
137 }
138
139 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
140 {
141         return  1 << ((lvl - 1) * LEVEL_STRIDE);
142 }
143
144 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
145    are never going to work. */
146 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
147 {
148         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
149 }
150
151 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
152 {
153         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
154 }
155 static inline unsigned long page_to_dma_pfn(struct page *pg)
156 {
157         return mm_to_dma_pfn(page_to_pfn(pg));
158 }
159 static inline unsigned long virt_to_dma_pfn(void *p)
160 {
161         return page_to_dma_pfn(virt_to_page(p));
162 }
163
164 /* global iommu list, set NULL for ignored DMAR units */
165 static struct intel_iommu **g_iommus;
166
167 static void __init check_tylersburg_isoch(void);
168 static int rwbf_quirk;
169
170 /*
171  * set to 1 to panic kernel if can't successfully enable VT-d
172  * (used when kernel is launched w/ TXT)
173  */
174 static int force_on = 0;
175
176 /*
177  * 0: Present
178  * 1-11: Reserved
179  * 12-63: Context Ptr (12 - (haw-1))
180  * 64-127: Reserved
181  */
182 struct root_entry {
183         u64     val;
184         u64     rsvd1;
185 };
186 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
187 static inline bool root_present(struct root_entry *root)
188 {
189         return (root->val & 1);
190 }
191 static inline void set_root_present(struct root_entry *root)
192 {
193         root->val |= 1;
194 }
195 static inline void set_root_value(struct root_entry *root, unsigned long value)
196 {
197         root->val |= value & VTD_PAGE_MASK;
198 }
199
200 static inline struct context_entry *
201 get_context_addr_from_root(struct root_entry *root)
202 {
203         return (struct context_entry *)
204                 (root_present(root)?phys_to_virt(
205                 root->val & VTD_PAGE_MASK) :
206                 NULL);
207 }
208
209 /*
210  * low 64 bits:
211  * 0: present
212  * 1: fault processing disable
213  * 2-3: translation type
214  * 12-63: address space root
215  * high 64 bits:
216  * 0-2: address width
217  * 3-6: aval
218  * 8-23: domain id
219  */
220 struct context_entry {
221         u64 lo;
222         u64 hi;
223 };
224
225 static inline bool context_present(struct context_entry *context)
226 {
227         return (context->lo & 1);
228 }
229 static inline void context_set_present(struct context_entry *context)
230 {
231         context->lo |= 1;
232 }
233
234 static inline void context_set_fault_enable(struct context_entry *context)
235 {
236         context->lo &= (((u64)-1) << 2) | 1;
237 }
238
239 static inline void context_set_translation_type(struct context_entry *context,
240                                                 unsigned long value)
241 {
242         context->lo &= (((u64)-1) << 4) | 3;
243         context->lo |= (value & 3) << 2;
244 }
245
246 static inline void context_set_address_root(struct context_entry *context,
247                                             unsigned long value)
248 {
249         context->lo |= value & VTD_PAGE_MASK;
250 }
251
252 static inline void context_set_address_width(struct context_entry *context,
253                                              unsigned long value)
254 {
255         context->hi |= value & 7;
256 }
257
258 static inline void context_set_domain_id(struct context_entry *context,
259                                          unsigned long value)
260 {
261         context->hi |= (value & ((1 << 16) - 1)) << 8;
262 }
263
264 static inline void context_clear_entry(struct context_entry *context)
265 {
266         context->lo = 0;
267         context->hi = 0;
268 }
269
270 /*
271  * 0: readable
272  * 1: writable
273  * 2-6: reserved
274  * 7: super page
275  * 8-10: available
276  * 11: snoop behavior
277  * 12-63: Host physcial address
278  */
279 struct dma_pte {
280         u64 val;
281 };
282
283 static inline void dma_clear_pte(struct dma_pte *pte)
284 {
285         pte->val = 0;
286 }
287
288 static inline void dma_set_pte_readable(struct dma_pte *pte)
289 {
290         pte->val |= DMA_PTE_READ;
291 }
292
293 static inline void dma_set_pte_writable(struct dma_pte *pte)
294 {
295         pte->val |= DMA_PTE_WRITE;
296 }
297
298 static inline void dma_set_pte_snp(struct dma_pte *pte)
299 {
300         pte->val |= DMA_PTE_SNP;
301 }
302
303 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
304 {
305         pte->val = (pte->val & ~3) | (prot & 3);
306 }
307
308 static inline u64 dma_pte_addr(struct dma_pte *pte)
309 {
310 #ifdef CONFIG_64BIT
311         return pte->val & VTD_PAGE_MASK;
312 #else
313         /* Must have a full atomic 64-bit read */
314         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
315 #endif
316 }
317
318 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
319 {
320         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
321 }
322
323 static inline bool dma_pte_present(struct dma_pte *pte)
324 {
325         return (pte->val & 3) != 0;
326 }
327
328 static inline bool dma_pte_superpage(struct dma_pte *pte)
329 {
330         return (pte->val & (1 << 7));
331 }
332
333 static inline int first_pte_in_page(struct dma_pte *pte)
334 {
335         return !((unsigned long)pte & ~VTD_PAGE_MASK);
336 }
337
338 /*
339  * This domain is a statically identity mapping domain.
340  *      1. This domain creats a static 1:1 mapping to all usable memory.
341  *      2. It maps to each iommu if successful.
342  *      3. Each iommu mapps to this domain if successful.
343  */
344 static struct dmar_domain *si_domain;
345 static int hw_pass_through = 1;
346
347 /* devices under the same p2p bridge are owned in one domain */
348 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
349
350 /* domain represents a virtual machine, more than one devices
351  * across iommus may be owned in one domain, e.g. kvm guest.
352  */
353 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
354
355 /* si_domain contains mulitple devices */
356 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
357
358 /* define the limit of IOMMUs supported in each domain */
359 #ifdef  CONFIG_X86
360 # define        IOMMU_UNITS_SUPPORTED   MAX_IO_APICS
361 #else
362 # define        IOMMU_UNITS_SUPPORTED   64
363 #endif
364
365 struct dmar_domain {
366         int     id;                     /* domain id */
367         int     nid;                    /* node id */
368         DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
369                                         /* bitmap of iommus this domain uses*/
370
371         struct list_head devices;       /* all devices' list */
372         struct iova_domain iovad;       /* iova's that belong to this domain */
373
374         struct dma_pte  *pgd;           /* virtual address */
375         int             gaw;            /* max guest address width */
376
377         /* adjusted guest address width, 0 is level 2 30-bit */
378         int             agaw;
379
380         int             flags;          /* flags to find out type of domain */
381
382         int             iommu_coherency;/* indicate coherency of iommu access */
383         int             iommu_snooping; /* indicate snooping control feature*/
384         int             iommu_count;    /* reference count of iommu */
385         int             iommu_superpage;/* Level of superpages supported:
386                                            0 == 4KiB (no superpages), 1 == 2MiB,
387                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
388         spinlock_t      iommu_lock;     /* protect iommu set in domain */
389         u64             max_addr;       /* maximum mapped address */
390 };
391
392 /* PCI domain-device relationship */
393 struct device_domain_info {
394         struct list_head link;  /* link to domain siblings */
395         struct list_head global; /* link to global list */
396         int segment;            /* PCI domain */
397         u8 bus;                 /* PCI bus number */
398         u8 devfn;               /* PCI devfn number */
399         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
400         struct intel_iommu *iommu; /* IOMMU used by this device */
401         struct dmar_domain *domain; /* pointer to domain */
402 };
403
404 static void flush_unmaps_timeout(unsigned long data);
405
406 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
407
408 #define HIGH_WATER_MARK 250
409 struct deferred_flush_tables {
410         int next;
411         struct iova *iova[HIGH_WATER_MARK];
412         struct dmar_domain *domain[HIGH_WATER_MARK];
413 };
414
415 static struct deferred_flush_tables *deferred_flush;
416
417 /* bitmap for indexing intel_iommus */
418 static int g_num_of_iommus;
419
420 static DEFINE_SPINLOCK(async_umap_flush_lock);
421 static LIST_HEAD(unmaps_to_do);
422
423 static int timer_on;
424 static long list_size;
425
426 static void domain_remove_dev_info(struct dmar_domain *domain);
427
428 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
429 int dmar_disabled = 0;
430 #else
431 int dmar_disabled = 1;
432 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
433
434 int intel_iommu_enabled = 0;
435 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
436
437 static int dmar_map_gfx = 1;
438 static int dmar_forcedac;
439 static int intel_iommu_strict;
440 static int intel_iommu_superpage = 1;
441
442 int intel_iommu_gfx_mapped;
443 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
444
445 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
446 static DEFINE_SPINLOCK(device_domain_lock);
447 static LIST_HEAD(device_domain_list);
448
449 static struct iommu_ops intel_iommu_ops;
450
451 static int __init intel_iommu_setup(char *str)
452 {
453         if (!str)
454                 return -EINVAL;
455         while (*str) {
456                 if (!strncmp(str, "on", 2)) {
457                         dmar_disabled = 0;
458                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
459                 } else if (!strncmp(str, "off", 3)) {
460                         dmar_disabled = 1;
461                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
462                 } else if (!strncmp(str, "igfx_off", 8)) {
463                         dmar_map_gfx = 0;
464                         printk(KERN_INFO
465                                 "Intel-IOMMU: disable GFX device mapping\n");
466                 } else if (!strncmp(str, "forcedac", 8)) {
467                         printk(KERN_INFO
468                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
469                         dmar_forcedac = 1;
470                 } else if (!strncmp(str, "strict", 6)) {
471                         printk(KERN_INFO
472                                 "Intel-IOMMU: disable batched IOTLB flush\n");
473                         intel_iommu_strict = 1;
474                 } else if (!strncmp(str, "sp_off", 6)) {
475                         printk(KERN_INFO
476                                 "Intel-IOMMU: disable supported super page\n");
477                         intel_iommu_superpage = 0;
478                 }
479
480                 str += strcspn(str, ",");
481                 while (*str == ',')
482                         str++;
483         }
484         return 0;
485 }
486 __setup("intel_iommu=", intel_iommu_setup);
487
488 static struct kmem_cache *iommu_domain_cache;
489 static struct kmem_cache *iommu_devinfo_cache;
490 static struct kmem_cache *iommu_iova_cache;
491
492 static inline void *alloc_pgtable_page(int node)
493 {
494         struct page *page;
495         void *vaddr = NULL;
496
497         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
498         if (page)
499                 vaddr = page_address(page);
500         return vaddr;
501 }
502
503 static inline void free_pgtable_page(void *vaddr)
504 {
505         free_page((unsigned long)vaddr);
506 }
507
508 static inline void *alloc_domain_mem(void)
509 {
510         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
511 }
512
513 static void free_domain_mem(void *vaddr)
514 {
515         kmem_cache_free(iommu_domain_cache, vaddr);
516 }
517
518 static inline void * alloc_devinfo_mem(void)
519 {
520         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
521 }
522
523 static inline void free_devinfo_mem(void *vaddr)
524 {
525         kmem_cache_free(iommu_devinfo_cache, vaddr);
526 }
527
528 struct iova *alloc_iova_mem(void)
529 {
530         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
531 }
532
533 void free_iova_mem(struct iova *iova)
534 {
535         kmem_cache_free(iommu_iova_cache, iova);
536 }
537
538
539 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
540 {
541         unsigned long sagaw;
542         int agaw = -1;
543
544         sagaw = cap_sagaw(iommu->cap);
545         for (agaw = width_to_agaw(max_gaw);
546              agaw >= 0; agaw--) {
547                 if (test_bit(agaw, &sagaw))
548                         break;
549         }
550
551         return agaw;
552 }
553
554 /*
555  * Calculate max SAGAW for each iommu.
556  */
557 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
558 {
559         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
560 }
561
562 /*
563  * calculate agaw for each iommu.
564  * "SAGAW" may be different across iommus, use a default agaw, and
565  * get a supported less agaw for iommus that don't support the default agaw.
566  */
567 int iommu_calculate_agaw(struct intel_iommu *iommu)
568 {
569         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
570 }
571
572 /* This functionin only returns single iommu in a domain */
573 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
574 {
575         int iommu_id;
576
577         /* si_domain and vm domain should not get here. */
578         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
579         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
580
581         iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
582         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
583                 return NULL;
584
585         return g_iommus[iommu_id];
586 }
587
588 static void domain_update_iommu_coherency(struct dmar_domain *domain)
589 {
590         int i;
591
592         i = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
593
594         domain->iommu_coherency = i < g_num_of_iommus ? 1 : 0;
595
596         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
597                 if (!ecap_coherent(g_iommus[i]->ecap)) {
598                         domain->iommu_coherency = 0;
599                         break;
600                 }
601         }
602 }
603
604 static void domain_update_iommu_snooping(struct dmar_domain *domain)
605 {
606         int i;
607
608         domain->iommu_snooping = 1;
609
610         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
611                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
612                         domain->iommu_snooping = 0;
613                         break;
614                 }
615         }
616 }
617
618 static void domain_update_iommu_superpage(struct dmar_domain *domain)
619 {
620         struct dmar_drhd_unit *drhd;
621         struct intel_iommu *iommu = NULL;
622         int mask = 0xf;
623
624         if (!intel_iommu_superpage) {
625                 domain->iommu_superpage = 0;
626                 return;
627         }
628
629         /* set iommu_superpage to the smallest common denominator */
630         for_each_active_iommu(iommu, drhd) {
631                 mask &= cap_super_page_val(iommu->cap);
632                 if (!mask) {
633                         break;
634                 }
635         }
636         domain->iommu_superpage = fls(mask);
637 }
638
639 /* Some capabilities may be different across iommus */
640 static void domain_update_iommu_cap(struct dmar_domain *domain)
641 {
642         domain_update_iommu_coherency(domain);
643         domain_update_iommu_snooping(domain);
644         domain_update_iommu_superpage(domain);
645 }
646
647 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
648 {
649         struct dmar_drhd_unit *drhd = NULL;
650         int i;
651
652         for_each_drhd_unit(drhd) {
653                 if (drhd->ignored)
654                         continue;
655                 if (segment != drhd->segment)
656                         continue;
657
658                 for (i = 0; i < drhd->devices_cnt; i++) {
659                         if (drhd->devices[i] &&
660                             drhd->devices[i]->bus->number == bus &&
661                             drhd->devices[i]->devfn == devfn)
662                                 return drhd->iommu;
663                         if (drhd->devices[i] &&
664                             drhd->devices[i]->subordinate &&
665                             drhd->devices[i]->subordinate->number <= bus &&
666                             drhd->devices[i]->subordinate->busn_res.end >= bus)
667                                 return drhd->iommu;
668                 }
669
670                 if (drhd->include_all)
671                         return drhd->iommu;
672         }
673
674         return NULL;
675 }
676
677 static void domain_flush_cache(struct dmar_domain *domain,
678                                void *addr, int size)
679 {
680         if (!domain->iommu_coherency)
681                 clflush_cache_range(addr, size);
682 }
683
684 /* Gets context entry for a given bus and devfn */
685 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
686                 u8 bus, u8 devfn)
687 {
688         struct root_entry *root;
689         struct context_entry *context;
690         unsigned long phy_addr;
691         unsigned long flags;
692
693         spin_lock_irqsave(&iommu->lock, flags);
694         root = &iommu->root_entry[bus];
695         context = get_context_addr_from_root(root);
696         if (!context) {
697                 context = (struct context_entry *)
698                                 alloc_pgtable_page(iommu->node);
699                 if (!context) {
700                         spin_unlock_irqrestore(&iommu->lock, flags);
701                         return NULL;
702                 }
703                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
704                 phy_addr = virt_to_phys((void *)context);
705                 set_root_value(root, phy_addr);
706                 set_root_present(root);
707                 __iommu_flush_cache(iommu, root, sizeof(*root));
708         }
709         spin_unlock_irqrestore(&iommu->lock, flags);
710         return &context[devfn];
711 }
712
713 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
714 {
715         struct root_entry *root;
716         struct context_entry *context;
717         int ret;
718         unsigned long flags;
719
720         spin_lock_irqsave(&iommu->lock, flags);
721         root = &iommu->root_entry[bus];
722         context = get_context_addr_from_root(root);
723         if (!context) {
724                 ret = 0;
725                 goto out;
726         }
727         ret = context_present(&context[devfn]);
728 out:
729         spin_unlock_irqrestore(&iommu->lock, flags);
730         return ret;
731 }
732
733 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
734 {
735         struct root_entry *root;
736         struct context_entry *context;
737         unsigned long flags;
738
739         spin_lock_irqsave(&iommu->lock, flags);
740         root = &iommu->root_entry[bus];
741         context = get_context_addr_from_root(root);
742         if (context) {
743                 context_clear_entry(&context[devfn]);
744                 __iommu_flush_cache(iommu, &context[devfn], \
745                         sizeof(*context));
746         }
747         spin_unlock_irqrestore(&iommu->lock, flags);
748 }
749
750 static void free_context_table(struct intel_iommu *iommu)
751 {
752         struct root_entry *root;
753         int i;
754         unsigned long flags;
755         struct context_entry *context;
756
757         spin_lock_irqsave(&iommu->lock, flags);
758         if (!iommu->root_entry) {
759                 goto out;
760         }
761         for (i = 0; i < ROOT_ENTRY_NR; i++) {
762                 root = &iommu->root_entry[i];
763                 context = get_context_addr_from_root(root);
764                 if (context)
765                         free_pgtable_page(context);
766         }
767         free_pgtable_page(iommu->root_entry);
768         iommu->root_entry = NULL;
769 out:
770         spin_unlock_irqrestore(&iommu->lock, flags);
771 }
772
773 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
774                                       unsigned long pfn, int target_level)
775 {
776         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
777         struct dma_pte *parent, *pte = NULL;
778         int level = agaw_to_level(domain->agaw);
779         int offset;
780
781         BUG_ON(!domain->pgd);
782         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
783         parent = domain->pgd;
784
785         while (level > 0) {
786                 void *tmp_page;
787
788                 offset = pfn_level_offset(pfn, level);
789                 pte = &parent[offset];
790                 if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
791                         break;
792                 if (level == target_level)
793                         break;
794
795                 if (!dma_pte_present(pte)) {
796                         uint64_t pteval;
797
798                         tmp_page = alloc_pgtable_page(domain->nid);
799
800                         if (!tmp_page)
801                                 return NULL;
802
803                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
804                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
805                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
806                                 /* Someone else set it while we were thinking; use theirs. */
807                                 free_pgtable_page(tmp_page);
808                         } else {
809                                 dma_pte_addr(pte);
810                                 domain_flush_cache(domain, pte, sizeof(*pte));
811                         }
812                 }
813                 parent = phys_to_virt(dma_pte_addr(pte));
814                 level--;
815         }
816
817         return pte;
818 }
819
820
821 /* return address's pte at specific level */
822 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
823                                          unsigned long pfn,
824                                          int level, int *large_page)
825 {
826         struct dma_pte *parent, *pte = NULL;
827         int total = agaw_to_level(domain->agaw);
828         int offset;
829
830         parent = domain->pgd;
831         while (level <= total) {
832                 offset = pfn_level_offset(pfn, total);
833                 pte = &parent[offset];
834                 if (level == total)
835                         return pte;
836
837                 if (!dma_pte_present(pte)) {
838                         *large_page = total;
839                         break;
840                 }
841
842                 if (pte->val & DMA_PTE_LARGE_PAGE) {
843                         *large_page = total;
844                         return pte;
845                 }
846
847                 parent = phys_to_virt(dma_pte_addr(pte));
848                 total--;
849         }
850         return NULL;
851 }
852
853 /* clear last level pte, a tlb flush should be followed */
854 static int dma_pte_clear_range(struct dmar_domain *domain,
855                                 unsigned long start_pfn,
856                                 unsigned long last_pfn)
857 {
858         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
859         unsigned int large_page = 1;
860         struct dma_pte *first_pte, *pte;
861         int order;
862
863         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
864         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
865         BUG_ON(start_pfn > last_pfn);
866
867         /* we don't need lock here; nobody else touches the iova range */
868         do {
869                 large_page = 1;
870                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
871                 if (!pte) {
872                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
873                         continue;
874                 }
875                 do {
876                         dma_clear_pte(pte);
877                         start_pfn += lvl_to_nr_pages(large_page);
878                         pte++;
879                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
880
881                 domain_flush_cache(domain, first_pte,
882                                    (void *)pte - (void *)first_pte);
883
884         } while (start_pfn && start_pfn <= last_pfn);
885
886         order = (large_page - 1) * 9;
887         return order;
888 }
889
890 /* free page table pages. last level pte should already be cleared */
891 static void dma_pte_free_pagetable(struct dmar_domain *domain,
892                                    unsigned long start_pfn,
893                                    unsigned long last_pfn)
894 {
895         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
896         struct dma_pte *first_pte, *pte;
897         int total = agaw_to_level(domain->agaw);
898         int level;
899         unsigned long tmp;
900         int large_page = 2;
901
902         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
903         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
904         BUG_ON(start_pfn > last_pfn);
905
906         /* We don't need lock here; nobody else touches the iova range */
907         level = 2;
908         while (level <= total) {
909                 tmp = align_to_level(start_pfn, level);
910
911                 /* If we can't even clear one PTE at this level, we're done */
912                 if (tmp + level_size(level) - 1 > last_pfn)
913                         return;
914
915                 do {
916                         large_page = level;
917                         first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
918                         if (large_page > level)
919                                 level = large_page + 1;
920                         if (!pte) {
921                                 tmp = align_to_level(tmp + 1, level + 1);
922                                 continue;
923                         }
924                         do {
925                                 if (dma_pte_present(pte)) {
926                                         free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
927                                         dma_clear_pte(pte);
928                                 }
929                                 pte++;
930                                 tmp += level_size(level);
931                         } while (!first_pte_in_page(pte) &&
932                                  tmp + level_size(level) - 1 <= last_pfn);
933
934                         domain_flush_cache(domain, first_pte,
935                                            (void *)pte - (void *)first_pte);
936                         
937                 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
938                 level++;
939         }
940         /* free pgd */
941         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
942                 free_pgtable_page(domain->pgd);
943                 domain->pgd = NULL;
944         }
945 }
946
947 /* iommu handling */
948 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
949 {
950         struct root_entry *root;
951         unsigned long flags;
952
953         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
954         if (!root)
955                 return -ENOMEM;
956
957         __iommu_flush_cache(iommu, root, ROOT_SIZE);
958
959         spin_lock_irqsave(&iommu->lock, flags);
960         iommu->root_entry = root;
961         spin_unlock_irqrestore(&iommu->lock, flags);
962
963         return 0;
964 }
965
966 static void iommu_set_root_entry(struct intel_iommu *iommu)
967 {
968         void *addr;
969         u32 sts;
970         unsigned long flag;
971
972         addr = iommu->root_entry;
973
974         raw_spin_lock_irqsave(&iommu->register_lock, flag);
975         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
976
977         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
978
979         /* Make sure hardware complete it */
980         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
981                       readl, (sts & DMA_GSTS_RTPS), sts);
982
983         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
984 }
985
986 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
987 {
988         u32 val;
989         unsigned long flag;
990
991         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
992                 return;
993
994         raw_spin_lock_irqsave(&iommu->register_lock, flag);
995         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
996
997         /* Make sure hardware complete it */
998         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
999                       readl, (!(val & DMA_GSTS_WBFS)), val);
1000
1001         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1002 }
1003
1004 /* return value determine if we need a write buffer flush */
1005 static void __iommu_flush_context(struct intel_iommu *iommu,
1006                                   u16 did, u16 source_id, u8 function_mask,
1007                                   u64 type)
1008 {
1009         u64 val = 0;
1010         unsigned long flag;
1011
1012         switch (type) {
1013         case DMA_CCMD_GLOBAL_INVL:
1014                 val = DMA_CCMD_GLOBAL_INVL;
1015                 break;
1016         case DMA_CCMD_DOMAIN_INVL:
1017                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1018                 break;
1019         case DMA_CCMD_DEVICE_INVL:
1020                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1021                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1022                 break;
1023         default:
1024                 BUG();
1025         }
1026         val |= DMA_CCMD_ICC;
1027
1028         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1029         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1030
1031         /* Make sure hardware complete it */
1032         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1033                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1034
1035         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1036 }
1037
1038 /* return value determine if we need a write buffer flush */
1039 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1040                                 u64 addr, unsigned int size_order, u64 type)
1041 {
1042         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1043         u64 val = 0, val_iva = 0;
1044         unsigned long flag;
1045
1046         switch (type) {
1047         case DMA_TLB_GLOBAL_FLUSH:
1048                 /* global flush doesn't need set IVA_REG */
1049                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1050                 break;
1051         case DMA_TLB_DSI_FLUSH:
1052                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1053                 break;
1054         case DMA_TLB_PSI_FLUSH:
1055                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1056                 /* Note: always flush non-leaf currently */
1057                 val_iva = size_order | addr;
1058                 break;
1059         default:
1060                 BUG();
1061         }
1062         /* Note: set drain read/write */
1063 #if 0
1064         /*
1065          * This is probably to be super secure.. Looks like we can
1066          * ignore it without any impact.
1067          */
1068         if (cap_read_drain(iommu->cap))
1069                 val |= DMA_TLB_READ_DRAIN;
1070 #endif
1071         if (cap_write_drain(iommu->cap))
1072                 val |= DMA_TLB_WRITE_DRAIN;
1073
1074         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1075         /* Note: Only uses first TLB reg currently */
1076         if (val_iva)
1077                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1078         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1079
1080         /* Make sure hardware complete it */
1081         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1082                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1083
1084         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1085
1086         /* check IOTLB invalidation granularity */
1087         if (DMA_TLB_IAIG(val) == 0)
1088                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1089         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1090                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1091                         (unsigned long long)DMA_TLB_IIRG(type),
1092                         (unsigned long long)DMA_TLB_IAIG(val));
1093 }
1094
1095 static struct device_domain_info *iommu_support_dev_iotlb(
1096         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1097 {
1098         int found = 0;
1099         unsigned long flags;
1100         struct device_domain_info *info;
1101         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1102
1103         if (!ecap_dev_iotlb_support(iommu->ecap))
1104                 return NULL;
1105
1106         if (!iommu->qi)
1107                 return NULL;
1108
1109         spin_lock_irqsave(&device_domain_lock, flags);
1110         list_for_each_entry(info, &domain->devices, link)
1111                 if (info->bus == bus && info->devfn == devfn) {
1112                         found = 1;
1113                         break;
1114                 }
1115         spin_unlock_irqrestore(&device_domain_lock, flags);
1116
1117         if (!found || !info->dev)
1118                 return NULL;
1119
1120         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1121                 return NULL;
1122
1123         if (!dmar_find_matched_atsr_unit(info->dev))
1124                 return NULL;
1125
1126         info->iommu = iommu;
1127
1128         return info;
1129 }
1130
1131 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1132 {
1133         if (!info)
1134                 return;
1135
1136         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1137 }
1138
1139 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1140 {
1141         if (!info->dev || !pci_ats_enabled(info->dev))
1142                 return;
1143
1144         pci_disable_ats(info->dev);
1145 }
1146
1147 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1148                                   u64 addr, unsigned mask)
1149 {
1150         u16 sid, qdep;
1151         unsigned long flags;
1152         struct device_domain_info *info;
1153
1154         spin_lock_irqsave(&device_domain_lock, flags);
1155         list_for_each_entry(info, &domain->devices, link) {
1156                 if (!info->dev || !pci_ats_enabled(info->dev))
1157                         continue;
1158
1159                 sid = info->bus << 8 | info->devfn;
1160                 qdep = pci_ats_queue_depth(info->dev);
1161                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1162         }
1163         spin_unlock_irqrestore(&device_domain_lock, flags);
1164 }
1165
1166 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1167                                   unsigned long pfn, unsigned int pages, int map)
1168 {
1169         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1170         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1171
1172         BUG_ON(pages == 0);
1173
1174         /*
1175          * Fallback to domain selective flush if no PSI support or the size is
1176          * too big.
1177          * PSI requires page size to be 2 ^ x, and the base address is naturally
1178          * aligned to the size
1179          */
1180         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1181                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1182                                                 DMA_TLB_DSI_FLUSH);
1183         else
1184                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1185                                                 DMA_TLB_PSI_FLUSH);
1186
1187         /*
1188          * In caching mode, changes of pages from non-present to present require
1189          * flush. However, device IOTLB doesn't need to be flushed in this case.
1190          */
1191         if (!cap_caching_mode(iommu->cap) || !map)
1192                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1193 }
1194
1195 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1196 {
1197         u32 pmen;
1198         unsigned long flags;
1199
1200         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1201         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1202         pmen &= ~DMA_PMEN_EPM;
1203         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1204
1205         /* wait for the protected region status bit to clear */
1206         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1207                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1208
1209         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1210 }
1211
1212 static int iommu_enable_translation(struct intel_iommu *iommu)
1213 {
1214         u32 sts;
1215         unsigned long flags;
1216
1217         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1218         iommu->gcmd |= DMA_GCMD_TE;
1219         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1220
1221         /* Make sure hardware complete it */
1222         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1223                       readl, (sts & DMA_GSTS_TES), sts);
1224
1225         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1226         return 0;
1227 }
1228
1229 static int iommu_disable_translation(struct intel_iommu *iommu)
1230 {
1231         u32 sts;
1232         unsigned long flag;
1233
1234         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1235         iommu->gcmd &= ~DMA_GCMD_TE;
1236         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1237
1238         /* Make sure hardware complete it */
1239         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1240                       readl, (!(sts & DMA_GSTS_TES)), sts);
1241
1242         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1243         return 0;
1244 }
1245
1246
1247 static int iommu_init_domains(struct intel_iommu *iommu)
1248 {
1249         unsigned long ndomains;
1250         unsigned long nlongs;
1251
1252         ndomains = cap_ndoms(iommu->cap);
1253         pr_debug("IOMMU %d: Number of Domains supported <%ld>\n", iommu->seq_id,
1254                         ndomains);
1255         nlongs = BITS_TO_LONGS(ndomains);
1256
1257         spin_lock_init(&iommu->lock);
1258
1259         /* TBD: there might be 64K domains,
1260          * consider other allocation for future chip
1261          */
1262         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1263         if (!iommu->domain_ids) {
1264                 printk(KERN_ERR "Allocating domain id array failed\n");
1265                 return -ENOMEM;
1266         }
1267         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1268                         GFP_KERNEL);
1269         if (!iommu->domains) {
1270                 printk(KERN_ERR "Allocating domain array failed\n");
1271                 return -ENOMEM;
1272         }
1273
1274         /*
1275          * if Caching mode is set, then invalid translations are tagged
1276          * with domainid 0. Hence we need to pre-allocate it.
1277          */
1278         if (cap_caching_mode(iommu->cap))
1279                 set_bit(0, iommu->domain_ids);
1280         return 0;
1281 }
1282
1283
1284 static void domain_exit(struct dmar_domain *domain);
1285 static void vm_domain_exit(struct dmar_domain *domain);
1286
1287 void free_dmar_iommu(struct intel_iommu *iommu)
1288 {
1289         struct dmar_domain *domain;
1290         int i;
1291         unsigned long flags;
1292
1293         if ((iommu->domains) && (iommu->domain_ids)) {
1294                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1295                         domain = iommu->domains[i];
1296                         clear_bit(i, iommu->domain_ids);
1297
1298                         spin_lock_irqsave(&domain->iommu_lock, flags);
1299                         if (--domain->iommu_count == 0) {
1300                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1301                                         vm_domain_exit(domain);
1302                                 else
1303                                         domain_exit(domain);
1304                         }
1305                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1306                 }
1307         }
1308
1309         if (iommu->gcmd & DMA_GCMD_TE)
1310                 iommu_disable_translation(iommu);
1311
1312         if (iommu->irq) {
1313                 irq_set_handler_data(iommu->irq, NULL);
1314                 /* This will mask the irq */
1315                 free_irq(iommu->irq, iommu);
1316                 destroy_irq(iommu->irq);
1317         }
1318
1319         kfree(iommu->domains);
1320         kfree(iommu->domain_ids);
1321
1322         g_iommus[iommu->seq_id] = NULL;
1323
1324         /* if all iommus are freed, free g_iommus */
1325         for (i = 0; i < g_num_of_iommus; i++) {
1326                 if (g_iommus[i])
1327                         break;
1328         }
1329
1330         if (i == g_num_of_iommus)
1331                 kfree(g_iommus);
1332
1333         /* free context mapping */
1334         free_context_table(iommu);
1335 }
1336
1337 static struct dmar_domain *alloc_domain(void)
1338 {
1339         struct dmar_domain *domain;
1340
1341         domain = alloc_domain_mem();
1342         if (!domain)
1343                 return NULL;
1344
1345         domain->nid = -1;
1346         memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1347         domain->flags = 0;
1348
1349         return domain;
1350 }
1351
1352 static int iommu_attach_domain(struct dmar_domain *domain,
1353                                struct intel_iommu *iommu)
1354 {
1355         int num;
1356         unsigned long ndomains;
1357         unsigned long flags;
1358
1359         ndomains = cap_ndoms(iommu->cap);
1360
1361         spin_lock_irqsave(&iommu->lock, flags);
1362
1363         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1364         if (num >= ndomains) {
1365                 spin_unlock_irqrestore(&iommu->lock, flags);
1366                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1367                 return -ENOMEM;
1368         }
1369
1370         domain->id = num;
1371         set_bit(num, iommu->domain_ids);
1372         set_bit(iommu->seq_id, domain->iommu_bmp);
1373         iommu->domains[num] = domain;
1374         spin_unlock_irqrestore(&iommu->lock, flags);
1375
1376         return 0;
1377 }
1378
1379 static void iommu_detach_domain(struct dmar_domain *domain,
1380                                 struct intel_iommu *iommu)
1381 {
1382         unsigned long flags;
1383         int num, ndomains;
1384         int found = 0;
1385
1386         spin_lock_irqsave(&iommu->lock, flags);
1387         ndomains = cap_ndoms(iommu->cap);
1388         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1389                 if (iommu->domains[num] == domain) {
1390                         found = 1;
1391                         break;
1392                 }
1393         }
1394
1395         if (found) {
1396                 clear_bit(num, iommu->domain_ids);
1397                 clear_bit(iommu->seq_id, domain->iommu_bmp);
1398                 iommu->domains[num] = NULL;
1399         }
1400         spin_unlock_irqrestore(&iommu->lock, flags);
1401 }
1402
1403 static struct iova_domain reserved_iova_list;
1404 static struct lock_class_key reserved_rbtree_key;
1405
1406 static int dmar_init_reserved_ranges(void)
1407 {
1408         struct pci_dev *pdev = NULL;
1409         struct iova *iova;
1410         int i;
1411
1412         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1413
1414         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1415                 &reserved_rbtree_key);
1416
1417         /* IOAPIC ranges shouldn't be accessed by DMA */
1418         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1419                 IOVA_PFN(IOAPIC_RANGE_END));
1420         if (!iova) {
1421                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1422                 return -ENODEV;
1423         }
1424
1425         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1426         for_each_pci_dev(pdev) {
1427                 struct resource *r;
1428
1429                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1430                         r = &pdev->resource[i];
1431                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1432                                 continue;
1433                         iova = reserve_iova(&reserved_iova_list,
1434                                             IOVA_PFN(r->start),
1435                                             IOVA_PFN(r->end));
1436                         if (!iova) {
1437                                 printk(KERN_ERR "Reserve iova failed\n");
1438                                 return -ENODEV;
1439                         }
1440                 }
1441         }
1442         return 0;
1443 }
1444
1445 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1446 {
1447         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1448 }
1449
1450 static inline int guestwidth_to_adjustwidth(int gaw)
1451 {
1452         int agaw;
1453         int r = (gaw - 12) % 9;
1454
1455         if (r == 0)
1456                 agaw = gaw;
1457         else
1458                 agaw = gaw + 9 - r;
1459         if (agaw > 64)
1460                 agaw = 64;
1461         return agaw;
1462 }
1463
1464 static int domain_init(struct dmar_domain *domain, int guest_width)
1465 {
1466         struct intel_iommu *iommu;
1467         int adjust_width, agaw;
1468         unsigned long sagaw;
1469
1470         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1471         spin_lock_init(&domain->iommu_lock);
1472
1473         domain_reserve_special_ranges(domain);
1474
1475         /* calculate AGAW */
1476         iommu = domain_get_iommu(domain);
1477         if (guest_width > cap_mgaw(iommu->cap))
1478                 guest_width = cap_mgaw(iommu->cap);
1479         domain->gaw = guest_width;
1480         adjust_width = guestwidth_to_adjustwidth(guest_width);
1481         agaw = width_to_agaw(adjust_width);
1482         sagaw = cap_sagaw(iommu->cap);
1483         if (!test_bit(agaw, &sagaw)) {
1484                 /* hardware doesn't support it, choose a bigger one */
1485                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1486                 agaw = find_next_bit(&sagaw, 5, agaw);
1487                 if (agaw >= 5)
1488                         return -ENODEV;
1489         }
1490         domain->agaw = agaw;
1491         INIT_LIST_HEAD(&domain->devices);
1492
1493         if (ecap_coherent(iommu->ecap))
1494                 domain->iommu_coherency = 1;
1495         else
1496                 domain->iommu_coherency = 0;
1497
1498         if (ecap_sc_support(iommu->ecap))
1499                 domain->iommu_snooping = 1;
1500         else
1501                 domain->iommu_snooping = 0;
1502
1503         domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1504         domain->iommu_count = 1;
1505         domain->nid = iommu->node;
1506
1507         /* always allocate the top pgd */
1508         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1509         if (!domain->pgd)
1510                 return -ENOMEM;
1511         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1512         return 0;
1513 }
1514
1515 static void domain_exit(struct dmar_domain *domain)
1516 {
1517         struct dmar_drhd_unit *drhd;
1518         struct intel_iommu *iommu;
1519
1520         /* Domain 0 is reserved, so dont process it */
1521         if (!domain)
1522                 return;
1523
1524         /* Flush any lazy unmaps that may reference this domain */
1525         if (!intel_iommu_strict)
1526                 flush_unmaps_timeout(0);
1527
1528         domain_remove_dev_info(domain);
1529         /* destroy iovas */
1530         put_iova_domain(&domain->iovad);
1531
1532         /* clear ptes */
1533         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1534
1535         /* free page tables */
1536         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1537
1538         for_each_active_iommu(iommu, drhd)
1539                 if (test_bit(iommu->seq_id, domain->iommu_bmp))
1540                         iommu_detach_domain(domain, iommu);
1541
1542         free_domain_mem(domain);
1543 }
1544
1545 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1546                                  u8 bus, u8 devfn, int translation)
1547 {
1548         struct context_entry *context;
1549         unsigned long flags;
1550         struct intel_iommu *iommu;
1551         struct dma_pte *pgd;
1552         unsigned long num;
1553         unsigned long ndomains;
1554         int id;
1555         int agaw;
1556         struct device_domain_info *info = NULL;
1557
1558         pr_debug("Set context mapping for %02x:%02x.%d\n",
1559                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1560
1561         BUG_ON(!domain->pgd);
1562         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1563                translation != CONTEXT_TT_MULTI_LEVEL);
1564
1565         iommu = device_to_iommu(segment, bus, devfn);
1566         if (!iommu)
1567                 return -ENODEV;
1568
1569         context = device_to_context_entry(iommu, bus, devfn);
1570         if (!context)
1571                 return -ENOMEM;
1572         spin_lock_irqsave(&iommu->lock, flags);
1573         if (context_present(context)) {
1574                 spin_unlock_irqrestore(&iommu->lock, flags);
1575                 return 0;
1576         }
1577
1578         id = domain->id;
1579         pgd = domain->pgd;
1580
1581         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1582             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1583                 int found = 0;
1584
1585                 /* find an available domain id for this device in iommu */
1586                 ndomains = cap_ndoms(iommu->cap);
1587                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1588                         if (iommu->domains[num] == domain) {
1589                                 id = num;
1590                                 found = 1;
1591                                 break;
1592                         }
1593                 }
1594
1595                 if (found == 0) {
1596                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1597                         if (num >= ndomains) {
1598                                 spin_unlock_irqrestore(&iommu->lock, flags);
1599                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1600                                 return -EFAULT;
1601                         }
1602
1603                         set_bit(num, iommu->domain_ids);
1604                         iommu->domains[num] = domain;
1605                         id = num;
1606                 }
1607
1608                 /* Skip top levels of page tables for
1609                  * iommu which has less agaw than default.
1610                  * Unnecessary for PT mode.
1611                  */
1612                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1613                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1614                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1615                                 if (!dma_pte_present(pgd)) {
1616                                         spin_unlock_irqrestore(&iommu->lock, flags);
1617                                         return -ENOMEM;
1618                                 }
1619                         }
1620                 }
1621         }
1622
1623         context_set_domain_id(context, id);
1624
1625         if (translation != CONTEXT_TT_PASS_THROUGH) {
1626                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1627                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1628                                      CONTEXT_TT_MULTI_LEVEL;
1629         }
1630         /*
1631          * In pass through mode, AW must be programmed to indicate the largest
1632          * AGAW value supported by hardware. And ASR is ignored by hardware.
1633          */
1634         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1635                 context_set_address_width(context, iommu->msagaw);
1636         else {
1637                 context_set_address_root(context, virt_to_phys(pgd));
1638                 context_set_address_width(context, iommu->agaw);
1639         }
1640
1641         context_set_translation_type(context, translation);
1642         context_set_fault_enable(context);
1643         context_set_present(context);
1644         domain_flush_cache(domain, context, sizeof(*context));
1645
1646         /*
1647          * It's a non-present to present mapping. If hardware doesn't cache
1648          * non-present entry we only need to flush the write-buffer. If the
1649          * _does_ cache non-present entries, then it does so in the special
1650          * domain #0, which we have to flush:
1651          */
1652         if (cap_caching_mode(iommu->cap)) {
1653                 iommu->flush.flush_context(iommu, 0,
1654                                            (((u16)bus) << 8) | devfn,
1655                                            DMA_CCMD_MASK_NOBIT,
1656                                            DMA_CCMD_DEVICE_INVL);
1657                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1658         } else {
1659                 iommu_flush_write_buffer(iommu);
1660         }
1661         iommu_enable_dev_iotlb(info);
1662         spin_unlock_irqrestore(&iommu->lock, flags);
1663
1664         spin_lock_irqsave(&domain->iommu_lock, flags);
1665         if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1666                 domain->iommu_count++;
1667                 if (domain->iommu_count == 1)
1668                         domain->nid = iommu->node;
1669                 domain_update_iommu_cap(domain);
1670         }
1671         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1672         return 0;
1673 }
1674
1675 static int
1676 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1677                         int translation)
1678 {
1679         int ret;
1680         struct pci_dev *tmp, *parent;
1681
1682         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1683                                          pdev->bus->number, pdev->devfn,
1684                                          translation);
1685         if (ret)
1686                 return ret;
1687
1688         /* dependent device mapping */
1689         tmp = pci_find_upstream_pcie_bridge(pdev);
1690         if (!tmp)
1691                 return 0;
1692         /* Secondary interface's bus number and devfn 0 */
1693         parent = pdev->bus->self;
1694         while (parent != tmp) {
1695                 ret = domain_context_mapping_one(domain,
1696                                                  pci_domain_nr(parent->bus),
1697                                                  parent->bus->number,
1698                                                  parent->devfn, translation);
1699                 if (ret)
1700                         return ret;
1701                 parent = parent->bus->self;
1702         }
1703         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1704                 return domain_context_mapping_one(domain,
1705                                         pci_domain_nr(tmp->subordinate),
1706                                         tmp->subordinate->number, 0,
1707                                         translation);
1708         else /* this is a legacy PCI bridge */
1709                 return domain_context_mapping_one(domain,
1710                                                   pci_domain_nr(tmp->bus),
1711                                                   tmp->bus->number,
1712                                                   tmp->devfn,
1713                                                   translation);
1714 }
1715
1716 static int domain_context_mapped(struct pci_dev *pdev)
1717 {
1718         int ret;
1719         struct pci_dev *tmp, *parent;
1720         struct intel_iommu *iommu;
1721
1722         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1723                                 pdev->devfn);
1724         if (!iommu)
1725                 return -ENODEV;
1726
1727         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1728         if (!ret)
1729                 return ret;
1730         /* dependent device mapping */
1731         tmp = pci_find_upstream_pcie_bridge(pdev);
1732         if (!tmp)
1733                 return ret;
1734         /* Secondary interface's bus number and devfn 0 */
1735         parent = pdev->bus->self;
1736         while (parent != tmp) {
1737                 ret = device_context_mapped(iommu, parent->bus->number,
1738                                             parent->devfn);
1739                 if (!ret)
1740                         return ret;
1741                 parent = parent->bus->self;
1742         }
1743         if (pci_is_pcie(tmp))
1744                 return device_context_mapped(iommu, tmp->subordinate->number,
1745                                              0);
1746         else
1747                 return device_context_mapped(iommu, tmp->bus->number,
1748                                              tmp->devfn);
1749 }
1750
1751 /* Returns a number of VTD pages, but aligned to MM page size */
1752 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1753                                             size_t size)
1754 {
1755         host_addr &= ~PAGE_MASK;
1756         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1757 }
1758
1759 /* Return largest possible superpage level for a given mapping */
1760 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1761                                           unsigned long iov_pfn,
1762                                           unsigned long phy_pfn,
1763                                           unsigned long pages)
1764 {
1765         int support, level = 1;
1766         unsigned long pfnmerge;
1767
1768         support = domain->iommu_superpage;
1769
1770         /* To use a large page, the virtual *and* physical addresses
1771            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1772            of them will mean we have to use smaller pages. So just
1773            merge them and check both at once. */
1774         pfnmerge = iov_pfn | phy_pfn;
1775
1776         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1777                 pages >>= VTD_STRIDE_SHIFT;
1778                 if (!pages)
1779                         break;
1780                 pfnmerge >>= VTD_STRIDE_SHIFT;
1781                 level++;
1782                 support--;
1783         }
1784         return level;
1785 }
1786
1787 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1788                             struct scatterlist *sg, unsigned long phys_pfn,
1789                             unsigned long nr_pages, int prot)
1790 {
1791         struct dma_pte *first_pte = NULL, *pte = NULL;
1792         phys_addr_t uninitialized_var(pteval);
1793         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1794         unsigned long sg_res;
1795         unsigned int largepage_lvl = 0;
1796         unsigned long lvl_pages = 0;
1797
1798         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1799
1800         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1801                 return -EINVAL;
1802
1803         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1804
1805         if (sg)
1806                 sg_res = 0;
1807         else {
1808                 sg_res = nr_pages + 1;
1809                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1810         }
1811
1812         while (nr_pages > 0) {
1813                 uint64_t tmp;
1814
1815                 if (!sg_res) {
1816                         sg_res = aligned_nrpages(sg->offset, sg->length);
1817                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1818                         sg->dma_length = sg->length;
1819                         pteval = page_to_phys(sg_page(sg)) | prot;
1820                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
1821                 }
1822
1823                 if (!pte) {
1824                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1825
1826                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1827                         if (!pte)
1828                                 return -ENOMEM;
1829                         /* It is large page*/
1830                         if (largepage_lvl > 1)
1831                                 pteval |= DMA_PTE_LARGE_PAGE;
1832                         else
1833                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1834
1835                 }
1836                 /* We don't need lock here, nobody else
1837                  * touches the iova range
1838                  */
1839                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1840                 if (tmp) {
1841                         static int dumps = 5;
1842                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1843                                iov_pfn, tmp, (unsigned long long)pteval);
1844                         if (dumps) {
1845                                 dumps--;
1846                                 debug_dma_dump_mappings(NULL);
1847                         }
1848                         WARN_ON(1);
1849                 }
1850
1851                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1852
1853                 BUG_ON(nr_pages < lvl_pages);
1854                 BUG_ON(sg_res < lvl_pages);
1855
1856                 nr_pages -= lvl_pages;
1857                 iov_pfn += lvl_pages;
1858                 phys_pfn += lvl_pages;
1859                 pteval += lvl_pages * VTD_PAGE_SIZE;
1860                 sg_res -= lvl_pages;
1861
1862                 /* If the next PTE would be the first in a new page, then we
1863                    need to flush the cache on the entries we've just written.
1864                    And then we'll need to recalculate 'pte', so clear it and
1865                    let it get set again in the if (!pte) block above.
1866
1867                    If we're done (!nr_pages) we need to flush the cache too.
1868
1869                    Also if we've been setting superpages, we may need to
1870                    recalculate 'pte' and switch back to smaller pages for the
1871                    end of the mapping, if the trailing size is not enough to
1872                    use another superpage (i.e. sg_res < lvl_pages). */
1873                 pte++;
1874                 if (!nr_pages || first_pte_in_page(pte) ||
1875                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
1876                         domain_flush_cache(domain, first_pte,
1877                                            (void *)pte - (void *)first_pte);
1878                         pte = NULL;
1879                 }
1880
1881                 if (!sg_res && nr_pages)
1882                         sg = sg_next(sg);
1883         }
1884         return 0;
1885 }
1886
1887 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1888                                     struct scatterlist *sg, unsigned long nr_pages,
1889                                     int prot)
1890 {
1891         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1892 }
1893
1894 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1895                                      unsigned long phys_pfn, unsigned long nr_pages,
1896                                      int prot)
1897 {
1898         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1899 }
1900
1901 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1902 {
1903         if (!iommu)
1904                 return;
1905
1906         clear_context_table(iommu, bus, devfn);
1907         iommu->flush.flush_context(iommu, 0, 0, 0,
1908                                            DMA_CCMD_GLOBAL_INVL);
1909         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1910 }
1911
1912 static inline void unlink_domain_info(struct device_domain_info *info)
1913 {
1914         assert_spin_locked(&device_domain_lock);
1915         list_del(&info->link);
1916         list_del(&info->global);
1917         if (info->dev)
1918                 info->dev->dev.archdata.iommu = NULL;
1919 }
1920
1921 static void domain_remove_dev_info(struct dmar_domain *domain)
1922 {
1923         struct device_domain_info *info;
1924         unsigned long flags;
1925         struct intel_iommu *iommu;
1926
1927         spin_lock_irqsave(&device_domain_lock, flags);
1928         while (!list_empty(&domain->devices)) {
1929                 info = list_entry(domain->devices.next,
1930                         struct device_domain_info, link);
1931                 unlink_domain_info(info);
1932                 spin_unlock_irqrestore(&device_domain_lock, flags);
1933
1934                 iommu_disable_dev_iotlb(info);
1935                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1936                 iommu_detach_dev(iommu, info->bus, info->devfn);
1937                 free_devinfo_mem(info);
1938
1939                 spin_lock_irqsave(&device_domain_lock, flags);
1940         }
1941         spin_unlock_irqrestore(&device_domain_lock, flags);
1942 }
1943
1944 /*
1945  * find_domain
1946  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1947  */
1948 static struct dmar_domain *
1949 find_domain(struct pci_dev *pdev)
1950 {
1951         struct device_domain_info *info;
1952
1953         /* No lock here, assumes no domain exit in normal case */
1954         info = pdev->dev.archdata.iommu;
1955         if (info)
1956                 return info->domain;
1957         return NULL;
1958 }
1959
1960 /* domain is initialized */
1961 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1962 {
1963         struct dmar_domain *domain, *found = NULL;
1964         struct intel_iommu *iommu;
1965         struct dmar_drhd_unit *drhd;
1966         struct device_domain_info *info, *tmp;
1967         struct pci_dev *dev_tmp;
1968         unsigned long flags;
1969         int bus = 0, devfn = 0;
1970         int segment;
1971         int ret;
1972
1973         domain = find_domain(pdev);
1974         if (domain)
1975                 return domain;
1976
1977         segment = pci_domain_nr(pdev->bus);
1978
1979         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1980         if (dev_tmp) {
1981                 if (pci_is_pcie(dev_tmp)) {
1982                         bus = dev_tmp->subordinate->number;
1983                         devfn = 0;
1984                 } else {
1985                         bus = dev_tmp->bus->number;
1986                         devfn = dev_tmp->devfn;
1987                 }
1988                 spin_lock_irqsave(&device_domain_lock, flags);
1989                 list_for_each_entry(info, &device_domain_list, global) {
1990                         if (info->segment == segment &&
1991                             info->bus == bus && info->devfn == devfn) {
1992                                 found = info->domain;
1993                                 break;
1994                         }
1995                 }
1996                 spin_unlock_irqrestore(&device_domain_lock, flags);
1997                 /* pcie-pci bridge already has a domain, uses it */
1998                 if (found) {
1999                         domain = found;
2000                         goto found_domain;
2001                 }
2002         }
2003
2004         domain = alloc_domain();
2005         if (!domain)
2006                 goto error;
2007
2008         /* Allocate new domain for the device */
2009         drhd = dmar_find_matched_drhd_unit(pdev);
2010         if (!drhd) {
2011                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
2012                         pci_name(pdev));
2013                 free_domain_mem(domain);
2014                 return NULL;
2015         }
2016         iommu = drhd->iommu;
2017
2018         ret = iommu_attach_domain(domain, iommu);
2019         if (ret) {
2020                 free_domain_mem(domain);
2021                 goto error;
2022         }
2023
2024         if (domain_init(domain, gaw)) {
2025                 domain_exit(domain);
2026                 goto error;
2027         }
2028
2029         /* register pcie-to-pci device */
2030         if (dev_tmp) {
2031                 info = alloc_devinfo_mem();
2032                 if (!info) {
2033                         domain_exit(domain);
2034                         goto error;
2035                 }
2036                 info->segment = segment;
2037                 info->bus = bus;
2038                 info->devfn = devfn;
2039                 info->dev = NULL;
2040                 info->domain = domain;
2041                 /* This domain is shared by devices under p2p bridge */
2042                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2043
2044                 /* pcie-to-pci bridge already has a domain, uses it */
2045                 found = NULL;
2046                 spin_lock_irqsave(&device_domain_lock, flags);
2047                 list_for_each_entry(tmp, &device_domain_list, global) {
2048                         if (tmp->segment == segment &&
2049                             tmp->bus == bus && tmp->devfn == devfn) {
2050                                 found = tmp->domain;
2051                                 break;
2052                         }
2053                 }
2054                 if (found) {
2055                         spin_unlock_irqrestore(&device_domain_lock, flags);
2056                         free_devinfo_mem(info);
2057                         domain_exit(domain);
2058                         domain = found;
2059                 } else {
2060                         list_add(&info->link, &domain->devices);
2061                         list_add(&info->global, &device_domain_list);
2062                         spin_unlock_irqrestore(&device_domain_lock, flags);
2063                 }
2064         }
2065
2066 found_domain:
2067         info = alloc_devinfo_mem();
2068         if (!info)
2069                 goto error;
2070         info->segment = segment;
2071         info->bus = pdev->bus->number;
2072         info->devfn = pdev->devfn;
2073         info->dev = pdev;
2074         info->domain = domain;
2075         spin_lock_irqsave(&device_domain_lock, flags);
2076         /* somebody is fast */
2077         found = find_domain(pdev);
2078         if (found != NULL) {
2079                 spin_unlock_irqrestore(&device_domain_lock, flags);
2080                 if (found != domain) {
2081                         domain_exit(domain);
2082                         domain = found;
2083                 }
2084                 free_devinfo_mem(info);
2085                 return domain;
2086         }
2087         list_add(&info->link, &domain->devices);
2088         list_add(&info->global, &device_domain_list);
2089         pdev->dev.archdata.iommu = info;
2090         spin_unlock_irqrestore(&device_domain_lock, flags);
2091         return domain;
2092 error:
2093         /* recheck it here, maybe others set it */
2094         return find_domain(pdev);
2095 }
2096
2097 static int iommu_identity_mapping;
2098 #define IDENTMAP_ALL            1
2099 #define IDENTMAP_GFX            2
2100 #define IDENTMAP_AZALIA         4
2101
2102 static int iommu_domain_identity_map(struct dmar_domain *domain,
2103                                      unsigned long long start,
2104                                      unsigned long long end)
2105 {
2106         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2107         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2108
2109         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2110                           dma_to_mm_pfn(last_vpfn))) {
2111                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2112                 return -ENOMEM;
2113         }
2114
2115         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2116                  start, end, domain->id);
2117         /*
2118          * RMRR range might have overlap with physical memory range,
2119          * clear it first
2120          */
2121         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2122
2123         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2124                                   last_vpfn - first_vpfn + 1,
2125                                   DMA_PTE_READ|DMA_PTE_WRITE);
2126 }
2127
2128 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2129                                       unsigned long long start,
2130                                       unsigned long long end)
2131 {
2132         struct dmar_domain *domain;
2133         int ret;
2134
2135         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2136         if (!domain)
2137                 return -ENOMEM;
2138
2139         /* For _hardware_ passthrough, don't bother. But for software
2140            passthrough, we do it anyway -- it may indicate a memory
2141            range which is reserved in E820, so which didn't get set
2142            up to start with in si_domain */
2143         if (domain == si_domain && hw_pass_through) {
2144                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2145                        pci_name(pdev), start, end);
2146                 return 0;
2147         }
2148
2149         printk(KERN_INFO
2150                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2151                pci_name(pdev), start, end);
2152         
2153         if (end < start) {
2154                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2155                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2156                         dmi_get_system_info(DMI_BIOS_VENDOR),
2157                         dmi_get_system_info(DMI_BIOS_VERSION),
2158                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2159                 ret = -EIO;
2160                 goto error;
2161         }
2162
2163         if (end >> agaw_to_width(domain->agaw)) {
2164                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2165                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2166                      agaw_to_width(domain->agaw),
2167                      dmi_get_system_info(DMI_BIOS_VENDOR),
2168                      dmi_get_system_info(DMI_BIOS_VERSION),
2169                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2170                 ret = -EIO;
2171                 goto error;
2172         }
2173
2174         ret = iommu_domain_identity_map(domain, start, end);
2175         if (ret)
2176                 goto error;
2177
2178         /* context entry init */
2179         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2180         if (ret)
2181                 goto error;
2182
2183         return 0;
2184
2185  error:
2186         domain_exit(domain);
2187         return ret;
2188 }
2189
2190 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2191         struct pci_dev *pdev)
2192 {
2193         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2194                 return 0;
2195         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2196                 rmrr->end_address);
2197 }
2198
2199 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2200 static inline void iommu_prepare_isa(void)
2201 {
2202         struct pci_dev *pdev;
2203         int ret;
2204
2205         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2206         if (!pdev)
2207                 return;
2208
2209         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2210         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2211
2212         if (ret)
2213                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2214                        "floppy might not work\n");
2215
2216 }
2217 #else
2218 static inline void iommu_prepare_isa(void)
2219 {
2220         return;
2221 }
2222 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2223
2224 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2225
2226 static int __init si_domain_init(int hw)
2227 {
2228         struct dmar_drhd_unit *drhd;
2229         struct intel_iommu *iommu;
2230         int nid, ret = 0;
2231
2232         si_domain = alloc_domain();
2233         if (!si_domain)
2234                 return -EFAULT;
2235
2236         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2237
2238         for_each_active_iommu(iommu, drhd) {
2239                 ret = iommu_attach_domain(si_domain, iommu);
2240                 if (ret) {
2241                         domain_exit(si_domain);
2242                         return -EFAULT;
2243                 }
2244         }
2245
2246         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2247                 domain_exit(si_domain);
2248                 return -EFAULT;
2249         }
2250
2251         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2252
2253         if (hw)
2254                 return 0;
2255
2256         for_each_online_node(nid) {
2257                 unsigned long start_pfn, end_pfn;
2258                 int i;
2259
2260                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2261                         ret = iommu_domain_identity_map(si_domain,
2262                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2263                         if (ret)
2264                                 return ret;
2265                 }
2266         }
2267
2268         return 0;
2269 }
2270
2271 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2272                                           struct pci_dev *pdev);
2273 static int identity_mapping(struct pci_dev *pdev)
2274 {
2275         struct device_domain_info *info;
2276
2277         if (likely(!iommu_identity_mapping))
2278                 return 0;
2279
2280         info = pdev->dev.archdata.iommu;
2281         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2282                 return (info->domain == si_domain);
2283
2284         return 0;
2285 }
2286
2287 static int domain_add_dev_info(struct dmar_domain *domain,
2288                                struct pci_dev *pdev,
2289                                int translation)
2290 {
2291         struct device_domain_info *info;
2292         unsigned long flags;
2293         int ret;
2294
2295         info = alloc_devinfo_mem();
2296         if (!info)
2297                 return -ENOMEM;
2298
2299         info->segment = pci_domain_nr(pdev->bus);
2300         info->bus = pdev->bus->number;
2301         info->devfn = pdev->devfn;
2302         info->dev = pdev;
2303         info->domain = domain;
2304
2305         spin_lock_irqsave(&device_domain_lock, flags);
2306         list_add(&info->link, &domain->devices);
2307         list_add(&info->global, &device_domain_list);
2308         pdev->dev.archdata.iommu = info;
2309         spin_unlock_irqrestore(&device_domain_lock, flags);
2310
2311         ret = domain_context_mapping(domain, pdev, translation);
2312         if (ret) {
2313                 spin_lock_irqsave(&device_domain_lock, flags);
2314                 unlink_domain_info(info);
2315                 spin_unlock_irqrestore(&device_domain_lock, flags);
2316                 free_devinfo_mem(info);
2317                 return ret;
2318         }
2319
2320         return 0;
2321 }
2322
2323 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2324 {
2325         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2326                 return 1;
2327
2328         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2329                 return 1;
2330
2331         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2332                 return 0;
2333
2334         /*
2335          * We want to start off with all devices in the 1:1 domain, and
2336          * take them out later if we find they can't access all of memory.
2337          *
2338          * However, we can't do this for PCI devices behind bridges,
2339          * because all PCI devices behind the same bridge will end up
2340          * with the same source-id on their transactions.
2341          *
2342          * Practically speaking, we can't change things around for these
2343          * devices at run-time, because we can't be sure there'll be no
2344          * DMA transactions in flight for any of their siblings.
2345          * 
2346          * So PCI devices (unless they're on the root bus) as well as
2347          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2348          * the 1:1 domain, just in _case_ one of their siblings turns out
2349          * not to be able to map all of memory.
2350          */
2351         if (!pci_is_pcie(pdev)) {
2352                 if (!pci_is_root_bus(pdev->bus))
2353                         return 0;
2354                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2355                         return 0;
2356         } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2357                 return 0;
2358
2359         /* 
2360          * At boot time, we don't yet know if devices will be 64-bit capable.
2361          * Assume that they will -- if they turn out not to be, then we can 
2362          * take them out of the 1:1 domain later.
2363          */
2364         if (!startup) {
2365                 /*
2366                  * If the device's dma_mask is less than the system's memory
2367                  * size then this is not a candidate for identity mapping.
2368                  */
2369                 u64 dma_mask = pdev->dma_mask;
2370
2371                 if (pdev->dev.coherent_dma_mask &&
2372                     pdev->dev.coherent_dma_mask < dma_mask)
2373                         dma_mask = pdev->dev.coherent_dma_mask;
2374
2375                 return dma_mask >= dma_get_required_mask(&pdev->dev);
2376         }
2377
2378         return 1;
2379 }
2380
2381 static int __init iommu_prepare_static_identity_mapping(int hw)
2382 {
2383         struct pci_dev *pdev = NULL;
2384         int ret;
2385
2386         ret = si_domain_init(hw);
2387         if (ret)
2388                 return -EFAULT;
2389
2390         for_each_pci_dev(pdev) {
2391                 if (iommu_should_identity_map(pdev, 1)) {
2392                         ret = domain_add_dev_info(si_domain, pdev,
2393                                              hw ? CONTEXT_TT_PASS_THROUGH :
2394                                                   CONTEXT_TT_MULTI_LEVEL);
2395                         if (ret) {
2396                                 /* device not associated with an iommu */
2397                                 if (ret == -ENODEV)
2398                                         continue;
2399                                 return ret;
2400                         }
2401                         pr_info("IOMMU: %s identity mapping for device %s\n",
2402                                 hw ? "hardware" : "software", pci_name(pdev));
2403                 }
2404         }
2405
2406         return 0;
2407 }
2408
2409 static int __init init_dmars(void)
2410 {
2411         struct dmar_drhd_unit *drhd;
2412         struct dmar_rmrr_unit *rmrr;
2413         struct pci_dev *pdev;
2414         struct intel_iommu *iommu;
2415         int i, ret;
2416
2417         /*
2418          * for each drhd
2419          *    allocate root
2420          *    initialize and program root entry to not present
2421          * endfor
2422          */
2423         for_each_drhd_unit(drhd) {
2424                 /*
2425                  * lock not needed as this is only incremented in the single
2426                  * threaded kernel __init code path all other access are read
2427                  * only
2428                  */
2429                 if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2430                         g_num_of_iommus++;
2431                         continue;
2432                 }
2433                 printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2434                           IOMMU_UNITS_SUPPORTED);
2435         }
2436
2437         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2438                         GFP_KERNEL);
2439         if (!g_iommus) {
2440                 printk(KERN_ERR "Allocating global iommu array failed\n");
2441                 ret = -ENOMEM;
2442                 goto error;
2443         }
2444
2445         deferred_flush = kzalloc(g_num_of_iommus *
2446                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2447         if (!deferred_flush) {
2448                 ret = -ENOMEM;
2449                 goto error;
2450         }
2451
2452         for_each_drhd_unit(drhd) {
2453                 if (drhd->ignored)
2454                         continue;
2455
2456                 iommu = drhd->iommu;
2457                 g_iommus[iommu->seq_id] = iommu;
2458
2459                 ret = iommu_init_domains(iommu);
2460                 if (ret)
2461                         goto error;
2462
2463                 /*
2464                  * TBD:
2465                  * we could share the same root & context tables
2466                  * among all IOMMU's. Need to Split it later.
2467                  */
2468                 ret = iommu_alloc_root_entry(iommu);
2469                 if (ret) {
2470                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2471                         goto error;
2472                 }
2473                 if (!ecap_pass_through(iommu->ecap))
2474                         hw_pass_through = 0;
2475         }
2476
2477         /*
2478          * Start from the sane iommu hardware state.
2479          */
2480         for_each_drhd_unit(drhd) {
2481                 if (drhd->ignored)
2482                         continue;
2483
2484                 iommu = drhd->iommu;
2485
2486                 /*
2487                  * If the queued invalidation is already initialized by us
2488                  * (for example, while enabling interrupt-remapping) then
2489                  * we got the things already rolling from a sane state.
2490                  */
2491                 if (iommu->qi)
2492                         continue;
2493
2494                 /*
2495                  * Clear any previous faults.
2496                  */
2497                 dmar_fault(-1, iommu);
2498                 /*
2499                  * Disable queued invalidation if supported and already enabled
2500                  * before OS handover.
2501                  */
2502                 dmar_disable_qi(iommu);
2503         }
2504
2505         for_each_drhd_unit(drhd) {
2506                 if (drhd->ignored)
2507                         continue;
2508
2509                 iommu = drhd->iommu;
2510
2511                 if (dmar_enable_qi(iommu)) {
2512                         /*
2513                          * Queued Invalidate not enabled, use Register Based
2514                          * Invalidate
2515                          */
2516                         iommu->flush.flush_context = __iommu_flush_context;
2517                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2518                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2519                                "invalidation\n",
2520                                 iommu->seq_id,
2521                                (unsigned long long)drhd->reg_base_addr);
2522                 } else {
2523                         iommu->flush.flush_context = qi_flush_context;
2524                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2525                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2526                                "invalidation\n",
2527                                 iommu->seq_id,
2528                                (unsigned long long)drhd->reg_base_addr);
2529                 }
2530         }
2531
2532         if (iommu_pass_through)
2533                 iommu_identity_mapping |= IDENTMAP_ALL;
2534
2535 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2536         iommu_identity_mapping |= IDENTMAP_GFX;
2537 #endif
2538
2539         check_tylersburg_isoch();
2540
2541         /*
2542          * If pass through is not set or not enabled, setup context entries for
2543          * identity mappings for rmrr, gfx, and isa and may fall back to static
2544          * identity mapping if iommu_identity_mapping is set.
2545          */
2546         if (iommu_identity_mapping) {
2547                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2548                 if (ret) {
2549                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2550                         goto error;
2551                 }
2552         }
2553         /*
2554          * For each rmrr
2555          *   for each dev attached to rmrr
2556          *   do
2557          *     locate drhd for dev, alloc domain for dev
2558          *     allocate free domain
2559          *     allocate page table entries for rmrr
2560          *     if context not allocated for bus
2561          *           allocate and init context
2562          *           set present in root table for this bus
2563          *     init context with domain, translation etc
2564          *    endfor
2565          * endfor
2566          */
2567         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2568         for_each_rmrr_units(rmrr) {
2569                 for (i = 0; i < rmrr->devices_cnt; i++) {
2570                         pdev = rmrr->devices[i];
2571                         /*
2572                          * some BIOS lists non-exist devices in DMAR
2573                          * table.
2574                          */
2575                         if (!pdev)
2576                                 continue;
2577                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2578                         if (ret)
2579                                 printk(KERN_ERR
2580                                        "IOMMU: mapping reserved region failed\n");
2581                 }
2582         }
2583
2584         iommu_prepare_isa();
2585
2586         /*
2587          * for each drhd
2588          *   enable fault log
2589          *   global invalidate context cache
2590          *   global invalidate iotlb
2591          *   enable translation
2592          */
2593         for_each_drhd_unit(drhd) {
2594                 if (drhd->ignored) {
2595                         /*
2596                          * we always have to disable PMRs or DMA may fail on
2597                          * this device
2598                          */
2599                         if (force_on)
2600                                 iommu_disable_protect_mem_regions(drhd->iommu);
2601                         continue;
2602                 }
2603                 iommu = drhd->iommu;
2604
2605                 iommu_flush_write_buffer(iommu);
2606
2607                 ret = dmar_set_interrupt(iommu);
2608                 if (ret)
2609                         goto error;
2610
2611                 iommu_set_root_entry(iommu);
2612
2613                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2614                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2615
2616                 ret = iommu_enable_translation(iommu);
2617                 if (ret)
2618                         goto error;
2619
2620                 iommu_disable_protect_mem_regions(iommu);
2621         }
2622
2623         return 0;
2624 error:
2625         for_each_drhd_unit(drhd) {
2626                 if (drhd->ignored)
2627                         continue;
2628                 iommu = drhd->iommu;
2629                 free_iommu(iommu);
2630         }
2631         kfree(g_iommus);
2632         return ret;
2633 }
2634
2635 /* This takes a number of _MM_ pages, not VTD pages */
2636 static struct iova *intel_alloc_iova(struct device *dev,
2637                                      struct dmar_domain *domain,
2638                                      unsigned long nrpages, uint64_t dma_mask)
2639 {
2640         struct pci_dev *pdev = to_pci_dev(dev);
2641         struct iova *iova = NULL;
2642
2643         /* Restrict dma_mask to the width that the iommu can handle */
2644         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2645
2646         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2647                 /*
2648                  * First try to allocate an io virtual address in
2649                  * DMA_BIT_MASK(32) and if that fails then try allocating
2650                  * from higher range
2651                  */
2652                 iova = alloc_iova(&domain->iovad, nrpages,
2653                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2654                 if (iova)
2655                         return iova;
2656         }
2657         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2658         if (unlikely(!iova)) {
2659                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2660                        nrpages, pci_name(pdev));
2661                 return NULL;
2662         }
2663
2664         return iova;
2665 }
2666
2667 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2668 {
2669         struct dmar_domain *domain;
2670         int ret;
2671
2672         domain = get_domain_for_dev(pdev,
2673                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2674         if (!domain) {
2675                 printk(KERN_ERR
2676                         "Allocating domain for %s failed", pci_name(pdev));
2677                 return NULL;
2678         }
2679
2680         /* make sure context mapping is ok */
2681         if (unlikely(!domain_context_mapped(pdev))) {
2682                 ret = domain_context_mapping(domain, pdev,
2683                                              CONTEXT_TT_MULTI_LEVEL);
2684                 if (ret) {
2685                         printk(KERN_ERR
2686                                 "Domain context map for %s failed",
2687                                 pci_name(pdev));
2688                         return NULL;
2689                 }
2690         }
2691
2692         return domain;
2693 }
2694
2695 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2696 {
2697         struct device_domain_info *info;
2698
2699         /* No lock here, assumes no domain exit in normal case */
2700         info = dev->dev.archdata.iommu;
2701         if (likely(info))
2702                 return info->domain;
2703
2704         return __get_valid_domain_for_dev(dev);
2705 }
2706
2707 static int iommu_dummy(struct pci_dev *pdev)
2708 {
2709         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2710 }
2711
2712 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2713 static int iommu_no_mapping(struct device *dev)
2714 {
2715         struct pci_dev *pdev;
2716         int found;
2717
2718         if (unlikely(dev->bus != &pci_bus_type))
2719                 return 1;
2720
2721         pdev = to_pci_dev(dev);
2722         if (iommu_dummy(pdev))
2723                 return 1;
2724
2725         if (!iommu_identity_mapping)
2726                 return 0;
2727
2728         found = identity_mapping(pdev);
2729         if (found) {
2730                 if (iommu_should_identity_map(pdev, 0))
2731                         return 1;
2732                 else {
2733                         /*
2734                          * 32 bit DMA is removed from si_domain and fall back
2735                          * to non-identity mapping.
2736                          */
2737                         domain_remove_one_dev_info(si_domain, pdev);
2738                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2739                                pci_name(pdev));
2740                         return 0;
2741                 }
2742         } else {
2743                 /*
2744                  * In case of a detached 64 bit DMA device from vm, the device
2745                  * is put into si_domain for identity mapping.
2746                  */
2747                 if (iommu_should_identity_map(pdev, 0)) {
2748                         int ret;
2749                         ret = domain_add_dev_info(si_domain, pdev,
2750                                                   hw_pass_through ?
2751                                                   CONTEXT_TT_PASS_THROUGH :
2752                                                   CONTEXT_TT_MULTI_LEVEL);
2753                         if (!ret) {
2754                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2755                                        pci_name(pdev));
2756                                 return 1;
2757                         }
2758                 }
2759         }
2760
2761         return 0;
2762 }
2763
2764 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2765                                      size_t size, int dir, u64 dma_mask)
2766 {
2767         struct pci_dev *pdev = to_pci_dev(hwdev);
2768         struct dmar_domain *domain;
2769         phys_addr_t start_paddr;
2770         struct iova *iova;
2771         int prot = 0;
2772         int ret;
2773         struct intel_iommu *iommu;
2774         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2775
2776         BUG_ON(dir == DMA_NONE);
2777
2778         if (iommu_no_mapping(hwdev))
2779                 return paddr;
2780
2781         domain = get_valid_domain_for_dev(pdev);
2782         if (!domain)
2783                 return 0;
2784
2785         iommu = domain_get_iommu(domain);
2786         size = aligned_nrpages(paddr, size);
2787
2788         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2789         if (!iova)
2790                 goto error;
2791
2792         /*
2793          * Check if DMAR supports zero-length reads on write only
2794          * mappings..
2795          */
2796         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2797                         !cap_zlr(iommu->cap))
2798                 prot |= DMA_PTE_READ;
2799         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2800                 prot |= DMA_PTE_WRITE;
2801         /*
2802          * paddr - (paddr + size) might be partial page, we should map the whole
2803          * page.  Note: if two part of one page are separately mapped, we
2804          * might have two guest_addr mapping to the same host paddr, but this
2805          * is not a big problem
2806          */
2807         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2808                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2809         if (ret)
2810                 goto error;
2811
2812         /* it's a non-present to present mapping. Only flush if caching mode */
2813         if (cap_caching_mode(iommu->cap))
2814                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2815         else
2816                 iommu_flush_write_buffer(iommu);
2817
2818         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2819         start_paddr += paddr & ~PAGE_MASK;
2820         return start_paddr;
2821
2822 error:
2823         if (iova)
2824                 __free_iova(&domain->iovad, iova);
2825         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2826                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2827         return 0;
2828 }
2829
2830 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2831                                  unsigned long offset, size_t size,
2832                                  enum dma_data_direction dir,
2833                                  struct dma_attrs *attrs)
2834 {
2835         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2836                                   dir, to_pci_dev(dev)->dma_mask);
2837 }
2838
2839 static void flush_unmaps(void)
2840 {
2841         int i, j;
2842
2843         timer_on = 0;
2844
2845         /* just flush them all */
2846         for (i = 0; i < g_num_of_iommus; i++) {
2847                 struct intel_iommu *iommu = g_iommus[i];
2848                 if (!iommu)
2849                         continue;
2850
2851                 if (!deferred_flush[i].next)
2852                         continue;
2853
2854                 /* In caching mode, global flushes turn emulation expensive */
2855                 if (!cap_caching_mode(iommu->cap))
2856                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2857                                          DMA_TLB_GLOBAL_FLUSH);
2858                 for (j = 0; j < deferred_flush[i].next; j++) {
2859                         unsigned long mask;
2860                         struct iova *iova = deferred_flush[i].iova[j];
2861                         struct dmar_domain *domain = deferred_flush[i].domain[j];
2862
2863                         /* On real hardware multiple invalidations are expensive */
2864                         if (cap_caching_mode(iommu->cap))
2865                                 iommu_flush_iotlb_psi(iommu, domain->id,
2866                                 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2867                         else {
2868                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2869                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2870                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2871                         }
2872                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2873                 }
2874                 deferred_flush[i].next = 0;
2875         }
2876
2877         list_size = 0;
2878 }
2879
2880 static void flush_unmaps_timeout(unsigned long data)
2881 {
2882         unsigned long flags;
2883
2884         spin_lock_irqsave(&async_umap_flush_lock, flags);
2885         flush_unmaps();
2886         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2887 }
2888
2889 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2890 {
2891         unsigned long flags;
2892         int next, iommu_id;
2893         struct intel_iommu *iommu;
2894
2895         spin_lock_irqsave(&async_umap_flush_lock, flags);
2896         if (list_size == HIGH_WATER_MARK)
2897                 flush_unmaps();
2898
2899         iommu = domain_get_iommu(dom);
2900         iommu_id = iommu->seq_id;
2901
2902         next = deferred_flush[iommu_id].next;
2903         deferred_flush[iommu_id].domain[next] = dom;
2904         deferred_flush[iommu_id].iova[next] = iova;
2905         deferred_flush[iommu_id].next++;
2906
2907         if (!timer_on) {
2908                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2909                 timer_on = 1;
2910         }
2911         list_size++;
2912         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2913 }
2914
2915 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2916                              size_t size, enum dma_data_direction dir,
2917                              struct dma_attrs *attrs)
2918 {
2919         struct pci_dev *pdev = to_pci_dev(dev);
2920         struct dmar_domain *domain;
2921         unsigned long start_pfn, last_pfn;
2922         struct iova *iova;
2923         struct intel_iommu *iommu;
2924
2925         if (iommu_no_mapping(dev))
2926                 return;
2927
2928         domain = find_domain(pdev);
2929         BUG_ON(!domain);
2930
2931         iommu = domain_get_iommu(domain);
2932
2933         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2934         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2935                       (unsigned long long)dev_addr))
2936                 return;
2937
2938         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2939         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2940
2941         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2942                  pci_name(pdev), start_pfn, last_pfn);
2943
2944         /*  clear the whole page */
2945         dma_pte_clear_range(domain, start_pfn, last_pfn);
2946
2947         /* free page tables */
2948         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2949
2950         if (intel_iommu_strict) {
2951                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2952                                       last_pfn - start_pfn + 1, 0);
2953                 /* free iova */
2954                 __free_iova(&domain->iovad, iova);
2955         } else {
2956                 add_unmap(domain, iova);
2957                 /*
2958                  * queue up the release of the unmap to save the 1/6th of the
2959                  * cpu used up by the iotlb flush operation...
2960                  */
2961         }
2962 }
2963
2964 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2965                                   dma_addr_t *dma_handle, gfp_t flags,
2966                                   struct dma_attrs *attrs)
2967 {
2968         void *vaddr;
2969         int order;
2970
2971         size = PAGE_ALIGN(size);
2972         order = get_order(size);
2973
2974         if (!iommu_no_mapping(hwdev))
2975                 flags &= ~(GFP_DMA | GFP_DMA32);
2976         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2977                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2978                         flags |= GFP_DMA;
2979                 else
2980                         flags |= GFP_DMA32;
2981         }
2982
2983         vaddr = (void *)__get_free_pages(flags, order);
2984         if (!vaddr)
2985                 return NULL;
2986         memset(vaddr, 0, size);
2987
2988         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2989                                          DMA_BIDIRECTIONAL,
2990                                          hwdev->coherent_dma_mask);
2991         if (*dma_handle)
2992                 return vaddr;
2993         free_pages((unsigned long)vaddr, order);
2994         return NULL;
2995 }
2996
2997 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2998                                 dma_addr_t dma_handle, struct dma_attrs *attrs)
2999 {
3000         int order;
3001
3002         size = PAGE_ALIGN(size);
3003         order = get_order(size);
3004
3005         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3006         free_pages((unsigned long)vaddr, order);
3007 }
3008
3009 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3010                            int nelems, enum dma_data_direction dir,
3011                            struct dma_attrs *attrs)
3012 {
3013         struct pci_dev *pdev = to_pci_dev(hwdev);
3014         struct dmar_domain *domain;
3015         unsigned long start_pfn, last_pfn;
3016         struct iova *iova;
3017         struct intel_iommu *iommu;
3018
3019         if (iommu_no_mapping(hwdev))
3020                 return;
3021
3022         domain = find_domain(pdev);
3023         BUG_ON(!domain);
3024
3025         iommu = domain_get_iommu(domain);
3026
3027         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3028         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3029                       (unsigned long long)sglist[0].dma_address))
3030                 return;
3031
3032         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3033         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3034
3035         /*  clear the whole page */
3036         dma_pte_clear_range(domain, start_pfn, last_pfn);
3037
3038         /* free page tables */
3039         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3040
3041         if (intel_iommu_strict) {
3042                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3043                                       last_pfn - start_pfn + 1, 0);
3044                 /* free iova */
3045                 __free_iova(&domain->iovad, iova);
3046         } else {
3047                 add_unmap(domain, iova);
3048                 /*
3049                  * queue up the release of the unmap to save the 1/6th of the
3050                  * cpu used up by the iotlb flush operation...
3051                  */
3052         }
3053 }
3054
3055 static int intel_nontranslate_map_sg(struct device *hddev,
3056         struct scatterlist *sglist, int nelems, int dir)
3057 {
3058         int i;
3059         struct scatterlist *sg;
3060
3061         for_each_sg(sglist, sg, nelems, i) {
3062                 BUG_ON(!sg_page(sg));
3063                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3064                 sg->dma_length = sg->length;
3065         }
3066         return nelems;
3067 }
3068
3069 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3070                         enum dma_data_direction dir, struct dma_attrs *attrs)
3071 {
3072         int i;
3073         struct pci_dev *pdev = to_pci_dev(hwdev);
3074         struct dmar_domain *domain;
3075         size_t size = 0;
3076         int prot = 0;
3077         struct iova *iova = NULL;
3078         int ret;
3079         struct scatterlist *sg;
3080         unsigned long start_vpfn;
3081         struct intel_iommu *iommu;
3082
3083         BUG_ON(dir == DMA_NONE);
3084         if (iommu_no_mapping(hwdev))
3085                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3086
3087         domain = get_valid_domain_for_dev(pdev);
3088         if (!domain)
3089                 return 0;
3090
3091         iommu = domain_get_iommu(domain);
3092
3093         for_each_sg(sglist, sg, nelems, i)
3094                 size += aligned_nrpages(sg->offset, sg->length);
3095
3096         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3097                                 pdev->dma_mask);
3098         if (!iova) {
3099                 sglist->dma_length = 0;
3100                 return 0;
3101         }
3102
3103         /*
3104          * Check if DMAR supports zero-length reads on write only
3105          * mappings..
3106          */
3107         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3108                         !cap_zlr(iommu->cap))
3109                 prot |= DMA_PTE_READ;
3110         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3111                 prot |= DMA_PTE_WRITE;
3112
3113         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3114
3115         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3116         if (unlikely(ret)) {
3117                 /*  clear the page */
3118                 dma_pte_clear_range(domain, start_vpfn,
3119                                     start_vpfn + size - 1);
3120                 /* free page tables */
3121                 dma_pte_free_pagetable(domain, start_vpfn,
3122                                        start_vpfn + size - 1);
3123                 /* free iova */
3124                 __free_iova(&domain->iovad, iova);
3125                 return 0;
3126         }
3127
3128         /* it's a non-present to present mapping. Only flush if caching mode */
3129         if (cap_caching_mode(iommu->cap))
3130                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3131         else
3132                 iommu_flush_write_buffer(iommu);
3133
3134         return nelems;
3135 }
3136
3137 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3138 {
3139         return !dma_addr;
3140 }
3141
3142 struct dma_map_ops intel_dma_ops = {
3143         .alloc = intel_alloc_coherent,
3144         .free = intel_free_coherent,
3145         .map_sg = intel_map_sg,
3146         .unmap_sg = intel_unmap_sg,
3147         .map_page = intel_map_page,
3148         .unmap_page = intel_unmap_page,
3149         .mapping_error = intel_mapping_error,
3150 };
3151
3152 static inline int iommu_domain_cache_init(void)
3153 {
3154         int ret = 0;
3155
3156         iommu_domain_cache = kmem_cache_create("iommu_domain",
3157                                          sizeof(struct dmar_domain),
3158                                          0,
3159                                          SLAB_HWCACHE_ALIGN,
3160
3161                                          NULL);
3162         if (!iommu_domain_cache) {
3163                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3164                 ret = -ENOMEM;
3165         }
3166
3167         return ret;
3168 }
3169
3170 static inline int iommu_devinfo_cache_init(void)
3171 {
3172         int ret = 0;
3173
3174         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3175                                          sizeof(struct device_domain_info),
3176                                          0,
3177                                          SLAB_HWCACHE_ALIGN,
3178                                          NULL);
3179         if (!iommu_devinfo_cache) {
3180                 printk(KERN_ERR "Couldn't create devinfo cache\n");
3181                 ret = -ENOMEM;
3182         }
3183
3184         return ret;
3185 }
3186
3187 static inline int iommu_iova_cache_init(void)
3188 {
3189         int ret = 0;
3190
3191         iommu_iova_cache = kmem_cache_create("iommu_iova",
3192                                          sizeof(struct iova),
3193                                          0,
3194                                          SLAB_HWCACHE_ALIGN,
3195                                          NULL);
3196         if (!iommu_iova_cache) {
3197                 printk(KERN_ERR "Couldn't create iova cache\n");
3198                 ret = -ENOMEM;
3199         }
3200
3201         return ret;
3202 }
3203
3204 static int __init iommu_init_mempool(void)
3205 {
3206         int ret;
3207         ret = iommu_iova_cache_init();
3208         if (ret)
3209                 return ret;
3210
3211         ret = iommu_domain_cache_init();
3212         if (ret)
3213                 goto domain_error;
3214
3215         ret = iommu_devinfo_cache_init();
3216         if (!ret)
3217                 return ret;
3218
3219         kmem_cache_destroy(iommu_domain_cache);
3220 domain_error:
3221         kmem_cache_destroy(iommu_iova_cache);
3222
3223         return -ENOMEM;
3224 }
3225
3226 static void __init iommu_exit_mempool(void)
3227 {
3228         kmem_cache_destroy(iommu_devinfo_cache);
3229         kmem_cache_destroy(iommu_domain_cache);
3230         kmem_cache_destroy(iommu_iova_cache);
3231
3232 }
3233
3234 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3235 {
3236         struct dmar_drhd_unit *drhd;
3237         u32 vtbar;
3238         int rc;
3239
3240         /* We know that this device on this chipset has its own IOMMU.
3241          * If we find it under a different IOMMU, then the BIOS is lying
3242          * to us. Hope that the IOMMU for this device is actually
3243          * disabled, and it needs no translation...
3244          */
3245         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3246         if (rc) {
3247                 /* "can't" happen */
3248                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3249                 return;
3250         }
3251         vtbar &= 0xffff0000;
3252
3253         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3254         drhd = dmar_find_matched_drhd_unit(pdev);
3255         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3256                             TAINT_FIRMWARE_WORKAROUND,
3257                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3258                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3259 }
3260 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3261
3262 static void __init init_no_remapping_devices(void)
3263 {
3264         struct dmar_drhd_unit *drhd;
3265
3266         for_each_drhd_unit(drhd) {
3267                 if (!drhd->include_all) {
3268                         int i;
3269                         for (i = 0; i < drhd->devices_cnt; i++)
3270                                 if (drhd->devices[i] != NULL)
3271                                         break;
3272                         /* ignore DMAR unit if no pci devices exist */
3273                         if (i == drhd->devices_cnt)
3274                                 drhd->ignored = 1;
3275                 }
3276         }
3277
3278         for_each_drhd_unit(drhd) {
3279                 int i;
3280                 if (drhd->ignored || drhd->include_all)
3281                         continue;
3282
3283                 for (i = 0; i < drhd->devices_cnt; i++)
3284                         if (drhd->devices[i] &&
3285                             !IS_GFX_DEVICE(drhd->devices[i]))
3286                                 break;
3287
3288                 if (i < drhd->devices_cnt)
3289                         continue;
3290
3291                 /* This IOMMU has *only* gfx devices. Either bypass it or
3292                    set the gfx_mapped flag, as appropriate */
3293                 if (dmar_map_gfx) {
3294                         intel_iommu_gfx_mapped = 1;
3295                 } else {
3296                         drhd->ignored = 1;
3297                         for (i = 0; i < drhd->devices_cnt; i++) {
3298                                 if (!drhd->devices[i])
3299                                         continue;
3300                                 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3301                         }
3302                 }
3303         }
3304 }
3305
3306 #ifdef CONFIG_SUSPEND
3307 static int init_iommu_hw(void)
3308 {
3309         struct dmar_drhd_unit *drhd;
3310         struct intel_iommu *iommu = NULL;
3311
3312         for_each_active_iommu(iommu, drhd)
3313                 if (iommu->qi)
3314                         dmar_reenable_qi(iommu);
3315
3316         for_each_iommu(iommu, drhd) {
3317                 if (drhd->ignored) {
3318                         /*
3319                          * we always have to disable PMRs or DMA may fail on
3320                          * this device
3321                          */
3322                         if (force_on)
3323                                 iommu_disable_protect_mem_regions(iommu);
3324                         continue;
3325                 }
3326         
3327                 iommu_flush_write_buffer(iommu);
3328
3329                 iommu_set_root_entry(iommu);
3330
3331                 iommu->flush.flush_context(iommu, 0, 0, 0,
3332                                            DMA_CCMD_GLOBAL_INVL);
3333                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3334                                          DMA_TLB_GLOBAL_FLUSH);
3335                 if (iommu_enable_translation(iommu))
3336                         return 1;
3337                 iommu_disable_protect_mem_regions(iommu);
3338         }
3339
3340         return 0;
3341 }
3342
3343 static void iommu_flush_all(void)
3344 {
3345         struct dmar_drhd_unit *drhd;
3346         struct intel_iommu *iommu;
3347
3348         for_each_active_iommu(iommu, drhd) {
3349                 iommu->flush.flush_context(iommu, 0, 0, 0,
3350                                            DMA_CCMD_GLOBAL_INVL);
3351                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3352                                          DMA_TLB_GLOBAL_FLUSH);
3353         }
3354 }
3355
3356 static int iommu_suspend(void)
3357 {
3358         struct dmar_drhd_unit *drhd;
3359         struct intel_iommu *iommu = NULL;
3360         unsigned long flag;
3361
3362         for_each_active_iommu(iommu, drhd) {
3363                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3364                                                  GFP_ATOMIC);
3365                 if (!iommu->iommu_state)
3366                         goto nomem;
3367         }
3368
3369         iommu_flush_all();
3370
3371         for_each_active_iommu(iommu, drhd) {
3372                 iommu_disable_translation(iommu);
3373
3374                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3375
3376                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3377                         readl(iommu->reg + DMAR_FECTL_REG);
3378                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3379                         readl(iommu->reg + DMAR_FEDATA_REG);
3380                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3381                         readl(iommu->reg + DMAR_FEADDR_REG);
3382                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3383                         readl(iommu->reg + DMAR_FEUADDR_REG);
3384
3385                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3386         }
3387         return 0;
3388
3389 nomem:
3390         for_each_active_iommu(iommu, drhd)
3391                 kfree(iommu->iommu_state);
3392
3393         return -ENOMEM;
3394 }
3395
3396 static void iommu_resume(void)
3397 {
3398         struct dmar_drhd_unit *drhd;
3399         struct intel_iommu *iommu = NULL;
3400         unsigned long flag;
3401
3402         if (init_iommu_hw()) {
3403                 if (force_on)
3404                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3405                 else
3406                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3407                 return;
3408         }
3409
3410         for_each_active_iommu(iommu, drhd) {
3411
3412                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3413
3414                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3415                         iommu->reg + DMAR_FECTL_REG);
3416                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3417                         iommu->reg + DMAR_FEDATA_REG);
3418                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3419                         iommu->reg + DMAR_FEADDR_REG);
3420                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3421                         iommu->reg + DMAR_FEUADDR_REG);
3422
3423                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3424         }
3425
3426         for_each_active_iommu(iommu, drhd)
3427                 kfree(iommu->iommu_state);
3428 }
3429
3430 static struct syscore_ops iommu_syscore_ops = {
3431         .resume         = iommu_resume,
3432         .suspend        = iommu_suspend,
3433 };
3434
3435 static void __init init_iommu_pm_ops(void)
3436 {
3437         register_syscore_ops(&iommu_syscore_ops);
3438 }
3439
3440 #else
3441 static inline void init_iommu_pm_ops(void) {}
3442 #endif  /* CONFIG_PM */
3443
3444 LIST_HEAD(dmar_rmrr_units);
3445
3446 static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3447 {
3448         list_add(&rmrr->list, &dmar_rmrr_units);
3449 }
3450
3451
3452 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3453 {
3454         struct acpi_dmar_reserved_memory *rmrr;
3455         struct dmar_rmrr_unit *rmrru;
3456
3457         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3458         if (!rmrru)
3459                 return -ENOMEM;
3460
3461         rmrru->hdr = header;
3462         rmrr = (struct acpi_dmar_reserved_memory *)header;
3463         rmrru->base_address = rmrr->base_address;
3464         rmrru->end_address = rmrr->end_address;
3465
3466         dmar_register_rmrr_unit(rmrru);
3467         return 0;
3468 }
3469
3470 static int __init
3471 rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3472 {
3473         struct acpi_dmar_reserved_memory *rmrr;
3474         int ret;
3475
3476         rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3477         ret = dmar_parse_dev_scope((void *)(rmrr + 1),
3478                 ((void *)rmrr) + rmrr->header.length,
3479                 &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
3480
3481         if (ret || (rmrru->devices_cnt == 0)) {
3482                 list_del(&rmrru->list);
3483                 kfree(rmrru);
3484         }
3485         return ret;
3486 }
3487
3488 static LIST_HEAD(dmar_atsr_units);
3489
3490 int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3491 {
3492         struct acpi_dmar_atsr *atsr;
3493         struct dmar_atsr_unit *atsru;
3494
3495         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3496         atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3497         if (!atsru)
3498                 return -ENOMEM;
3499
3500         atsru->hdr = hdr;
3501         atsru->include_all = atsr->flags & 0x1;
3502
3503         list_add(&atsru->list, &dmar_atsr_units);
3504
3505         return 0;
3506 }
3507
3508 static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3509 {
3510         int rc;
3511         struct acpi_dmar_atsr *atsr;
3512
3513         if (atsru->include_all)
3514                 return 0;
3515
3516         atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3517         rc = dmar_parse_dev_scope((void *)(atsr + 1),
3518                                 (void *)atsr + atsr->header.length,
3519                                 &atsru->devices_cnt, &atsru->devices,
3520                                 atsr->segment);
3521         if (rc || !atsru->devices_cnt) {
3522                 list_del(&atsru->list);
3523                 kfree(atsru);
3524         }
3525
3526         return rc;
3527 }
3528
3529 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3530 {
3531         int i;
3532         struct pci_bus *bus;
3533         struct acpi_dmar_atsr *atsr;
3534         struct dmar_atsr_unit *atsru;
3535
3536         dev = pci_physfn(dev);
3537
3538         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3539                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3540                 if (atsr->segment == pci_domain_nr(dev->bus))
3541                         goto found;
3542         }
3543
3544         return 0;
3545
3546 found:
3547         for (bus = dev->bus; bus; bus = bus->parent) {
3548                 struct pci_dev *bridge = bus->self;
3549
3550                 if (!bridge || !pci_is_pcie(bridge) ||
3551                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3552                         return 0;
3553
3554                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) {
3555                         for (i = 0; i < atsru->devices_cnt; i++)
3556                                 if (atsru->devices[i] == bridge)
3557                                         return 1;
3558                         break;
3559                 }
3560         }
3561
3562         if (atsru->include_all)
3563                 return 1;
3564
3565         return 0;
3566 }
3567
3568 int __init dmar_parse_rmrr_atsr_dev(void)
3569 {
3570         struct dmar_rmrr_unit *rmrr, *rmrr_n;
3571         struct dmar_atsr_unit *atsr, *atsr_n;
3572         int ret = 0;
3573
3574         list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
3575                 ret = rmrr_parse_dev(rmrr);
3576                 if (ret)
3577                         return ret;
3578         }
3579
3580         list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
3581                 ret = atsr_parse_dev(atsr);
3582                 if (ret)
3583                         return ret;
3584         }
3585
3586         return ret;
3587 }
3588
3589 /*
3590  * Here we only respond to action of unbound device from driver.
3591  *
3592  * Added device is not attached to its DMAR domain here yet. That will happen
3593  * when mapping the device to iova.
3594  */
3595 static int device_notifier(struct notifier_block *nb,
3596                                   unsigned long action, void *data)
3597 {
3598         struct device *dev = data;
3599         struct pci_dev *pdev = to_pci_dev(dev);
3600         struct dmar_domain *domain;
3601
3602         if (iommu_no_mapping(dev))
3603                 return 0;
3604
3605         domain = find_domain(pdev);
3606         if (!domain)
3607                 return 0;
3608
3609         if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3610                 domain_remove_one_dev_info(domain, pdev);
3611
3612                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3613                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3614                     list_empty(&domain->devices))
3615                         domain_exit(domain);
3616         }
3617
3618         return 0;
3619 }
3620
3621 static struct notifier_block device_nb = {
3622         .notifier_call = device_notifier,
3623 };
3624
3625 int __init intel_iommu_init(void)
3626 {
3627         int ret = 0;
3628
3629         /* VT-d is required for a TXT/tboot launch, so enforce that */
3630         force_on = tboot_force_iommu();
3631
3632         if (dmar_table_init()) {
3633                 if (force_on)
3634                         panic("tboot: Failed to initialize DMAR table\n");
3635                 return  -ENODEV;
3636         }
3637
3638         if (dmar_dev_scope_init() < 0) {
3639                 if (force_on)
3640                         panic("tboot: Failed to initialize DMAR device scope\n");
3641                 return  -ENODEV;
3642         }
3643
3644         if (no_iommu || dmar_disabled)
3645                 return -ENODEV;
3646
3647         if (iommu_init_mempool()) {
3648                 if (force_on)
3649                         panic("tboot: Failed to initialize iommu memory\n");
3650                 return  -ENODEV;
3651         }
3652
3653         if (list_empty(&dmar_rmrr_units))
3654                 printk(KERN_INFO "DMAR: No RMRR found\n");
3655
3656         if (list_empty(&dmar_atsr_units))
3657                 printk(KERN_INFO "DMAR: No ATSR found\n");
3658
3659         if (dmar_init_reserved_ranges()) {
3660                 if (force_on)
3661                         panic("tboot: Failed to reserve iommu ranges\n");
3662                 return  -ENODEV;
3663         }
3664
3665         init_no_remapping_devices();
3666
3667         ret = init_dmars();
3668         if (ret) {
3669                 if (force_on)
3670                         panic("tboot: Failed to initialize DMARs\n");
3671                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3672                 put_iova_domain(&reserved_iova_list);
3673                 iommu_exit_mempool();
3674                 return ret;
3675         }
3676         printk(KERN_INFO
3677         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3678
3679         init_timer(&unmap_timer);
3680 #ifdef CONFIG_SWIOTLB
3681         swiotlb = 0;
3682 #endif
3683         dma_ops = &intel_dma_ops;
3684
3685         init_iommu_pm_ops();
3686
3687         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3688
3689         bus_register_notifier(&pci_bus_type, &device_nb);
3690
3691         intel_iommu_enabled = 1;
3692
3693         return 0;
3694 }
3695
3696 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3697                                            struct pci_dev *pdev)
3698 {
3699         struct pci_dev *tmp, *parent;
3700
3701         if (!iommu || !pdev)
3702                 return;
3703
3704         /* dependent device detach */
3705         tmp = pci_find_upstream_pcie_bridge(pdev);
3706         /* Secondary interface's bus number and devfn 0 */
3707         if (tmp) {
3708                 parent = pdev->bus->self;
3709                 while (parent != tmp) {
3710                         iommu_detach_dev(iommu, parent->bus->number,
3711                                          parent->devfn);
3712                         parent = parent->bus->self;
3713                 }
3714                 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3715                         iommu_detach_dev(iommu,
3716                                 tmp->subordinate->number, 0);
3717                 else /* this is a legacy PCI bridge */
3718                         iommu_detach_dev(iommu, tmp->bus->number,
3719                                          tmp->devfn);
3720         }
3721 }
3722
3723 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3724                                           struct pci_dev *pdev)
3725 {
3726         struct device_domain_info *info;
3727         struct intel_iommu *iommu;
3728         unsigned long flags;
3729         int found = 0;
3730         struct list_head *entry, *tmp;
3731
3732         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3733                                 pdev->devfn);
3734         if (!iommu)
3735                 return;
3736
3737         spin_lock_irqsave(&device_domain_lock, flags);
3738         list_for_each_safe(entry, tmp, &domain->devices) {
3739                 info = list_entry(entry, struct device_domain_info, link);
3740                 if (info->segment == pci_domain_nr(pdev->bus) &&
3741                     info->bus == pdev->bus->number &&
3742                     info->devfn == pdev->devfn) {
3743                         unlink_domain_info(info);
3744                         spin_unlock_irqrestore(&device_domain_lock, flags);
3745
3746                         iommu_disable_dev_iotlb(info);
3747                         iommu_detach_dev(iommu, info->bus, info->devfn);
3748                         iommu_detach_dependent_devices(iommu, pdev);
3749                         free_devinfo_mem(info);
3750
3751                         spin_lock_irqsave(&device_domain_lock, flags);
3752
3753                         if (found)
3754                                 break;
3755                         else
3756                                 continue;
3757                 }
3758
3759                 /* if there is no other devices under the same iommu
3760                  * owned by this domain, clear this iommu in iommu_bmp
3761                  * update iommu count and coherency
3762                  */
3763                 if (iommu == device_to_iommu(info->segment, info->bus,
3764                                             info->devfn))
3765                         found = 1;
3766         }
3767
3768         spin_unlock_irqrestore(&device_domain_lock, flags);
3769
3770         if (found == 0) {
3771                 unsigned long tmp_flags;
3772                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3773                 clear_bit(iommu->seq_id, domain->iommu_bmp);
3774                 domain->iommu_count--;
3775                 domain_update_iommu_cap(domain);
3776                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3777
3778                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3779                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3780                         spin_lock_irqsave(&iommu->lock, tmp_flags);
3781                         clear_bit(domain->id, iommu->domain_ids);
3782                         iommu->domains[domain->id] = NULL;
3783                         spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3784                 }
3785         }
3786 }
3787
3788 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3789 {
3790         struct device_domain_info *info;
3791         struct intel_iommu *iommu;
3792         unsigned long flags1, flags2;
3793
3794         spin_lock_irqsave(&device_domain_lock, flags1);
3795         while (!list_empty(&domain->devices)) {
3796                 info = list_entry(domain->devices.next,
3797                         struct device_domain_info, link);
3798                 unlink_domain_info(info);
3799                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3800
3801                 iommu_disable_dev_iotlb(info);
3802                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3803                 iommu_detach_dev(iommu, info->bus, info->devfn);
3804                 iommu_detach_dependent_devices(iommu, info->dev);
3805
3806                 /* clear this iommu in iommu_bmp, update iommu count
3807                  * and capabilities
3808                  */
3809                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3810                 if (test_and_clear_bit(iommu->seq_id,
3811                                        domain->iommu_bmp)) {
3812                         domain->iommu_count--;
3813                         domain_update_iommu_cap(domain);
3814                 }
3815                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3816
3817                 free_devinfo_mem(info);
3818                 spin_lock_irqsave(&device_domain_lock, flags1);
3819         }
3820         spin_unlock_irqrestore(&device_domain_lock, flags1);
3821 }
3822
3823 /* domain id for virtual machine, it won't be set in context */
3824 static unsigned long vm_domid;
3825
3826 static struct dmar_domain *iommu_alloc_vm_domain(void)
3827 {
3828         struct dmar_domain *domain;
3829
3830         domain = alloc_domain_mem();
3831         if (!domain)
3832                 return NULL;
3833
3834         domain->id = vm_domid++;
3835         domain->nid = -1;
3836         memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
3837         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3838
3839         return domain;
3840 }
3841
3842 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3843 {
3844         int adjust_width;
3845
3846         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3847         spin_lock_init(&domain->iommu_lock);
3848
3849         domain_reserve_special_ranges(domain);
3850
3851         /* calculate AGAW */
3852         domain->gaw = guest_width;
3853         adjust_width = guestwidth_to_adjustwidth(guest_width);
3854         domain->agaw = width_to_agaw(adjust_width);
3855
3856         INIT_LIST_HEAD(&domain->devices);
3857
3858         domain->iommu_count = 0;
3859         domain->iommu_coherency = 0;
3860         domain->iommu_snooping = 0;
3861         domain->iommu_superpage = 0;
3862         domain->max_addr = 0;
3863         domain->nid = -1;
3864
3865         /* always allocate the top pgd */
3866         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3867         if (!domain->pgd)
3868                 return -ENOMEM;
3869         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3870         return 0;
3871 }
3872
3873 static void iommu_free_vm_domain(struct dmar_domain *domain)
3874 {
3875         unsigned long flags;
3876         struct dmar_drhd_unit *drhd;
3877         struct intel_iommu *iommu;
3878         unsigned long i;
3879         unsigned long ndomains;
3880
3881         for_each_drhd_unit(drhd) {
3882                 if (drhd->ignored)
3883                         continue;
3884                 iommu = drhd->iommu;
3885
3886                 ndomains = cap_ndoms(iommu->cap);
3887                 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3888                         if (iommu->domains[i] == domain) {
3889                                 spin_lock_irqsave(&iommu->lock, flags);
3890                                 clear_bit(i, iommu->domain_ids);
3891                                 iommu->domains[i] = NULL;
3892                                 spin_unlock_irqrestore(&iommu->lock, flags);
3893                                 break;
3894                         }
3895                 }
3896         }
3897 }
3898
3899 static void vm_domain_exit(struct dmar_domain *domain)
3900 {
3901         /* Domain 0 is reserved, so dont process it */
3902         if (!domain)
3903                 return;
3904
3905         vm_domain_remove_all_dev_info(domain);
3906         /* destroy iovas */
3907         put_iova_domain(&domain->iovad);
3908
3909         /* clear ptes */
3910         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3911
3912         /* free page tables */
3913         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3914
3915         iommu_free_vm_domain(domain);
3916         free_domain_mem(domain);
3917 }
3918
3919 static int intel_iommu_domain_init(struct iommu_domain *domain)
3920 {
3921         struct dmar_domain *dmar_domain;
3922
3923         dmar_domain = iommu_alloc_vm_domain();
3924         if (!dmar_domain) {
3925                 printk(KERN_ERR
3926                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3927                 return -ENOMEM;
3928         }
3929         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3930                 printk(KERN_ERR
3931                         "intel_iommu_domain_init() failed\n");
3932                 vm_domain_exit(dmar_domain);
3933                 return -ENOMEM;
3934         }
3935         domain_update_iommu_cap(dmar_domain);
3936         domain->priv = dmar_domain;
3937
3938         domain->geometry.aperture_start = 0;
3939         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
3940         domain->geometry.force_aperture = true;
3941
3942         return 0;
3943 }
3944
3945 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3946 {
3947         struct dmar_domain *dmar_domain = domain->priv;
3948
3949         domain->priv = NULL;
3950         vm_domain_exit(dmar_domain);
3951 }
3952
3953 static int intel_iommu_attach_device(struct iommu_domain *domain,
3954                                      struct device *dev)
3955 {
3956         struct dmar_domain *dmar_domain = domain->priv;
3957         struct pci_dev *pdev = to_pci_dev(dev);
3958         struct intel_iommu *iommu;
3959         int addr_width;
3960
3961         /* normally pdev is not mapped */
3962         if (unlikely(domain_context_mapped(pdev))) {
3963                 struct dmar_domain *old_domain;
3964
3965                 old_domain = find_domain(pdev);
3966                 if (old_domain) {
3967                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3968                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3969                                 domain_remove_one_dev_info(old_domain, pdev);
3970                         else
3971                                 domain_remove_dev_info(old_domain);
3972                 }
3973         }
3974
3975         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3976                                 pdev->devfn);
3977         if (!iommu)
3978                 return -ENODEV;
3979
3980         /* check if this iommu agaw is sufficient for max mapped address */
3981         addr_width = agaw_to_width(iommu->agaw);
3982         if (addr_width > cap_mgaw(iommu->cap))
3983                 addr_width = cap_mgaw(iommu->cap);
3984
3985         if (dmar_domain->max_addr > (1LL << addr_width)) {
3986                 printk(KERN_ERR "%s: iommu width (%d) is not "
3987                        "sufficient for the mapped address (%llx)\n",
3988                        __func__, addr_width, dmar_domain->max_addr);
3989                 return -EFAULT;
3990         }
3991         dmar_domain->gaw = addr_width;
3992
3993         /*
3994          * Knock out extra levels of page tables if necessary
3995          */
3996         while (iommu->agaw < dmar_domain->agaw) {
3997                 struct dma_pte *pte;
3998
3999                 pte = dmar_domain->pgd;
4000                 if (dma_pte_present(pte)) {
4001                         dmar_domain->pgd = (struct dma_pte *)
4002                                 phys_to_virt(dma_pte_addr(pte));
4003                         free_pgtable_page(pte);
4004                 }
4005                 dmar_domain->agaw--;
4006         }
4007
4008         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4009 }
4010
4011 static void intel_iommu_detach_device(struct iommu_domain *domain,
4012                                       struct device *dev)
4013 {
4014         struct dmar_domain *dmar_domain = domain->priv;
4015         struct pci_dev *pdev = to_pci_dev(dev);
4016
4017         domain_remove_one_dev_info(dmar_domain, pdev);
4018 }
4019
4020 static int intel_iommu_map(struct iommu_domain *domain,
4021                            unsigned long iova, phys_addr_t hpa,
4022                            size_t size, int iommu_prot)
4023 {
4024         struct dmar_domain *dmar_domain = domain->priv;
4025         u64 max_addr;
4026         int prot = 0;
4027         int ret;
4028
4029         if (iommu_prot & IOMMU_READ)
4030                 prot |= DMA_PTE_READ;
4031         if (iommu_prot & IOMMU_WRITE)
4032                 prot |= DMA_PTE_WRITE;
4033         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4034                 prot |= DMA_PTE_SNP;
4035
4036         max_addr = iova + size;
4037         if (dmar_domain->max_addr < max_addr) {
4038                 u64 end;
4039
4040                 /* check if minimum agaw is sufficient for mapped address */
4041                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4042                 if (end < max_addr) {
4043                         printk(KERN_ERR "%s: iommu width (%d) is not "
4044                                "sufficient for the mapped address (%llx)\n",
4045                                __func__, dmar_domain->gaw, max_addr);
4046                         return -EFAULT;
4047                 }
4048                 dmar_domain->max_addr = max_addr;
4049         }
4050         /* Round up size to next multiple of PAGE_SIZE, if it and
4051            the low bits of hpa would take us onto the next page */
4052         size = aligned_nrpages(hpa, size);
4053         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4054                                  hpa >> VTD_PAGE_SHIFT, size, prot);
4055         return ret;
4056 }
4057
4058 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4059                              unsigned long iova, size_t size)
4060 {
4061         struct dmar_domain *dmar_domain = domain->priv;
4062         int order;
4063
4064         order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4065                             (iova + size - 1) >> VTD_PAGE_SHIFT);
4066
4067         if (dmar_domain->max_addr == iova + size)
4068                 dmar_domain->max_addr = iova;
4069
4070         return PAGE_SIZE << order;
4071 }
4072
4073 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4074                                             unsigned long iova)
4075 {
4076         struct dmar_domain *dmar_domain = domain->priv;
4077         struct dma_pte *pte;
4078         u64 phys = 0;
4079
4080         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4081         if (pte)
4082                 phys = dma_pte_addr(pte);
4083
4084         return phys;
4085 }
4086
4087 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4088                                       unsigned long cap)
4089 {
4090         struct dmar_domain *dmar_domain = domain->priv;
4091
4092         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4093                 return dmar_domain->iommu_snooping;
4094         if (cap == IOMMU_CAP_INTR_REMAP)
4095                 return irq_remapping_enabled;
4096
4097         return 0;
4098 }
4099
4100 static void swap_pci_ref(struct pci_dev **from, struct pci_dev *to)
4101 {
4102         pci_dev_put(*from);
4103         *from = to;
4104 }
4105
4106 #define REQ_ACS_FLAGS   (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
4107
4108 static int intel_iommu_add_device(struct device *dev)
4109 {
4110         struct pci_dev *pdev = to_pci_dev(dev);
4111         struct pci_dev *bridge, *dma_pdev;
4112         struct iommu_group *group;
4113         int ret;
4114
4115         if (!device_to_iommu(pci_domain_nr(pdev->bus),
4116                              pdev->bus->number, pdev->devfn))
4117                 return -ENODEV;
4118
4119         bridge = pci_find_upstream_pcie_bridge(pdev);
4120         if (bridge) {
4121                 if (pci_is_pcie(bridge))
4122                         dma_pdev = pci_get_domain_bus_and_slot(
4123                                                 pci_domain_nr(pdev->bus),
4124                                                 bridge->subordinate->number, 0);
4125                 else
4126                         dma_pdev = pci_dev_get(bridge);
4127         } else
4128                 dma_pdev = pci_dev_get(pdev);
4129
4130         /* Account for quirked devices */
4131         swap_pci_ref(&dma_pdev, pci_get_dma_source(dma_pdev));
4132
4133         /*
4134          * If it's a multifunction device that does not support our
4135          * required ACS flags, add to the same group as function 0.
4136          */
4137         if (dma_pdev->multifunction &&
4138             !pci_acs_enabled(dma_pdev, REQ_ACS_FLAGS))
4139                 swap_pci_ref(&dma_pdev,
4140                              pci_get_slot(dma_pdev->bus,
4141                                           PCI_DEVFN(PCI_SLOT(dma_pdev->devfn),
4142                                           0)));
4143
4144         /*
4145          * Devices on the root bus go through the iommu.  If that's not us,
4146          * find the next upstream device and test ACS up to the root bus.
4147          * Finding the next device may require skipping virtual buses.
4148          */
4149         while (!pci_is_root_bus(dma_pdev->bus)) {
4150                 struct pci_bus *bus = dma_pdev->bus;
4151
4152                 while (!bus->self) {
4153                         if (!pci_is_root_bus(bus))
4154                                 bus = bus->parent;
4155                         else
4156                                 goto root_bus;
4157                 }
4158
4159                 if (pci_acs_path_enabled(bus->self, NULL, REQ_ACS_FLAGS))
4160                         break;
4161
4162                 swap_pci_ref(&dma_pdev, pci_dev_get(bus->self));
4163         }
4164
4165 root_bus:
4166         group = iommu_group_get(&dma_pdev->dev);
4167         pci_dev_put(dma_pdev);
4168         if (!group) {
4169                 group = iommu_group_alloc();
4170                 if (IS_ERR(group))
4171                         return PTR_ERR(group);
4172         }
4173
4174         ret = iommu_group_add_device(group, dev);
4175
4176         iommu_group_put(group);
4177         return ret;
4178 }
4179
4180 static void intel_iommu_remove_device(struct device *dev)
4181 {
4182         iommu_group_remove_device(dev);
4183 }
4184
4185 static struct iommu_ops intel_iommu_ops = {
4186         .domain_init    = intel_iommu_domain_init,
4187         .domain_destroy = intel_iommu_domain_destroy,
4188         .attach_dev     = intel_iommu_attach_device,
4189         .detach_dev     = intel_iommu_detach_device,
4190         .map            = intel_iommu_map,
4191         .unmap          = intel_iommu_unmap,
4192         .iova_to_phys   = intel_iommu_iova_to_phys,
4193         .domain_has_cap = intel_iommu_domain_has_cap,
4194         .add_device     = intel_iommu_add_device,
4195         .remove_device  = intel_iommu_remove_device,
4196         .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
4197 };
4198
4199 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
4200 {
4201         /*
4202          * Mobile 4 Series Chipset neglects to set RWBF capability,
4203          * but needs it:
4204          */
4205         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4206         rwbf_quirk = 1;
4207
4208         /* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
4209         if (dev->revision == 0x07) {
4210                 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4211                 dmar_map_gfx = 0;
4212         }
4213 }
4214
4215 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4216
4217 #define GGC 0x52
4218 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4219 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4220 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4221 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4222 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4223 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4224 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4225 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4226
4227 static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4228 {
4229         unsigned short ggc;
4230
4231         if (pci_read_config_word(dev, GGC, &ggc))
4232                 return;
4233
4234         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4235                 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4236                 dmar_map_gfx = 0;
4237         } else if (dmar_map_gfx) {
4238                 /* we have to ensure the gfx device is idle before we flush */
4239                 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4240                 intel_iommu_strict = 1;
4241        }
4242 }
4243 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4244 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4245 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4246 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4247
4248 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4249    ISOCH DMAR unit for the Azalia sound device, but not give it any
4250    TLB entries, which causes it to deadlock. Check for that.  We do
4251    this in a function called from init_dmars(), instead of in a PCI
4252    quirk, because we don't want to print the obnoxious "BIOS broken"
4253    message if VT-d is actually disabled.
4254 */
4255 static void __init check_tylersburg_isoch(void)
4256 {
4257         struct pci_dev *pdev;
4258         uint32_t vtisochctrl;
4259
4260         /* If there's no Azalia in the system anyway, forget it. */
4261         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4262         if (!pdev)
4263                 return;
4264         pci_dev_put(pdev);
4265
4266         /* System Management Registers. Might be hidden, in which case
4267            we can't do the sanity check. But that's OK, because the
4268            known-broken BIOSes _don't_ actually hide it, so far. */
4269         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4270         if (!pdev)
4271                 return;
4272
4273         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4274                 pci_dev_put(pdev);
4275                 return;
4276         }
4277
4278         pci_dev_put(pdev);
4279
4280         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4281         if (vtisochctrl & 1)
4282                 return;
4283
4284         /* Drop all bits other than the number of TLB entries */
4285         vtisochctrl &= 0x1c;
4286
4287         /* If we have the recommended number of TLB entries (16), fine. */
4288         if (vtisochctrl == 0x10)
4289                 return;
4290
4291         /* Zero TLB entries? You get to ride the short bus to school. */
4292         if (!vtisochctrl) {
4293                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4294                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4295                      dmi_get_system_info(DMI_BIOS_VENDOR),
4296                      dmi_get_system_info(DMI_BIOS_VERSION),
4297                      dmi_get_system_info(DMI_PRODUCT_VERSION));
4298                 iommu_identity_mapping |= IDENTMAP_AZALIA;
4299                 return;
4300         }
4301         
4302         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4303                vtisochctrl);
4304 }