Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/vapier...
[sfrench/cifs-2.6.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <linux/sysdev.h>
40 #include <linux/tboot.h>
41 #include <linux/dmi.h>
42 #include <asm/cacheflush.h>
43 #include <asm/iommu.h>
44 #include "pci.h"
45
46 #define ROOT_SIZE               VTD_PAGE_SIZE
47 #define CONTEXT_SIZE            VTD_PAGE_SIZE
48
49 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
50 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
51 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
52
53 #define IOAPIC_RANGE_START      (0xfee00000)
54 #define IOAPIC_RANGE_END        (0xfeefffff)
55 #define IOVA_START_ADDR         (0x1000)
56
57 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
58
59 #define MAX_AGAW_WIDTH 64
60
61 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
62 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
63
64 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
65    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
66 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
67                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
68 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
69
70 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
71 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
72 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
73
74
75 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
76    are never going to work. */
77 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
78 {
79         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
80 }
81
82 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
83 {
84         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
85 }
86 static inline unsigned long page_to_dma_pfn(struct page *pg)
87 {
88         return mm_to_dma_pfn(page_to_pfn(pg));
89 }
90 static inline unsigned long virt_to_dma_pfn(void *p)
91 {
92         return page_to_dma_pfn(virt_to_page(p));
93 }
94
95 /* global iommu list, set NULL for ignored DMAR units */
96 static struct intel_iommu **g_iommus;
97
98 static void __init check_tylersburg_isoch(void);
99 static int rwbf_quirk;
100
101 /*
102  * 0: Present
103  * 1-11: Reserved
104  * 12-63: Context Ptr (12 - (haw-1))
105  * 64-127: Reserved
106  */
107 struct root_entry {
108         u64     val;
109         u64     rsvd1;
110 };
111 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
112 static inline bool root_present(struct root_entry *root)
113 {
114         return (root->val & 1);
115 }
116 static inline void set_root_present(struct root_entry *root)
117 {
118         root->val |= 1;
119 }
120 static inline void set_root_value(struct root_entry *root, unsigned long value)
121 {
122         root->val |= value & VTD_PAGE_MASK;
123 }
124
125 static inline struct context_entry *
126 get_context_addr_from_root(struct root_entry *root)
127 {
128         return (struct context_entry *)
129                 (root_present(root)?phys_to_virt(
130                 root->val & VTD_PAGE_MASK) :
131                 NULL);
132 }
133
134 /*
135  * low 64 bits:
136  * 0: present
137  * 1: fault processing disable
138  * 2-3: translation type
139  * 12-63: address space root
140  * high 64 bits:
141  * 0-2: address width
142  * 3-6: aval
143  * 8-23: domain id
144  */
145 struct context_entry {
146         u64 lo;
147         u64 hi;
148 };
149
150 static inline bool context_present(struct context_entry *context)
151 {
152         return (context->lo & 1);
153 }
154 static inline void context_set_present(struct context_entry *context)
155 {
156         context->lo |= 1;
157 }
158
159 static inline void context_set_fault_enable(struct context_entry *context)
160 {
161         context->lo &= (((u64)-1) << 2) | 1;
162 }
163
164 static inline void context_set_translation_type(struct context_entry *context,
165                                                 unsigned long value)
166 {
167         context->lo &= (((u64)-1) << 4) | 3;
168         context->lo |= (value & 3) << 2;
169 }
170
171 static inline void context_set_address_root(struct context_entry *context,
172                                             unsigned long value)
173 {
174         context->lo |= value & VTD_PAGE_MASK;
175 }
176
177 static inline void context_set_address_width(struct context_entry *context,
178                                              unsigned long value)
179 {
180         context->hi |= value & 7;
181 }
182
183 static inline void context_set_domain_id(struct context_entry *context,
184                                          unsigned long value)
185 {
186         context->hi |= (value & ((1 << 16) - 1)) << 8;
187 }
188
189 static inline void context_clear_entry(struct context_entry *context)
190 {
191         context->lo = 0;
192         context->hi = 0;
193 }
194
195 /*
196  * 0: readable
197  * 1: writable
198  * 2-6: reserved
199  * 7: super page
200  * 8-10: available
201  * 11: snoop behavior
202  * 12-63: Host physcial address
203  */
204 struct dma_pte {
205         u64 val;
206 };
207
208 static inline void dma_clear_pte(struct dma_pte *pte)
209 {
210         pte->val = 0;
211 }
212
213 static inline void dma_set_pte_readable(struct dma_pte *pte)
214 {
215         pte->val |= DMA_PTE_READ;
216 }
217
218 static inline void dma_set_pte_writable(struct dma_pte *pte)
219 {
220         pte->val |= DMA_PTE_WRITE;
221 }
222
223 static inline void dma_set_pte_snp(struct dma_pte *pte)
224 {
225         pte->val |= DMA_PTE_SNP;
226 }
227
228 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
229 {
230         pte->val = (pte->val & ~3) | (prot & 3);
231 }
232
233 static inline u64 dma_pte_addr(struct dma_pte *pte)
234 {
235 #ifdef CONFIG_64BIT
236         return pte->val & VTD_PAGE_MASK;
237 #else
238         /* Must have a full atomic 64-bit read */
239         return  __cmpxchg64(pte, 0ULL, 0ULL) & VTD_PAGE_MASK;
240 #endif
241 }
242
243 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
244 {
245         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
246 }
247
248 static inline bool dma_pte_present(struct dma_pte *pte)
249 {
250         return (pte->val & 3) != 0;
251 }
252
253 static inline int first_pte_in_page(struct dma_pte *pte)
254 {
255         return !((unsigned long)pte & ~VTD_PAGE_MASK);
256 }
257
258 /*
259  * This domain is a statically identity mapping domain.
260  *      1. This domain creats a static 1:1 mapping to all usable memory.
261  *      2. It maps to each iommu if successful.
262  *      3. Each iommu mapps to this domain if successful.
263  */
264 static struct dmar_domain *si_domain;
265 static int hw_pass_through = 1;
266
267 /* devices under the same p2p bridge are owned in one domain */
268 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
269
270 /* domain represents a virtual machine, more than one devices
271  * across iommus may be owned in one domain, e.g. kvm guest.
272  */
273 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
274
275 /* si_domain contains mulitple devices */
276 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
277
278 struct dmar_domain {
279         int     id;                     /* domain id */
280         int     nid;                    /* node id */
281         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
282
283         struct list_head devices;       /* all devices' list */
284         struct iova_domain iovad;       /* iova's that belong to this domain */
285
286         struct dma_pte  *pgd;           /* virtual address */
287         int             gaw;            /* max guest address width */
288
289         /* adjusted guest address width, 0 is level 2 30-bit */
290         int             agaw;
291
292         int             flags;          /* flags to find out type of domain */
293
294         int             iommu_coherency;/* indicate coherency of iommu access */
295         int             iommu_snooping; /* indicate snooping control feature*/
296         int             iommu_count;    /* reference count of iommu */
297         spinlock_t      iommu_lock;     /* protect iommu set in domain */
298         u64             max_addr;       /* maximum mapped address */
299 };
300
301 /* PCI domain-device relationship */
302 struct device_domain_info {
303         struct list_head link;  /* link to domain siblings */
304         struct list_head global; /* link to global list */
305         int segment;            /* PCI domain */
306         u8 bus;                 /* PCI bus number */
307         u8 devfn;               /* PCI devfn number */
308         struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
309         struct intel_iommu *iommu; /* IOMMU used by this device */
310         struct dmar_domain *domain; /* pointer to domain */
311 };
312
313 static void flush_unmaps_timeout(unsigned long data);
314
315 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
316
317 #define HIGH_WATER_MARK 250
318 struct deferred_flush_tables {
319         int next;
320         struct iova *iova[HIGH_WATER_MARK];
321         struct dmar_domain *domain[HIGH_WATER_MARK];
322 };
323
324 static struct deferred_flush_tables *deferred_flush;
325
326 /* bitmap for indexing intel_iommus */
327 static int g_num_of_iommus;
328
329 static DEFINE_SPINLOCK(async_umap_flush_lock);
330 static LIST_HEAD(unmaps_to_do);
331
332 static int timer_on;
333 static long list_size;
334
335 static void domain_remove_dev_info(struct dmar_domain *domain);
336
337 #ifdef CONFIG_DMAR_DEFAULT_ON
338 int dmar_disabled = 0;
339 #else
340 int dmar_disabled = 1;
341 #endif /*CONFIG_DMAR_DEFAULT_ON*/
342
343 static int __initdata dmar_map_gfx = 1;
344 static int dmar_forcedac;
345 static int intel_iommu_strict;
346
347 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
348 static DEFINE_SPINLOCK(device_domain_lock);
349 static LIST_HEAD(device_domain_list);
350
351 static struct iommu_ops intel_iommu_ops;
352
353 static int __init intel_iommu_setup(char *str)
354 {
355         if (!str)
356                 return -EINVAL;
357         while (*str) {
358                 if (!strncmp(str, "on", 2)) {
359                         dmar_disabled = 0;
360                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
361                 } else if (!strncmp(str, "off", 3)) {
362                         dmar_disabled = 1;
363                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
364                 } else if (!strncmp(str, "igfx_off", 8)) {
365                         dmar_map_gfx = 0;
366                         printk(KERN_INFO
367                                 "Intel-IOMMU: disable GFX device mapping\n");
368                 } else if (!strncmp(str, "forcedac", 8)) {
369                         printk(KERN_INFO
370                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
371                         dmar_forcedac = 1;
372                 } else if (!strncmp(str, "strict", 6)) {
373                         printk(KERN_INFO
374                                 "Intel-IOMMU: disable batched IOTLB flush\n");
375                         intel_iommu_strict = 1;
376                 }
377
378                 str += strcspn(str, ",");
379                 while (*str == ',')
380                         str++;
381         }
382         return 0;
383 }
384 __setup("intel_iommu=", intel_iommu_setup);
385
386 static struct kmem_cache *iommu_domain_cache;
387 static struct kmem_cache *iommu_devinfo_cache;
388 static struct kmem_cache *iommu_iova_cache;
389
390 static inline void *alloc_pgtable_page(int node)
391 {
392         struct page *page;
393         void *vaddr = NULL;
394
395         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
396         if (page)
397                 vaddr = page_address(page);
398         return vaddr;
399 }
400
401 static inline void free_pgtable_page(void *vaddr)
402 {
403         free_page((unsigned long)vaddr);
404 }
405
406 static inline void *alloc_domain_mem(void)
407 {
408         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
409 }
410
411 static void free_domain_mem(void *vaddr)
412 {
413         kmem_cache_free(iommu_domain_cache, vaddr);
414 }
415
416 static inline void * alloc_devinfo_mem(void)
417 {
418         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
419 }
420
421 static inline void free_devinfo_mem(void *vaddr)
422 {
423         kmem_cache_free(iommu_devinfo_cache, vaddr);
424 }
425
426 struct iova *alloc_iova_mem(void)
427 {
428         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
429 }
430
431 void free_iova_mem(struct iova *iova)
432 {
433         kmem_cache_free(iommu_iova_cache, iova);
434 }
435
436
437 static inline int width_to_agaw(int width);
438
439 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
440 {
441         unsigned long sagaw;
442         int agaw = -1;
443
444         sagaw = cap_sagaw(iommu->cap);
445         for (agaw = width_to_agaw(max_gaw);
446              agaw >= 0; agaw--) {
447                 if (test_bit(agaw, &sagaw))
448                         break;
449         }
450
451         return agaw;
452 }
453
454 /*
455  * Calculate max SAGAW for each iommu.
456  */
457 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
458 {
459         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
460 }
461
462 /*
463  * calculate agaw for each iommu.
464  * "SAGAW" may be different across iommus, use a default agaw, and
465  * get a supported less agaw for iommus that don't support the default agaw.
466  */
467 int iommu_calculate_agaw(struct intel_iommu *iommu)
468 {
469         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
470 }
471
472 /* This functionin only returns single iommu in a domain */
473 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
474 {
475         int iommu_id;
476
477         /* si_domain and vm domain should not get here. */
478         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
479         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
480
481         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
482         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
483                 return NULL;
484
485         return g_iommus[iommu_id];
486 }
487
488 static void domain_update_iommu_coherency(struct dmar_domain *domain)
489 {
490         int i;
491
492         domain->iommu_coherency = 1;
493
494         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
495         for (; i < g_num_of_iommus; ) {
496                 if (!ecap_coherent(g_iommus[i]->ecap)) {
497                         domain->iommu_coherency = 0;
498                         break;
499                 }
500                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
501         }
502 }
503
504 static void domain_update_iommu_snooping(struct dmar_domain *domain)
505 {
506         int i;
507
508         domain->iommu_snooping = 1;
509
510         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
511         for (; i < g_num_of_iommus; ) {
512                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
513                         domain->iommu_snooping = 0;
514                         break;
515                 }
516                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
517         }
518 }
519
520 /* Some capabilities may be different across iommus */
521 static void domain_update_iommu_cap(struct dmar_domain *domain)
522 {
523         domain_update_iommu_coherency(domain);
524         domain_update_iommu_snooping(domain);
525 }
526
527 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
528 {
529         struct dmar_drhd_unit *drhd = NULL;
530         int i;
531
532         for_each_drhd_unit(drhd) {
533                 if (drhd->ignored)
534                         continue;
535                 if (segment != drhd->segment)
536                         continue;
537
538                 for (i = 0; i < drhd->devices_cnt; i++) {
539                         if (drhd->devices[i] &&
540                             drhd->devices[i]->bus->number == bus &&
541                             drhd->devices[i]->devfn == devfn)
542                                 return drhd->iommu;
543                         if (drhd->devices[i] &&
544                             drhd->devices[i]->subordinate &&
545                             drhd->devices[i]->subordinate->number <= bus &&
546                             drhd->devices[i]->subordinate->subordinate >= bus)
547                                 return drhd->iommu;
548                 }
549
550                 if (drhd->include_all)
551                         return drhd->iommu;
552         }
553
554         return NULL;
555 }
556
557 static void domain_flush_cache(struct dmar_domain *domain,
558                                void *addr, int size)
559 {
560         if (!domain->iommu_coherency)
561                 clflush_cache_range(addr, size);
562 }
563
564 /* Gets context entry for a given bus and devfn */
565 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
566                 u8 bus, u8 devfn)
567 {
568         struct root_entry *root;
569         struct context_entry *context;
570         unsigned long phy_addr;
571         unsigned long flags;
572
573         spin_lock_irqsave(&iommu->lock, flags);
574         root = &iommu->root_entry[bus];
575         context = get_context_addr_from_root(root);
576         if (!context) {
577                 context = (struct context_entry *)
578                                 alloc_pgtable_page(iommu->node);
579                 if (!context) {
580                         spin_unlock_irqrestore(&iommu->lock, flags);
581                         return NULL;
582                 }
583                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
584                 phy_addr = virt_to_phys((void *)context);
585                 set_root_value(root, phy_addr);
586                 set_root_present(root);
587                 __iommu_flush_cache(iommu, root, sizeof(*root));
588         }
589         spin_unlock_irqrestore(&iommu->lock, flags);
590         return &context[devfn];
591 }
592
593 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
594 {
595         struct root_entry *root;
596         struct context_entry *context;
597         int ret;
598         unsigned long flags;
599
600         spin_lock_irqsave(&iommu->lock, flags);
601         root = &iommu->root_entry[bus];
602         context = get_context_addr_from_root(root);
603         if (!context) {
604                 ret = 0;
605                 goto out;
606         }
607         ret = context_present(&context[devfn]);
608 out:
609         spin_unlock_irqrestore(&iommu->lock, flags);
610         return ret;
611 }
612
613 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
614 {
615         struct root_entry *root;
616         struct context_entry *context;
617         unsigned long flags;
618
619         spin_lock_irqsave(&iommu->lock, flags);
620         root = &iommu->root_entry[bus];
621         context = get_context_addr_from_root(root);
622         if (context) {
623                 context_clear_entry(&context[devfn]);
624                 __iommu_flush_cache(iommu, &context[devfn], \
625                         sizeof(*context));
626         }
627         spin_unlock_irqrestore(&iommu->lock, flags);
628 }
629
630 static void free_context_table(struct intel_iommu *iommu)
631 {
632         struct root_entry *root;
633         int i;
634         unsigned long flags;
635         struct context_entry *context;
636
637         spin_lock_irqsave(&iommu->lock, flags);
638         if (!iommu->root_entry) {
639                 goto out;
640         }
641         for (i = 0; i < ROOT_ENTRY_NR; i++) {
642                 root = &iommu->root_entry[i];
643                 context = get_context_addr_from_root(root);
644                 if (context)
645                         free_pgtable_page(context);
646         }
647         free_pgtable_page(iommu->root_entry);
648         iommu->root_entry = NULL;
649 out:
650         spin_unlock_irqrestore(&iommu->lock, flags);
651 }
652
653 /* page table handling */
654 #define LEVEL_STRIDE            (9)
655 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
656
657 static inline int agaw_to_level(int agaw)
658 {
659         return agaw + 2;
660 }
661
662 static inline int agaw_to_width(int agaw)
663 {
664         return 30 + agaw * LEVEL_STRIDE;
665
666 }
667
668 static inline int width_to_agaw(int width)
669 {
670         return (width - 30) / LEVEL_STRIDE;
671 }
672
673 static inline unsigned int level_to_offset_bits(int level)
674 {
675         return (level - 1) * LEVEL_STRIDE;
676 }
677
678 static inline int pfn_level_offset(unsigned long pfn, int level)
679 {
680         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
681 }
682
683 static inline unsigned long level_mask(int level)
684 {
685         return -1UL << level_to_offset_bits(level);
686 }
687
688 static inline unsigned long level_size(int level)
689 {
690         return 1UL << level_to_offset_bits(level);
691 }
692
693 static inline unsigned long align_to_level(unsigned long pfn, int level)
694 {
695         return (pfn + level_size(level) - 1) & level_mask(level);
696 }
697
698 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
699                                       unsigned long pfn)
700 {
701         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
702         struct dma_pte *parent, *pte = NULL;
703         int level = agaw_to_level(domain->agaw);
704         int offset;
705
706         BUG_ON(!domain->pgd);
707         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
708         parent = domain->pgd;
709
710         while (level > 0) {
711                 void *tmp_page;
712
713                 offset = pfn_level_offset(pfn, level);
714                 pte = &parent[offset];
715                 if (level == 1)
716                         break;
717
718                 if (!dma_pte_present(pte)) {
719                         uint64_t pteval;
720
721                         tmp_page = alloc_pgtable_page(domain->nid);
722
723                         if (!tmp_page)
724                                 return NULL;
725
726                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
727                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
728                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
729                                 /* Someone else set it while we were thinking; use theirs. */
730                                 free_pgtable_page(tmp_page);
731                         } else {
732                                 dma_pte_addr(pte);
733                                 domain_flush_cache(domain, pte, sizeof(*pte));
734                         }
735                 }
736                 parent = phys_to_virt(dma_pte_addr(pte));
737                 level--;
738         }
739
740         return pte;
741 }
742
743 /* return address's pte at specific level */
744 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
745                                          unsigned long pfn,
746                                          int level)
747 {
748         struct dma_pte *parent, *pte = NULL;
749         int total = agaw_to_level(domain->agaw);
750         int offset;
751
752         parent = domain->pgd;
753         while (level <= total) {
754                 offset = pfn_level_offset(pfn, total);
755                 pte = &parent[offset];
756                 if (level == total)
757                         return pte;
758
759                 if (!dma_pte_present(pte))
760                         break;
761                 parent = phys_to_virt(dma_pte_addr(pte));
762                 total--;
763         }
764         return NULL;
765 }
766
767 /* clear last level pte, a tlb flush should be followed */
768 static void dma_pte_clear_range(struct dmar_domain *domain,
769                                 unsigned long start_pfn,
770                                 unsigned long last_pfn)
771 {
772         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
773         struct dma_pte *first_pte, *pte;
774
775         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
776         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
777         BUG_ON(start_pfn > last_pfn);
778
779         /* we don't need lock here; nobody else touches the iova range */
780         do {
781                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1);
782                 if (!pte) {
783                         start_pfn = align_to_level(start_pfn + 1, 2);
784                         continue;
785                 }
786                 do { 
787                         dma_clear_pte(pte);
788                         start_pfn++;
789                         pte++;
790                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
791
792                 domain_flush_cache(domain, first_pte,
793                                    (void *)pte - (void *)first_pte);
794
795         } while (start_pfn && start_pfn <= last_pfn);
796 }
797
798 /* free page table pages. last level pte should already be cleared */
799 static void dma_pte_free_pagetable(struct dmar_domain *domain,
800                                    unsigned long start_pfn,
801                                    unsigned long last_pfn)
802 {
803         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
804         struct dma_pte *first_pte, *pte;
805         int total = agaw_to_level(domain->agaw);
806         int level;
807         unsigned long tmp;
808
809         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
810         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
811         BUG_ON(start_pfn > last_pfn);
812
813         /* We don't need lock here; nobody else touches the iova range */
814         level = 2;
815         while (level <= total) {
816                 tmp = align_to_level(start_pfn, level);
817
818                 /* If we can't even clear one PTE at this level, we're done */
819                 if (tmp + level_size(level) - 1 > last_pfn)
820                         return;
821
822                 do {
823                         first_pte = pte = dma_pfn_level_pte(domain, tmp, level);
824                         if (!pte) {
825                                 tmp = align_to_level(tmp + 1, level + 1);
826                                 continue;
827                         }
828                         do {
829                                 if (dma_pte_present(pte)) {
830                                         free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
831                                         dma_clear_pte(pte);
832                                 }
833                                 pte++;
834                                 tmp += level_size(level);
835                         } while (!first_pte_in_page(pte) &&
836                                  tmp + level_size(level) - 1 <= last_pfn);
837
838                         domain_flush_cache(domain, first_pte,
839                                            (void *)pte - (void *)first_pte);
840                         
841                 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
842                 level++;
843         }
844         /* free pgd */
845         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
846                 free_pgtable_page(domain->pgd);
847                 domain->pgd = NULL;
848         }
849 }
850
851 /* iommu handling */
852 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
853 {
854         struct root_entry *root;
855         unsigned long flags;
856
857         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
858         if (!root)
859                 return -ENOMEM;
860
861         __iommu_flush_cache(iommu, root, ROOT_SIZE);
862
863         spin_lock_irqsave(&iommu->lock, flags);
864         iommu->root_entry = root;
865         spin_unlock_irqrestore(&iommu->lock, flags);
866
867         return 0;
868 }
869
870 static void iommu_set_root_entry(struct intel_iommu *iommu)
871 {
872         void *addr;
873         u32 sts;
874         unsigned long flag;
875
876         addr = iommu->root_entry;
877
878         spin_lock_irqsave(&iommu->register_lock, flag);
879         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
880
881         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
882
883         /* Make sure hardware complete it */
884         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
885                       readl, (sts & DMA_GSTS_RTPS), sts);
886
887         spin_unlock_irqrestore(&iommu->register_lock, flag);
888 }
889
890 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
891 {
892         u32 val;
893         unsigned long flag;
894
895         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
896                 return;
897
898         spin_lock_irqsave(&iommu->register_lock, flag);
899         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
900
901         /* Make sure hardware complete it */
902         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
903                       readl, (!(val & DMA_GSTS_WBFS)), val);
904
905         spin_unlock_irqrestore(&iommu->register_lock, flag);
906 }
907
908 /* return value determine if we need a write buffer flush */
909 static void __iommu_flush_context(struct intel_iommu *iommu,
910                                   u16 did, u16 source_id, u8 function_mask,
911                                   u64 type)
912 {
913         u64 val = 0;
914         unsigned long flag;
915
916         switch (type) {
917         case DMA_CCMD_GLOBAL_INVL:
918                 val = DMA_CCMD_GLOBAL_INVL;
919                 break;
920         case DMA_CCMD_DOMAIN_INVL:
921                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
922                 break;
923         case DMA_CCMD_DEVICE_INVL:
924                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
925                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
926                 break;
927         default:
928                 BUG();
929         }
930         val |= DMA_CCMD_ICC;
931
932         spin_lock_irqsave(&iommu->register_lock, flag);
933         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
934
935         /* Make sure hardware complete it */
936         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
937                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
938
939         spin_unlock_irqrestore(&iommu->register_lock, flag);
940 }
941
942 /* return value determine if we need a write buffer flush */
943 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
944                                 u64 addr, unsigned int size_order, u64 type)
945 {
946         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
947         u64 val = 0, val_iva = 0;
948         unsigned long flag;
949
950         switch (type) {
951         case DMA_TLB_GLOBAL_FLUSH:
952                 /* global flush doesn't need set IVA_REG */
953                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
954                 break;
955         case DMA_TLB_DSI_FLUSH:
956                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
957                 break;
958         case DMA_TLB_PSI_FLUSH:
959                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
960                 /* Note: always flush non-leaf currently */
961                 val_iva = size_order | addr;
962                 break;
963         default:
964                 BUG();
965         }
966         /* Note: set drain read/write */
967 #if 0
968         /*
969          * This is probably to be super secure.. Looks like we can
970          * ignore it without any impact.
971          */
972         if (cap_read_drain(iommu->cap))
973                 val |= DMA_TLB_READ_DRAIN;
974 #endif
975         if (cap_write_drain(iommu->cap))
976                 val |= DMA_TLB_WRITE_DRAIN;
977
978         spin_lock_irqsave(&iommu->register_lock, flag);
979         /* Note: Only uses first TLB reg currently */
980         if (val_iva)
981                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
982         dmar_writeq(iommu->reg + tlb_offset + 8, val);
983
984         /* Make sure hardware complete it */
985         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
986                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
987
988         spin_unlock_irqrestore(&iommu->register_lock, flag);
989
990         /* check IOTLB invalidation granularity */
991         if (DMA_TLB_IAIG(val) == 0)
992                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
993         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
994                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
995                         (unsigned long long)DMA_TLB_IIRG(type),
996                         (unsigned long long)DMA_TLB_IAIG(val));
997 }
998
999 static struct device_domain_info *iommu_support_dev_iotlb(
1000         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1001 {
1002         int found = 0;
1003         unsigned long flags;
1004         struct device_domain_info *info;
1005         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1006
1007         if (!ecap_dev_iotlb_support(iommu->ecap))
1008                 return NULL;
1009
1010         if (!iommu->qi)
1011                 return NULL;
1012
1013         spin_lock_irqsave(&device_domain_lock, flags);
1014         list_for_each_entry(info, &domain->devices, link)
1015                 if (info->bus == bus && info->devfn == devfn) {
1016                         found = 1;
1017                         break;
1018                 }
1019         spin_unlock_irqrestore(&device_domain_lock, flags);
1020
1021         if (!found || !info->dev)
1022                 return NULL;
1023
1024         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1025                 return NULL;
1026
1027         if (!dmar_find_matched_atsr_unit(info->dev))
1028                 return NULL;
1029
1030         info->iommu = iommu;
1031
1032         return info;
1033 }
1034
1035 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1036 {
1037         if (!info)
1038                 return;
1039
1040         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1041 }
1042
1043 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1044 {
1045         if (!info->dev || !pci_ats_enabled(info->dev))
1046                 return;
1047
1048         pci_disable_ats(info->dev);
1049 }
1050
1051 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1052                                   u64 addr, unsigned mask)
1053 {
1054         u16 sid, qdep;
1055         unsigned long flags;
1056         struct device_domain_info *info;
1057
1058         spin_lock_irqsave(&device_domain_lock, flags);
1059         list_for_each_entry(info, &domain->devices, link) {
1060                 if (!info->dev || !pci_ats_enabled(info->dev))
1061                         continue;
1062
1063                 sid = info->bus << 8 | info->devfn;
1064                 qdep = pci_ats_queue_depth(info->dev);
1065                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1066         }
1067         spin_unlock_irqrestore(&device_domain_lock, flags);
1068 }
1069
1070 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1071                                   unsigned long pfn, unsigned int pages)
1072 {
1073         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1074         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1075
1076         BUG_ON(pages == 0);
1077
1078         /*
1079          * Fallback to domain selective flush if no PSI support or the size is
1080          * too big.
1081          * PSI requires page size to be 2 ^ x, and the base address is naturally
1082          * aligned to the size
1083          */
1084         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1085                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1086                                                 DMA_TLB_DSI_FLUSH);
1087         else
1088                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1089                                                 DMA_TLB_PSI_FLUSH);
1090
1091         /*
1092          * In caching mode, domain ID 0 is reserved for non-present to present
1093          * mapping flush. Device IOTLB doesn't need to be flushed in this case.
1094          */
1095         if (!cap_caching_mode(iommu->cap) || did)
1096                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1097 }
1098
1099 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1100 {
1101         u32 pmen;
1102         unsigned long flags;
1103
1104         spin_lock_irqsave(&iommu->register_lock, flags);
1105         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1106         pmen &= ~DMA_PMEN_EPM;
1107         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1108
1109         /* wait for the protected region status bit to clear */
1110         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1111                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1112
1113         spin_unlock_irqrestore(&iommu->register_lock, flags);
1114 }
1115
1116 static int iommu_enable_translation(struct intel_iommu *iommu)
1117 {
1118         u32 sts;
1119         unsigned long flags;
1120
1121         spin_lock_irqsave(&iommu->register_lock, flags);
1122         iommu->gcmd |= DMA_GCMD_TE;
1123         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1124
1125         /* Make sure hardware complete it */
1126         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1127                       readl, (sts & DMA_GSTS_TES), sts);
1128
1129         spin_unlock_irqrestore(&iommu->register_lock, flags);
1130         return 0;
1131 }
1132
1133 static int iommu_disable_translation(struct intel_iommu *iommu)
1134 {
1135         u32 sts;
1136         unsigned long flag;
1137
1138         spin_lock_irqsave(&iommu->register_lock, flag);
1139         iommu->gcmd &= ~DMA_GCMD_TE;
1140         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1141
1142         /* Make sure hardware complete it */
1143         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1144                       readl, (!(sts & DMA_GSTS_TES)), sts);
1145
1146         spin_unlock_irqrestore(&iommu->register_lock, flag);
1147         return 0;
1148 }
1149
1150
1151 static int iommu_init_domains(struct intel_iommu *iommu)
1152 {
1153         unsigned long ndomains;
1154         unsigned long nlongs;
1155
1156         ndomains = cap_ndoms(iommu->cap);
1157         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1158         nlongs = BITS_TO_LONGS(ndomains);
1159
1160         spin_lock_init(&iommu->lock);
1161
1162         /* TBD: there might be 64K domains,
1163          * consider other allocation for future chip
1164          */
1165         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1166         if (!iommu->domain_ids) {
1167                 printk(KERN_ERR "Allocating domain id array failed\n");
1168                 return -ENOMEM;
1169         }
1170         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1171                         GFP_KERNEL);
1172         if (!iommu->domains) {
1173                 printk(KERN_ERR "Allocating domain array failed\n");
1174                 return -ENOMEM;
1175         }
1176
1177         /*
1178          * if Caching mode is set, then invalid translations are tagged
1179          * with domainid 0. Hence we need to pre-allocate it.
1180          */
1181         if (cap_caching_mode(iommu->cap))
1182                 set_bit(0, iommu->domain_ids);
1183         return 0;
1184 }
1185
1186
1187 static void domain_exit(struct dmar_domain *domain);
1188 static void vm_domain_exit(struct dmar_domain *domain);
1189
1190 void free_dmar_iommu(struct intel_iommu *iommu)
1191 {
1192         struct dmar_domain *domain;
1193         int i;
1194         unsigned long flags;
1195
1196         if ((iommu->domains) && (iommu->domain_ids)) {
1197                 i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1198                 for (; i < cap_ndoms(iommu->cap); ) {
1199                         domain = iommu->domains[i];
1200                         clear_bit(i, iommu->domain_ids);
1201
1202                         spin_lock_irqsave(&domain->iommu_lock, flags);
1203                         if (--domain->iommu_count == 0) {
1204                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1205                                         vm_domain_exit(domain);
1206                                 else
1207                                         domain_exit(domain);
1208                         }
1209                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1210
1211                         i = find_next_bit(iommu->domain_ids,
1212                                 cap_ndoms(iommu->cap), i+1);
1213                 }
1214         }
1215
1216         if (iommu->gcmd & DMA_GCMD_TE)
1217                 iommu_disable_translation(iommu);
1218
1219         if (iommu->irq) {
1220                 set_irq_data(iommu->irq, NULL);
1221                 /* This will mask the irq */
1222                 free_irq(iommu->irq, iommu);
1223                 destroy_irq(iommu->irq);
1224         }
1225
1226         kfree(iommu->domains);
1227         kfree(iommu->domain_ids);
1228
1229         g_iommus[iommu->seq_id] = NULL;
1230
1231         /* if all iommus are freed, free g_iommus */
1232         for (i = 0; i < g_num_of_iommus; i++) {
1233                 if (g_iommus[i])
1234                         break;
1235         }
1236
1237         if (i == g_num_of_iommus)
1238                 kfree(g_iommus);
1239
1240         /* free context mapping */
1241         free_context_table(iommu);
1242 }
1243
1244 static struct dmar_domain *alloc_domain(void)
1245 {
1246         struct dmar_domain *domain;
1247
1248         domain = alloc_domain_mem();
1249         if (!domain)
1250                 return NULL;
1251
1252         domain->nid = -1;
1253         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1254         domain->flags = 0;
1255
1256         return domain;
1257 }
1258
1259 static int iommu_attach_domain(struct dmar_domain *domain,
1260                                struct intel_iommu *iommu)
1261 {
1262         int num;
1263         unsigned long ndomains;
1264         unsigned long flags;
1265
1266         ndomains = cap_ndoms(iommu->cap);
1267
1268         spin_lock_irqsave(&iommu->lock, flags);
1269
1270         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1271         if (num >= ndomains) {
1272                 spin_unlock_irqrestore(&iommu->lock, flags);
1273                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1274                 return -ENOMEM;
1275         }
1276
1277         domain->id = num;
1278         set_bit(num, iommu->domain_ids);
1279         set_bit(iommu->seq_id, &domain->iommu_bmp);
1280         iommu->domains[num] = domain;
1281         spin_unlock_irqrestore(&iommu->lock, flags);
1282
1283         return 0;
1284 }
1285
1286 static void iommu_detach_domain(struct dmar_domain *domain,
1287                                 struct intel_iommu *iommu)
1288 {
1289         unsigned long flags;
1290         int num, ndomains;
1291         int found = 0;
1292
1293         spin_lock_irqsave(&iommu->lock, flags);
1294         ndomains = cap_ndoms(iommu->cap);
1295         num = find_first_bit(iommu->domain_ids, ndomains);
1296         for (; num < ndomains; ) {
1297                 if (iommu->domains[num] == domain) {
1298                         found = 1;
1299                         break;
1300                 }
1301                 num = find_next_bit(iommu->domain_ids,
1302                                     cap_ndoms(iommu->cap), num+1);
1303         }
1304
1305         if (found) {
1306                 clear_bit(num, iommu->domain_ids);
1307                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
1308                 iommu->domains[num] = NULL;
1309         }
1310         spin_unlock_irqrestore(&iommu->lock, flags);
1311 }
1312
1313 static struct iova_domain reserved_iova_list;
1314 static struct lock_class_key reserved_rbtree_key;
1315
1316 static void dmar_init_reserved_ranges(void)
1317 {
1318         struct pci_dev *pdev = NULL;
1319         struct iova *iova;
1320         int i;
1321
1322         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1323
1324         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1325                 &reserved_rbtree_key);
1326
1327         /* IOAPIC ranges shouldn't be accessed by DMA */
1328         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1329                 IOVA_PFN(IOAPIC_RANGE_END));
1330         if (!iova)
1331                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1332
1333         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1334         for_each_pci_dev(pdev) {
1335                 struct resource *r;
1336
1337                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1338                         r = &pdev->resource[i];
1339                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1340                                 continue;
1341                         iova = reserve_iova(&reserved_iova_list,
1342                                             IOVA_PFN(r->start),
1343                                             IOVA_PFN(r->end));
1344                         if (!iova)
1345                                 printk(KERN_ERR "Reserve iova failed\n");
1346                 }
1347         }
1348
1349 }
1350
1351 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1352 {
1353         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1354 }
1355
1356 static inline int guestwidth_to_adjustwidth(int gaw)
1357 {
1358         int agaw;
1359         int r = (gaw - 12) % 9;
1360
1361         if (r == 0)
1362                 agaw = gaw;
1363         else
1364                 agaw = gaw + 9 - r;
1365         if (agaw > 64)
1366                 agaw = 64;
1367         return agaw;
1368 }
1369
1370 static int domain_init(struct dmar_domain *domain, int guest_width)
1371 {
1372         struct intel_iommu *iommu;
1373         int adjust_width, agaw;
1374         unsigned long sagaw;
1375
1376         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1377         spin_lock_init(&domain->iommu_lock);
1378
1379         domain_reserve_special_ranges(domain);
1380
1381         /* calculate AGAW */
1382         iommu = domain_get_iommu(domain);
1383         if (guest_width > cap_mgaw(iommu->cap))
1384                 guest_width = cap_mgaw(iommu->cap);
1385         domain->gaw = guest_width;
1386         adjust_width = guestwidth_to_adjustwidth(guest_width);
1387         agaw = width_to_agaw(adjust_width);
1388         sagaw = cap_sagaw(iommu->cap);
1389         if (!test_bit(agaw, &sagaw)) {
1390                 /* hardware doesn't support it, choose a bigger one */
1391                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1392                 agaw = find_next_bit(&sagaw, 5, agaw);
1393                 if (agaw >= 5)
1394                         return -ENODEV;
1395         }
1396         domain->agaw = agaw;
1397         INIT_LIST_HEAD(&domain->devices);
1398
1399         if (ecap_coherent(iommu->ecap))
1400                 domain->iommu_coherency = 1;
1401         else
1402                 domain->iommu_coherency = 0;
1403
1404         if (ecap_sc_support(iommu->ecap))
1405                 domain->iommu_snooping = 1;
1406         else
1407                 domain->iommu_snooping = 0;
1408
1409         domain->iommu_count = 1;
1410         domain->nid = iommu->node;
1411
1412         /* always allocate the top pgd */
1413         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1414         if (!domain->pgd)
1415                 return -ENOMEM;
1416         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1417         return 0;
1418 }
1419
1420 static void domain_exit(struct dmar_domain *domain)
1421 {
1422         struct dmar_drhd_unit *drhd;
1423         struct intel_iommu *iommu;
1424
1425         /* Domain 0 is reserved, so dont process it */
1426         if (!domain)
1427                 return;
1428
1429         domain_remove_dev_info(domain);
1430         /* destroy iovas */
1431         put_iova_domain(&domain->iovad);
1432
1433         /* clear ptes */
1434         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1435
1436         /* free page tables */
1437         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1438
1439         for_each_active_iommu(iommu, drhd)
1440                 if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1441                         iommu_detach_domain(domain, iommu);
1442
1443         free_domain_mem(domain);
1444 }
1445
1446 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1447                                  u8 bus, u8 devfn, int translation)
1448 {
1449         struct context_entry *context;
1450         unsigned long flags;
1451         struct intel_iommu *iommu;
1452         struct dma_pte *pgd;
1453         unsigned long num;
1454         unsigned long ndomains;
1455         int id;
1456         int agaw;
1457         struct device_domain_info *info = NULL;
1458
1459         pr_debug("Set context mapping for %02x:%02x.%d\n",
1460                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1461
1462         BUG_ON(!domain->pgd);
1463         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1464                translation != CONTEXT_TT_MULTI_LEVEL);
1465
1466         iommu = device_to_iommu(segment, bus, devfn);
1467         if (!iommu)
1468                 return -ENODEV;
1469
1470         context = device_to_context_entry(iommu, bus, devfn);
1471         if (!context)
1472                 return -ENOMEM;
1473         spin_lock_irqsave(&iommu->lock, flags);
1474         if (context_present(context)) {
1475                 spin_unlock_irqrestore(&iommu->lock, flags);
1476                 return 0;
1477         }
1478
1479         id = domain->id;
1480         pgd = domain->pgd;
1481
1482         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1483             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1484                 int found = 0;
1485
1486                 /* find an available domain id for this device in iommu */
1487                 ndomains = cap_ndoms(iommu->cap);
1488                 num = find_first_bit(iommu->domain_ids, ndomains);
1489                 for (; num < ndomains; ) {
1490                         if (iommu->domains[num] == domain) {
1491                                 id = num;
1492                                 found = 1;
1493                                 break;
1494                         }
1495                         num = find_next_bit(iommu->domain_ids,
1496                                             cap_ndoms(iommu->cap), num+1);
1497                 }
1498
1499                 if (found == 0) {
1500                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1501                         if (num >= ndomains) {
1502                                 spin_unlock_irqrestore(&iommu->lock, flags);
1503                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1504                                 return -EFAULT;
1505                         }
1506
1507                         set_bit(num, iommu->domain_ids);
1508                         iommu->domains[num] = domain;
1509                         id = num;
1510                 }
1511
1512                 /* Skip top levels of page tables for
1513                  * iommu which has less agaw than default.
1514                  * Unnecessary for PT mode.
1515                  */
1516                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1517                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1518                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1519                                 if (!dma_pte_present(pgd)) {
1520                                         spin_unlock_irqrestore(&iommu->lock, flags);
1521                                         return -ENOMEM;
1522                                 }
1523                         }
1524                 }
1525         }
1526
1527         context_set_domain_id(context, id);
1528
1529         if (translation != CONTEXT_TT_PASS_THROUGH) {
1530                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1531                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1532                                      CONTEXT_TT_MULTI_LEVEL;
1533         }
1534         /*
1535          * In pass through mode, AW must be programmed to indicate the largest
1536          * AGAW value supported by hardware. And ASR is ignored by hardware.
1537          */
1538         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1539                 context_set_address_width(context, iommu->msagaw);
1540         else {
1541                 context_set_address_root(context, virt_to_phys(pgd));
1542                 context_set_address_width(context, iommu->agaw);
1543         }
1544
1545         context_set_translation_type(context, translation);
1546         context_set_fault_enable(context);
1547         context_set_present(context);
1548         domain_flush_cache(domain, context, sizeof(*context));
1549
1550         /*
1551          * It's a non-present to present mapping. If hardware doesn't cache
1552          * non-present entry we only need to flush the write-buffer. If the
1553          * _does_ cache non-present entries, then it does so in the special
1554          * domain #0, which we have to flush:
1555          */
1556         if (cap_caching_mode(iommu->cap)) {
1557                 iommu->flush.flush_context(iommu, 0,
1558                                            (((u16)bus) << 8) | devfn,
1559                                            DMA_CCMD_MASK_NOBIT,
1560                                            DMA_CCMD_DEVICE_INVL);
1561                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH);
1562         } else {
1563                 iommu_flush_write_buffer(iommu);
1564         }
1565         iommu_enable_dev_iotlb(info);
1566         spin_unlock_irqrestore(&iommu->lock, flags);
1567
1568         spin_lock_irqsave(&domain->iommu_lock, flags);
1569         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1570                 domain->iommu_count++;
1571                 if (domain->iommu_count == 1)
1572                         domain->nid = iommu->node;
1573                 domain_update_iommu_cap(domain);
1574         }
1575         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1576         return 0;
1577 }
1578
1579 static int
1580 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1581                         int translation)
1582 {
1583         int ret;
1584         struct pci_dev *tmp, *parent;
1585
1586         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1587                                          pdev->bus->number, pdev->devfn,
1588                                          translation);
1589         if (ret)
1590                 return ret;
1591
1592         /* dependent device mapping */
1593         tmp = pci_find_upstream_pcie_bridge(pdev);
1594         if (!tmp)
1595                 return 0;
1596         /* Secondary interface's bus number and devfn 0 */
1597         parent = pdev->bus->self;
1598         while (parent != tmp) {
1599                 ret = domain_context_mapping_one(domain,
1600                                                  pci_domain_nr(parent->bus),
1601                                                  parent->bus->number,
1602                                                  parent->devfn, translation);
1603                 if (ret)
1604                         return ret;
1605                 parent = parent->bus->self;
1606         }
1607         if (pci_is_pcie(tmp)) /* this is a PCIE-to-PCI bridge */
1608                 return domain_context_mapping_one(domain,
1609                                         pci_domain_nr(tmp->subordinate),
1610                                         tmp->subordinate->number, 0,
1611                                         translation);
1612         else /* this is a legacy PCI bridge */
1613                 return domain_context_mapping_one(domain,
1614                                                   pci_domain_nr(tmp->bus),
1615                                                   tmp->bus->number,
1616                                                   tmp->devfn,
1617                                                   translation);
1618 }
1619
1620 static int domain_context_mapped(struct pci_dev *pdev)
1621 {
1622         int ret;
1623         struct pci_dev *tmp, *parent;
1624         struct intel_iommu *iommu;
1625
1626         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1627                                 pdev->devfn);
1628         if (!iommu)
1629                 return -ENODEV;
1630
1631         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1632         if (!ret)
1633                 return ret;
1634         /* dependent device mapping */
1635         tmp = pci_find_upstream_pcie_bridge(pdev);
1636         if (!tmp)
1637                 return ret;
1638         /* Secondary interface's bus number and devfn 0 */
1639         parent = pdev->bus->self;
1640         while (parent != tmp) {
1641                 ret = device_context_mapped(iommu, parent->bus->number,
1642                                             parent->devfn);
1643                 if (!ret)
1644                         return ret;
1645                 parent = parent->bus->self;
1646         }
1647         if (pci_is_pcie(tmp))
1648                 return device_context_mapped(iommu, tmp->subordinate->number,
1649                                              0);
1650         else
1651                 return device_context_mapped(iommu, tmp->bus->number,
1652                                              tmp->devfn);
1653 }
1654
1655 /* Returns a number of VTD pages, but aligned to MM page size */
1656 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1657                                             size_t size)
1658 {
1659         host_addr &= ~PAGE_MASK;
1660         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1661 }
1662
1663 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1664                             struct scatterlist *sg, unsigned long phys_pfn,
1665                             unsigned long nr_pages, int prot)
1666 {
1667         struct dma_pte *first_pte = NULL, *pte = NULL;
1668         phys_addr_t uninitialized_var(pteval);
1669         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1670         unsigned long sg_res;
1671
1672         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1673
1674         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1675                 return -EINVAL;
1676
1677         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1678
1679         if (sg)
1680                 sg_res = 0;
1681         else {
1682                 sg_res = nr_pages + 1;
1683                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1684         }
1685
1686         while (nr_pages--) {
1687                 uint64_t tmp;
1688
1689                 if (!sg_res) {
1690                         sg_res = aligned_nrpages(sg->offset, sg->length);
1691                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1692                         sg->dma_length = sg->length;
1693                         pteval = page_to_phys(sg_page(sg)) | prot;
1694                 }
1695                 if (!pte) {
1696                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn);
1697                         if (!pte)
1698                                 return -ENOMEM;
1699                 }
1700                 /* We don't need lock here, nobody else
1701                  * touches the iova range
1702                  */
1703                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1704                 if (tmp) {
1705                         static int dumps = 5;
1706                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1707                                iov_pfn, tmp, (unsigned long long)pteval);
1708                         if (dumps) {
1709                                 dumps--;
1710                                 debug_dma_dump_mappings(NULL);
1711                         }
1712                         WARN_ON(1);
1713                 }
1714                 pte++;
1715                 if (!nr_pages || first_pte_in_page(pte)) {
1716                         domain_flush_cache(domain, first_pte,
1717                                            (void *)pte - (void *)first_pte);
1718                         pte = NULL;
1719                 }
1720                 iov_pfn++;
1721                 pteval += VTD_PAGE_SIZE;
1722                 sg_res--;
1723                 if (!sg_res)
1724                         sg = sg_next(sg);
1725         }
1726         return 0;
1727 }
1728
1729 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1730                                     struct scatterlist *sg, unsigned long nr_pages,
1731                                     int prot)
1732 {
1733         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1734 }
1735
1736 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1737                                      unsigned long phys_pfn, unsigned long nr_pages,
1738                                      int prot)
1739 {
1740         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1741 }
1742
1743 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1744 {
1745         if (!iommu)
1746                 return;
1747
1748         clear_context_table(iommu, bus, devfn);
1749         iommu->flush.flush_context(iommu, 0, 0, 0,
1750                                            DMA_CCMD_GLOBAL_INVL);
1751         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1752 }
1753
1754 static void domain_remove_dev_info(struct dmar_domain *domain)
1755 {
1756         struct device_domain_info *info;
1757         unsigned long flags;
1758         struct intel_iommu *iommu;
1759
1760         spin_lock_irqsave(&device_domain_lock, flags);
1761         while (!list_empty(&domain->devices)) {
1762                 info = list_entry(domain->devices.next,
1763                         struct device_domain_info, link);
1764                 list_del(&info->link);
1765                 list_del(&info->global);
1766                 if (info->dev)
1767                         info->dev->dev.archdata.iommu = NULL;
1768                 spin_unlock_irqrestore(&device_domain_lock, flags);
1769
1770                 iommu_disable_dev_iotlb(info);
1771                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1772                 iommu_detach_dev(iommu, info->bus, info->devfn);
1773                 free_devinfo_mem(info);
1774
1775                 spin_lock_irqsave(&device_domain_lock, flags);
1776         }
1777         spin_unlock_irqrestore(&device_domain_lock, flags);
1778 }
1779
1780 /*
1781  * find_domain
1782  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1783  */
1784 static struct dmar_domain *
1785 find_domain(struct pci_dev *pdev)
1786 {
1787         struct device_domain_info *info;
1788
1789         /* No lock here, assumes no domain exit in normal case */
1790         info = pdev->dev.archdata.iommu;
1791         if (info)
1792                 return info->domain;
1793         return NULL;
1794 }
1795
1796 /* domain is initialized */
1797 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1798 {
1799         struct dmar_domain *domain, *found = NULL;
1800         struct intel_iommu *iommu;
1801         struct dmar_drhd_unit *drhd;
1802         struct device_domain_info *info, *tmp;
1803         struct pci_dev *dev_tmp;
1804         unsigned long flags;
1805         int bus = 0, devfn = 0;
1806         int segment;
1807         int ret;
1808
1809         domain = find_domain(pdev);
1810         if (domain)
1811                 return domain;
1812
1813         segment = pci_domain_nr(pdev->bus);
1814
1815         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1816         if (dev_tmp) {
1817                 if (pci_is_pcie(dev_tmp)) {
1818                         bus = dev_tmp->subordinate->number;
1819                         devfn = 0;
1820                 } else {
1821                         bus = dev_tmp->bus->number;
1822                         devfn = dev_tmp->devfn;
1823                 }
1824                 spin_lock_irqsave(&device_domain_lock, flags);
1825                 list_for_each_entry(info, &device_domain_list, global) {
1826                         if (info->segment == segment &&
1827                             info->bus == bus && info->devfn == devfn) {
1828                                 found = info->domain;
1829                                 break;
1830                         }
1831                 }
1832                 spin_unlock_irqrestore(&device_domain_lock, flags);
1833                 /* pcie-pci bridge already has a domain, uses it */
1834                 if (found) {
1835                         domain = found;
1836                         goto found_domain;
1837                 }
1838         }
1839
1840         domain = alloc_domain();
1841         if (!domain)
1842                 goto error;
1843
1844         /* Allocate new domain for the device */
1845         drhd = dmar_find_matched_drhd_unit(pdev);
1846         if (!drhd) {
1847                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1848                         pci_name(pdev));
1849                 return NULL;
1850         }
1851         iommu = drhd->iommu;
1852
1853         ret = iommu_attach_domain(domain, iommu);
1854         if (ret) {
1855                 domain_exit(domain);
1856                 goto error;
1857         }
1858
1859         if (domain_init(domain, gaw)) {
1860                 domain_exit(domain);
1861                 goto error;
1862         }
1863
1864         /* register pcie-to-pci device */
1865         if (dev_tmp) {
1866                 info = alloc_devinfo_mem();
1867                 if (!info) {
1868                         domain_exit(domain);
1869                         goto error;
1870                 }
1871                 info->segment = segment;
1872                 info->bus = bus;
1873                 info->devfn = devfn;
1874                 info->dev = NULL;
1875                 info->domain = domain;
1876                 /* This domain is shared by devices under p2p bridge */
1877                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1878
1879                 /* pcie-to-pci bridge already has a domain, uses it */
1880                 found = NULL;
1881                 spin_lock_irqsave(&device_domain_lock, flags);
1882                 list_for_each_entry(tmp, &device_domain_list, global) {
1883                         if (tmp->segment == segment &&
1884                             tmp->bus == bus && tmp->devfn == devfn) {
1885                                 found = tmp->domain;
1886                                 break;
1887                         }
1888                 }
1889                 if (found) {
1890                         free_devinfo_mem(info);
1891                         domain_exit(domain);
1892                         domain = found;
1893                 } else {
1894                         list_add(&info->link, &domain->devices);
1895                         list_add(&info->global, &device_domain_list);
1896                 }
1897                 spin_unlock_irqrestore(&device_domain_lock, flags);
1898         }
1899
1900 found_domain:
1901         info = alloc_devinfo_mem();
1902         if (!info)
1903                 goto error;
1904         info->segment = segment;
1905         info->bus = pdev->bus->number;
1906         info->devfn = pdev->devfn;
1907         info->dev = pdev;
1908         info->domain = domain;
1909         spin_lock_irqsave(&device_domain_lock, flags);
1910         /* somebody is fast */
1911         found = find_domain(pdev);
1912         if (found != NULL) {
1913                 spin_unlock_irqrestore(&device_domain_lock, flags);
1914                 if (found != domain) {
1915                         domain_exit(domain);
1916                         domain = found;
1917                 }
1918                 free_devinfo_mem(info);
1919                 return domain;
1920         }
1921         list_add(&info->link, &domain->devices);
1922         list_add(&info->global, &device_domain_list);
1923         pdev->dev.archdata.iommu = info;
1924         spin_unlock_irqrestore(&device_domain_lock, flags);
1925         return domain;
1926 error:
1927         /* recheck it here, maybe others set it */
1928         return find_domain(pdev);
1929 }
1930
1931 static int iommu_identity_mapping;
1932 #define IDENTMAP_ALL            1
1933 #define IDENTMAP_GFX            2
1934 #define IDENTMAP_AZALIA         4
1935
1936 static int iommu_domain_identity_map(struct dmar_domain *domain,
1937                                      unsigned long long start,
1938                                      unsigned long long end)
1939 {
1940         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
1941         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
1942
1943         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
1944                           dma_to_mm_pfn(last_vpfn))) {
1945                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1946                 return -ENOMEM;
1947         }
1948
1949         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
1950                  start, end, domain->id);
1951         /*
1952          * RMRR range might have overlap with physical memory range,
1953          * clear it first
1954          */
1955         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
1956
1957         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
1958                                   last_vpfn - first_vpfn + 1,
1959                                   DMA_PTE_READ|DMA_PTE_WRITE);
1960 }
1961
1962 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1963                                       unsigned long long start,
1964                                       unsigned long long end)
1965 {
1966         struct dmar_domain *domain;
1967         int ret;
1968
1969         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1970         if (!domain)
1971                 return -ENOMEM;
1972
1973         /* For _hardware_ passthrough, don't bother. But for software
1974            passthrough, we do it anyway -- it may indicate a memory
1975            range which is reserved in E820, so which didn't get set
1976            up to start with in si_domain */
1977         if (domain == si_domain && hw_pass_through) {
1978                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
1979                        pci_name(pdev), start, end);
1980                 return 0;
1981         }
1982
1983         printk(KERN_INFO
1984                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1985                pci_name(pdev), start, end);
1986         
1987         if (end < start) {
1988                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
1989                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1990                         dmi_get_system_info(DMI_BIOS_VENDOR),
1991                         dmi_get_system_info(DMI_BIOS_VERSION),
1992                      dmi_get_system_info(DMI_PRODUCT_VERSION));
1993                 ret = -EIO;
1994                 goto error;
1995         }
1996
1997         if (end >> agaw_to_width(domain->agaw)) {
1998                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
1999                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2000                      agaw_to_width(domain->agaw),
2001                      dmi_get_system_info(DMI_BIOS_VENDOR),
2002                      dmi_get_system_info(DMI_BIOS_VERSION),
2003                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2004                 ret = -EIO;
2005                 goto error;
2006         }
2007
2008         ret = iommu_domain_identity_map(domain, start, end);
2009         if (ret)
2010                 goto error;
2011
2012         /* context entry init */
2013         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2014         if (ret)
2015                 goto error;
2016
2017         return 0;
2018
2019  error:
2020         domain_exit(domain);
2021         return ret;
2022 }
2023
2024 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2025         struct pci_dev *pdev)
2026 {
2027         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2028                 return 0;
2029         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2030                 rmrr->end_address + 1);
2031 }
2032
2033 #ifdef CONFIG_DMAR_FLOPPY_WA
2034 static inline void iommu_prepare_isa(void)
2035 {
2036         struct pci_dev *pdev;
2037         int ret;
2038
2039         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2040         if (!pdev)
2041                 return;
2042
2043         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2044         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
2045
2046         if (ret)
2047                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2048                        "floppy might not work\n");
2049
2050 }
2051 #else
2052 static inline void iommu_prepare_isa(void)
2053 {
2054         return;
2055 }
2056 #endif /* !CONFIG_DMAR_FLPY_WA */
2057
2058 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2059
2060 static int __init si_domain_work_fn(unsigned long start_pfn,
2061                                     unsigned long end_pfn, void *datax)
2062 {
2063         int *ret = datax;
2064
2065         *ret = iommu_domain_identity_map(si_domain,
2066                                          (uint64_t)start_pfn << PAGE_SHIFT,
2067                                          (uint64_t)end_pfn << PAGE_SHIFT);
2068         return *ret;
2069
2070 }
2071
2072 static int __init si_domain_init(int hw)
2073 {
2074         struct dmar_drhd_unit *drhd;
2075         struct intel_iommu *iommu;
2076         int nid, ret = 0;
2077
2078         si_domain = alloc_domain();
2079         if (!si_domain)
2080                 return -EFAULT;
2081
2082         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2083
2084         for_each_active_iommu(iommu, drhd) {
2085                 ret = iommu_attach_domain(si_domain, iommu);
2086                 if (ret) {
2087                         domain_exit(si_domain);
2088                         return -EFAULT;
2089                 }
2090         }
2091
2092         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2093                 domain_exit(si_domain);
2094                 return -EFAULT;
2095         }
2096
2097         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2098
2099         if (hw)
2100                 return 0;
2101
2102         for_each_online_node(nid) {
2103                 work_with_active_regions(nid, si_domain_work_fn, &ret);
2104                 if (ret)
2105                         return ret;
2106         }
2107
2108         return 0;
2109 }
2110
2111 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2112                                           struct pci_dev *pdev);
2113 static int identity_mapping(struct pci_dev *pdev)
2114 {
2115         struct device_domain_info *info;
2116
2117         if (likely(!iommu_identity_mapping))
2118                 return 0;
2119
2120
2121         list_for_each_entry(info, &si_domain->devices, link)
2122                 if (info->dev == pdev)
2123                         return 1;
2124         return 0;
2125 }
2126
2127 static int domain_add_dev_info(struct dmar_domain *domain,
2128                                struct pci_dev *pdev,
2129                                int translation)
2130 {
2131         struct device_domain_info *info;
2132         unsigned long flags;
2133         int ret;
2134
2135         info = alloc_devinfo_mem();
2136         if (!info)
2137                 return -ENOMEM;
2138
2139         ret = domain_context_mapping(domain, pdev, translation);
2140         if (ret) {
2141                 free_devinfo_mem(info);
2142                 return ret;
2143         }
2144
2145         info->segment = pci_domain_nr(pdev->bus);
2146         info->bus = pdev->bus->number;
2147         info->devfn = pdev->devfn;
2148         info->dev = pdev;
2149         info->domain = domain;
2150
2151         spin_lock_irqsave(&device_domain_lock, flags);
2152         list_add(&info->link, &domain->devices);
2153         list_add(&info->global, &device_domain_list);
2154         pdev->dev.archdata.iommu = info;
2155         spin_unlock_irqrestore(&device_domain_lock, flags);
2156
2157         return 0;
2158 }
2159
2160 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2161 {
2162         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2163                 return 1;
2164
2165         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2166                 return 1;
2167
2168         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2169                 return 0;
2170
2171         /*
2172          * We want to start off with all devices in the 1:1 domain, and
2173          * take them out later if we find they can't access all of memory.
2174          *
2175          * However, we can't do this for PCI devices behind bridges,
2176          * because all PCI devices behind the same bridge will end up
2177          * with the same source-id on their transactions.
2178          *
2179          * Practically speaking, we can't change things around for these
2180          * devices at run-time, because we can't be sure there'll be no
2181          * DMA transactions in flight for any of their siblings.
2182          * 
2183          * So PCI devices (unless they're on the root bus) as well as
2184          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2185          * the 1:1 domain, just in _case_ one of their siblings turns out
2186          * not to be able to map all of memory.
2187          */
2188         if (!pci_is_pcie(pdev)) {
2189                 if (!pci_is_root_bus(pdev->bus))
2190                         return 0;
2191                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2192                         return 0;
2193         } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2194                 return 0;
2195
2196         /* 
2197          * At boot time, we don't yet know if devices will be 64-bit capable.
2198          * Assume that they will -- if they turn out not to be, then we can 
2199          * take them out of the 1:1 domain later.
2200          */
2201         if (!startup)
2202                 return pdev->dma_mask > DMA_BIT_MASK(32);
2203
2204         return 1;
2205 }
2206
2207 static int __init iommu_prepare_static_identity_mapping(int hw)
2208 {
2209         struct pci_dev *pdev = NULL;
2210         int ret;
2211
2212         ret = si_domain_init(hw);
2213         if (ret)
2214                 return -EFAULT;
2215
2216         for_each_pci_dev(pdev) {
2217                 if (iommu_should_identity_map(pdev, 1)) {
2218                         printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2219                                hw ? "hardware" : "software", pci_name(pdev));
2220
2221                         ret = domain_add_dev_info(si_domain, pdev,
2222                                                      hw ? CONTEXT_TT_PASS_THROUGH :
2223                                                      CONTEXT_TT_MULTI_LEVEL);
2224                         if (ret)
2225                                 return ret;
2226                 }
2227         }
2228
2229         return 0;
2230 }
2231
2232 int __init init_dmars(void)
2233 {
2234         struct dmar_drhd_unit *drhd;
2235         struct dmar_rmrr_unit *rmrr;
2236         struct pci_dev *pdev;
2237         struct intel_iommu *iommu;
2238         int i, ret;
2239
2240         /*
2241          * for each drhd
2242          *    allocate root
2243          *    initialize and program root entry to not present
2244          * endfor
2245          */
2246         for_each_drhd_unit(drhd) {
2247                 g_num_of_iommus++;
2248                 /*
2249                  * lock not needed as this is only incremented in the single
2250                  * threaded kernel __init code path all other access are read
2251                  * only
2252                  */
2253         }
2254
2255         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2256                         GFP_KERNEL);
2257         if (!g_iommus) {
2258                 printk(KERN_ERR "Allocating global iommu array failed\n");
2259                 ret = -ENOMEM;
2260                 goto error;
2261         }
2262
2263         deferred_flush = kzalloc(g_num_of_iommus *
2264                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2265         if (!deferred_flush) {
2266                 ret = -ENOMEM;
2267                 goto error;
2268         }
2269
2270         for_each_drhd_unit(drhd) {
2271                 if (drhd->ignored)
2272                         continue;
2273
2274                 iommu = drhd->iommu;
2275                 g_iommus[iommu->seq_id] = iommu;
2276
2277                 ret = iommu_init_domains(iommu);
2278                 if (ret)
2279                         goto error;
2280
2281                 /*
2282                  * TBD:
2283                  * we could share the same root & context tables
2284                  * amoung all IOMMU's. Need to Split it later.
2285                  */
2286                 ret = iommu_alloc_root_entry(iommu);
2287                 if (ret) {
2288                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2289                         goto error;
2290                 }
2291                 if (!ecap_pass_through(iommu->ecap))
2292                         hw_pass_through = 0;
2293         }
2294
2295         /*
2296          * Start from the sane iommu hardware state.
2297          */
2298         for_each_drhd_unit(drhd) {
2299                 if (drhd->ignored)
2300                         continue;
2301
2302                 iommu = drhd->iommu;
2303
2304                 /*
2305                  * If the queued invalidation is already initialized by us
2306                  * (for example, while enabling interrupt-remapping) then
2307                  * we got the things already rolling from a sane state.
2308                  */
2309                 if (iommu->qi)
2310                         continue;
2311
2312                 /*
2313                  * Clear any previous faults.
2314                  */
2315                 dmar_fault(-1, iommu);
2316                 /*
2317                  * Disable queued invalidation if supported and already enabled
2318                  * before OS handover.
2319                  */
2320                 dmar_disable_qi(iommu);
2321         }
2322
2323         for_each_drhd_unit(drhd) {
2324                 if (drhd->ignored)
2325                         continue;
2326
2327                 iommu = drhd->iommu;
2328
2329                 if (dmar_enable_qi(iommu)) {
2330                         /*
2331                          * Queued Invalidate not enabled, use Register Based
2332                          * Invalidate
2333                          */
2334                         iommu->flush.flush_context = __iommu_flush_context;
2335                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2336                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
2337                                "invalidation\n",
2338                                (unsigned long long)drhd->reg_base_addr);
2339                 } else {
2340                         iommu->flush.flush_context = qi_flush_context;
2341                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2342                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
2343                                "invalidation\n",
2344                                (unsigned long long)drhd->reg_base_addr);
2345                 }
2346         }
2347
2348         if (iommu_pass_through)
2349                 iommu_identity_mapping |= IDENTMAP_ALL;
2350
2351 #ifdef CONFIG_DMAR_BROKEN_GFX_WA
2352         iommu_identity_mapping |= IDENTMAP_GFX;
2353 #endif
2354
2355         check_tylersburg_isoch();
2356
2357         /*
2358          * If pass through is not set or not enabled, setup context entries for
2359          * identity mappings for rmrr, gfx, and isa and may fall back to static
2360          * identity mapping if iommu_identity_mapping is set.
2361          */
2362         if (iommu_identity_mapping) {
2363                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2364                 if (ret) {
2365                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2366                         goto error;
2367                 }
2368         }
2369         /*
2370          * For each rmrr
2371          *   for each dev attached to rmrr
2372          *   do
2373          *     locate drhd for dev, alloc domain for dev
2374          *     allocate free domain
2375          *     allocate page table entries for rmrr
2376          *     if context not allocated for bus
2377          *           allocate and init context
2378          *           set present in root table for this bus
2379          *     init context with domain, translation etc
2380          *    endfor
2381          * endfor
2382          */
2383         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2384         for_each_rmrr_units(rmrr) {
2385                 for (i = 0; i < rmrr->devices_cnt; i++) {
2386                         pdev = rmrr->devices[i];
2387                         /*
2388                          * some BIOS lists non-exist devices in DMAR
2389                          * table.
2390                          */
2391                         if (!pdev)
2392                                 continue;
2393                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2394                         if (ret)
2395                                 printk(KERN_ERR
2396                                        "IOMMU: mapping reserved region failed\n");
2397                 }
2398         }
2399
2400         iommu_prepare_isa();
2401
2402         /*
2403          * for each drhd
2404          *   enable fault log
2405          *   global invalidate context cache
2406          *   global invalidate iotlb
2407          *   enable translation
2408          */
2409         for_each_drhd_unit(drhd) {
2410                 if (drhd->ignored)
2411                         continue;
2412                 iommu = drhd->iommu;
2413
2414                 iommu_flush_write_buffer(iommu);
2415
2416                 ret = dmar_set_interrupt(iommu);
2417                 if (ret)
2418                         goto error;
2419
2420                 iommu_set_root_entry(iommu);
2421
2422                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2423                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2424
2425                 ret = iommu_enable_translation(iommu);
2426                 if (ret)
2427                         goto error;
2428
2429                 iommu_disable_protect_mem_regions(iommu);
2430         }
2431
2432         return 0;
2433 error:
2434         for_each_drhd_unit(drhd) {
2435                 if (drhd->ignored)
2436                         continue;
2437                 iommu = drhd->iommu;
2438                 free_iommu(iommu);
2439         }
2440         kfree(g_iommus);
2441         return ret;
2442 }
2443
2444 /* This takes a number of _MM_ pages, not VTD pages */
2445 static struct iova *intel_alloc_iova(struct device *dev,
2446                                      struct dmar_domain *domain,
2447                                      unsigned long nrpages, uint64_t dma_mask)
2448 {
2449         struct pci_dev *pdev = to_pci_dev(dev);
2450         struct iova *iova = NULL;
2451
2452         /* Restrict dma_mask to the width that the iommu can handle */
2453         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2454
2455         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2456                 /*
2457                  * First try to allocate an io virtual address in
2458                  * DMA_BIT_MASK(32) and if that fails then try allocating
2459                  * from higher range
2460                  */
2461                 iova = alloc_iova(&domain->iovad, nrpages,
2462                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2463                 if (iova)
2464                         return iova;
2465         }
2466         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2467         if (unlikely(!iova)) {
2468                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2469                        nrpages, pci_name(pdev));
2470                 return NULL;
2471         }
2472
2473         return iova;
2474 }
2475
2476 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2477 {
2478         struct dmar_domain *domain;
2479         int ret;
2480
2481         domain = get_domain_for_dev(pdev,
2482                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2483         if (!domain) {
2484                 printk(KERN_ERR
2485                         "Allocating domain for %s failed", pci_name(pdev));
2486                 return NULL;
2487         }
2488
2489         /* make sure context mapping is ok */
2490         if (unlikely(!domain_context_mapped(pdev))) {
2491                 ret = domain_context_mapping(domain, pdev,
2492                                              CONTEXT_TT_MULTI_LEVEL);
2493                 if (ret) {
2494                         printk(KERN_ERR
2495                                 "Domain context map for %s failed",
2496                                 pci_name(pdev));
2497                         return NULL;
2498                 }
2499         }
2500
2501         return domain;
2502 }
2503
2504 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2505 {
2506         struct device_domain_info *info;
2507
2508         /* No lock here, assumes no domain exit in normal case */
2509         info = dev->dev.archdata.iommu;
2510         if (likely(info))
2511                 return info->domain;
2512
2513         return __get_valid_domain_for_dev(dev);
2514 }
2515
2516 static int iommu_dummy(struct pci_dev *pdev)
2517 {
2518         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2519 }
2520
2521 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2522 static int iommu_no_mapping(struct device *dev)
2523 {
2524         struct pci_dev *pdev;
2525         int found;
2526
2527         if (unlikely(dev->bus != &pci_bus_type))
2528                 return 1;
2529
2530         pdev = to_pci_dev(dev);
2531         if (iommu_dummy(pdev))
2532                 return 1;
2533
2534         if (!iommu_identity_mapping)
2535                 return 0;
2536
2537         found = identity_mapping(pdev);
2538         if (found) {
2539                 if (iommu_should_identity_map(pdev, 0))
2540                         return 1;
2541                 else {
2542                         /*
2543                          * 32 bit DMA is removed from si_domain and fall back
2544                          * to non-identity mapping.
2545                          */
2546                         domain_remove_one_dev_info(si_domain, pdev);
2547                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2548                                pci_name(pdev));
2549                         return 0;
2550                 }
2551         } else {
2552                 /*
2553                  * In case of a detached 64 bit DMA device from vm, the device
2554                  * is put into si_domain for identity mapping.
2555                  */
2556                 if (iommu_should_identity_map(pdev, 0)) {
2557                         int ret;
2558                         ret = domain_add_dev_info(si_domain, pdev,
2559                                                   hw_pass_through ?
2560                                                   CONTEXT_TT_PASS_THROUGH :
2561                                                   CONTEXT_TT_MULTI_LEVEL);
2562                         if (!ret) {
2563                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2564                                        pci_name(pdev));
2565                                 return 1;
2566                         }
2567                 }
2568         }
2569
2570         return 0;
2571 }
2572
2573 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2574                                      size_t size, int dir, u64 dma_mask)
2575 {
2576         struct pci_dev *pdev = to_pci_dev(hwdev);
2577         struct dmar_domain *domain;
2578         phys_addr_t start_paddr;
2579         struct iova *iova;
2580         int prot = 0;
2581         int ret;
2582         struct intel_iommu *iommu;
2583         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2584
2585         BUG_ON(dir == DMA_NONE);
2586
2587         if (iommu_no_mapping(hwdev))
2588                 return paddr;
2589
2590         domain = get_valid_domain_for_dev(pdev);
2591         if (!domain)
2592                 return 0;
2593
2594         iommu = domain_get_iommu(domain);
2595         size = aligned_nrpages(paddr, size);
2596
2597         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2598                                 pdev->dma_mask);
2599         if (!iova)
2600                 goto error;
2601
2602         /*
2603          * Check if DMAR supports zero-length reads on write only
2604          * mappings..
2605          */
2606         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2607                         !cap_zlr(iommu->cap))
2608                 prot |= DMA_PTE_READ;
2609         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2610                 prot |= DMA_PTE_WRITE;
2611         /*
2612          * paddr - (paddr + size) might be partial page, we should map the whole
2613          * page.  Note: if two part of one page are separately mapped, we
2614          * might have two guest_addr mapping to the same host paddr, but this
2615          * is not a big problem
2616          */
2617         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2618                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2619         if (ret)
2620                 goto error;
2621
2622         /* it's a non-present to present mapping. Only flush if caching mode */
2623         if (cap_caching_mode(iommu->cap))
2624                 iommu_flush_iotlb_psi(iommu, 0, mm_to_dma_pfn(iova->pfn_lo), size);
2625         else
2626                 iommu_flush_write_buffer(iommu);
2627
2628         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2629         start_paddr += paddr & ~PAGE_MASK;
2630         return start_paddr;
2631
2632 error:
2633         if (iova)
2634                 __free_iova(&domain->iovad, iova);
2635         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2636                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2637         return 0;
2638 }
2639
2640 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2641                                  unsigned long offset, size_t size,
2642                                  enum dma_data_direction dir,
2643                                  struct dma_attrs *attrs)
2644 {
2645         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2646                                   dir, to_pci_dev(dev)->dma_mask);
2647 }
2648
2649 static void flush_unmaps(void)
2650 {
2651         int i, j;
2652
2653         timer_on = 0;
2654
2655         /* just flush them all */
2656         for (i = 0; i < g_num_of_iommus; i++) {
2657                 struct intel_iommu *iommu = g_iommus[i];
2658                 if (!iommu)
2659                         continue;
2660
2661                 if (!deferred_flush[i].next)
2662                         continue;
2663
2664                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2665                                          DMA_TLB_GLOBAL_FLUSH);
2666                 for (j = 0; j < deferred_flush[i].next; j++) {
2667                         unsigned long mask;
2668                         struct iova *iova = deferred_flush[i].iova[j];
2669
2670                         mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2671                         iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2672                                         (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2673                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2674                 }
2675                 deferred_flush[i].next = 0;
2676         }
2677
2678         list_size = 0;
2679 }
2680
2681 static void flush_unmaps_timeout(unsigned long data)
2682 {
2683         unsigned long flags;
2684
2685         spin_lock_irqsave(&async_umap_flush_lock, flags);
2686         flush_unmaps();
2687         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2688 }
2689
2690 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2691 {
2692         unsigned long flags;
2693         int next, iommu_id;
2694         struct intel_iommu *iommu;
2695
2696         spin_lock_irqsave(&async_umap_flush_lock, flags);
2697         if (list_size == HIGH_WATER_MARK)
2698                 flush_unmaps();
2699
2700         iommu = domain_get_iommu(dom);
2701         iommu_id = iommu->seq_id;
2702
2703         next = deferred_flush[iommu_id].next;
2704         deferred_flush[iommu_id].domain[next] = dom;
2705         deferred_flush[iommu_id].iova[next] = iova;
2706         deferred_flush[iommu_id].next++;
2707
2708         if (!timer_on) {
2709                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2710                 timer_on = 1;
2711         }
2712         list_size++;
2713         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2714 }
2715
2716 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2717                              size_t size, enum dma_data_direction dir,
2718                              struct dma_attrs *attrs)
2719 {
2720         struct pci_dev *pdev = to_pci_dev(dev);
2721         struct dmar_domain *domain;
2722         unsigned long start_pfn, last_pfn;
2723         struct iova *iova;
2724         struct intel_iommu *iommu;
2725
2726         if (iommu_no_mapping(dev))
2727                 return;
2728
2729         domain = find_domain(pdev);
2730         BUG_ON(!domain);
2731
2732         iommu = domain_get_iommu(domain);
2733
2734         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2735         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2736                       (unsigned long long)dev_addr))
2737                 return;
2738
2739         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2740         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2741
2742         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2743                  pci_name(pdev), start_pfn, last_pfn);
2744
2745         /*  clear the whole page */
2746         dma_pte_clear_range(domain, start_pfn, last_pfn);
2747
2748         /* free page tables */
2749         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2750
2751         if (intel_iommu_strict) {
2752                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2753                                       last_pfn - start_pfn + 1);
2754                 /* free iova */
2755                 __free_iova(&domain->iovad, iova);
2756         } else {
2757                 add_unmap(domain, iova);
2758                 /*
2759                  * queue up the release of the unmap to save the 1/6th of the
2760                  * cpu used up by the iotlb flush operation...
2761                  */
2762         }
2763 }
2764
2765 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2766                                   dma_addr_t *dma_handle, gfp_t flags)
2767 {
2768         void *vaddr;
2769         int order;
2770
2771         size = PAGE_ALIGN(size);
2772         order = get_order(size);
2773
2774         if (!iommu_no_mapping(hwdev))
2775                 flags &= ~(GFP_DMA | GFP_DMA32);
2776         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2777                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2778                         flags |= GFP_DMA;
2779                 else
2780                         flags |= GFP_DMA32;
2781         }
2782
2783         vaddr = (void *)__get_free_pages(flags, order);
2784         if (!vaddr)
2785                 return NULL;
2786         memset(vaddr, 0, size);
2787
2788         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2789                                          DMA_BIDIRECTIONAL,
2790                                          hwdev->coherent_dma_mask);
2791         if (*dma_handle)
2792                 return vaddr;
2793         free_pages((unsigned long)vaddr, order);
2794         return NULL;
2795 }
2796
2797 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2798                                 dma_addr_t dma_handle)
2799 {
2800         int order;
2801
2802         size = PAGE_ALIGN(size);
2803         order = get_order(size);
2804
2805         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2806         free_pages((unsigned long)vaddr, order);
2807 }
2808
2809 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2810                            int nelems, enum dma_data_direction dir,
2811                            struct dma_attrs *attrs)
2812 {
2813         struct pci_dev *pdev = to_pci_dev(hwdev);
2814         struct dmar_domain *domain;
2815         unsigned long start_pfn, last_pfn;
2816         struct iova *iova;
2817         struct intel_iommu *iommu;
2818
2819         if (iommu_no_mapping(hwdev))
2820                 return;
2821
2822         domain = find_domain(pdev);
2823         BUG_ON(!domain);
2824
2825         iommu = domain_get_iommu(domain);
2826
2827         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2828         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
2829                       (unsigned long long)sglist[0].dma_address))
2830                 return;
2831
2832         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2833         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2834
2835         /*  clear the whole page */
2836         dma_pte_clear_range(domain, start_pfn, last_pfn);
2837
2838         /* free page tables */
2839         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2840
2841         if (intel_iommu_strict) {
2842                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2843                                       last_pfn - start_pfn + 1);
2844                 /* free iova */
2845                 __free_iova(&domain->iovad, iova);
2846         } else {
2847                 add_unmap(domain, iova);
2848                 /*
2849                  * queue up the release of the unmap to save the 1/6th of the
2850                  * cpu used up by the iotlb flush operation...
2851                  */
2852         }
2853 }
2854
2855 static int intel_nontranslate_map_sg(struct device *hddev,
2856         struct scatterlist *sglist, int nelems, int dir)
2857 {
2858         int i;
2859         struct scatterlist *sg;
2860
2861         for_each_sg(sglist, sg, nelems, i) {
2862                 BUG_ON(!sg_page(sg));
2863                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
2864                 sg->dma_length = sg->length;
2865         }
2866         return nelems;
2867 }
2868
2869 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2870                         enum dma_data_direction dir, struct dma_attrs *attrs)
2871 {
2872         int i;
2873         struct pci_dev *pdev = to_pci_dev(hwdev);
2874         struct dmar_domain *domain;
2875         size_t size = 0;
2876         int prot = 0;
2877         size_t offset_pfn = 0;
2878         struct iova *iova = NULL;
2879         int ret;
2880         struct scatterlist *sg;
2881         unsigned long start_vpfn;
2882         struct intel_iommu *iommu;
2883
2884         BUG_ON(dir == DMA_NONE);
2885         if (iommu_no_mapping(hwdev))
2886                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2887
2888         domain = get_valid_domain_for_dev(pdev);
2889         if (!domain)
2890                 return 0;
2891
2892         iommu = domain_get_iommu(domain);
2893
2894         for_each_sg(sglist, sg, nelems, i)
2895                 size += aligned_nrpages(sg->offset, sg->length);
2896
2897         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2898                                 pdev->dma_mask);
2899         if (!iova) {
2900                 sglist->dma_length = 0;
2901                 return 0;
2902         }
2903
2904         /*
2905          * Check if DMAR supports zero-length reads on write only
2906          * mappings..
2907          */
2908         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2909                         !cap_zlr(iommu->cap))
2910                 prot |= DMA_PTE_READ;
2911         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2912                 prot |= DMA_PTE_WRITE;
2913
2914         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
2915
2916         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
2917         if (unlikely(ret)) {
2918                 /*  clear the page */
2919                 dma_pte_clear_range(domain, start_vpfn,
2920                                     start_vpfn + size - 1);
2921                 /* free page tables */
2922                 dma_pte_free_pagetable(domain, start_vpfn,
2923                                        start_vpfn + size - 1);
2924                 /* free iova */
2925                 __free_iova(&domain->iovad, iova);
2926                 return 0;
2927         }
2928
2929         /* it's a non-present to present mapping. Only flush if caching mode */
2930         if (cap_caching_mode(iommu->cap))
2931                 iommu_flush_iotlb_psi(iommu, 0, start_vpfn, offset_pfn);
2932         else
2933                 iommu_flush_write_buffer(iommu);
2934
2935         return nelems;
2936 }
2937
2938 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
2939 {
2940         return !dma_addr;
2941 }
2942
2943 struct dma_map_ops intel_dma_ops = {
2944         .alloc_coherent = intel_alloc_coherent,
2945         .free_coherent = intel_free_coherent,
2946         .map_sg = intel_map_sg,
2947         .unmap_sg = intel_unmap_sg,
2948         .map_page = intel_map_page,
2949         .unmap_page = intel_unmap_page,
2950         .mapping_error = intel_mapping_error,
2951 };
2952
2953 static inline int iommu_domain_cache_init(void)
2954 {
2955         int ret = 0;
2956
2957         iommu_domain_cache = kmem_cache_create("iommu_domain",
2958                                          sizeof(struct dmar_domain),
2959                                          0,
2960                                          SLAB_HWCACHE_ALIGN,
2961
2962                                          NULL);
2963         if (!iommu_domain_cache) {
2964                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2965                 ret = -ENOMEM;
2966         }
2967
2968         return ret;
2969 }
2970
2971 static inline int iommu_devinfo_cache_init(void)
2972 {
2973         int ret = 0;
2974
2975         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2976                                          sizeof(struct device_domain_info),
2977                                          0,
2978                                          SLAB_HWCACHE_ALIGN,
2979                                          NULL);
2980         if (!iommu_devinfo_cache) {
2981                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2982                 ret = -ENOMEM;
2983         }
2984
2985         return ret;
2986 }
2987
2988 static inline int iommu_iova_cache_init(void)
2989 {
2990         int ret = 0;
2991
2992         iommu_iova_cache = kmem_cache_create("iommu_iova",
2993                                          sizeof(struct iova),
2994                                          0,
2995                                          SLAB_HWCACHE_ALIGN,
2996                                          NULL);
2997         if (!iommu_iova_cache) {
2998                 printk(KERN_ERR "Couldn't create iova cache\n");
2999                 ret = -ENOMEM;
3000         }
3001
3002         return ret;
3003 }
3004
3005 static int __init iommu_init_mempool(void)
3006 {
3007         int ret;
3008         ret = iommu_iova_cache_init();
3009         if (ret)
3010                 return ret;
3011
3012         ret = iommu_domain_cache_init();
3013         if (ret)
3014                 goto domain_error;
3015
3016         ret = iommu_devinfo_cache_init();
3017         if (!ret)
3018                 return ret;
3019
3020         kmem_cache_destroy(iommu_domain_cache);
3021 domain_error:
3022         kmem_cache_destroy(iommu_iova_cache);
3023
3024         return -ENOMEM;
3025 }
3026
3027 static void __init iommu_exit_mempool(void)
3028 {
3029         kmem_cache_destroy(iommu_devinfo_cache);
3030         kmem_cache_destroy(iommu_domain_cache);
3031         kmem_cache_destroy(iommu_iova_cache);
3032
3033 }
3034
3035 static void __init init_no_remapping_devices(void)
3036 {
3037         struct dmar_drhd_unit *drhd;
3038
3039         for_each_drhd_unit(drhd) {
3040                 if (!drhd->include_all) {
3041                         int i;
3042                         for (i = 0; i < drhd->devices_cnt; i++)
3043                                 if (drhd->devices[i] != NULL)
3044                                         break;
3045                         /* ignore DMAR unit if no pci devices exist */
3046                         if (i == drhd->devices_cnt)
3047                                 drhd->ignored = 1;
3048                 }
3049         }
3050
3051         if (dmar_map_gfx)
3052                 return;
3053
3054         for_each_drhd_unit(drhd) {
3055                 int i;
3056                 if (drhd->ignored || drhd->include_all)
3057                         continue;
3058
3059                 for (i = 0; i < drhd->devices_cnt; i++)
3060                         if (drhd->devices[i] &&
3061                                 !IS_GFX_DEVICE(drhd->devices[i]))
3062                                 break;
3063
3064                 if (i < drhd->devices_cnt)
3065                         continue;
3066
3067                 /* bypass IOMMU if it is just for gfx devices */
3068                 drhd->ignored = 1;
3069                 for (i = 0; i < drhd->devices_cnt; i++) {
3070                         if (!drhd->devices[i])
3071                                 continue;
3072                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3073                 }
3074         }
3075 }
3076
3077 #ifdef CONFIG_SUSPEND
3078 static int init_iommu_hw(void)
3079 {
3080         struct dmar_drhd_unit *drhd;
3081         struct intel_iommu *iommu = NULL;
3082
3083         for_each_active_iommu(iommu, drhd)
3084                 if (iommu->qi)
3085                         dmar_reenable_qi(iommu);
3086
3087         for_each_active_iommu(iommu, drhd) {
3088                 iommu_flush_write_buffer(iommu);
3089
3090                 iommu_set_root_entry(iommu);
3091
3092                 iommu->flush.flush_context(iommu, 0, 0, 0,
3093                                            DMA_CCMD_GLOBAL_INVL);
3094                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3095                                          DMA_TLB_GLOBAL_FLUSH);
3096                 iommu_enable_translation(iommu);
3097                 iommu_disable_protect_mem_regions(iommu);
3098         }
3099
3100         return 0;
3101 }
3102
3103 static void iommu_flush_all(void)
3104 {
3105         struct dmar_drhd_unit *drhd;
3106         struct intel_iommu *iommu;
3107
3108         for_each_active_iommu(iommu, drhd) {
3109                 iommu->flush.flush_context(iommu, 0, 0, 0,
3110                                            DMA_CCMD_GLOBAL_INVL);
3111                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3112                                          DMA_TLB_GLOBAL_FLUSH);
3113         }
3114 }
3115
3116 static int iommu_suspend(struct sys_device *dev, pm_message_t state)
3117 {
3118         struct dmar_drhd_unit *drhd;
3119         struct intel_iommu *iommu = NULL;
3120         unsigned long flag;
3121
3122         for_each_active_iommu(iommu, drhd) {
3123                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3124                                                  GFP_ATOMIC);
3125                 if (!iommu->iommu_state)
3126                         goto nomem;
3127         }
3128
3129         iommu_flush_all();
3130
3131         for_each_active_iommu(iommu, drhd) {
3132                 iommu_disable_translation(iommu);
3133
3134                 spin_lock_irqsave(&iommu->register_lock, flag);
3135
3136                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3137                         readl(iommu->reg + DMAR_FECTL_REG);
3138                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3139                         readl(iommu->reg + DMAR_FEDATA_REG);
3140                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3141                         readl(iommu->reg + DMAR_FEADDR_REG);
3142                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3143                         readl(iommu->reg + DMAR_FEUADDR_REG);
3144
3145                 spin_unlock_irqrestore(&iommu->register_lock, flag);
3146         }
3147         return 0;
3148
3149 nomem:
3150         for_each_active_iommu(iommu, drhd)
3151                 kfree(iommu->iommu_state);
3152
3153         return -ENOMEM;
3154 }
3155
3156 static int iommu_resume(struct sys_device *dev)
3157 {
3158         struct dmar_drhd_unit *drhd;
3159         struct intel_iommu *iommu = NULL;
3160         unsigned long flag;
3161
3162         if (init_iommu_hw()) {
3163                 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3164                 return -EIO;
3165         }
3166
3167         for_each_active_iommu(iommu, drhd) {
3168
3169                 spin_lock_irqsave(&iommu->register_lock, flag);
3170
3171                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3172                         iommu->reg + DMAR_FECTL_REG);
3173                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3174                         iommu->reg + DMAR_FEDATA_REG);
3175                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3176                         iommu->reg + DMAR_FEADDR_REG);
3177                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3178                         iommu->reg + DMAR_FEUADDR_REG);
3179
3180                 spin_unlock_irqrestore(&iommu->register_lock, flag);
3181         }
3182
3183         for_each_active_iommu(iommu, drhd)
3184                 kfree(iommu->iommu_state);
3185
3186         return 0;
3187 }
3188
3189 static struct sysdev_class iommu_sysclass = {
3190         .name           = "iommu",
3191         .resume         = iommu_resume,
3192         .suspend        = iommu_suspend,
3193 };
3194
3195 static struct sys_device device_iommu = {
3196         .cls    = &iommu_sysclass,
3197 };
3198
3199 static int __init init_iommu_sysfs(void)
3200 {
3201         int error;
3202
3203         error = sysdev_class_register(&iommu_sysclass);
3204         if (error)
3205                 return error;
3206
3207         error = sysdev_register(&device_iommu);
3208         if (error)
3209                 sysdev_class_unregister(&iommu_sysclass);
3210
3211         return error;
3212 }
3213
3214 #else
3215 static int __init init_iommu_sysfs(void)
3216 {
3217         return 0;
3218 }
3219 #endif  /* CONFIG_PM */
3220
3221 /*
3222  * Here we only respond to action of unbound device from driver.
3223  *
3224  * Added device is not attached to its DMAR domain here yet. That will happen
3225  * when mapping the device to iova.
3226  */
3227 static int device_notifier(struct notifier_block *nb,
3228                                   unsigned long action, void *data)
3229 {
3230         struct device *dev = data;
3231         struct pci_dev *pdev = to_pci_dev(dev);
3232         struct dmar_domain *domain;
3233
3234         if (iommu_no_mapping(dev))
3235                 return 0;
3236
3237         domain = find_domain(pdev);
3238         if (!domain)
3239                 return 0;
3240
3241         if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through)
3242                 domain_remove_one_dev_info(domain, pdev);
3243
3244         return 0;
3245 }
3246
3247 static struct notifier_block device_nb = {
3248         .notifier_call = device_notifier,
3249 };
3250
3251 int __init intel_iommu_init(void)
3252 {
3253         int ret = 0;
3254         int force_on = 0;
3255
3256         /* VT-d is required for a TXT/tboot launch, so enforce that */
3257         force_on = tboot_force_iommu();
3258
3259         if (dmar_table_init()) {
3260                 if (force_on)
3261                         panic("tboot: Failed to initialize DMAR table\n");
3262                 return  -ENODEV;
3263         }
3264
3265         if (dmar_dev_scope_init()) {
3266                 if (force_on)
3267                         panic("tboot: Failed to initialize DMAR device scope\n");
3268                 return  -ENODEV;
3269         }
3270
3271         /*
3272          * Check the need for DMA-remapping initialization now.
3273          * Above initialization will also be used by Interrupt-remapping.
3274          */
3275         if (no_iommu || dmar_disabled)
3276                 return -ENODEV;
3277
3278         iommu_init_mempool();
3279         dmar_init_reserved_ranges();
3280
3281         init_no_remapping_devices();
3282
3283         ret = init_dmars();
3284         if (ret) {
3285                 if (force_on)
3286                         panic("tboot: Failed to initialize DMARs\n");
3287                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3288                 put_iova_domain(&reserved_iova_list);
3289                 iommu_exit_mempool();
3290                 return ret;
3291         }
3292         printk(KERN_INFO
3293         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3294
3295         init_timer(&unmap_timer);
3296 #ifdef CONFIG_SWIOTLB
3297         swiotlb = 0;
3298 #endif
3299         dma_ops = &intel_dma_ops;
3300
3301         init_iommu_sysfs();
3302
3303         register_iommu(&intel_iommu_ops);
3304
3305         bus_register_notifier(&pci_bus_type, &device_nb);
3306
3307         return 0;
3308 }
3309
3310 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3311                                            struct pci_dev *pdev)
3312 {
3313         struct pci_dev *tmp, *parent;
3314
3315         if (!iommu || !pdev)
3316                 return;
3317
3318         /* dependent device detach */
3319         tmp = pci_find_upstream_pcie_bridge(pdev);
3320         /* Secondary interface's bus number and devfn 0 */
3321         if (tmp) {
3322                 parent = pdev->bus->self;
3323                 while (parent != tmp) {
3324                         iommu_detach_dev(iommu, parent->bus->number,
3325                                          parent->devfn);
3326                         parent = parent->bus->self;
3327                 }
3328                 if (pci_is_pcie(tmp)) /* this is a PCIE-to-PCI bridge */
3329                         iommu_detach_dev(iommu,
3330                                 tmp->subordinate->number, 0);
3331                 else /* this is a legacy PCI bridge */
3332                         iommu_detach_dev(iommu, tmp->bus->number,
3333                                          tmp->devfn);
3334         }
3335 }
3336
3337 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3338                                           struct pci_dev *pdev)
3339 {
3340         struct device_domain_info *info;
3341         struct intel_iommu *iommu;
3342         unsigned long flags;
3343         int found = 0;
3344         struct list_head *entry, *tmp;
3345
3346         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3347                                 pdev->devfn);
3348         if (!iommu)
3349                 return;
3350
3351         spin_lock_irqsave(&device_domain_lock, flags);
3352         list_for_each_safe(entry, tmp, &domain->devices) {
3353                 info = list_entry(entry, struct device_domain_info, link);
3354                 /* No need to compare PCI domain; it has to be the same */
3355                 if (info->bus == pdev->bus->number &&
3356                     info->devfn == pdev->devfn) {
3357                         list_del(&info->link);
3358                         list_del(&info->global);
3359                         if (info->dev)
3360                                 info->dev->dev.archdata.iommu = NULL;
3361                         spin_unlock_irqrestore(&device_domain_lock, flags);
3362
3363                         iommu_disable_dev_iotlb(info);
3364                         iommu_detach_dev(iommu, info->bus, info->devfn);
3365                         iommu_detach_dependent_devices(iommu, pdev);
3366                         free_devinfo_mem(info);
3367
3368                         spin_lock_irqsave(&device_domain_lock, flags);
3369
3370                         if (found)
3371                                 break;
3372                         else
3373                                 continue;
3374                 }
3375
3376                 /* if there is no other devices under the same iommu
3377                  * owned by this domain, clear this iommu in iommu_bmp
3378                  * update iommu count and coherency
3379                  */
3380                 if (iommu == device_to_iommu(info->segment, info->bus,
3381                                             info->devfn))
3382                         found = 1;
3383         }
3384
3385         if (found == 0) {
3386                 unsigned long tmp_flags;
3387                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3388                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
3389                 domain->iommu_count--;
3390                 domain_update_iommu_cap(domain);
3391                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3392         }
3393
3394         spin_unlock_irqrestore(&device_domain_lock, flags);
3395 }
3396
3397 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3398 {
3399         struct device_domain_info *info;
3400         struct intel_iommu *iommu;
3401         unsigned long flags1, flags2;
3402
3403         spin_lock_irqsave(&device_domain_lock, flags1);
3404         while (!list_empty(&domain->devices)) {
3405                 info = list_entry(domain->devices.next,
3406                         struct device_domain_info, link);
3407                 list_del(&info->link);
3408                 list_del(&info->global);
3409                 if (info->dev)
3410                         info->dev->dev.archdata.iommu = NULL;
3411
3412                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3413
3414                 iommu_disable_dev_iotlb(info);
3415                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3416                 iommu_detach_dev(iommu, info->bus, info->devfn);
3417                 iommu_detach_dependent_devices(iommu, info->dev);
3418
3419                 /* clear this iommu in iommu_bmp, update iommu count
3420                  * and capabilities
3421                  */
3422                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3423                 if (test_and_clear_bit(iommu->seq_id,
3424                                        &domain->iommu_bmp)) {
3425                         domain->iommu_count--;
3426                         domain_update_iommu_cap(domain);
3427                 }
3428                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3429
3430                 free_devinfo_mem(info);
3431                 spin_lock_irqsave(&device_domain_lock, flags1);
3432         }
3433         spin_unlock_irqrestore(&device_domain_lock, flags1);
3434 }
3435
3436 /* domain id for virtual machine, it won't be set in context */
3437 static unsigned long vm_domid;
3438
3439 static int vm_domain_min_agaw(struct dmar_domain *domain)
3440 {
3441         int i;
3442         int min_agaw = domain->agaw;
3443
3444         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
3445         for (; i < g_num_of_iommus; ) {
3446                 if (min_agaw > g_iommus[i]->agaw)
3447                         min_agaw = g_iommus[i]->agaw;
3448
3449                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
3450         }
3451
3452         return min_agaw;
3453 }
3454
3455 static struct dmar_domain *iommu_alloc_vm_domain(void)
3456 {
3457         struct dmar_domain *domain;
3458
3459         domain = alloc_domain_mem();
3460         if (!domain)
3461                 return NULL;
3462
3463         domain->id = vm_domid++;
3464         domain->nid = -1;
3465         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3466         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3467
3468         return domain;
3469 }
3470
3471 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3472 {
3473         int adjust_width;
3474
3475         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3476         spin_lock_init(&domain->iommu_lock);
3477
3478         domain_reserve_special_ranges(domain);
3479
3480         /* calculate AGAW */
3481         domain->gaw = guest_width;
3482         adjust_width = guestwidth_to_adjustwidth(guest_width);
3483         domain->agaw = width_to_agaw(adjust_width);
3484
3485         INIT_LIST_HEAD(&domain->devices);
3486
3487         domain->iommu_count = 0;
3488         domain->iommu_coherency = 0;
3489         domain->iommu_snooping = 0;
3490         domain->max_addr = 0;
3491         domain->nid = -1;
3492
3493         /* always allocate the top pgd */
3494         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3495         if (!domain->pgd)
3496                 return -ENOMEM;
3497         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3498         return 0;
3499 }
3500
3501 static void iommu_free_vm_domain(struct dmar_domain *domain)
3502 {
3503         unsigned long flags;
3504         struct dmar_drhd_unit *drhd;
3505         struct intel_iommu *iommu;
3506         unsigned long i;
3507         unsigned long ndomains;
3508
3509         for_each_drhd_unit(drhd) {
3510                 if (drhd->ignored)
3511                         continue;
3512                 iommu = drhd->iommu;
3513
3514                 ndomains = cap_ndoms(iommu->cap);
3515                 i = find_first_bit(iommu->domain_ids, ndomains);
3516                 for (; i < ndomains; ) {
3517                         if (iommu->domains[i] == domain) {
3518                                 spin_lock_irqsave(&iommu->lock, flags);
3519                                 clear_bit(i, iommu->domain_ids);
3520                                 iommu->domains[i] = NULL;
3521                                 spin_unlock_irqrestore(&iommu->lock, flags);
3522                                 break;
3523                         }
3524                         i = find_next_bit(iommu->domain_ids, ndomains, i+1);
3525                 }
3526         }
3527 }
3528
3529 static void vm_domain_exit(struct dmar_domain *domain)
3530 {
3531         /* Domain 0 is reserved, so dont process it */
3532         if (!domain)
3533                 return;
3534
3535         vm_domain_remove_all_dev_info(domain);
3536         /* destroy iovas */
3537         put_iova_domain(&domain->iovad);
3538
3539         /* clear ptes */
3540         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3541
3542         /* free page tables */
3543         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3544
3545         iommu_free_vm_domain(domain);
3546         free_domain_mem(domain);
3547 }
3548
3549 static int intel_iommu_domain_init(struct iommu_domain *domain)
3550 {
3551         struct dmar_domain *dmar_domain;
3552
3553         dmar_domain = iommu_alloc_vm_domain();
3554         if (!dmar_domain) {
3555                 printk(KERN_ERR
3556                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3557                 return -ENOMEM;
3558         }
3559         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3560                 printk(KERN_ERR
3561                         "intel_iommu_domain_init() failed\n");
3562                 vm_domain_exit(dmar_domain);
3563                 return -ENOMEM;
3564         }
3565         domain->priv = dmar_domain;
3566
3567         return 0;
3568 }
3569
3570 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3571 {
3572         struct dmar_domain *dmar_domain = domain->priv;
3573
3574         domain->priv = NULL;
3575         vm_domain_exit(dmar_domain);
3576 }
3577
3578 static int intel_iommu_attach_device(struct iommu_domain *domain,
3579                                      struct device *dev)
3580 {
3581         struct dmar_domain *dmar_domain = domain->priv;
3582         struct pci_dev *pdev = to_pci_dev(dev);
3583         struct intel_iommu *iommu;
3584         int addr_width;
3585         u64 end;
3586
3587         /* normally pdev is not mapped */
3588         if (unlikely(domain_context_mapped(pdev))) {
3589                 struct dmar_domain *old_domain;
3590
3591                 old_domain = find_domain(pdev);
3592                 if (old_domain) {
3593                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3594                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3595                                 domain_remove_one_dev_info(old_domain, pdev);
3596                         else
3597                                 domain_remove_dev_info(old_domain);
3598                 }
3599         }
3600
3601         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3602                                 pdev->devfn);
3603         if (!iommu)
3604                 return -ENODEV;
3605
3606         /* check if this iommu agaw is sufficient for max mapped address */
3607         addr_width = agaw_to_width(iommu->agaw);
3608         end = DOMAIN_MAX_ADDR(addr_width);
3609         end = end & VTD_PAGE_MASK;
3610         if (end < dmar_domain->max_addr) {
3611                 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3612                        "sufficient for the mapped address (%llx)\n",
3613                        __func__, iommu->agaw, dmar_domain->max_addr);
3614                 return -EFAULT;
3615         }
3616
3617         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3618 }
3619
3620 static void intel_iommu_detach_device(struct iommu_domain *domain,
3621                                       struct device *dev)
3622 {
3623         struct dmar_domain *dmar_domain = domain->priv;
3624         struct pci_dev *pdev = to_pci_dev(dev);
3625
3626         domain_remove_one_dev_info(dmar_domain, pdev);
3627 }
3628
3629 static int intel_iommu_map_range(struct iommu_domain *domain,
3630                                  unsigned long iova, phys_addr_t hpa,
3631                                  size_t size, int iommu_prot)
3632 {
3633         struct dmar_domain *dmar_domain = domain->priv;
3634         u64 max_addr;
3635         int addr_width;
3636         int prot = 0;
3637         int ret;
3638
3639         if (iommu_prot & IOMMU_READ)
3640                 prot |= DMA_PTE_READ;
3641         if (iommu_prot & IOMMU_WRITE)
3642                 prot |= DMA_PTE_WRITE;
3643         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3644                 prot |= DMA_PTE_SNP;
3645
3646         max_addr = iova + size;
3647         if (dmar_domain->max_addr < max_addr) {
3648                 int min_agaw;
3649                 u64 end;
3650
3651                 /* check if minimum agaw is sufficient for mapped address */
3652                 min_agaw = vm_domain_min_agaw(dmar_domain);
3653                 addr_width = agaw_to_width(min_agaw);
3654                 end = DOMAIN_MAX_ADDR(addr_width);
3655                 end = end & VTD_PAGE_MASK;
3656                 if (end < max_addr) {
3657                         printk(KERN_ERR "%s: iommu agaw (%d) is not "
3658                                "sufficient for the mapped address (%llx)\n",
3659                                __func__, min_agaw, max_addr);
3660                         return -EFAULT;
3661                 }
3662                 dmar_domain->max_addr = max_addr;
3663         }
3664         /* Round up size to next multiple of PAGE_SIZE, if it and
3665            the low bits of hpa would take us onto the next page */
3666         size = aligned_nrpages(hpa, size);
3667         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3668                                  hpa >> VTD_PAGE_SHIFT, size, prot);
3669         return ret;
3670 }
3671
3672 static void intel_iommu_unmap_range(struct iommu_domain *domain,
3673                                     unsigned long iova, size_t size)
3674 {
3675         struct dmar_domain *dmar_domain = domain->priv;
3676
3677         if (!size)
3678                 return;
3679
3680         dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
3681                             (iova + size - 1) >> VTD_PAGE_SHIFT);
3682
3683         if (dmar_domain->max_addr == iova + size)
3684                 dmar_domain->max_addr = iova;
3685 }
3686
3687 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3688                                             unsigned long iova)
3689 {
3690         struct dmar_domain *dmar_domain = domain->priv;
3691         struct dma_pte *pte;
3692         u64 phys = 0;
3693
3694         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT);
3695         if (pte)
3696                 phys = dma_pte_addr(pte);
3697
3698         return phys;
3699 }
3700
3701 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3702                                       unsigned long cap)
3703 {
3704         struct dmar_domain *dmar_domain = domain->priv;
3705
3706         if (cap == IOMMU_CAP_CACHE_COHERENCY)
3707                 return dmar_domain->iommu_snooping;
3708
3709         return 0;
3710 }
3711
3712 static struct iommu_ops intel_iommu_ops = {
3713         .domain_init    = intel_iommu_domain_init,
3714         .domain_destroy = intel_iommu_domain_destroy,
3715         .attach_dev     = intel_iommu_attach_device,
3716         .detach_dev     = intel_iommu_detach_device,
3717         .map            = intel_iommu_map_range,
3718         .unmap          = intel_iommu_unmap_range,
3719         .iova_to_phys   = intel_iommu_iova_to_phys,
3720         .domain_has_cap = intel_iommu_domain_has_cap,
3721 };
3722
3723 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3724 {
3725         /*
3726          * Mobile 4 Series Chipset neglects to set RWBF capability,
3727          * but needs it:
3728          */
3729         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3730         rwbf_quirk = 1;
3731 }
3732
3733 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
3734
3735 /* On Tylersburg chipsets, some BIOSes have been known to enable the
3736    ISOCH DMAR unit for the Azalia sound device, but not give it any
3737    TLB entries, which causes it to deadlock. Check for that.  We do
3738    this in a function called from init_dmars(), instead of in a PCI
3739    quirk, because we don't want to print the obnoxious "BIOS broken"
3740    message if VT-d is actually disabled.
3741 */
3742 static void __init check_tylersburg_isoch(void)
3743 {
3744         struct pci_dev *pdev;
3745         uint32_t vtisochctrl;
3746
3747         /* If there's no Azalia in the system anyway, forget it. */
3748         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
3749         if (!pdev)
3750                 return;
3751         pci_dev_put(pdev);
3752
3753         /* System Management Registers. Might be hidden, in which case
3754            we can't do the sanity check. But that's OK, because the
3755            known-broken BIOSes _don't_ actually hide it, so far. */
3756         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
3757         if (!pdev)
3758                 return;
3759
3760         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
3761                 pci_dev_put(pdev);
3762                 return;
3763         }
3764
3765         pci_dev_put(pdev);
3766
3767         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
3768         if (vtisochctrl & 1)
3769                 return;
3770
3771         /* Drop all bits other than the number of TLB entries */
3772         vtisochctrl &= 0x1c;
3773
3774         /* If we have the recommended number of TLB entries (16), fine. */
3775         if (vtisochctrl == 0x10)
3776                 return;
3777
3778         /* Zero TLB entries? You get to ride the short bus to school. */
3779         if (!vtisochctrl) {
3780                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
3781                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3782                      dmi_get_system_info(DMI_BIOS_VENDOR),
3783                      dmi_get_system_info(DMI_BIOS_VERSION),
3784                      dmi_get_system_info(DMI_PRODUCT_VERSION));
3785                 iommu_identity_mapping |= IDENTMAP_AZALIA;
3786                 return;
3787         }
3788         
3789         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
3790                vtisochctrl);
3791 }